From c1c0b1a76d274047b60a63822aa345f3b32eabc3 Mon Sep 17 00:00:00 2001 From: Debian Science Team Date: Sun, 2 Feb 2025 11:17:13 +0000 Subject: [PATCH] Use fixed seeds for reproducible pseudorandomness Author: Rebecca N. Palmer Forwarded: no Gbp-Pq: Name fix_random_seeds.patch --- .../comparison/comparison_with_r.rst | 1 + doc/source/user_guide/advanced.rst | 1 + doc/source/user_guide/style.ipynb | 50 +++++++++++++++++-- doc/source/user_guide/visualization.rst | 1 + pandas/plotting/_core.py | 4 ++ pandas/plotting/_misc.py | 3 ++ 6 files changed, 56 insertions(+), 4 deletions(-) diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index a6cfcd46..836e3669 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -237,6 +237,7 @@ In pandas we may use :meth:`~pandas.pivot_table` method to handle this: import random import string + random.seed(123456) # for reproducibility baseball = pd.DataFrame( { diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 45353609..5126c732 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -590,6 +590,7 @@ they need to be sorted. As with any index, you can use :meth:`~DataFrame.sort_in import random + random.seed(123456) # for reproducibility random.shuffle(tuples) s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples)) s diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index f22a5064..46c671c5 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -78,8 +78,37 @@ "source": [ "import pandas as pd\n", "import numpy as np\n", - "import matplotlib as mpl\n", - "\n", + "import matplotlib as mpl\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# For reproducibility - this doesn't respect uuid_len or positionally-passed uuid but the places here that use that coincidentally bypass this anyway\n", + "from pandas.io.formats.style import Styler\n", + "next_uuid = 1000\n", + "class StylerReproducible(Styler):\n", + " def __init__(self, *args, uuid=None, **kwargs):\n", + " global next_uuid\n", + " if uuid is None:\n", + " uuid = str(next_uuid)\n", + " next_uuid = next_uuid + 1\n", + " super().__init__(*args, uuid=uuid, **kwargs)\n", + "Styler = StylerReproducible\n", + "pd.DataFrame.style = property(lambda self: StylerReproducible(self))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "df = pd.DataFrame({\n", " \"strings\": [\"Adam\", \"Mike\"],\n", " \"ints\": [1, 3],\n", @@ -104,6 +133,7 @@ "metadata": {}, "outputs": [], "source": [ + "np.random.seed(25) # for reproducibility\n", "weather_df = pd.DataFrame(np.random.rand(10,2)*5, \n", " index=pd.date_range(start=\"2021-01-01\", periods=10),\n", " columns=[\"Tokyo\", \"Beijing\"])\n", @@ -1394,7 +1424,6 @@ "outputs": [], "source": [ "# Hide the construction of the display chart from the user\n", - "import pandas as pd\n", "from IPython.display import HTML\n", "\n", "# Test series\n", @@ -1925,6 +1954,18 @@ "from pandas.io.formats.style import Styler" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "# For reproducibility\n", + "Styler = StylerReproducible\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2126,7 +2167,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" - } + }, + "record_timing": false }, "nbformat": 4, "nbformat_minor": 1 diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 9081d13e..b786b181 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1086,6 +1086,7 @@ are what constitutes the bootstrap plot. :suppress: np.random.seed(123456) + random.seed(123456) # for reproducibility - bootstrap_plot uses random.sample .. ipython:: python diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index cb5598a9..cb020ec6 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -604,6 +604,7 @@ def boxplot_frame_groupby( .. plot:: :context: close-figs + >>> np.random.seed(1234) >>> import itertools >>> tuples = [t for t in itertools.product(range(1000), range(4))] >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) @@ -1328,6 +1329,7 @@ class PlotAccessor(PandasObject): .. plot:: :context: close-figs + >>> np.random.seed(1234) >>> data = np.random.randn(25, 4) >>> df = pd.DataFrame(data, columns=list('ABCD')) >>> ax = df.plot.box() @@ -1392,6 +1394,7 @@ class PlotAccessor(PandasObject): .. plot:: :context: close-figs + >>> np.random.seed(1234) >>> df = pd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) >>> df['two'] = df['one'] + np.random.randint(1, 7, 6000) >>> ax = df.plot.hist(bins=12, alpha=0.5) @@ -1811,6 +1814,7 @@ class PlotAccessor(PandasObject): .. plot:: :context: close-figs + >>> np.random.seed(1234) >>> n = 10000 >>> df = pd.DataFrame({'x': np.random.randn(n), ... 'y': np.random.randn(n)}) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 18db460d..fd180e62 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -438,6 +438,8 @@ def bootstrap_plot( .. plot:: :context: close-figs + >>> np.random.seed(1234) + >>> random.seed(1234) # for reproducibility >>> s = pd.Series(np.random.uniform(size=100)) >>> pd.plotting.bootstrap_plot(s) # doctest: +SKIP
@@ -597,6 +599,7 @@ def autocorrelation_plot(series: Series, ax: Axes | None = None, **kwargs) -> Ax .. plot:: :context: close-figs + >>> np.random.seed(1234) >>> spacing = np.linspace(-9 * np.pi, 9 * np.pi, num=1000) >>> s = pd.Series(0.7 * np.random.rand(1000) + 0.3 * np.sin(spacing)) >>> pd.plotting.autocorrelation_plot(s) # doctest: +SKIP -- 2.30.2