From 4a4ab3ff5d8b6ea8015c94fd2d8566ae14f0b858 Mon Sep 17 00:00:00 2001
From: Debian Science Maintainers
 <debian-science-maintainers@lists.alioth.debian.org>
Date: Sat, 5 Oct 2024 11:43:25 +0100
Subject: [PATCH] Use cached datasets in tests and examples

Also remove a download that isn't actually used in that example.

This allows the tests to pass, and at least some of the examples to
be built, in an offline environment such as a Debian buildd.

The cached data is extracted from R packages by debian/datasets/*.

Author: Diane Trout <diane@ghic.org>, Rebecca N. Palmer <rebecca_palmer@zoho.com>
Forwarded: not-needed


Gbp-Pq: Name use_cached_datasets.patch
---
 docs/source/contingency_tables.rst                      | 4 ++--
 docs/source/datasets/index.rst                          | 2 +-
 docs/source/duration.rst                                | 4 ++--
 docs/source/example_formulas.rst                        | 2 +-
 docs/source/gee.rst                                     | 2 +-
 docs/source/gettingstarted.rst                          | 2 +-
 docs/source/index.rst                                   | 2 +-
 docs/source/mixed_linear.rst                            | 2 +-
 docs/source/release/version0.6.rst                      | 2 +-
 examples/notebooks/markov_regression.ipynb              | 9 +--------
 examples/notebooks/mixed_lm_example.ipynb               | 4 ++--
 examples/notebooks/regression_diagnostics.ipynb         | 4 ++--
 statsmodels/stats/tests/test_dist_dependant_measures.py | 4 ++--
 13 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/docs/source/contingency_tables.rst b/docs/source/contingency_tables.rst
index 6a21a7b..c3cf075 100644
--- a/docs/source/contingency_tables.rst
+++ b/docs/source/contingency_tables.rst
@@ -49,7 +49,7 @@ contingency table cell counts:
     import pandas as pd
     import statsmodels.api as sm
 
-    df = sm.datasets.get_rdataset("Arthritis", "vcd").data
+    df = sm.datasets.get_rdataset("Arthritis", "vcd", cache=True).data
     df.fillna({"Improved":"None"}, inplace=True)
 
     tab = pd.crosstab(df['Treatment'], df['Improved'])
@@ -185,7 +185,7 @@ contingency table.
 
 .. ipython:: python
 
-    df = sm.datasets.get_rdataset("VisualAcuity", "vcd").data
+    df = sm.datasets.get_rdataset("VisualAcuity", "vcd", cache=True).data
     df = df.loc[df.gender == "female", :]
     tab = df.set_index(['left', 'right'])
     del tab["gender"]
diff --git a/docs/source/datasets/index.rst b/docs/source/datasets/index.rst
index b088486..1ea669c 100644
--- a/docs/source/datasets/index.rst
+++ b/docs/source/datasets/index.rst
@@ -30,7 +30,7 @@ The `Rdatasets project <https://vincentarelbundock.github.io/Rdatasets/>`__ give
 .. ipython:: python
 
    import statsmodels.api as sm
-   duncan_prestige = sm.datasets.get_rdataset("Duncan", "carData")
+   duncan_prestige = sm.datasets.get_rdataset("Duncan", "carData", cache=True)
    print(duncan_prestige.__doc__)
    duncan_prestige.data.head(5)
 
diff --git a/docs/source/duration.rst b/docs/source/duration.rst
index 104f6e4..32dfa19 100644
--- a/docs/source/duration.rst
+++ b/docs/source/duration.rst
@@ -42,7 +42,7 @@ We fit the survival distribution only for the female subjects.
 
    import statsmodels.api as sm
 
-   data = sm.datasets.get_rdataset("flchain", "survival").data
+   data = sm.datasets.get_rdataset("flchain", "survival", cache=True).data
    df = data.loc[data.sex == "F", :]
    sf = sm.SurvfuncRight(df["futime"], df["death"])
 
@@ -169,7 +169,7 @@ depending on the value of the covariates.
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
 
-   data = sm.datasets.get_rdataset("flchain", "survival").data
+   data = sm.datasets.get_rdataset("flchain", "survival", cache=True).data
    del data["chapter"]
    data = data.dropna()
    data["lam"] = data["lambda"]
diff --git a/docs/source/example_formulas.rst b/docs/source/example_formulas.rst
index a10b348..a9b1439 100644
--- a/docs/source/example_formulas.rst
+++ b/docs/source/example_formulas.rst
@@ -47,7 +47,7 @@ and list-wise delete to remove missing observations:
 
 .. ipython:: python
 
-    df = sm.datasets.get_rdataset("Guerry", "HistData").data
+    df = sm.datasets.get_rdataset("Guerry", "HistData", cache=True).data
     df = df[['Lottery', 'Literacy', 'Wealth', 'Region']].dropna()
     df.head()
 
diff --git a/docs/source/gee.rst b/docs/source/gee.rst
index da3859f..3eefda6 100644
--- a/docs/source/gee.rst
+++ b/docs/source/gee.rst
@@ -24,7 +24,7 @@ within clusters using data on epilepsy seizures.
     import statsmodels.api as sm
     import statsmodels.formula.api as smf
 
-    data = sm.datasets.get_rdataset('epil', package='MASS').data
+    data = sm.datasets.get_rdataset('epil', package='MASS', cache=True).data
 
     fam = sm.families.Poisson()
     ind = sm.cov_struct.Exchangeable()
diff --git a/docs/source/gettingstarted.rst b/docs/source/gettingstarted.rst
index 8bf3863..5cef84d 100644
--- a/docs/source/gettingstarted.rst
+++ b/docs/source/gettingstarted.rst
@@ -50,7 +50,7 @@ We could download the file locally and then load it using ``read_csv``, but
 
 .. ipython:: python
 
-    df = sm.datasets.get_rdataset("Guerry", "HistData").data
+    df = sm.datasets.get_rdataset("Guerry", "HistData", cache=True).data
 
 The `Input/Output doc page <iolib.html>`_ shows how to import from various
 other formats.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 00fbf80..c48e8e9 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -25,7 +25,7 @@ Here is a simple example using ordinary least squares:
     import statsmodels.formula.api as smf
 
     # Load data
-    dat = sm.datasets.get_rdataset("Guerry", "HistData").data
+    dat = sm.datasets.get_rdataset("Guerry", "HistData", cache=True).data
 
     # Fit regression model (using the natural log of one of the regressors)
     results = smf.ols('Lottery ~ Literacy + np.log(Pop1831)', data=dat).fit()
diff --git a/docs/source/mixed_linear.rst b/docs/source/mixed_linear.rst
index 50ea7bb..612de35 100644
--- a/docs/source/mixed_linear.rst
+++ b/docs/source/mixed_linear.rst
@@ -83,7 +83,7 @@ Examples
   import statsmodels.api as sm
   import statsmodels.formula.api as smf
 
-  data = sm.datasets.get_rdataset("dietox", "geepack").data
+  data = sm.datasets.get_rdataset("dietox", "geepack", cache=True).data
 
   md = smf.mixedlm("Weight ~ Time", data, groups=data["Pig"])
   mdf = md.fit()
diff --git a/docs/source/release/version0.6.rst b/docs/source/release/version0.6.rst
index c2fcb63..7995eb8 100644
--- a/docs/source/release/version0.6.rst
+++ b/docs/source/release/version0.6.rst
@@ -41,7 +41,7 @@ covariates.
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
 
-   data = sm.datasets.get_rdataset("epil", "MASS").data
+   data = sm.datasets.get_rdataset("epil", "MASS", cache=True).data
 
    md = smf.gee("y ~ age + trt + base", "subject", data,
                 cov_struct=sm.cov_struct.Independence(), 
diff --git a/examples/notebooks/markov_regression.ipynb b/examples/notebooks/markov_regression.ipynb
index 6e93771..74064df 100644
--- a/examples/notebooks/markov_regression.ipynb
+++ b/examples/notebooks/markov_regression.ipynb
@@ -30,14 +30,7 @@
     "import pandas as pd\n",
     "import statsmodels.api as sm\n",
     "import matplotlib.pyplot as plt\n",
-    "\n",
-    "# NBER recessions\n",
-    "from pandas_datareader.data import DataReader\n",
-    "from datetime import datetime\n",
-    "\n",
-    "usrec = DataReader(\n",
-    "    \"USREC\", \"fred\", start=datetime(1947, 1, 1), end=datetime(2013, 4, 1)\n",
-    ")"
+    "from datetime import datetime\n"
    ]
   },
   {
diff --git a/examples/notebooks/mixed_lm_example.ipynb b/examples/notebooks/mixed_lm_example.ipynb
index 2b9bd2d..ee170c9 100644
--- a/examples/notebooks/mixed_lm_example.ipynb
+++ b/examples/notebooks/mixed_lm_example.ipynb
@@ -86,7 +86,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = sm.datasets.get_rdataset(\"dietox\", \"geepack\").data\n",
+    "data = sm.datasets.get_rdataset(\"dietox\", \"geepack\", cache=True).data\n",
     "md = smf.mixedlm(\"Weight ~ Time\", data, groups=data[\"Pig\"])\n",
     "mdf = md.fit(method=[\"lbfgs\"])\n",
     "print(mdf.summary())"
@@ -318,7 +318,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "data = sm.datasets.get_rdataset(\"Sitka\", \"MASS\").data\n",
+    "data = sm.datasets.get_rdataset(\"Sitka\", \"MASS\", cache=True).data\n",
     "endog = data[\"size\"]\n",
     "data[\"Intercept\"] = 1\n",
     "exog = data[[\"Intercept\", \"Time\"]]"
diff --git a/examples/notebooks/regression_diagnostics.ipynb b/examples/notebooks/regression_diagnostics.ipynb
index 55a5cc0..e9d03ee 100644
--- a/examples/notebooks/regression_diagnostics.ipynb
+++ b/examples/notebooks/regression_diagnostics.ipynb
@@ -47,8 +47,8 @@
     "import matplotlib.pyplot as plt\n",
     "\n",
     "# Load data\n",
-    "url = \"https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/HistData/Guerry.csv\"\n",
-    "dat = pd.read_csv(url)\n",
+    "import statsmodels.datasets\n",
+    "dat = statsmodels.datasets.get_rdataset(\"Guerry\", \"HistData\", cache=True).data\n",
     "\n",
     "# Fit regression model (using the natural log of one of the regressors)\n",
     "results = smf.ols(\"Lottery ~ Literacy + np.log(Pop1831)\", data=dat).fit()\n",
diff --git a/statsmodels/stats/tests/test_dist_dependant_measures.py b/statsmodels/stats/tests/test_dist_dependant_measures.py
index 0c6b252..f51ad09 100644
--- a/statsmodels/stats/tests/test_dist_dependant_measures.py
+++ b/statsmodels/stats/tests/test_dist_dependant_measures.py
@@ -140,7 +140,7 @@ class TestDistDependenceMeasures:
         0.1025087
         """
         try:
-            iris = get_rdataset("iris").data.values[:, :4]
+            iris = get_rdataset("iris", cache=True).data.values[:, :4]
         except IGNORED_EXCEPTIONS:
             pytest.skip('Failed with HTTPError or URLError, these are random')
 
@@ -180,7 +180,7 @@ class TestDistDependenceMeasures:
         30.01526
         """
         try:
-            quakes = get_rdataset("quakes").data.values[:, :3]
+            quakes = get_rdataset("quakes", cache=True).data.values[:, :3]
         except IGNORED_EXCEPTIONS:
             pytest.skip('Failed with HTTPError or URLError, these are random')
 
-- 
2.30.2