From bc123a260e2597bdf636c6db7513352805b953bf Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 30 Sep 2016 01:29:45 +0000 Subject: [PATCH 1/1] Import scikit-learn_0.18.orig.tar.gz [dgit import orig scikit-learn_0.18.orig.tar.gz] --- .coveragerc | 8 + .gitattributes | 30 + .gitignore | 65 + .landscape.yml | 5 + .mailmap | 125 + .travis.yml | 82 + AUTHORS.rst | 71 + CONTRIBUTING.md | 232 + COPYING | 32 + ISSUE_TEMPLATE.md | 54 + MANIFEST.in | 9 + Makefile | 73 + PULL_REQUEST_TEMPLATE.md | 26 + README.rst | 161 + appveyor.yml | 93 + benchmarks/bench_20newsgroups.py | 97 + benchmarks/bench_covertype.py | 190 + benchmarks/bench_glm.py | 58 + benchmarks/bench_glmnet.py | 128 + benchmarks/bench_isolation_forest.py | 128 + benchmarks/bench_isotonic.py | 103 + benchmarks/bench_lasso.py | 96 + benchmarks/bench_mnist.py | 177 + benchmarks/bench_multilabel_metrics.py | 190 + .../bench_plot_approximate_neighbors.py | 167 + benchmarks/bench_plot_fastkmeans.py | 138 + benchmarks/bench_plot_incremental_pca.py | 156 + benchmarks/bench_plot_lasso_path.py | 117 + benchmarks/bench_plot_neighbors.py | 185 + benchmarks/bench_plot_nmf.py | 166 + benchmarks/bench_plot_omp_lars.py | 123 + benchmarks/bench_plot_parallel_pairwise.py | 46 + benchmarks/bench_plot_randomized_svd.py | 455 ++ benchmarks/bench_plot_svd.py | 82 + benchmarks/bench_plot_ward.py | 43 + benchmarks/bench_random_projections.py | 254 + benchmarks/bench_rcv1_logreg_convergence.py | 236 + .../bench_sample_without_replacement.py | 207 + benchmarks/bench_sgd_regression.py | 151 + benchmarks/bench_sparsify.py | 104 + benchmarks/bench_tree.py | 124 + build_tools/appveyor/install.ps1 | 229 + build_tools/appveyor/requirements.txt | 16 + build_tools/appveyor/run_with_env.cmd | 88 + build_tools/circle/build_doc.sh | 55 + build_tools/circle/check_build_doc.py | 66 + build_tools/circle/push_doc.sh | 33 + build_tools/cythonize.py | 198 + build_tools/travis/after_success.sh | 19 + build_tools/travis/flake8_diff.sh | 118 + build_tools/travis/install.sh | 120 + build_tools/travis/test_script.sh | 49 + .../windows/windows_testing_downloader.ps1 | 270 + circle.yml | 23 + doc/Makefile | 105 + doc/README.md | 37 + doc/about.rst | 229 + doc/conf.py | 282 + doc/data_transforms.rst | 35 + doc/datasets/covtype.rst | 20 + doc/datasets/index.rst | 306 + doc/datasets/kddcup99.rst | 36 + doc/datasets/labeled_faces.rst | 118 + doc/datasets/labeled_faces_fixture.py | 15 + doc/datasets/mldata.rst | 72 + doc/datasets/mldata_fixture.py | 45 + doc/datasets/olivetti_faces.rst | 37 + doc/datasets/rcv1.rst | 52 + doc/datasets/rcv1_fixture.py | 23 + doc/datasets/twenty_newsgroups.rst | 218 + doc/datasets/twenty_newsgroups_fixture.py | 15 + doc/developers/advanced_installation.rst | 430 ++ doc/developers/contributing.rst | 1066 +++ doc/developers/debugging.rst | 51 + doc/developers/index.rst | 17 + doc/developers/maintainer.rst | 50 + doc/developers/performance.rst | 473 ++ doc/developers/utilities.rst | 292 + doc/documentation.rst | 98 + doc/faq.rst | 334 + doc/images/cds-logo.png | Bin 0 -> 6501 bytes doc/images/inria-logo.jpg | Bin 0 -> 21107 bytes doc/images/iris.pdf | Bin 0 -> 27033 bytes doc/images/iris.svg | 239 + doc/images/last_digit.png | Bin 0 -> 3037 bytes doc/images/lda_model_graph.png | Bin 0 -> 19063 bytes doc/images/ml_map.png | Bin 0 -> 761071 bytes doc/images/multilayerperceptron_network.png | Bin 0 -> 89381 bytes doc/images/no_image.png | Bin 0 -> 4315 bytes doc/images/nyu_short_color.png | Bin 0 -> 5485 bytes doc/images/plot_digits_classification.png | Bin 0 -> 31108 bytes doc/images/plot_face_recognition_1.png | Bin 0 -> 124459 bytes doc/images/plot_face_recognition_2.png | Bin 0 -> 86623 bytes doc/images/rbm_graph.png | Bin 0 -> 15495 bytes doc/images/scikit-learn-logo-notext.png | Bin 0 -> 8053 bytes doc/includes/big_toc_css.rst | 44 + doc/includes/bigger_toc_css.rst | 60 + doc/index.rst | 361 + doc/install.rst | 97 + doc/logos/favicon.ico | Bin 0 -> 2238 bytes doc/logos/identity.pdf | Bin 0 -> 120865 bytes doc/logos/scikit-learn-logo-notext.png | Bin 0 -> 8053 bytes doc/logos/scikit-learn-logo-small.png | Bin 0 -> 5468 bytes doc/logos/scikit-learn-logo-thumb.png | Bin 0 -> 7069 bytes doc/logos/scikit-learn-logo.bmp | Bin 0 -> 37902 bytes doc/logos/scikit-learn-logo.png | Bin 0 -> 10879 bytes doc/logos/scikit-learn-logo.svg | 110 + doc/make.bat | 113 + doc/model_selection.rst | 14 + doc/modules/biclustering.rst | 306 + doc/modules/calibration.rst | 202 + doc/modules/classes.rst | 1350 ++++ doc/modules/clustering.rst | 1570 ++++ doc/modules/computational_performance.rst | 342 + doc/modules/covariance.rst | 332 + doc/modules/cross_decomposition.rst | 42 + doc/modules/cross_validation.rst | 646 ++ doc/modules/decomposition.rst | 838 +++ doc/modules/density.rst | 180 + doc/modules/dp-derivation.rst | 501 ++ doc/modules/ensemble.rst | 1066 +++ doc/modules/feature_extraction.rst | 1011 +++ doc/modules/feature_selection.rst | 346 + doc/modules/gaussian_process.rst | 895 +++ .../lasso_enet_coordinate_descent.png | Bin 0 -> 28954 bytes doc/modules/grid_search.rst | 282 + doc/modules/isotonic.rst | 23 + doc/modules/kernel_approximation.rst | 207 + doc/modules/kernel_ridge.rst | 57 + doc/modules/label_propagation.rst | 99 + doc/modules/lda_qda.rst | 186 + doc/modules/learning_curve.rst | 158 + doc/modules/linear_model.rst | 1251 ++++ doc/modules/manifold.rst | 643 ++ doc/modules/metrics.rst | 204 + doc/modules/mixture.rst | 326 + doc/modules/model_evaluation.rst | 1551 ++++ doc/modules/model_persistence.rst | 85 + doc/modules/multiclass.rst | 351 + doc/modules/naive_bayes.rst | 203 + doc/modules/neighbors.rst | 690 ++ doc/modules/neural_networks_supervised.rst | 383 + doc/modules/neural_networks_unsupervised.rst | 161 + doc/modules/outlier_detection.rst | 243 + doc/modules/pipeline.rst | 194 + doc/modules/preprocessing.rst | 544 ++ doc/modules/preprocessing_targets.rst | 65 + doc/modules/random_projection.rst | 162 + doc/modules/scaling_strategies.rst | 126 + doc/modules/sgd.rst | 430 ++ doc/modules/svm.rst | 711 ++ doc/modules/tree.rst | 509 ++ doc/modules/unsupervised_reduction.rst | 60 + doc/preface.rst | 32 + doc/presentations.rst | 78 + doc/related_projects.rst | 170 + doc/sphinxext/LICENSE.txt | 97 + doc/sphinxext/MANIFEST.in | 2 + doc/sphinxext/README.txt | 52 + doc/sphinxext/github_link.py | 84 + doc/sphinxext/numpy_ext/__init__.py | 0 doc/sphinxext/numpy_ext/docscrape.py | 511 ++ doc/sphinxext/numpy_ext/docscrape_sphinx.py | 240 + doc/sphinxext/numpy_ext/numpydoc.py | 192 + doc/sphinxext/sphinx_gallery/__init__.py | 13 + .../sphinx_gallery/_static/broken_example.png | Bin 0 -> 21404 bytes .../sphinx_gallery/_static/gallery.css | 127 + .../sphinx_gallery/_static/no_image.png | Bin 0 -> 4315 bytes .../sphinx_gallery/backreferences.py | 180 + doc/sphinxext/sphinx_gallery/docs_resolv.py | 436 ++ doc/sphinxext/sphinx_gallery/gen_gallery.py | 148 + doc/sphinxext/sphinx_gallery/gen_rst.py | 615 ++ doc/sphinxext/sphinx_gallery/notebook.py | 123 + doc/supervised_learning.rst | 26 + doc/support.rst | 101 + doc/templates/class.rst | 16 + doc/templates/class_with_call.rst | 17 + doc/templates/class_without_init.rst | 12 + doc/templates/function.rst | 12 + doc/testimonials/README.txt | 8 + doc/testimonials/images/Makefile | 0 doc/testimonials/images/aweber.png | Bin 0 -> 41412 bytes doc/testimonials/images/bestofmedia-logo.png | Bin 0 -> 3321 bytes doc/testimonials/images/betaworks.png | Bin 0 -> 4891 bytes doc/testimonials/images/birchbox.jpg | Bin 0 -> 14595 bytes doc/testimonials/images/booking.png | Bin 0 -> 5937 bytes doc/testimonials/images/change-logo.png | Bin 0 -> 3294 bytes doc/testimonials/images/dataiku_logo.png | Bin 0 -> 10684 bytes doc/testimonials/images/datapublica.png | Bin 0 -> 5177 bytes doc/testimonials/images/datarobot.png | Bin 0 -> 19895 bytes doc/testimonials/images/evernote.png | Bin 0 -> 2629 bytes doc/testimonials/images/howaboutwe.png | Bin 0 -> 24772 bytes doc/testimonials/images/infonea.jpg | Bin 0 -> 85087 bytes doc/testimonials/images/inria.png | Bin 0 -> 23903 bytes doc/testimonials/images/lovely.png | Bin 0 -> 3307 bytes doc/testimonials/images/machinalis.png | Bin 0 -> 12363 bytes doc/testimonials/images/okcupid.png | Bin 0 -> 10246 bytes doc/testimonials/images/ottogroup_logo.png | Bin 0 -> 8603 bytes doc/testimonials/images/peerindex.png | Bin 0 -> 4689 bytes doc/testimonials/images/phimeca.png | Bin 0 -> 2571 bytes doc/testimonials/images/rangespan.png | Bin 0 -> 11944 bytes doc/testimonials/images/solido_logo.png | Bin 0 -> 6569 bytes doc/testimonials/images/spotify.png | Bin 0 -> 12293 bytes doc/testimonials/images/telecomparistech.jpg | Bin 0 -> 11473 bytes doc/testimonials/images/yhat.png | Bin 0 -> 6350 bytes doc/testimonials/testimonials.rst | 836 +++ doc/themes/scikit-learn/layout.html | 364 + .../scikit-learn/static/ML_MAPS_README.rst | 93 + .../static/css/bootstrap-responsive.css | 1109 +++ .../static/css/bootstrap-responsive.min.css | 9 + .../scikit-learn/static/css/bootstrap.css | 6315 +++++++++++++++++ .../scikit-learn/static/css/bootstrap.min.css | 857 +++ .../scikit-learn/static/css/examples.css | 0 doc/themes/scikit-learn/static/gallery.css | 73 + .../scikit-learn/static/img/FNRS-logo.png | Bin 0 -> 7835 bytes doc/themes/scikit-learn/static/img/forkme.png | Bin 0 -> 8676 bytes .../static/img/glyphicons-halflings-white.png | Bin 0 -> 8777 bytes .../static/img/glyphicons-halflings.png | Bin 0 -> 12764 bytes doc/themes/scikit-learn/static/img/google.png | Bin 0 -> 6982 bytes .../scikit-learn/static/img/inria-small.jpg | Bin 0 -> 11762 bytes .../scikit-learn/static/img/inria-small.png | Bin 0 -> 10055 bytes .../static/img/nyu_short_color.png | Bin 0 -> 5485 bytes .../img/plot_classifier_comparison_1.png | Bin 0 -> 402605 bytes .../static/img/plot_manifold_sphere_1.png | Bin 0 -> 551524 bytes .../static/img/scikit-learn-logo-notext.png | Bin 0 -> 8053 bytes .../static/img/scikit-learn-logo-small.png | Bin 0 -> 3538 bytes .../static/img/scikit-learn-logo.png | Bin 0 -> 11986 bytes .../static/img/scikit-learn-logo.svg | 1050 +++ .../scikit-learn/static/img/telecom.png | Bin 0 -> 35103 bytes .../scikit-learn/static/jquery.maphilight.js | 362 + .../scikit-learn/static/js/bootstrap.js | 2280 ++++++ .../scikit-learn/static/js/copybutton.js | 59 + doc/themes/scikit-learn/static/nature.css_t | 1338 ++++ doc/themes/scikit-learn/theme.conf | 11 + doc/tune_toc.rst | 138 + doc/tutorial/basic/tutorial.rst | 348 + doc/tutorial/common_includes/info.txt | 3 + doc/tutorial/index.rst | 35 + .../machine_learning_map/ML_MAPS_README.txt | 93 + doc/tutorial/machine_learning_map/index.rst | 114 + .../machine_learning_map/parse_path.py | 164 + .../machine_learning_map/pyparsing.py | 3381 +++++++++ .../machine_learning_map/svg2imagemap.py | 90 + .../statistical_inference/finding_help.rst | 32 + doc/tutorial/statistical_inference/index.rst | 37 + .../statistical_inference/model_selection.rst | 292 + .../putting_together.rst | 76 + .../statistical_inference/settings.rst | 91 + .../supervised_learning.rst | 575 ++ .../unsupervised_learning.rst | 323 + .../unsupervised_learning_fixture.py | 10 + doc/tutorial/text_analytics/.gitignore | 25 + .../data/languages/fetch_data.py | 106 + .../data/movie_reviews/fetch_data.py | 30 + .../data/twenty_newsgroups/fetch_data.py | 31 + .../exercise_01_language_train_model.py | 62 + .../skeletons/exercise_02_sentiment.py | 63 + .../exercise_01_language_train_model.py | 70 + .../solutions/exercise_02_sentiment.py | 79 + .../solutions/generate_skeletons.py | 38 + .../text_analytics/working_with_text_data.rst | 566 ++ .../working_with_text_data_fixture.py | 15 + doc/unsupervised_learning.rst | 18 + doc/user_guide.rst | 24 + doc/whats_new.rst | 4787 +++++++++++++ examples/README.txt | 6 + examples/applications/README.txt | 7 + examples/applications/face_recognition.py | 162 + .../plot_model_complexity_influence.py | 169 + .../plot_out_of_core_classification.py | 416 ++ .../plot_outlier_detection_housing.py | 133 + .../applications/plot_prediction_latency.py | 310 + .../plot_species_distribution_modeling.py | 208 + examples/applications/plot_stock_market.py | 259 + .../plot_tomography_l1_reconstruction.py | 149 + examples/applications/svm_gui.py | 335 + .../topics_extraction_with_nmf_lda.py | 101 + .../wikipedia_principal_eigenvector.py | 230 + examples/bicluster/README.txt | 6 + examples/bicluster/bicluster_newsgroups.py | 184 + .../bicluster/plot_spectral_biclustering.py | 62 + .../bicluster/plot_spectral_coclustering.py | 54 + examples/calibration/README.txt | 6 + examples/calibration/plot_calibration.py | 116 + .../calibration/plot_calibration_curve.py | 134 + .../plot_calibration_multiclass.py | 168 + .../calibration/plot_compare_calibration.py | 122 + examples/classification/README.txt | 6 + .../plot_classification_probability.py | 85 + .../plot_classifier_comparison.py | 141 + .../plot_digits_classification.py | 66 + examples/classification/plot_lda.py | 71 + examples/classification/plot_lda_qda.py | 147 + examples/cluster/README.txt | 6 + .../plot_adjusted_for_chance_measures.py | 122 + examples/cluster/plot_affinity_propagation.py | 62 + .../cluster/plot_agglomerative_clustering.py | 76 + .../plot_agglomerative_clustering_metrics.py | 129 + .../cluster/plot_birch_vs_minibatchkmeans.py | 103 + examples/cluster/plot_cluster_comparison.py | 123 + examples/cluster/plot_cluster_iris.py | 92 + examples/cluster/plot_color_quantization.py | 105 + examples/cluster/plot_dbscan.py | 72 + examples/cluster/plot_dict_face_patches.py | 84 + examples/cluster/plot_digits_agglomeration.py | 61 + examples/cluster/plot_digits_linkage.py | 91 + examples/cluster/plot_face_compress.py | 87 + examples/cluster/plot_face_segmentation.py | 87 + .../cluster/plot_face_ward_segmentation.py | 74 + ...e_agglomeration_vs_univariate_selection.py | 108 + examples/cluster/plot_kmeans_assumptions.py | 63 + examples/cluster/plot_kmeans_digits.py | 127 + .../plot_kmeans_silhouette_analysis.py | 141 + .../plot_kmeans_stability_low_dim_dense.py | 119 + examples/cluster/plot_mean_shift.py | 56 + examples/cluster/plot_mini_batch_kmeans.py | 115 + examples/cluster/plot_segmentation_toy.py | 102 + .../plot_ward_structured_vs_unstructured.py | 91 + examples/covariance/README.txt | 6 + .../covariance/plot_covariance_estimation.py | 131 + examples/covariance/plot_lw_vs_oas.py | 83 + .../covariance/plot_mahalanobis_distances.py | 144 + examples/covariance/plot_outlier_detection.py | 107 + .../plot_robust_vs_empirical_covariance.py | 153 + examples/covariance/plot_sparse_cov.py | 135 + examples/cross_decomposition/README.txt | 7 + .../plot_compare_cross_decomposition.py | 148 + examples/datasets/README.txt | 6 + examples/datasets/plot_digits_last_image.py | 35 + examples/datasets/plot_iris_dataset.py | 67 + examples/datasets/plot_random_dataset.py | 61 + .../plot_random_multilabel_dataset.py | 95 + examples/decomposition/README.txt | 7 + .../decomposition/plot_faces_decomposition.py | 134 + .../plot_ica_blind_source_separation.py | 73 + examples/decomposition/plot_ica_vs_pca.py | 105 + .../decomposition/plot_image_denoising.py | 173 + .../decomposition/plot_incremental_pca.py | 60 + examples/decomposition/plot_kernel_pca.py | 73 + examples/decomposition/plot_pca_3d.py | 99 + examples/decomposition/plot_pca_iris.py | 59 + .../plot_pca_vs_fa_model_selection.py | 125 + examples/decomposition/plot_pca_vs_lda.py | 60 + examples/decomposition/plot_sparse_coding.py | 100 + examples/ensemble/README.txt | 6 + .../ensemble/plot_adaboost_hastie_10_2.py | 112 + examples/ensemble/plot_adaboost_multiclass.py | 120 + examples/ensemble/plot_adaboost_regression.py | 54 + examples/ensemble/plot_adaboost_twoclass.py | 101 + examples/ensemble/plot_bias_variance.py | 184 + examples/ensemble/plot_ensemble_oob.py | 86 + .../ensemble/plot_feature_transformation.py | 121 + examples/ensemble/plot_forest_importances.py | 54 + .../ensemble/plot_forest_importances_faces.py | 49 + examples/ensemble/plot_forest_iris.py | 152 + .../ensemble/plot_gradient_boosting_oob.py | 136 + .../plot_gradient_boosting_quantile.py | 79 + .../plot_gradient_boosting_regression.py | 76 + .../plot_gradient_boosting_regularization.py | 79 + examples/ensemble/plot_isolation_forest.py | 69 + examples/ensemble/plot_partial_dependence.py | 114 + .../ensemble/plot_random_forest_embedding.py | 105 + ...ot_random_forest_regression_multioutput.py | 76 + .../ensemble/plot_voting_decision_regions.py | 72 + examples/ensemble/plot_voting_probas.py | 78 + examples/exercises/README.txt | 4 + .../digits_classification_exercise.py | 33 + examples/exercises/plot_cv_diabetes.py | 86 + examples/exercises/plot_cv_digits.py | 45 + examples/exercises/plot_iris_exercise.py | 65 + examples/feature_selection/README.txt | 6 + .../feature_selection_pipeline.py | 29 + .../feature_selection/plot_f_test_vs_mi.py | 49 + .../plot_feature_selection.py | 85 + ...lot_permutation_test_for_classification.py | 68 + examples/feature_selection/plot_rfe_digits.py | 36 + .../plot_rfe_with_cross_validation.py | 37 + .../plot_select_from_model_boston.py | 51 + examples/feature_stacker.py | 58 + examples/gaussian_process/README.txt | 7 + .../gaussian_process/plot_compare_gpr_krr.py | 121 + examples/gaussian_process/plot_gpc.py | 100 + examples/gaussian_process/plot_gpc_iris.py | 62 + .../plot_gpc_isoprobability.py | 102 + examples/gaussian_process/plot_gpc_xor.py | 57 + examples/gaussian_process/plot_gpr_co2.py | 131 + examples/gaussian_process/plot_gpr_noisy.py | 97 + .../plot_gpr_noisy_targets.py | 113 + .../plot_gpr_prior_posterior.py | 78 + examples/hetero_feature_union.py | 181 + examples/linear_model/README.txt | 6 + .../lasso_dense_vs_sparse_data.py | 66 + examples/linear_model/plot_ard.py | 84 + examples/linear_model/plot_bayesian_ridge.py | 80 + examples/linear_model/plot_huber_vs_ridge.py | 65 + examples/linear_model/plot_iris_logistic.py | 59 + .../linear_model/plot_lasso_and_elasticnet.py | 69 + .../plot_lasso_coordinate_descent_path.py | 93 + examples/linear_model/plot_lasso_lars.py | 42 + .../plot_lasso_model_selection.py | 155 + examples/linear_model/plot_logistic.py | 65 + .../plot_logistic_l1_l2_sparsity.py | 79 + .../linear_model/plot_logistic_multinomial.py | 70 + examples/linear_model/plot_logistic_path.py | 55 + .../plot_multi_task_lasso_support.py | 69 + examples/linear_model/plot_ols.py | 67 + examples/linear_model/plot_ols_3d.py | 74 + .../linear_model/plot_ols_ridge_variance.py | 71 + examples/linear_model/plot_omp.py | 82 + .../plot_polynomial_interpolation.py | 72 + examples/linear_model/plot_ransac.py | 58 + examples/linear_model/plot_ridge_coeffs.py | 89 + examples/linear_model/plot_ridge_path.py | 67 + examples/linear_model/plot_robust_fit.py | 97 + examples/linear_model/plot_sgd_comparison.py | 58 + examples/linear_model/plot_sgd_iris.py | 80 + .../linear_model/plot_sgd_loss_functions.py | 42 + examples/linear_model/plot_sgd_penalties.py | 72 + .../plot_sgd_separating_hyperplane.py | 42 + .../linear_model/plot_sgd_weighted_samples.py | 48 + examples/linear_model/plot_sparse_recovery.py | 174 + examples/linear_model/plot_theilsen.py | 111 + examples/manifold/README.txt | 7 + examples/manifold/plot_compare_methods.py | 123 + examples/manifold/plot_lle_digits.py | 230 + examples/manifold/plot_manifold_sphere.py | 155 + examples/manifold/plot_mds.py | 88 + examples/manifold/plot_swissroll.py | 50 + examples/missing_values.py | 72 + examples/mixture/README.txt | 6 + examples/mixture/plot_concentration_prior.py | 135 + examples/mixture/plot_gmm.py | 88 + examples/mixture/plot_gmm_covariances.py | 134 + examples/mixture/plot_gmm_pdf.py | 50 + examples/mixture/plot_gmm_selection.py | 98 + examples/mixture/plot_gmm_sin.py | 154 + examples/model_selection/README.txt | 6 + .../model_selection/grid_search_digits.py | 80 + .../grid_search_text_feature_extraction.py | 130 + .../model_selection/plot_confusion_matrix.py | 99 + .../model_selection/plot_learning_curve.py | 115 + .../plot_nested_cross_validation_iris.py | 115 + .../model_selection/plot_precision_recall.py | 158 + examples/model_selection/plot_roc.py | 148 + examples/model_selection/plot_roc_crossval.py | 98 + .../plot_train_error_vs_test_error.py | 75 + .../plot_underfitting_overfitting.py | 68 + .../model_selection/plot_validation_curve.py | 51 + examples/model_selection/randomized_search.py | 88 + examples/neighbors/README.txt | 6 + ...imate_nearest_neighbors_hyperparameters.py | 132 + ...proximate_nearest_neighbors_scalability.py | 156 + examples/neighbors/plot_classification.py | 55 + .../neighbors/plot_digits_kde_sampling.py | 62 + examples/neighbors/plot_kde_1d.py | 144 + examples/neighbors/plot_nearest_centroid.py | 56 + examples/neighbors/plot_regression.py | 49 + examples/neighbors/plot_species_kde.py | 115 + examples/neural_networks/README.txt | 6 + examples/neural_networks/plot_mlp_alpha.py | 113 + .../plot_mlp_training_curves.py | 89 + .../neural_networks/plot_mnist_filters.py | 54 + .../plot_rbm_logistic_classification.py | 141 + examples/plot_compare_reduction.py | 75 + examples/plot_cv_predict.py | 28 + examples/plot_digits_pipe.py | 67 + examples/plot_isotonic_regression.py | 58 + examples/plot_johnson_lindenstrauss_bound.py | 199 + examples/plot_kernel_approximation.py | 210 + examples/plot_kernel_ridge_regression.py | 171 + examples/plot_multilabel.py | 114 + examples/plot_multioutput_face_completion.py | 98 + examples/preprocessing/README.txt | 6 + .../plot_function_transformer.py | 72 + examples/preprocessing/plot_robust_scaling.py | 84 + examples/semi_supervised/README.txt | 6 + .../plot_label_propagation_digits.py | 89 + ...abel_propagation_digits_active_learning.py | 99 + .../plot_label_propagation_structure.py | 62 + .../plot_label_propagation_versus_svm_iris.py | 79 + examples/svm/README.txt | 6 + examples/svm/plot_custom_kernel.py | 57 + examples/svm/plot_iris.py | 93 + examples/svm/plot_oneclass.py | 64 + examples/svm/plot_rbf_parameters.py | 196 + examples/svm/plot_separating_hyperplane.py | 48 + .../plot_separating_hyperplane_unbalanced.py | 67 + examples/svm/plot_svm_anova.py | 58 + examples/svm/plot_svm_kernels.py | 84 + examples/svm/plot_svm_margin.py | 87 + examples/svm/plot_svm_nonlinear.py | 41 + examples/svm/plot_svm_regression.py | 45 + examples/svm/plot_svm_scale_c.py | 151 + examples/svm/plot_weighted_samples.py | 63 + examples/text/README.txt | 6 + .../document_classification_20newsgroups.py | 314 + examples/text/document_clustering.py | 221 + examples/text/hashing_vs_dict_vectorizer.py | 111 + .../mlcomp_sparse_document_classification.py | 145 + examples/tree/README.txt | 6 + examples/tree/plot_iris.py | 66 + examples/tree/plot_tree_regression.py | 49 + .../tree/plot_tree_regression_multioutput.py | 56 + examples/tree/plot_unveil_tree_structure.py | 134 + setup.cfg | 41 + setup.py | 305 + setup32.cfg | 22 + site.cfg | 6 + sklearn/__check_build/__init__.py | 46 + sklearn/__check_build/_check_build.pyx | 2 + sklearn/__check_build/setup.py | 18 + sklearn/__init__.py | 87 + sklearn/_build_utils/__init__.py | 35 + sklearn/_isotonic.pyx | 118 + sklearn/base.py | 533 ++ sklearn/calibration.py | 551 ++ sklearn/cluster/__init__.py | 36 + sklearn/cluster/_dbscan_inner.pyx | 54 + sklearn/cluster/_feature_agglomeration.py | 72 + sklearn/cluster/_hierarchical.pyx | 334 + sklearn/cluster/_k_means.pyx | 401 ++ sklearn/cluster/_k_means_elkan.pyx | 255 + sklearn/cluster/affinity_propagation_.py | 324 + sklearn/cluster/bicluster.py | 501 ++ sklearn/cluster/birch.py | 611 ++ sklearn/cluster/dbscan_.py | 297 + sklearn/cluster/hierarchical.py | 841 +++ sklearn/cluster/k_means_.py | 1543 ++++ sklearn/cluster/mean_shift_.py | 410 ++ sklearn/cluster/setup.py | 54 + sklearn/cluster/spectral.py | 468 ++ sklearn/cluster/tests/__init__.py | 0 sklearn/cluster/tests/common.py | 28 + .../tests/test_affinity_propagation.py | 80 + sklearn/cluster/tests/test_bicluster.py | 264 + sklearn/cluster/tests/test_birch.py | 160 + sklearn/cluster/tests/test_dbscan.py | 324 + sklearn/cluster/tests/test_hierarchical.py | 509 ++ sklearn/cluster/tests/test_k_means.py | 826 +++ sklearn/cluster/tests/test_mean_shift.py | 114 + sklearn/cluster/tests/test_spectral.py | 196 + sklearn/covariance/__init__.py | 34 + sklearn/covariance/empirical_covariance_.py | 287 + sklearn/covariance/graph_lasso_.py | 693 ++ sklearn/covariance/outlier_detection.py | 182 + sklearn/covariance/robust_covariance.py | 707 ++ sklearn/covariance/shrunk_covariance_.py | 564 ++ sklearn/covariance/tests/__init__.py | 0 sklearn/covariance/tests/test_covariance.py | 306 + sklearn/covariance/tests/test_graph_lasso.py | 133 + .../tests/test_robust_covariance.py | 111 + sklearn/cross_decomposition/__init__.py | 2 + sklearn/cross_decomposition/cca_.py | 107 + sklearn/cross_decomposition/pls_.py | 857 +++ sklearn/cross_decomposition/tests/__init__.py | 0 sklearn/cross_decomposition/tests/test_pls.py | 362 + sklearn/cross_validation.py | 1991 ++++++ sklearn/datasets/__init__.py | 105 + sklearn/datasets/_svmlight_format.pyx | 109 + sklearn/datasets/base.py | 764 ++ sklearn/datasets/california_housing.py | 128 + sklearn/datasets/covtype.py | 116 + sklearn/datasets/data/boston_house_prices.csv | 508 ++ sklearn/datasets/data/breast_cancer.csv | 570 ++ sklearn/datasets/data/diabetes_data.csv.gz | Bin 0 -> 23803 bytes sklearn/datasets/data/diabetes_target.csv.gz | Bin 0 -> 1050 bytes sklearn/datasets/data/digits.csv.gz | Bin 0 -> 57523 bytes sklearn/datasets/data/iris.csv | 151 + sklearn/datasets/data/linnerud_exercise.csv | 21 + .../datasets/data/linnerud_physiological.csv | 21 + .../datasets/descr/boston_house_prices.rst | 53 + sklearn/datasets/descr/breast_cancer.rst | 117 + sklearn/datasets/descr/diabetes.rst | 39 + sklearn/datasets/descr/digits.rst | 45 + sklearn/datasets/descr/iris.rst | 62 + sklearn/datasets/descr/linnerud.rst | 21 + sklearn/datasets/images/README.txt | 21 + sklearn/datasets/images/china.jpg | Bin 0 -> 196653 bytes sklearn/datasets/images/flower.jpg | Bin 0 -> 142987 bytes sklearn/datasets/kddcup99.py | 369 + sklearn/datasets/lfw.py | 514 ++ sklearn/datasets/mlcomp.py | 107 + sklearn/datasets/mldata.py | 240 + sklearn/datasets/olivetti_faces.py | 140 + sklearn/datasets/rcv1.py | 238 + sklearn/datasets/samples_generator.py | 1644 +++++ sklearn/datasets/setup.py | 22 + sklearn/datasets/species_distributions.py | 256 + sklearn/datasets/svmlight_format.py | 440 ++ sklearn/datasets/tests/__init__.py | 0 .../tests/data/svmlight_classification.txt | 9 + .../datasets/tests/data/svmlight_invalid.txt | 3 + .../tests/data/svmlight_invalid_order.txt | 1 + .../tests/data/svmlight_multilabel.txt | 5 + sklearn/datasets/tests/test_20news.py | 79 + sklearn/datasets/tests/test_base.py | 272 + sklearn/datasets/tests/test_covtype.py | 32 + sklearn/datasets/tests/test_kddcup99.py | 41 + sklearn/datasets/tests/test_lfw.py | 199 + sklearn/datasets/tests/test_mldata.py | 169 + sklearn/datasets/tests/test_rcv1.py | 71 + .../datasets/tests/test_samples_generator.py | 362 + .../datasets/tests/test_svmlight_format.py | 404 ++ sklearn/datasets/twenty_newsgroups.py | 377 + sklearn/decomposition/__init__.py | 41 + sklearn/decomposition/_online_lda.pyx | 116 + sklearn/decomposition/base.py | 162 + sklearn/decomposition/cdnmf_fast.pyx | 41 + sklearn/decomposition/dict_learning.py | 1290 ++++ sklearn/decomposition/factor_analysis.py | 344 + sklearn/decomposition/fastica_.py | 572 ++ sklearn/decomposition/incremental_pca.py | 264 + sklearn/decomposition/kernel_pca.py | 307 + sklearn/decomposition/nmf.py | 1310 ++++ sklearn/decomposition/online_lda.py | 721 ++ sklearn/decomposition/pca.py | 743 ++ sklearn/decomposition/setup.py | 29 + sklearn/decomposition/sparse_pca.py | 284 + sklearn/decomposition/tests/__init__.py | 0 .../decomposition/tests/test_dict_learning.py | 246 + .../tests/test_factor_analysis.py | 85 + sklearn/decomposition/tests/test_fastica.py | 236 + .../tests/test_incremental_pca.py | 224 + .../decomposition/tests/test_kernel_pca.py | 222 + sklearn/decomposition/tests/test_nmf.py | 260 + .../decomposition/tests/test_online_lda.py | 339 + sklearn/decomposition/tests/test_pca.py | 510 ++ .../decomposition/tests/test_sparse_pca.py | 162 + .../decomposition/tests/test_truncated_svd.py | 164 + sklearn/decomposition/truncated_svd.py | 221 + sklearn/discriminant_analysis.py | 797 +++ sklearn/dummy.py | 476 ++ sklearn/ensemble/__init__.py | 35 + sklearn/ensemble/_gradient_boosting.pyx | 302 + sklearn/ensemble/bagging.py | 994 +++ sklearn/ensemble/base.py | 150 + sklearn/ensemble/forest.py | 1714 +++++ sklearn/ensemble/gradient_boosting.py | 1892 +++++ sklearn/ensemble/iforest.py | 311 + sklearn/ensemble/partial_dependence.py | 393 + sklearn/ensemble/setup.py | 17 + sklearn/ensemble/tests/__init__.py | 0 sklearn/ensemble/tests/test_bagging.py | 725 ++ sklearn/ensemble/tests/test_base.py | 108 + sklearn/ensemble/tests/test_forest.py | 1184 +++ .../ensemble/tests/test_gradient_boosting.py | 1077 +++ .../test_gradient_boosting_loss_functions.py | 176 + sklearn/ensemble/tests/test_iforest.py | 202 + .../ensemble/tests/test_partial_dependence.py | 206 + .../ensemble/tests/test_voting_classifier.py | 260 + .../ensemble/tests/test_weight_boosting.py | 477 ++ sklearn/ensemble/voting_classifier.py | 268 + sklearn/ensemble/weight_boosting.py | 1118 +++ sklearn/exceptions.py | 146 + sklearn/externals/README | 7 + sklearn/externals/__init__.py | 5 + sklearn/externals/copy_joblib.sh | 28 + sklearn/externals/funcsigs.py | 818 +++ sklearn/externals/odict.py | 266 + sklearn/externals/setup.py | 9 + sklearn/externals/six.py | 577 ++ sklearn/externals/test_externals_setup.py | 10 + sklearn/feature_extraction/__init__.py | 13 + sklearn/feature_extraction/_hashing.pyx | 85 + sklearn/feature_extraction/dict_vectorizer.py | 366 + sklearn/feature_extraction/hashing.py | 153 + sklearn/feature_extraction/image.py | 511 ++ sklearn/feature_extraction/setup.py | 19 + sklearn/feature_extraction/stop_words.py | 45 + sklearn/feature_extraction/tests/__init__.py | 0 .../tests/test_dict_vectorizer.py | 113 + .../tests/test_feature_hasher.py | 109 + .../feature_extraction/tests/test_image.py | 313 + sklearn/feature_extraction/tests/test_text.py | 969 +++ sklearn/feature_extraction/text.py | 1361 ++++ sklearn/feature_selection/__init__.py | 43 + sklearn/feature_selection/base.py | 122 + sklearn/feature_selection/from_model.py | 258 + sklearn/feature_selection/mutual_info_.py | 438 ++ sklearn/feature_selection/rfe.py | 446 ++ sklearn/feature_selection/tests/__init__.py | 0 sklearn/feature_selection/tests/test_base.py | 115 + sklearn/feature_selection/tests/test_chi2.py | 97 + .../tests/test_feature_select.py | 621 ++ .../tests/test_from_model.py | 178 + .../tests/test_mutual_info.py | 193 + sklearn/feature_selection/tests/test_rfe.py | 305 + .../tests/test_variance_threshold.py | 28 + .../feature_selection/univariate_selection.py | 735 ++ .../feature_selection/variance_threshold.py | 82 + sklearn/gaussian_process/__init__.py | 23 + .../gaussian_process/correlation_models.py | 284 + sklearn/gaussian_process/gaussian_process.py | 897 +++ sklearn/gaussian_process/gpc.py | 734 ++ sklearn/gaussian_process/gpr.py | 434 ++ sklearn/gaussian_process/kernels.py | 1863 +++++ sklearn/gaussian_process/regression_models.py | 89 + sklearn/gaussian_process/tests/__init__.py | 0 .../tests/test_gaussian_process.py | 170 + sklearn/gaussian_process/tests/test_gpc.py | 163 + sklearn/gaussian_process/tests/test_gpr.py | 316 + .../gaussian_process/tests/test_kernels.py | 315 + sklearn/grid_search.py | 1006 +++ sklearn/isotonic.py | 421 ++ sklearn/kernel_approximation.py | 521 ++ sklearn/kernel_ridge.py | 183 + sklearn/lda.py | 6 + sklearn/learning_curve.py | 345 + sklearn/linear_model/__init__.py | 83 + sklearn/linear_model/base.py | 600 ++ sklearn/linear_model/bayes.py | 436 ++ sklearn/linear_model/cd_fast.pyx | 866 +++ sklearn/linear_model/coordinate_descent.py | 2168 ++++++ sklearn/linear_model/huber.py | 286 + sklearn/linear_model/least_angle.py | 1488 ++++ sklearn/linear_model/logistic.py | 1724 +++++ sklearn/linear_model/omp.py | 882 +++ sklearn/linear_model/passive_aggressive.py | 299 + sklearn/linear_model/perceptron.py | 105 + sklearn/linear_model/randomized_l1.py | 655 ++ sklearn/linear_model/ransac.py | 450 ++ sklearn/linear_model/ridge.py | 1322 ++++ sklearn/linear_model/sag.py | 301 + sklearn/linear_model/sag_fast.pyx | 563 ++ sklearn/linear_model/setup.py | 48 + sklearn/linear_model/sgd_fast.pxd | 26 + sklearn/linear_model/sgd_fast.pyx | 743 ++ sklearn/linear_model/sgd_fast_helpers.h | 9 + sklearn/linear_model/stochastic_gradient.py | 1246 ++++ sklearn/linear_model/tests/__init__.py | 0 sklearn/linear_model/tests/test_base.py | 409 ++ sklearn/linear_model/tests/test_bayes.py | 58 + .../tests/test_coordinate_descent.py | 714 ++ sklearn/linear_model/tests/test_huber.py | 196 + .../linear_model/tests/test_least_angle.py | 544 ++ sklearn/linear_model/tests/test_logistic.py | 963 +++ sklearn/linear_model/tests/test_omp.py | 207 + .../tests/test_passive_aggressive.py | 252 + sklearn/linear_model/tests/test_perceptron.py | 70 + .../linear_model/tests/test_randomized_l1.py | 129 + sklearn/linear_model/tests/test_ransac.py | 448 ++ sklearn/linear_model/tests/test_ridge.py | 784 ++ sklearn/linear_model/tests/test_sag.py | 780 ++ sklearn/linear_model/tests/test_sgd.py | 1277 ++++ .../tests/test_sparse_coordinate_descent.py | 293 + sklearn/linear_model/tests/test_theil_sen.py | 285 + sklearn/linear_model/theil_sen.py | 389 + sklearn/manifold/__init__.py | 12 + sklearn/manifold/_barnes_hut_tsne.pyx | 845 +++ sklearn/manifold/_utils.pyx | 138 + sklearn/manifold/isomap.py | 217 + sklearn/manifold/locally_linear.py | 696 ++ sklearn/manifold/mds.py | 416 ++ sklearn/manifold/setup.py | 36 + sklearn/manifold/spectral_embedding_.py | 511 ++ sklearn/manifold/t_sne.py | 896 +++ sklearn/manifold/tests/__init__.py | 0 sklearn/manifold/tests/test_isomap.py | 124 + sklearn/manifold/tests/test_locally_linear.py | 136 + sklearn/manifold/tests/test_mds.py | 61 + .../manifold/tests/test_spectral_embedding.py | 268 + sklearn/manifold/tests/test_t_sne.py | 562 ++ sklearn/metrics/__init__.py | 114 + sklearn/metrics/base.py | 133 + sklearn/metrics/classification.py | 1887 +++++ sklearn/metrics/cluster/__init__.py | 30 + sklearn/metrics/cluster/bicluster.py | 86 + .../cluster/expected_mutual_info_fast.pyx | 71 + sklearn/metrics/cluster/setup.py | 23 + sklearn/metrics/cluster/supervised.py | 874 +++ sklearn/metrics/cluster/tests/__init__.py | 0 .../metrics/cluster/tests/test_bicluster.py | 50 + .../metrics/cluster/tests/test_supervised.py | 239 + .../cluster/tests/test_unsupervised.py | 148 + sklearn/metrics/cluster/unsupervised.py | 257 + sklearn/metrics/pairwise.py | 1394 ++++ sklearn/metrics/pairwise_fast.pyx | 79 + sklearn/metrics/ranking.py | 763 ++ sklearn/metrics/regression.py | 491 ++ sklearn/metrics/scorer.py | 414 ++ sklearn/metrics/setup.py | 32 + sklearn/metrics/tests/__init__.py | 0 sklearn/metrics/tests/test_classification.py | 1455 ++++ sklearn/metrics/tests/test_common.py | 1093 +++ sklearn/metrics/tests/test_pairwise.py | 684 ++ sklearn/metrics/tests/test_ranking.py | 980 +++ sklearn/metrics/tests/test_regression.py | 143 + sklearn/metrics/tests/test_score_objects.py | 448 ++ sklearn/mixture/__init__.py | 22 + sklearn/mixture/base.py | 498 ++ sklearn/mixture/bayesian_mixture.py | 781 ++ sklearn/mixture/dpgmm.py | 854 +++ sklearn/mixture/gaussian_mixture.py | 747 ++ sklearn/mixture/gmm.py | 839 +++ sklearn/mixture/tests/__init__.py | 0 .../mixture/tests/test_bayesian_mixture.py | 421 ++ sklearn/mixture/tests/test_dpgmm.py | 237 + .../mixture/tests/test_gaussian_mixture.py | 973 +++ sklearn/mixture/tests/test_gmm.py | 535 ++ sklearn/model_selection/__init__.py | 53 + sklearn/model_selection/_search.py | 1185 ++++ sklearn/model_selection/_split.py | 1710 +++++ sklearn/model_selection/_validation.py | 961 +++ sklearn/model_selection/tests/__init__.py | 0 sklearn/model_selection/tests/test_search.py | 1142 +++ sklearn/model_selection/tests/test_split.py | 1061 +++ .../model_selection/tests/test_validation.py | 814 +++ sklearn/multiclass.py | 744 ++ sklearn/multioutput.py | 255 + sklearn/naive_bayes.py | 825 +++ sklearn/neighbors/__init__.py | 29 + sklearn/neighbors/approximate.py | 550 ++ sklearn/neighbors/ball_tree.pyx | 175 + sklearn/neighbors/base.py | 799 +++ sklearn/neighbors/binary_tree.pxi | 2526 +++++++ sklearn/neighbors/classification.py | 390 + sklearn/neighbors/dist_metrics.pxd | 77 + sklearn/neighbors/dist_metrics.pyx | 1118 +++ sklearn/neighbors/graph.py | 178 + sklearn/neighbors/kd_tree.pyx | 254 + sklearn/neighbors/kde.py | 216 + sklearn/neighbors/nearest_centroid.py | 191 + sklearn/neighbors/regression.py | 301 + sklearn/neighbors/setup.py | 37 + sklearn/neighbors/tests/__init__.py | 0 sklearn/neighbors/tests/test_approximate.py | 478 ++ sklearn/neighbors/tests/test_ball_tree.py | 311 + sklearn/neighbors/tests/test_dist_metrics.py | 171 + sklearn/neighbors/tests/test_kd_tree.py | 237 + sklearn/neighbors/tests/test_kde.py | 141 + .../neighbors/tests/test_nearest_centroid.py | 136 + sklearn/neighbors/tests/test_neighbors.py | 1222 ++++ sklearn/neighbors/typedefs.pxd | 18 + sklearn/neighbors/typedefs.pyx | 23 + sklearn/neighbors/unsupervised.py | 123 + sklearn/neural_network/__init__.py | 15 + sklearn/neural_network/_base.py | 252 + .../neural_network/_stochastic_optimizers.py | 266 + .../neural_network/multilayer_perceptron.py | 1267 ++++ sklearn/neural_network/rbm.py | 365 + sklearn/neural_network/tests/__init__.py | 0 sklearn/neural_network/tests/test_mlp.py | 558 ++ sklearn/neural_network/tests/test_rbm.py | 194 + .../tests/test_stochastic_optimizers.py | 109 + sklearn/pipeline.py | 813 +++ sklearn/preprocessing/__init__.py | 57 + .../preprocessing/_function_transformer.py | 94 + sklearn/preprocessing/data.py | 1948 +++++ sklearn/preprocessing/imputation.py | 377 + sklearn/preprocessing/label.py | 813 +++ sklearn/preprocessing/tests/__init__.py | 0 sklearn/preprocessing/tests/test_data.py | 1653 +++++ .../tests/test_function_transformer.py | 130 + .../preprocessing/tests/test_imputation.py | 360 + sklearn/preprocessing/tests/test_label.py | 509 ++ sklearn/qda.py | 6 + sklearn/random_projection.py | 623 ++ sklearn/semi_supervised/__init__.py | 10 + sklearn/semi_supervised/label_propagation.py | 463 ++ sklearn/semi_supervised/tests/__init__.py | 0 .../tests/test_label_propagation.py | 55 + sklearn/setup.py | 85 + sklearn/src/cblas/ATL_drefasum.c | 133 + sklearn/src/cblas/ATL_drefcopy.c | 148 + sklearn/src/cblas/ATL_drefgemv.c | 178 + sklearn/src/cblas/ATL_drefgemvN.c | 96 + sklearn/src/cblas/ATL_drefgemvT.c | 96 + sklearn/src/cblas/ATL_drefger.c | 147 + sklearn/src/cblas/ATL_drefrot.c | 170 + sklearn/src/cblas/ATL_drefrotg.c | 146 + sklearn/src/cblas/ATL_dsrefdot.c | 141 + sklearn/src/cblas/ATL_srefasum.c | 133 + sklearn/src/cblas/ATL_srefcopy.c | 148 + sklearn/src/cblas/ATL_srefnrm2.c | 204 + sklearn/src/cblas/ATL_srefrot.c | 170 + sklearn/src/cblas/ATL_srefrotg.c | 146 + sklearn/src/cblas/README.txt | 11 + sklearn/src/cblas/atlas_aux.h | 942 +++ sklearn/src/cblas/atlas_dsysinfo.h | 0 sklearn/src/cblas/atlas_enum.h | 55 + sklearn/src/cblas/atlas_level1.h | 129 + sklearn/src/cblas/atlas_level2.h | 338 + sklearn/src/cblas/atlas_misc.h | 477 ++ sklearn/src/cblas/atlas_refalias1.h | 59 + sklearn/src/cblas/atlas_refalias2.h | 85 + sklearn/src/cblas/atlas_reflevel1.h | 423 ++ sklearn/src/cblas/atlas_reflevel2.h | 792 +++ sklearn/src/cblas/atlas_reflvl2.h | 3187 +++++++++ sklearn/src/cblas/atlas_refmisc.h | 370 + sklearn/src/cblas/atlas_ssysinfo.h | 0 sklearn/src/cblas/atlas_type.h | 0 sklearn/src/cblas/cblas.h | 596 ++ sklearn/src/cblas/cblas_dasum.c | 44 + sklearn/src/cblas/cblas_daxpy.c | 159 + sklearn/src/cblas/cblas_dcopy.c | 52 + sklearn/src/cblas/cblas_ddot.c | 135 + sklearn/src/cblas/cblas_dgemv.c | 102 + sklearn/src/cblas/cblas_dger.c | 85 + sklearn/src/cblas/cblas_dnrm2.c | 206 + sklearn/src/cblas/cblas_drot.c | 60 + sklearn/src/cblas/cblas_drotg.c | 42 + sklearn/src/cblas/cblas_dscal.c | 183 + sklearn/src/cblas/cblas_errprn.c | 50 + sklearn/src/cblas/cblas_sasum.c | 44 + sklearn/src/cblas/cblas_saxpy.c | 156 + sklearn/src/cblas/cblas_scopy.c | 52 + sklearn/src/cblas/cblas_sdot.c | 132 + sklearn/src/cblas/cblas_snrm2.c | 44 + sklearn/src/cblas/cblas_srot.c | 60 + sklearn/src/cblas/cblas_srotg.c | 42 + sklearn/src/cblas/cblas_xerbla.c | 53 + sklearn/svm/__init__.py | 28 + sklearn/svm/base.py | 929 +++ sklearn/svm/bounds.py | 75 + sklearn/svm/classes.py | 1052 +++ sklearn/svm/liblinear.pxd | 33 + sklearn/svm/liblinear.pyx | 90 + sklearn/svm/libsvm.pxd | 69 + sklearn/svm/libsvm.pyx | 576 ++ sklearn/svm/libsvm_sparse.pyx | 414 ++ sklearn/svm/setup.py | 81 + sklearn/svm/src/liblinear/COPYRIGHT | 31 + sklearn/svm/src/liblinear/liblinear_helper.c | 244 + sklearn/svm/src/liblinear/linear.cpp | 2983 ++++++++ sklearn/svm/src/liblinear/linear.h | 83 + sklearn/svm/src/liblinear/tron.cpp | 226 + sklearn/svm/src/liblinear/tron.h | 34 + sklearn/svm/src/libsvm/LIBSVM_CHANGES | 8 + sklearn/svm/src/libsvm/libsvm_helper.c | 401 ++ sklearn/svm/src/libsvm/libsvm_sparse_helper.c | 451 ++ sklearn/svm/src/libsvm/libsvm_template.cpp | 8 + sklearn/svm/src/libsvm/svm.cpp | 3113 ++++++++ sklearn/svm/src/libsvm/svm.h | 172 + sklearn/svm/tests/__init__.py | 0 sklearn/svm/tests/test_bounds.py | 78 + sklearn/svm/tests/test_sparse.py | 343 + sklearn/svm/tests/test_svm.py | 951 +++ sklearn/tests/__init__.py | 0 sklearn/tests/test_base.py | 366 + sklearn/tests/test_calibration.py | 277 + sklearn/tests/test_check_build.py | 14 + sklearn/tests/test_common.py | 233 + sklearn/tests/test_cross_validation.py | 1252 ++++ sklearn/tests/test_discriminant_analysis.py | 358 + sklearn/tests/test_dummy.py | 599 ++ sklearn/tests/test_grid_search.py | 787 ++ sklearn/tests/test_init.py | 20 + sklearn/tests/test_isotonic.py | 431 ++ sklearn/tests/test_kernel_approximation.py | 216 + sklearn/tests/test_kernel_ridge.py | 85 + sklearn/tests/test_learning_curve.py | 259 + sklearn/tests/test_metaestimators.py | 138 + sklearn/tests/test_multiclass.py | 651 ++ sklearn/tests/test_multioutput.py | 184 + sklearn/tests/test_naive_bayes.py | 538 ++ sklearn/tests/test_pipeline.py | 729 ++ sklearn/tests/test_random_projection.py | 354 + sklearn/tree/__init__.py | 13 + sklearn/tree/_criterion.pxd | 66 + sklearn/tree/_criterion.pyx | 1322 ++++ sklearn/tree/_splitter.pxd | 98 + sklearn/tree/_splitter.pyx | 1610 +++++ sklearn/tree/_tree.pxd | 104 + sklearn/tree/_tree.pyx | 1109 +++ sklearn/tree/_utils.pxd | 159 + sklearn/tree/_utils.pyx | 673 ++ sklearn/tree/export.py | 447 ++ sklearn/tree/setup.py | 39 + sklearn/tree/tests/__init__.py | 0 sklearn/tree/tests/test_export.py | 239 + sklearn/tree/tests/test_tree.py | 1508 ++++ sklearn/tree/tree.py | 1065 +++ sklearn/utils/__init__.py | 445 ++ sklearn/utils/_logistic_sigmoid.pyx | 27 + sklearn/utils/_random.pxd | 14 + sklearn/utils/_random.pyx | 302 + sklearn/utils/_scipy_sparse_lsqr_backport.py | 508 ++ sklearn/utils/arpack.py | 1859 +++++ sklearn/utils/arrayfuncs.pyx | 64 + sklearn/utils/bench.py | 17 + sklearn/utils/class_weight.py | 185 + sklearn/utils/deprecation.py | 85 + sklearn/utils/estimator_checks.py | 1556 ++++ sklearn/utils/extmath.py | 864 +++ sklearn/utils/fast_dict.pxd | 24 + sklearn/utils/fast_dict.pyx | 155 + sklearn/utils/fixes.py | 402 ++ sklearn/utils/graph.py | 183 + sklearn/utils/graph_shortest_path.pyx | 610 ++ sklearn/utils/lgamma.pxd | 1 + sklearn/utils/lgamma.pyx | 8 + sklearn/utils/linear_assignment_.py | 284 + sklearn/utils/metaestimators.py | 115 + sklearn/utils/mocking.py | 74 + sklearn/utils/multiclass.py | 435 ++ sklearn/utils/murmurhash.pxd | 21 + sklearn/utils/murmurhash.pyx | 131 + sklearn/utils/optimize.py | 204 + sklearn/utils/random.py | 288 + sklearn/utils/seq_dataset.pxd | 51 + sklearn/utils/seq_dataset.pyx | 300 + sklearn/utils/setup.py | 84 + sklearn/utils/sparsefuncs.py | 470 ++ sklearn/utils/sparsefuncs_fast.pyx | 423 ++ sklearn/utils/sparsetools/README | 1 + sklearn/utils/sparsetools/__init__.py | 5 + sklearn/utils/sparsetools/_graph_tools.pyx | 460 ++ .../utils/sparsetools/_graph_validation.py | 58 + sklearn/utils/sparsetools/_traversal.pyx | 748 ++ sklearn/utils/sparsetools/setup.py | 26 + sklearn/utils/sparsetools/tests/__init__.py | 0 .../utils/sparsetools/tests/test_traversal.py | 56 + sklearn/utils/src/MurmurHash3.cpp | 346 + sklearn/utils/src/MurmurHash3.h | 45 + sklearn/utils/src/cholesky_delete.h | 76 + sklearn/utils/src/gamma.c | 155 + sklearn/utils/src/gamma.h | 8 + sklearn/utils/stats.py | 59 + sklearn/utils/testing.py | 826 +++ sklearn/utils/tests/__init__.py | 0 sklearn/utils/tests/test_bench.py | 11 + sklearn/utils/tests/test_class_weight.py | 281 + sklearn/utils/tests/test_estimator_checks.py | 109 + sklearn/utils/tests/test_extmath.py | 660 ++ sklearn/utils/tests/test_fast_dict.py | 32 + sklearn/utils/tests/test_fixes.py | 55 + sklearn/utils/tests/test_graph.py | 24 + sklearn/utils/tests/test_linear_assignment.py | 60 + sklearn/utils/tests/test_metaestimators.py | 80 + sklearn/utils/tests/test_multiclass.py | 374 + sklearn/utils/tests/test_murmurhash.py | 80 + sklearn/utils/tests/test_optimize.py | 32 + sklearn/utils/tests/test_random.py | 182 + sklearn/utils/tests/test_seq_dataset.py | 84 + sklearn/utils/tests/test_shortest_path.py | 95 + sklearn/utils/tests/test_sparsefuncs.py | 485 ++ sklearn/utils/tests/test_stats.py | 23 + sklearn/utils/tests/test_testing.py | 233 + sklearn/utils/tests/test_utils.py | 267 + sklearn/utils/tests/test_validation.py | 471 ++ sklearn/utils/validation.py | 707 ++ sklearn/utils/weight_vector.pxd | 29 + sklearn/utils/weight_vector.pyx | 199 + 1043 files changed, 279252 insertions(+) create mode 100644 .coveragerc create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 .landscape.yml create mode 100644 .mailmap create mode 100644 .travis.yml create mode 100644 AUTHORS.rst create mode 100644 CONTRIBUTING.md create mode 100644 COPYING create mode 100644 ISSUE_TEMPLATE.md create mode 100644 MANIFEST.in create mode 100644 Makefile create mode 100644 PULL_REQUEST_TEMPLATE.md create mode 100644 README.rst create mode 100644 appveyor.yml create mode 100644 benchmarks/bench_20newsgroups.py create mode 100644 benchmarks/bench_covertype.py create mode 100644 benchmarks/bench_glm.py create mode 100644 benchmarks/bench_glmnet.py create mode 100644 benchmarks/bench_isolation_forest.py create mode 100644 benchmarks/bench_isotonic.py create mode 100644 benchmarks/bench_lasso.py create mode 100644 benchmarks/bench_mnist.py create mode 100755 benchmarks/bench_multilabel_metrics.py create mode 100644 benchmarks/bench_plot_approximate_neighbors.py create mode 100644 benchmarks/bench_plot_fastkmeans.py create mode 100644 benchmarks/bench_plot_incremental_pca.py create mode 100644 benchmarks/bench_plot_lasso_path.py create mode 100644 benchmarks/bench_plot_neighbors.py create mode 100644 benchmarks/bench_plot_nmf.py create mode 100644 benchmarks/bench_plot_omp_lars.py create mode 100644 benchmarks/bench_plot_parallel_pairwise.py create mode 100644 benchmarks/bench_plot_randomized_svd.py create mode 100644 benchmarks/bench_plot_svd.py create mode 100644 benchmarks/bench_plot_ward.py create mode 100644 benchmarks/bench_random_projections.py create mode 100644 benchmarks/bench_rcv1_logreg_convergence.py create mode 100644 benchmarks/bench_sample_without_replacement.py create mode 100644 benchmarks/bench_sgd_regression.py create mode 100644 benchmarks/bench_sparsify.py create mode 100644 benchmarks/bench_tree.py create mode 100644 build_tools/appveyor/install.ps1 create mode 100644 build_tools/appveyor/requirements.txt create mode 100644 build_tools/appveyor/run_with_env.cmd create mode 100755 build_tools/circle/build_doc.sh create mode 100644 build_tools/circle/check_build_doc.py create mode 100755 build_tools/circle/push_doc.sh create mode 100755 build_tools/cythonize.py create mode 100755 build_tools/travis/after_success.sh create mode 100755 build_tools/travis/flake8_diff.sh create mode 100755 build_tools/travis/install.sh create mode 100755 build_tools/travis/test_script.sh create mode 100644 build_tools/windows/windows_testing_downloader.ps1 create mode 100644 circle.yml create mode 100644 doc/Makefile create mode 100644 doc/README.md create mode 100644 doc/about.rst create mode 100644 doc/conf.py create mode 100644 doc/data_transforms.rst create mode 100644 doc/datasets/covtype.rst create mode 100644 doc/datasets/index.rst create mode 100644 doc/datasets/kddcup99.rst create mode 100644 doc/datasets/labeled_faces.rst create mode 100644 doc/datasets/labeled_faces_fixture.py create mode 100644 doc/datasets/mldata.rst create mode 100644 doc/datasets/mldata_fixture.py create mode 100644 doc/datasets/olivetti_faces.rst create mode 100644 doc/datasets/rcv1.rst create mode 100644 doc/datasets/rcv1_fixture.py create mode 100644 doc/datasets/twenty_newsgroups.rst create mode 100644 doc/datasets/twenty_newsgroups_fixture.py create mode 100644 doc/developers/advanced_installation.rst create mode 100644 doc/developers/contributing.rst create mode 100644 doc/developers/debugging.rst create mode 100644 doc/developers/index.rst create mode 100644 doc/developers/maintainer.rst create mode 100644 doc/developers/performance.rst create mode 100644 doc/developers/utilities.rst create mode 100644 doc/documentation.rst create mode 100644 doc/faq.rst create mode 100644 doc/images/cds-logo.png create mode 100644 doc/images/inria-logo.jpg create mode 100644 doc/images/iris.pdf create mode 100644 doc/images/iris.svg create mode 100644 doc/images/last_digit.png create mode 100644 doc/images/lda_model_graph.png create mode 100644 doc/images/ml_map.png create mode 100644 doc/images/multilayerperceptron_network.png create mode 100644 doc/images/no_image.png create mode 100644 doc/images/nyu_short_color.png create mode 100644 doc/images/plot_digits_classification.png create mode 100644 doc/images/plot_face_recognition_1.png create mode 100644 doc/images/plot_face_recognition_2.png create mode 100644 doc/images/rbm_graph.png create mode 100644 doc/images/scikit-learn-logo-notext.png create mode 100644 doc/includes/big_toc_css.rst create mode 100644 doc/includes/bigger_toc_css.rst create mode 100644 doc/index.rst create mode 100644 doc/install.rst create mode 100755 doc/logos/favicon.ico create mode 100755 doc/logos/identity.pdf create mode 100644 doc/logos/scikit-learn-logo-notext.png create mode 100644 doc/logos/scikit-learn-logo-small.png create mode 100644 doc/logos/scikit-learn-logo-thumb.png create mode 100644 doc/logos/scikit-learn-logo.bmp create mode 100644 doc/logos/scikit-learn-logo.png create mode 100644 doc/logos/scikit-learn-logo.svg create mode 100644 doc/make.bat create mode 100644 doc/model_selection.rst create mode 100644 doc/modules/biclustering.rst create mode 100644 doc/modules/calibration.rst create mode 100644 doc/modules/classes.rst create mode 100644 doc/modules/clustering.rst create mode 100644 doc/modules/computational_performance.rst create mode 100644 doc/modules/covariance.rst create mode 100644 doc/modules/cross_decomposition.rst create mode 100644 doc/modules/cross_validation.rst create mode 100644 doc/modules/decomposition.rst create mode 100644 doc/modules/density.rst create mode 100644 doc/modules/dp-derivation.rst create mode 100644 doc/modules/ensemble.rst create mode 100644 doc/modules/feature_extraction.rst create mode 100644 doc/modules/feature_selection.rst create mode 100644 doc/modules/gaussian_process.rst create mode 100644 doc/modules/glm_data/lasso_enet_coordinate_descent.png create mode 100644 doc/modules/grid_search.rst create mode 100644 doc/modules/isotonic.rst create mode 100644 doc/modules/kernel_approximation.rst create mode 100644 doc/modules/kernel_ridge.rst create mode 100644 doc/modules/label_propagation.rst create mode 100644 doc/modules/lda_qda.rst create mode 100644 doc/modules/learning_curve.rst create mode 100644 doc/modules/linear_model.rst create mode 100644 doc/modules/manifold.rst create mode 100644 doc/modules/metrics.rst create mode 100644 doc/modules/mixture.rst create mode 100644 doc/modules/model_evaluation.rst create mode 100644 doc/modules/model_persistence.rst create mode 100644 doc/modules/multiclass.rst create mode 100644 doc/modules/naive_bayes.rst create mode 100644 doc/modules/neighbors.rst create mode 100644 doc/modules/neural_networks_supervised.rst create mode 100644 doc/modules/neural_networks_unsupervised.rst create mode 100644 doc/modules/outlier_detection.rst create mode 100644 doc/modules/pipeline.rst create mode 100644 doc/modules/preprocessing.rst create mode 100644 doc/modules/preprocessing_targets.rst create mode 100644 doc/modules/random_projection.rst create mode 100644 doc/modules/scaling_strategies.rst create mode 100644 doc/modules/sgd.rst create mode 100644 doc/modules/svm.rst create mode 100644 doc/modules/tree.rst create mode 100644 doc/modules/unsupervised_reduction.rst create mode 100644 doc/preface.rst create mode 100644 doc/presentations.rst create mode 100644 doc/related_projects.rst create mode 100644 doc/sphinxext/LICENSE.txt create mode 100644 doc/sphinxext/MANIFEST.in create mode 100644 doc/sphinxext/README.txt create mode 100644 doc/sphinxext/github_link.py create mode 100644 doc/sphinxext/numpy_ext/__init__.py create mode 100644 doc/sphinxext/numpy_ext/docscrape.py create mode 100644 doc/sphinxext/numpy_ext/docscrape_sphinx.py create mode 100644 doc/sphinxext/numpy_ext/numpydoc.py create mode 100644 doc/sphinxext/sphinx_gallery/__init__.py create mode 100644 doc/sphinxext/sphinx_gallery/_static/broken_example.png create mode 100644 doc/sphinxext/sphinx_gallery/_static/gallery.css create mode 100644 doc/sphinxext/sphinx_gallery/_static/no_image.png create mode 100644 doc/sphinxext/sphinx_gallery/backreferences.py create mode 100644 doc/sphinxext/sphinx_gallery/docs_resolv.py create mode 100644 doc/sphinxext/sphinx_gallery/gen_gallery.py create mode 100644 doc/sphinxext/sphinx_gallery/gen_rst.py create mode 100644 doc/sphinxext/sphinx_gallery/notebook.py create mode 100644 doc/supervised_learning.rst create mode 100644 doc/support.rst create mode 100644 doc/templates/class.rst create mode 100644 doc/templates/class_with_call.rst create mode 100644 doc/templates/class_without_init.rst create mode 100644 doc/templates/function.rst create mode 100644 doc/testimonials/README.txt create mode 100644 doc/testimonials/images/Makefile create mode 100644 doc/testimonials/images/aweber.png create mode 100644 doc/testimonials/images/bestofmedia-logo.png create mode 100644 doc/testimonials/images/betaworks.png create mode 100644 doc/testimonials/images/birchbox.jpg create mode 100644 doc/testimonials/images/booking.png create mode 100644 doc/testimonials/images/change-logo.png create mode 100644 doc/testimonials/images/dataiku_logo.png create mode 100644 doc/testimonials/images/datapublica.png create mode 100644 doc/testimonials/images/datarobot.png create mode 100644 doc/testimonials/images/evernote.png create mode 100644 doc/testimonials/images/howaboutwe.png create mode 100644 doc/testimonials/images/infonea.jpg create mode 100644 doc/testimonials/images/inria.png create mode 100644 doc/testimonials/images/lovely.png create mode 100644 doc/testimonials/images/machinalis.png create mode 100644 doc/testimonials/images/okcupid.png create mode 100644 doc/testimonials/images/ottogroup_logo.png create mode 100644 doc/testimonials/images/peerindex.png create mode 100644 doc/testimonials/images/phimeca.png create mode 100644 doc/testimonials/images/rangespan.png create mode 100644 doc/testimonials/images/solido_logo.png create mode 100644 doc/testimonials/images/spotify.png create mode 100644 doc/testimonials/images/telecomparistech.jpg create mode 100644 doc/testimonials/images/yhat.png create mode 100644 doc/testimonials/testimonials.rst create mode 100644 doc/themes/scikit-learn/layout.html create mode 100644 doc/themes/scikit-learn/static/ML_MAPS_README.rst create mode 100644 doc/themes/scikit-learn/static/css/bootstrap-responsive.css create mode 100644 doc/themes/scikit-learn/static/css/bootstrap-responsive.min.css create mode 100644 doc/themes/scikit-learn/static/css/bootstrap.css create mode 100644 doc/themes/scikit-learn/static/css/bootstrap.min.css create mode 100644 doc/themes/scikit-learn/static/css/examples.css create mode 100644 doc/themes/scikit-learn/static/gallery.css create mode 100644 doc/themes/scikit-learn/static/img/FNRS-logo.png create mode 100644 doc/themes/scikit-learn/static/img/forkme.png create mode 100644 doc/themes/scikit-learn/static/img/glyphicons-halflings-white.png create mode 100644 doc/themes/scikit-learn/static/img/glyphicons-halflings.png create mode 100644 doc/themes/scikit-learn/static/img/google.png create mode 100644 doc/themes/scikit-learn/static/img/inria-small.jpg create mode 100644 doc/themes/scikit-learn/static/img/inria-small.png create mode 100644 doc/themes/scikit-learn/static/img/nyu_short_color.png create mode 100644 doc/themes/scikit-learn/static/img/plot_classifier_comparison_1.png create mode 100644 doc/themes/scikit-learn/static/img/plot_manifold_sphere_1.png create mode 100644 doc/themes/scikit-learn/static/img/scikit-learn-logo-notext.png create mode 100644 doc/themes/scikit-learn/static/img/scikit-learn-logo-small.png create mode 100644 doc/themes/scikit-learn/static/img/scikit-learn-logo.png create mode 100644 doc/themes/scikit-learn/static/img/scikit-learn-logo.svg create mode 100644 doc/themes/scikit-learn/static/img/telecom.png create mode 100644 doc/themes/scikit-learn/static/jquery.maphilight.js create mode 100644 doc/themes/scikit-learn/static/js/bootstrap.js create mode 100644 doc/themes/scikit-learn/static/js/copybutton.js create mode 100644 doc/themes/scikit-learn/static/nature.css_t create mode 100644 doc/themes/scikit-learn/theme.conf create mode 100644 doc/tune_toc.rst create mode 100644 doc/tutorial/basic/tutorial.rst create mode 100644 doc/tutorial/common_includes/info.txt create mode 100644 doc/tutorial/index.rst create mode 100644 doc/tutorial/machine_learning_map/ML_MAPS_README.txt create mode 100644 doc/tutorial/machine_learning_map/index.rst create mode 100644 doc/tutorial/machine_learning_map/parse_path.py create mode 100644 doc/tutorial/machine_learning_map/pyparsing.py create mode 100644 doc/tutorial/machine_learning_map/svg2imagemap.py create mode 100644 doc/tutorial/statistical_inference/finding_help.rst create mode 100644 doc/tutorial/statistical_inference/index.rst create mode 100644 doc/tutorial/statistical_inference/model_selection.rst create mode 100644 doc/tutorial/statistical_inference/putting_together.rst create mode 100644 doc/tutorial/statistical_inference/settings.rst create mode 100644 doc/tutorial/statistical_inference/supervised_learning.rst create mode 100644 doc/tutorial/statistical_inference/unsupervised_learning.rst create mode 100644 doc/tutorial/statistical_inference/unsupervised_learning_fixture.py create mode 100644 doc/tutorial/text_analytics/.gitignore create mode 100644 doc/tutorial/text_analytics/data/languages/fetch_data.py create mode 100644 doc/tutorial/text_analytics/data/movie_reviews/fetch_data.py create mode 100644 doc/tutorial/text_analytics/data/twenty_newsgroups/fetch_data.py create mode 100644 doc/tutorial/text_analytics/skeletons/exercise_01_language_train_model.py create mode 100644 doc/tutorial/text_analytics/skeletons/exercise_02_sentiment.py create mode 100644 doc/tutorial/text_analytics/solutions/exercise_01_language_train_model.py create mode 100644 doc/tutorial/text_analytics/solutions/exercise_02_sentiment.py create mode 100644 doc/tutorial/text_analytics/solutions/generate_skeletons.py create mode 100644 doc/tutorial/text_analytics/working_with_text_data.rst create mode 100644 doc/tutorial/text_analytics/working_with_text_data_fixture.py create mode 100644 doc/unsupervised_learning.rst create mode 100644 doc/user_guide.rst create mode 100644 doc/whats_new.rst create mode 100644 examples/README.txt create mode 100644 examples/applications/README.txt create mode 100644 examples/applications/face_recognition.py create mode 100644 examples/applications/plot_model_complexity_influence.py create mode 100644 examples/applications/plot_out_of_core_classification.py create mode 100644 examples/applications/plot_outlier_detection_housing.py create mode 100644 examples/applications/plot_prediction_latency.py create mode 100644 examples/applications/plot_species_distribution_modeling.py create mode 100644 examples/applications/plot_stock_market.py create mode 100644 examples/applications/plot_tomography_l1_reconstruction.py create mode 100644 examples/applications/svm_gui.py create mode 100644 examples/applications/topics_extraction_with_nmf_lda.py create mode 100644 examples/applications/wikipedia_principal_eigenvector.py create mode 100644 examples/bicluster/README.txt create mode 100644 examples/bicluster/bicluster_newsgroups.py create mode 100644 examples/bicluster/plot_spectral_biclustering.py create mode 100644 examples/bicluster/plot_spectral_coclustering.py create mode 100644 examples/calibration/README.txt create mode 100644 examples/calibration/plot_calibration.py create mode 100644 examples/calibration/plot_calibration_curve.py create mode 100644 examples/calibration/plot_calibration_multiclass.py create mode 100644 examples/calibration/plot_compare_calibration.py create mode 100644 examples/classification/README.txt create mode 100644 examples/classification/plot_classification_probability.py create mode 100644 examples/classification/plot_classifier_comparison.py create mode 100644 examples/classification/plot_digits_classification.py create mode 100644 examples/classification/plot_lda.py create mode 100644 examples/classification/plot_lda_qda.py create mode 100644 examples/cluster/README.txt create mode 100644 examples/cluster/plot_adjusted_for_chance_measures.py create mode 100644 examples/cluster/plot_affinity_propagation.py create mode 100644 examples/cluster/plot_agglomerative_clustering.py create mode 100644 examples/cluster/plot_agglomerative_clustering_metrics.py create mode 100644 examples/cluster/plot_birch_vs_minibatchkmeans.py create mode 100644 examples/cluster/plot_cluster_comparison.py create mode 100644 examples/cluster/plot_cluster_iris.py create mode 100644 examples/cluster/plot_color_quantization.py create mode 100644 examples/cluster/plot_dbscan.py create mode 100644 examples/cluster/plot_dict_face_patches.py create mode 100644 examples/cluster/plot_digits_agglomeration.py create mode 100644 examples/cluster/plot_digits_linkage.py create mode 100644 examples/cluster/plot_face_compress.py create mode 100644 examples/cluster/plot_face_segmentation.py create mode 100644 examples/cluster/plot_face_ward_segmentation.py create mode 100644 examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py create mode 100644 examples/cluster/plot_kmeans_assumptions.py create mode 100644 examples/cluster/plot_kmeans_digits.py create mode 100644 examples/cluster/plot_kmeans_silhouette_analysis.py create mode 100644 examples/cluster/plot_kmeans_stability_low_dim_dense.py create mode 100644 examples/cluster/plot_mean_shift.py create mode 100644 examples/cluster/plot_mini_batch_kmeans.py create mode 100644 examples/cluster/plot_segmentation_toy.py create mode 100644 examples/cluster/plot_ward_structured_vs_unstructured.py create mode 100644 examples/covariance/README.txt create mode 100644 examples/covariance/plot_covariance_estimation.py create mode 100644 examples/covariance/plot_lw_vs_oas.py create mode 100644 examples/covariance/plot_mahalanobis_distances.py create mode 100644 examples/covariance/plot_outlier_detection.py create mode 100644 examples/covariance/plot_robust_vs_empirical_covariance.py create mode 100644 examples/covariance/plot_sparse_cov.py create mode 100644 examples/cross_decomposition/README.txt create mode 100644 examples/cross_decomposition/plot_compare_cross_decomposition.py create mode 100644 examples/datasets/README.txt create mode 100644 examples/datasets/plot_digits_last_image.py create mode 100644 examples/datasets/plot_iris_dataset.py create mode 100644 examples/datasets/plot_random_dataset.py create mode 100644 examples/datasets/plot_random_multilabel_dataset.py create mode 100644 examples/decomposition/README.txt create mode 100644 examples/decomposition/plot_faces_decomposition.py create mode 100644 examples/decomposition/plot_ica_blind_source_separation.py create mode 100644 examples/decomposition/plot_ica_vs_pca.py create mode 100644 examples/decomposition/plot_image_denoising.py create mode 100644 examples/decomposition/plot_incremental_pca.py create mode 100644 examples/decomposition/plot_kernel_pca.py create mode 100644 examples/decomposition/plot_pca_3d.py create mode 100644 examples/decomposition/plot_pca_iris.py create mode 100644 examples/decomposition/plot_pca_vs_fa_model_selection.py create mode 100644 examples/decomposition/plot_pca_vs_lda.py create mode 100644 examples/decomposition/plot_sparse_coding.py create mode 100644 examples/ensemble/README.txt create mode 100644 examples/ensemble/plot_adaboost_hastie_10_2.py create mode 100644 examples/ensemble/plot_adaboost_multiclass.py create mode 100644 examples/ensemble/plot_adaboost_regression.py create mode 100644 examples/ensemble/plot_adaboost_twoclass.py create mode 100644 examples/ensemble/plot_bias_variance.py create mode 100644 examples/ensemble/plot_ensemble_oob.py create mode 100644 examples/ensemble/plot_feature_transformation.py create mode 100644 examples/ensemble/plot_forest_importances.py create mode 100644 examples/ensemble/plot_forest_importances_faces.py create mode 100644 examples/ensemble/plot_forest_iris.py create mode 100644 examples/ensemble/plot_gradient_boosting_oob.py create mode 100644 examples/ensemble/plot_gradient_boosting_quantile.py create mode 100644 examples/ensemble/plot_gradient_boosting_regression.py create mode 100644 examples/ensemble/plot_gradient_boosting_regularization.py create mode 100644 examples/ensemble/plot_isolation_forest.py create mode 100644 examples/ensemble/plot_partial_dependence.py create mode 100644 examples/ensemble/plot_random_forest_embedding.py create mode 100644 examples/ensemble/plot_random_forest_regression_multioutput.py create mode 100644 examples/ensemble/plot_voting_decision_regions.py create mode 100644 examples/ensemble/plot_voting_probas.py create mode 100644 examples/exercises/README.txt create mode 100644 examples/exercises/digits_classification_exercise.py create mode 100644 examples/exercises/plot_cv_diabetes.py create mode 100644 examples/exercises/plot_cv_digits.py create mode 100644 examples/exercises/plot_iris_exercise.py create mode 100644 examples/feature_selection/README.txt create mode 100644 examples/feature_selection/feature_selection_pipeline.py create mode 100644 examples/feature_selection/plot_f_test_vs_mi.py create mode 100644 examples/feature_selection/plot_feature_selection.py create mode 100644 examples/feature_selection/plot_permutation_test_for_classification.py create mode 100644 examples/feature_selection/plot_rfe_digits.py create mode 100644 examples/feature_selection/plot_rfe_with_cross_validation.py create mode 100644 examples/feature_selection/plot_select_from_model_boston.py create mode 100644 examples/feature_stacker.py create mode 100644 examples/gaussian_process/README.txt create mode 100644 examples/gaussian_process/plot_compare_gpr_krr.py create mode 100644 examples/gaussian_process/plot_gpc.py create mode 100644 examples/gaussian_process/plot_gpc_iris.py create mode 100644 examples/gaussian_process/plot_gpc_isoprobability.py create mode 100644 examples/gaussian_process/plot_gpc_xor.py create mode 100644 examples/gaussian_process/plot_gpr_co2.py create mode 100644 examples/gaussian_process/plot_gpr_noisy.py create mode 100644 examples/gaussian_process/plot_gpr_noisy_targets.py create mode 100644 examples/gaussian_process/plot_gpr_prior_posterior.py create mode 100644 examples/hetero_feature_union.py create mode 100644 examples/linear_model/README.txt create mode 100644 examples/linear_model/lasso_dense_vs_sparse_data.py create mode 100644 examples/linear_model/plot_ard.py create mode 100644 examples/linear_model/plot_bayesian_ridge.py create mode 100644 examples/linear_model/plot_huber_vs_ridge.py create mode 100644 examples/linear_model/plot_iris_logistic.py create mode 100644 examples/linear_model/plot_lasso_and_elasticnet.py create mode 100644 examples/linear_model/plot_lasso_coordinate_descent_path.py create mode 100644 examples/linear_model/plot_lasso_lars.py create mode 100644 examples/linear_model/plot_lasso_model_selection.py create mode 100644 examples/linear_model/plot_logistic.py create mode 100644 examples/linear_model/plot_logistic_l1_l2_sparsity.py create mode 100644 examples/linear_model/plot_logistic_multinomial.py create mode 100644 examples/linear_model/plot_logistic_path.py create mode 100644 examples/linear_model/plot_multi_task_lasso_support.py create mode 100644 examples/linear_model/plot_ols.py create mode 100644 examples/linear_model/plot_ols_3d.py create mode 100644 examples/linear_model/plot_ols_ridge_variance.py create mode 100644 examples/linear_model/plot_omp.py create mode 100644 examples/linear_model/plot_polynomial_interpolation.py create mode 100644 examples/linear_model/plot_ransac.py create mode 100644 examples/linear_model/plot_ridge_coeffs.py create mode 100644 examples/linear_model/plot_ridge_path.py create mode 100644 examples/linear_model/plot_robust_fit.py create mode 100644 examples/linear_model/plot_sgd_comparison.py create mode 100644 examples/linear_model/plot_sgd_iris.py create mode 100644 examples/linear_model/plot_sgd_loss_functions.py create mode 100644 examples/linear_model/plot_sgd_penalties.py create mode 100644 examples/linear_model/plot_sgd_separating_hyperplane.py create mode 100644 examples/linear_model/plot_sgd_weighted_samples.py create mode 100644 examples/linear_model/plot_sparse_recovery.py create mode 100644 examples/linear_model/plot_theilsen.py create mode 100644 examples/manifold/README.txt create mode 100644 examples/manifold/plot_compare_methods.py create mode 100644 examples/manifold/plot_lle_digits.py create mode 100644 examples/manifold/plot_manifold_sphere.py create mode 100644 examples/manifold/plot_mds.py create mode 100644 examples/manifold/plot_swissroll.py create mode 100644 examples/missing_values.py create mode 100644 examples/mixture/README.txt create mode 100644 examples/mixture/plot_concentration_prior.py create mode 100644 examples/mixture/plot_gmm.py create mode 100644 examples/mixture/plot_gmm_covariances.py create mode 100644 examples/mixture/plot_gmm_pdf.py create mode 100644 examples/mixture/plot_gmm_selection.py create mode 100644 examples/mixture/plot_gmm_sin.py create mode 100644 examples/model_selection/README.txt create mode 100644 examples/model_selection/grid_search_digits.py create mode 100644 examples/model_selection/grid_search_text_feature_extraction.py create mode 100644 examples/model_selection/plot_confusion_matrix.py create mode 100644 examples/model_selection/plot_learning_curve.py create mode 100644 examples/model_selection/plot_nested_cross_validation_iris.py create mode 100644 examples/model_selection/plot_precision_recall.py create mode 100644 examples/model_selection/plot_roc.py create mode 100644 examples/model_selection/plot_roc_crossval.py create mode 100644 examples/model_selection/plot_train_error_vs_test_error.py create mode 100644 examples/model_selection/plot_underfitting_overfitting.py create mode 100644 examples/model_selection/plot_validation_curve.py create mode 100644 examples/model_selection/randomized_search.py create mode 100644 examples/neighbors/README.txt create mode 100644 examples/neighbors/plot_approximate_nearest_neighbors_hyperparameters.py create mode 100644 examples/neighbors/plot_approximate_nearest_neighbors_scalability.py create mode 100644 examples/neighbors/plot_classification.py create mode 100644 examples/neighbors/plot_digits_kde_sampling.py create mode 100644 examples/neighbors/plot_kde_1d.py create mode 100644 examples/neighbors/plot_nearest_centroid.py create mode 100644 examples/neighbors/plot_regression.py create mode 100644 examples/neighbors/plot_species_kde.py create mode 100644 examples/neural_networks/README.txt create mode 100644 examples/neural_networks/plot_mlp_alpha.py create mode 100644 examples/neural_networks/plot_mlp_training_curves.py create mode 100644 examples/neural_networks/plot_mnist_filters.py create mode 100644 examples/neural_networks/plot_rbm_logistic_classification.py create mode 100644 examples/plot_compare_reduction.py create mode 100644 examples/plot_cv_predict.py create mode 100644 examples/plot_digits_pipe.py create mode 100644 examples/plot_isotonic_regression.py create mode 100644 examples/plot_johnson_lindenstrauss_bound.py create mode 100644 examples/plot_kernel_approximation.py create mode 100644 examples/plot_kernel_ridge_regression.py create mode 100644 examples/plot_multilabel.py create mode 100644 examples/plot_multioutput_face_completion.py create mode 100644 examples/preprocessing/README.txt create mode 100644 examples/preprocessing/plot_function_transformer.py create mode 100644 examples/preprocessing/plot_robust_scaling.py create mode 100644 examples/semi_supervised/README.txt create mode 100644 examples/semi_supervised/plot_label_propagation_digits.py create mode 100644 examples/semi_supervised/plot_label_propagation_digits_active_learning.py create mode 100644 examples/semi_supervised/plot_label_propagation_structure.py create mode 100644 examples/semi_supervised/plot_label_propagation_versus_svm_iris.py create mode 100644 examples/svm/README.txt create mode 100644 examples/svm/plot_custom_kernel.py create mode 100644 examples/svm/plot_iris.py create mode 100644 examples/svm/plot_oneclass.py create mode 100644 examples/svm/plot_rbf_parameters.py create mode 100644 examples/svm/plot_separating_hyperplane.py create mode 100644 examples/svm/plot_separating_hyperplane_unbalanced.py create mode 100644 examples/svm/plot_svm_anova.py create mode 100644 examples/svm/plot_svm_kernels.py create mode 100644 examples/svm/plot_svm_margin.py create mode 100644 examples/svm/plot_svm_nonlinear.py create mode 100644 examples/svm/plot_svm_regression.py create mode 100644 examples/svm/plot_svm_scale_c.py create mode 100644 examples/svm/plot_weighted_samples.py create mode 100644 examples/text/README.txt create mode 100644 examples/text/document_classification_20newsgroups.py create mode 100644 examples/text/document_clustering.py create mode 100644 examples/text/hashing_vs_dict_vectorizer.py create mode 100644 examples/text/mlcomp_sparse_document_classification.py create mode 100644 examples/tree/README.txt create mode 100644 examples/tree/plot_iris.py create mode 100644 examples/tree/plot_tree_regression.py create mode 100644 examples/tree/plot_tree_regression_multioutput.py create mode 100644 examples/tree/plot_unveil_tree_structure.py create mode 100644 setup.cfg create mode 100755 setup.py create mode 100644 setup32.cfg create mode 100644 site.cfg create mode 100644 sklearn/__check_build/__init__.py create mode 100644 sklearn/__check_build/_check_build.pyx create mode 100644 sklearn/__check_build/setup.py create mode 100644 sklearn/__init__.py create mode 100644 sklearn/_build_utils/__init__.py create mode 100644 sklearn/_isotonic.pyx create mode 100644 sklearn/base.py create mode 100644 sklearn/calibration.py create mode 100644 sklearn/cluster/__init__.py create mode 100644 sklearn/cluster/_dbscan_inner.pyx create mode 100644 sklearn/cluster/_feature_agglomeration.py create mode 100644 sklearn/cluster/_hierarchical.pyx create mode 100644 sklearn/cluster/_k_means.pyx create mode 100644 sklearn/cluster/_k_means_elkan.pyx create mode 100644 sklearn/cluster/affinity_propagation_.py create mode 100644 sklearn/cluster/bicluster.py create mode 100644 sklearn/cluster/birch.py create mode 100644 sklearn/cluster/dbscan_.py create mode 100644 sklearn/cluster/hierarchical.py create mode 100644 sklearn/cluster/k_means_.py create mode 100644 sklearn/cluster/mean_shift_.py create mode 100644 sklearn/cluster/setup.py create mode 100644 sklearn/cluster/spectral.py create mode 100644 sklearn/cluster/tests/__init__.py create mode 100644 sklearn/cluster/tests/common.py create mode 100644 sklearn/cluster/tests/test_affinity_propagation.py create mode 100644 sklearn/cluster/tests/test_bicluster.py create mode 100644 sklearn/cluster/tests/test_birch.py create mode 100644 sklearn/cluster/tests/test_dbscan.py create mode 100644 sklearn/cluster/tests/test_hierarchical.py create mode 100644 sklearn/cluster/tests/test_k_means.py create mode 100644 sklearn/cluster/tests/test_mean_shift.py create mode 100644 sklearn/cluster/tests/test_spectral.py create mode 100644 sklearn/covariance/__init__.py create mode 100644 sklearn/covariance/empirical_covariance_.py create mode 100644 sklearn/covariance/graph_lasso_.py create mode 100644 sklearn/covariance/outlier_detection.py create mode 100644 sklearn/covariance/robust_covariance.py create mode 100644 sklearn/covariance/shrunk_covariance_.py create mode 100644 sklearn/covariance/tests/__init__.py create mode 100644 sklearn/covariance/tests/test_covariance.py create mode 100644 sklearn/covariance/tests/test_graph_lasso.py create mode 100644 sklearn/covariance/tests/test_robust_covariance.py create mode 100644 sklearn/cross_decomposition/__init__.py create mode 100644 sklearn/cross_decomposition/cca_.py create mode 100644 sklearn/cross_decomposition/pls_.py create mode 100644 sklearn/cross_decomposition/tests/__init__.py create mode 100644 sklearn/cross_decomposition/tests/test_pls.py create mode 100644 sklearn/cross_validation.py create mode 100644 sklearn/datasets/__init__.py create mode 100644 sklearn/datasets/_svmlight_format.pyx create mode 100644 sklearn/datasets/base.py create mode 100644 sklearn/datasets/california_housing.py create mode 100644 sklearn/datasets/covtype.py create mode 100644 sklearn/datasets/data/boston_house_prices.csv create mode 100644 sklearn/datasets/data/breast_cancer.csv create mode 100644 sklearn/datasets/data/diabetes_data.csv.gz create mode 100644 sklearn/datasets/data/diabetes_target.csv.gz create mode 100644 sklearn/datasets/data/digits.csv.gz create mode 100644 sklearn/datasets/data/iris.csv create mode 100644 sklearn/datasets/data/linnerud_exercise.csv create mode 100644 sklearn/datasets/data/linnerud_physiological.csv create mode 100644 sklearn/datasets/descr/boston_house_prices.rst create mode 100644 sklearn/datasets/descr/breast_cancer.rst create mode 100644 sklearn/datasets/descr/diabetes.rst create mode 100644 sklearn/datasets/descr/digits.rst create mode 100644 sklearn/datasets/descr/iris.rst create mode 100644 sklearn/datasets/descr/linnerud.rst create mode 100644 sklearn/datasets/images/README.txt create mode 100644 sklearn/datasets/images/china.jpg create mode 100644 sklearn/datasets/images/flower.jpg create mode 100644 sklearn/datasets/kddcup99.py create mode 100644 sklearn/datasets/lfw.py create mode 100644 sklearn/datasets/mlcomp.py create mode 100644 sklearn/datasets/mldata.py create mode 100644 sklearn/datasets/olivetti_faces.py create mode 100644 sklearn/datasets/rcv1.py create mode 100644 sklearn/datasets/samples_generator.py create mode 100644 sklearn/datasets/setup.py create mode 100644 sklearn/datasets/species_distributions.py create mode 100644 sklearn/datasets/svmlight_format.py create mode 100644 sklearn/datasets/tests/__init__.py create mode 100644 sklearn/datasets/tests/data/svmlight_classification.txt create mode 100644 sklearn/datasets/tests/data/svmlight_invalid.txt create mode 100644 sklearn/datasets/tests/data/svmlight_invalid_order.txt create mode 100644 sklearn/datasets/tests/data/svmlight_multilabel.txt create mode 100644 sklearn/datasets/tests/test_20news.py create mode 100644 sklearn/datasets/tests/test_base.py create mode 100644 sklearn/datasets/tests/test_covtype.py create mode 100644 sklearn/datasets/tests/test_kddcup99.py create mode 100644 sklearn/datasets/tests/test_lfw.py create mode 100644 sklearn/datasets/tests/test_mldata.py create mode 100644 sklearn/datasets/tests/test_rcv1.py create mode 100644 sklearn/datasets/tests/test_samples_generator.py create mode 100644 sklearn/datasets/tests/test_svmlight_format.py create mode 100644 sklearn/datasets/twenty_newsgroups.py create mode 100644 sklearn/decomposition/__init__.py create mode 100644 sklearn/decomposition/_online_lda.pyx create mode 100644 sklearn/decomposition/base.py create mode 100644 sklearn/decomposition/cdnmf_fast.pyx create mode 100644 sklearn/decomposition/dict_learning.py create mode 100644 sklearn/decomposition/factor_analysis.py create mode 100644 sklearn/decomposition/fastica_.py create mode 100644 sklearn/decomposition/incremental_pca.py create mode 100644 sklearn/decomposition/kernel_pca.py create mode 100644 sklearn/decomposition/nmf.py create mode 100644 sklearn/decomposition/online_lda.py create mode 100644 sklearn/decomposition/pca.py create mode 100644 sklearn/decomposition/setup.py create mode 100644 sklearn/decomposition/sparse_pca.py create mode 100644 sklearn/decomposition/tests/__init__.py create mode 100644 sklearn/decomposition/tests/test_dict_learning.py create mode 100644 sklearn/decomposition/tests/test_factor_analysis.py create mode 100644 sklearn/decomposition/tests/test_fastica.py create mode 100644 sklearn/decomposition/tests/test_incremental_pca.py create mode 100644 sklearn/decomposition/tests/test_kernel_pca.py create mode 100644 sklearn/decomposition/tests/test_nmf.py create mode 100644 sklearn/decomposition/tests/test_online_lda.py create mode 100644 sklearn/decomposition/tests/test_pca.py create mode 100644 sklearn/decomposition/tests/test_sparse_pca.py create mode 100644 sklearn/decomposition/tests/test_truncated_svd.py create mode 100644 sklearn/decomposition/truncated_svd.py create mode 100644 sklearn/discriminant_analysis.py create mode 100644 sklearn/dummy.py create mode 100644 sklearn/ensemble/__init__.py create mode 100644 sklearn/ensemble/_gradient_boosting.pyx create mode 100644 sklearn/ensemble/bagging.py create mode 100644 sklearn/ensemble/base.py create mode 100644 sklearn/ensemble/forest.py create mode 100644 sklearn/ensemble/gradient_boosting.py create mode 100644 sklearn/ensemble/iforest.py create mode 100644 sklearn/ensemble/partial_dependence.py create mode 100644 sklearn/ensemble/setup.py create mode 100644 sklearn/ensemble/tests/__init__.py create mode 100644 sklearn/ensemble/tests/test_bagging.py create mode 100644 sklearn/ensemble/tests/test_base.py create mode 100644 sklearn/ensemble/tests/test_forest.py create mode 100644 sklearn/ensemble/tests/test_gradient_boosting.py create mode 100644 sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py create mode 100644 sklearn/ensemble/tests/test_iforest.py create mode 100644 sklearn/ensemble/tests/test_partial_dependence.py create mode 100644 sklearn/ensemble/tests/test_voting_classifier.py create mode 100755 sklearn/ensemble/tests/test_weight_boosting.py create mode 100644 sklearn/ensemble/voting_classifier.py create mode 100644 sklearn/ensemble/weight_boosting.py create mode 100644 sklearn/exceptions.py create mode 100644 sklearn/externals/README create mode 100644 sklearn/externals/__init__.py create mode 100755 sklearn/externals/copy_joblib.sh create mode 100644 sklearn/externals/funcsigs.py create mode 100644 sklearn/externals/odict.py create mode 100644 sklearn/externals/setup.py create mode 100644 sklearn/externals/six.py create mode 100644 sklearn/externals/test_externals_setup.py create mode 100644 sklearn/feature_extraction/__init__.py create mode 100644 sklearn/feature_extraction/_hashing.pyx create mode 100644 sklearn/feature_extraction/dict_vectorizer.py create mode 100644 sklearn/feature_extraction/hashing.py create mode 100644 sklearn/feature_extraction/image.py create mode 100644 sklearn/feature_extraction/setup.py create mode 100644 sklearn/feature_extraction/stop_words.py create mode 100644 sklearn/feature_extraction/tests/__init__.py create mode 100644 sklearn/feature_extraction/tests/test_dict_vectorizer.py create mode 100644 sklearn/feature_extraction/tests/test_feature_hasher.py create mode 100644 sklearn/feature_extraction/tests/test_image.py create mode 100644 sklearn/feature_extraction/tests/test_text.py create mode 100644 sklearn/feature_extraction/text.py create mode 100644 sklearn/feature_selection/__init__.py create mode 100644 sklearn/feature_selection/base.py create mode 100644 sklearn/feature_selection/from_model.py create mode 100644 sklearn/feature_selection/mutual_info_.py create mode 100644 sklearn/feature_selection/rfe.py create mode 100644 sklearn/feature_selection/tests/__init__.py create mode 100644 sklearn/feature_selection/tests/test_base.py create mode 100644 sklearn/feature_selection/tests/test_chi2.py create mode 100644 sklearn/feature_selection/tests/test_feature_select.py create mode 100644 sklearn/feature_selection/tests/test_from_model.py create mode 100644 sklearn/feature_selection/tests/test_mutual_info.py create mode 100644 sklearn/feature_selection/tests/test_rfe.py create mode 100644 sklearn/feature_selection/tests/test_variance_threshold.py create mode 100644 sklearn/feature_selection/univariate_selection.py create mode 100644 sklearn/feature_selection/variance_threshold.py create mode 100644 sklearn/gaussian_process/__init__.py create mode 100644 sklearn/gaussian_process/correlation_models.py create mode 100644 sklearn/gaussian_process/gaussian_process.py create mode 100644 sklearn/gaussian_process/gpc.py create mode 100644 sklearn/gaussian_process/gpr.py create mode 100644 sklearn/gaussian_process/kernels.py create mode 100644 sklearn/gaussian_process/regression_models.py create mode 100644 sklearn/gaussian_process/tests/__init__.py create mode 100644 sklearn/gaussian_process/tests/test_gaussian_process.py create mode 100644 sklearn/gaussian_process/tests/test_gpc.py create mode 100644 sklearn/gaussian_process/tests/test_gpr.py create mode 100644 sklearn/gaussian_process/tests/test_kernels.py create mode 100644 sklearn/grid_search.py create mode 100644 sklearn/isotonic.py create mode 100644 sklearn/kernel_approximation.py create mode 100644 sklearn/kernel_ridge.py create mode 100644 sklearn/lda.py create mode 100644 sklearn/learning_curve.py create mode 100644 sklearn/linear_model/__init__.py create mode 100644 sklearn/linear_model/base.py create mode 100644 sklearn/linear_model/bayes.py create mode 100644 sklearn/linear_model/cd_fast.pyx create mode 100644 sklearn/linear_model/coordinate_descent.py create mode 100644 sklearn/linear_model/huber.py create mode 100644 sklearn/linear_model/least_angle.py create mode 100644 sklearn/linear_model/logistic.py create mode 100644 sklearn/linear_model/omp.py create mode 100644 sklearn/linear_model/passive_aggressive.py create mode 100644 sklearn/linear_model/perceptron.py create mode 100644 sklearn/linear_model/randomized_l1.py create mode 100644 sklearn/linear_model/ransac.py create mode 100644 sklearn/linear_model/ridge.py create mode 100644 sklearn/linear_model/sag.py create mode 100644 sklearn/linear_model/sag_fast.pyx create mode 100644 sklearn/linear_model/setup.py create mode 100644 sklearn/linear_model/sgd_fast.pxd create mode 100644 sklearn/linear_model/sgd_fast.pyx create mode 100644 sklearn/linear_model/sgd_fast_helpers.h create mode 100644 sklearn/linear_model/stochastic_gradient.py create mode 100644 sklearn/linear_model/tests/__init__.py create mode 100644 sklearn/linear_model/tests/test_base.py create mode 100644 sklearn/linear_model/tests/test_bayes.py create mode 100644 sklearn/linear_model/tests/test_coordinate_descent.py create mode 100644 sklearn/linear_model/tests/test_huber.py create mode 100644 sklearn/linear_model/tests/test_least_angle.py create mode 100644 sklearn/linear_model/tests/test_logistic.py create mode 100644 sklearn/linear_model/tests/test_omp.py create mode 100644 sklearn/linear_model/tests/test_passive_aggressive.py create mode 100644 sklearn/linear_model/tests/test_perceptron.py create mode 100644 sklearn/linear_model/tests/test_randomized_l1.py create mode 100644 sklearn/linear_model/tests/test_ransac.py create mode 100644 sklearn/linear_model/tests/test_ridge.py create mode 100644 sklearn/linear_model/tests/test_sag.py create mode 100644 sklearn/linear_model/tests/test_sgd.py create mode 100644 sklearn/linear_model/tests/test_sparse_coordinate_descent.py create mode 100644 sklearn/linear_model/tests/test_theil_sen.py create mode 100644 sklearn/linear_model/theil_sen.py create mode 100644 sklearn/manifold/__init__.py create mode 100644 sklearn/manifold/_barnes_hut_tsne.pyx create mode 100644 sklearn/manifold/_utils.pyx create mode 100644 sklearn/manifold/isomap.py create mode 100644 sklearn/manifold/locally_linear.py create mode 100644 sklearn/manifold/mds.py create mode 100644 sklearn/manifold/setup.py create mode 100644 sklearn/manifold/spectral_embedding_.py create mode 100644 sklearn/manifold/t_sne.py create mode 100644 sklearn/manifold/tests/__init__.py create mode 100644 sklearn/manifold/tests/test_isomap.py create mode 100644 sklearn/manifold/tests/test_locally_linear.py create mode 100644 sklearn/manifold/tests/test_mds.py create mode 100644 sklearn/manifold/tests/test_spectral_embedding.py create mode 100644 sklearn/manifold/tests/test_t_sne.py create mode 100644 sklearn/metrics/__init__.py create mode 100644 sklearn/metrics/base.py create mode 100644 sklearn/metrics/classification.py create mode 100644 sklearn/metrics/cluster/__init__.py create mode 100644 sklearn/metrics/cluster/bicluster.py create mode 100644 sklearn/metrics/cluster/expected_mutual_info_fast.pyx create mode 100644 sklearn/metrics/cluster/setup.py create mode 100644 sklearn/metrics/cluster/supervised.py create mode 100644 sklearn/metrics/cluster/tests/__init__.py create mode 100644 sklearn/metrics/cluster/tests/test_bicluster.py create mode 100644 sklearn/metrics/cluster/tests/test_supervised.py create mode 100644 sklearn/metrics/cluster/tests/test_unsupervised.py create mode 100644 sklearn/metrics/cluster/unsupervised.py create mode 100644 sklearn/metrics/pairwise.py create mode 100644 sklearn/metrics/pairwise_fast.pyx create mode 100644 sklearn/metrics/ranking.py create mode 100644 sklearn/metrics/regression.py create mode 100644 sklearn/metrics/scorer.py create mode 100644 sklearn/metrics/setup.py create mode 100644 sklearn/metrics/tests/__init__.py create mode 100644 sklearn/metrics/tests/test_classification.py create mode 100644 sklearn/metrics/tests/test_common.py create mode 100644 sklearn/metrics/tests/test_pairwise.py create mode 100644 sklearn/metrics/tests/test_ranking.py create mode 100644 sklearn/metrics/tests/test_regression.py create mode 100644 sklearn/metrics/tests/test_score_objects.py create mode 100644 sklearn/mixture/__init__.py create mode 100644 sklearn/mixture/base.py create mode 100644 sklearn/mixture/bayesian_mixture.py create mode 100644 sklearn/mixture/dpgmm.py create mode 100644 sklearn/mixture/gaussian_mixture.py create mode 100644 sklearn/mixture/gmm.py create mode 100644 sklearn/mixture/tests/__init__.py create mode 100644 sklearn/mixture/tests/test_bayesian_mixture.py create mode 100644 sklearn/mixture/tests/test_dpgmm.py create mode 100644 sklearn/mixture/tests/test_gaussian_mixture.py create mode 100644 sklearn/mixture/tests/test_gmm.py create mode 100644 sklearn/model_selection/__init__.py create mode 100644 sklearn/model_selection/_search.py create mode 100644 sklearn/model_selection/_split.py create mode 100644 sklearn/model_selection/_validation.py create mode 100644 sklearn/model_selection/tests/__init__.py create mode 100644 sklearn/model_selection/tests/test_search.py create mode 100644 sklearn/model_selection/tests/test_split.py create mode 100644 sklearn/model_selection/tests/test_validation.py create mode 100644 sklearn/multiclass.py create mode 100644 sklearn/multioutput.py create mode 100644 sklearn/naive_bayes.py create mode 100644 sklearn/neighbors/__init__.py create mode 100644 sklearn/neighbors/approximate.py create mode 100644 sklearn/neighbors/ball_tree.pyx create mode 100644 sklearn/neighbors/base.py create mode 100755 sklearn/neighbors/binary_tree.pxi create mode 100644 sklearn/neighbors/classification.py create mode 100644 sklearn/neighbors/dist_metrics.pxd create mode 100644 sklearn/neighbors/dist_metrics.pyx create mode 100644 sklearn/neighbors/graph.py create mode 100644 sklearn/neighbors/kd_tree.pyx create mode 100644 sklearn/neighbors/kde.py create mode 100644 sklearn/neighbors/nearest_centroid.py create mode 100644 sklearn/neighbors/regression.py create mode 100644 sklearn/neighbors/setup.py create mode 100644 sklearn/neighbors/tests/__init__.py create mode 100644 sklearn/neighbors/tests/test_approximate.py create mode 100644 sklearn/neighbors/tests/test_ball_tree.py create mode 100644 sklearn/neighbors/tests/test_dist_metrics.py create mode 100644 sklearn/neighbors/tests/test_kd_tree.py create mode 100644 sklearn/neighbors/tests/test_kde.py create mode 100644 sklearn/neighbors/tests/test_nearest_centroid.py create mode 100644 sklearn/neighbors/tests/test_neighbors.py create mode 100644 sklearn/neighbors/typedefs.pxd create mode 100644 sklearn/neighbors/typedefs.pyx create mode 100644 sklearn/neighbors/unsupervised.py create mode 100644 sklearn/neural_network/__init__.py create mode 100644 sklearn/neural_network/_base.py create mode 100644 sklearn/neural_network/_stochastic_optimizers.py create mode 100644 sklearn/neural_network/multilayer_perceptron.py create mode 100644 sklearn/neural_network/rbm.py create mode 100644 sklearn/neural_network/tests/__init__.py create mode 100644 sklearn/neural_network/tests/test_mlp.py create mode 100644 sklearn/neural_network/tests/test_rbm.py create mode 100644 sklearn/neural_network/tests/test_stochastic_optimizers.py create mode 100644 sklearn/pipeline.py create mode 100644 sklearn/preprocessing/__init__.py create mode 100644 sklearn/preprocessing/_function_transformer.py create mode 100644 sklearn/preprocessing/data.py create mode 100644 sklearn/preprocessing/imputation.py create mode 100644 sklearn/preprocessing/label.py create mode 100644 sklearn/preprocessing/tests/__init__.py create mode 100644 sklearn/preprocessing/tests/test_data.py create mode 100644 sklearn/preprocessing/tests/test_function_transformer.py create mode 100644 sklearn/preprocessing/tests/test_imputation.py create mode 100644 sklearn/preprocessing/tests/test_label.py create mode 100644 sklearn/qda.py create mode 100644 sklearn/random_projection.py create mode 100644 sklearn/semi_supervised/__init__.py create mode 100644 sklearn/semi_supervised/label_propagation.py create mode 100644 sklearn/semi_supervised/tests/__init__.py create mode 100644 sklearn/semi_supervised/tests/test_label_propagation.py create mode 100644 sklearn/setup.py create mode 100644 sklearn/src/cblas/ATL_drefasum.c create mode 100644 sklearn/src/cblas/ATL_drefcopy.c create mode 100644 sklearn/src/cblas/ATL_drefgemv.c create mode 100644 sklearn/src/cblas/ATL_drefgemvN.c create mode 100644 sklearn/src/cblas/ATL_drefgemvT.c create mode 100644 sklearn/src/cblas/ATL_drefger.c create mode 100644 sklearn/src/cblas/ATL_drefrot.c create mode 100644 sklearn/src/cblas/ATL_drefrotg.c create mode 100644 sklearn/src/cblas/ATL_dsrefdot.c create mode 100644 sklearn/src/cblas/ATL_srefasum.c create mode 100644 sklearn/src/cblas/ATL_srefcopy.c create mode 100644 sklearn/src/cblas/ATL_srefnrm2.c create mode 100644 sklearn/src/cblas/ATL_srefrot.c create mode 100644 sklearn/src/cblas/ATL_srefrotg.c create mode 100644 sklearn/src/cblas/README.txt create mode 100644 sklearn/src/cblas/atlas_aux.h create mode 100644 sklearn/src/cblas/atlas_dsysinfo.h create mode 100644 sklearn/src/cblas/atlas_enum.h create mode 100644 sklearn/src/cblas/atlas_level1.h create mode 100644 sklearn/src/cblas/atlas_level2.h create mode 100644 sklearn/src/cblas/atlas_misc.h create mode 100644 sklearn/src/cblas/atlas_refalias1.h create mode 100644 sklearn/src/cblas/atlas_refalias2.h create mode 100644 sklearn/src/cblas/atlas_reflevel1.h create mode 100644 sklearn/src/cblas/atlas_reflevel2.h create mode 100644 sklearn/src/cblas/atlas_reflvl2.h create mode 100644 sklearn/src/cblas/atlas_refmisc.h create mode 100644 sklearn/src/cblas/atlas_ssysinfo.h create mode 100644 sklearn/src/cblas/atlas_type.h create mode 100644 sklearn/src/cblas/cblas.h create mode 100644 sklearn/src/cblas/cblas_dasum.c create mode 100644 sklearn/src/cblas/cblas_daxpy.c create mode 100644 sklearn/src/cblas/cblas_dcopy.c create mode 100644 sklearn/src/cblas/cblas_ddot.c create mode 100644 sklearn/src/cblas/cblas_dgemv.c create mode 100644 sklearn/src/cblas/cblas_dger.c create mode 100644 sklearn/src/cblas/cblas_dnrm2.c create mode 100644 sklearn/src/cblas/cblas_drot.c create mode 100644 sklearn/src/cblas/cblas_drotg.c create mode 100644 sklearn/src/cblas/cblas_dscal.c create mode 100644 sklearn/src/cblas/cblas_errprn.c create mode 100644 sklearn/src/cblas/cblas_sasum.c create mode 100644 sklearn/src/cblas/cblas_saxpy.c create mode 100644 sklearn/src/cblas/cblas_scopy.c create mode 100644 sklearn/src/cblas/cblas_sdot.c create mode 100644 sklearn/src/cblas/cblas_snrm2.c create mode 100644 sklearn/src/cblas/cblas_srot.c create mode 100644 sklearn/src/cblas/cblas_srotg.c create mode 100644 sklearn/src/cblas/cblas_xerbla.c create mode 100644 sklearn/svm/__init__.py create mode 100644 sklearn/svm/base.py create mode 100644 sklearn/svm/bounds.py create mode 100644 sklearn/svm/classes.py create mode 100644 sklearn/svm/liblinear.pxd create mode 100644 sklearn/svm/liblinear.pyx create mode 100644 sklearn/svm/libsvm.pxd create mode 100644 sklearn/svm/libsvm.pyx create mode 100644 sklearn/svm/libsvm_sparse.pyx create mode 100644 sklearn/svm/setup.py create mode 100644 sklearn/svm/src/liblinear/COPYRIGHT create mode 100644 sklearn/svm/src/liblinear/liblinear_helper.c create mode 100644 sklearn/svm/src/liblinear/linear.cpp create mode 100644 sklearn/svm/src/liblinear/linear.h create mode 100644 sklearn/svm/src/liblinear/tron.cpp create mode 100644 sklearn/svm/src/liblinear/tron.h create mode 100644 sklearn/svm/src/libsvm/LIBSVM_CHANGES create mode 100644 sklearn/svm/src/libsvm/libsvm_helper.c create mode 100644 sklearn/svm/src/libsvm/libsvm_sparse_helper.c create mode 100644 sklearn/svm/src/libsvm/libsvm_template.cpp create mode 100644 sklearn/svm/src/libsvm/svm.cpp create mode 100644 sklearn/svm/src/libsvm/svm.h create mode 100644 sklearn/svm/tests/__init__.py create mode 100644 sklearn/svm/tests/test_bounds.py create mode 100644 sklearn/svm/tests/test_sparse.py create mode 100644 sklearn/svm/tests/test_svm.py create mode 100644 sklearn/tests/__init__.py create mode 100644 sklearn/tests/test_base.py create mode 100644 sklearn/tests/test_calibration.py create mode 100644 sklearn/tests/test_check_build.py create mode 100644 sklearn/tests/test_common.py create mode 100644 sklearn/tests/test_cross_validation.py create mode 100644 sklearn/tests/test_discriminant_analysis.py create mode 100644 sklearn/tests/test_dummy.py create mode 100644 sklearn/tests/test_grid_search.py create mode 100644 sklearn/tests/test_init.py create mode 100644 sklearn/tests/test_isotonic.py create mode 100644 sklearn/tests/test_kernel_approximation.py create mode 100644 sklearn/tests/test_kernel_ridge.py create mode 100644 sklearn/tests/test_learning_curve.py create mode 100644 sklearn/tests/test_metaestimators.py create mode 100644 sklearn/tests/test_multiclass.py create mode 100644 sklearn/tests/test_multioutput.py create mode 100644 sklearn/tests/test_naive_bayes.py create mode 100644 sklearn/tests/test_pipeline.py create mode 100644 sklearn/tests/test_random_projection.py create mode 100644 sklearn/tree/__init__.py create mode 100644 sklearn/tree/_criterion.pxd create mode 100644 sklearn/tree/_criterion.pyx create mode 100644 sklearn/tree/_splitter.pxd create mode 100644 sklearn/tree/_splitter.pyx create mode 100644 sklearn/tree/_tree.pxd create mode 100644 sklearn/tree/_tree.pyx create mode 100644 sklearn/tree/_utils.pxd create mode 100644 sklearn/tree/_utils.pyx create mode 100644 sklearn/tree/export.py create mode 100644 sklearn/tree/setup.py create mode 100644 sklearn/tree/tests/__init__.py create mode 100644 sklearn/tree/tests/test_export.py create mode 100644 sklearn/tree/tests/test_tree.py create mode 100644 sklearn/tree/tree.py create mode 100644 sklearn/utils/__init__.py create mode 100644 sklearn/utils/_logistic_sigmoid.pyx create mode 100644 sklearn/utils/_random.pxd create mode 100644 sklearn/utils/_random.pyx create mode 100644 sklearn/utils/_scipy_sparse_lsqr_backport.py create mode 100644 sklearn/utils/arpack.py create mode 100644 sklearn/utils/arrayfuncs.pyx create mode 100644 sklearn/utils/bench.py create mode 100644 sklearn/utils/class_weight.py create mode 100644 sklearn/utils/deprecation.py create mode 100644 sklearn/utils/estimator_checks.py create mode 100644 sklearn/utils/extmath.py create mode 100644 sklearn/utils/fast_dict.pxd create mode 100644 sklearn/utils/fast_dict.pyx create mode 100644 sklearn/utils/fixes.py create mode 100644 sklearn/utils/graph.py create mode 100644 sklearn/utils/graph_shortest_path.pyx create mode 100644 sklearn/utils/lgamma.pxd create mode 100644 sklearn/utils/lgamma.pyx create mode 100644 sklearn/utils/linear_assignment_.py create mode 100644 sklearn/utils/metaestimators.py create mode 100644 sklearn/utils/mocking.py create mode 100644 sklearn/utils/multiclass.py create mode 100644 sklearn/utils/murmurhash.pxd create mode 100644 sklearn/utils/murmurhash.pyx create mode 100644 sklearn/utils/optimize.py create mode 100644 sklearn/utils/random.py create mode 100644 sklearn/utils/seq_dataset.pxd create mode 100644 sklearn/utils/seq_dataset.pyx create mode 100644 sklearn/utils/setup.py create mode 100644 sklearn/utils/sparsefuncs.py create mode 100644 sklearn/utils/sparsefuncs_fast.pyx create mode 100644 sklearn/utils/sparsetools/README create mode 100644 sklearn/utils/sparsetools/__init__.py create mode 100644 sklearn/utils/sparsetools/_graph_tools.pyx create mode 100644 sklearn/utils/sparsetools/_graph_validation.py create mode 100644 sklearn/utils/sparsetools/_traversal.pyx create mode 100644 sklearn/utils/sparsetools/setup.py create mode 100644 sklearn/utils/sparsetools/tests/__init__.py create mode 100644 sklearn/utils/sparsetools/tests/test_traversal.py create mode 100644 sklearn/utils/src/MurmurHash3.cpp create mode 100644 sklearn/utils/src/MurmurHash3.h create mode 100644 sklearn/utils/src/cholesky_delete.h create mode 100644 sklearn/utils/src/gamma.c create mode 100644 sklearn/utils/src/gamma.h create mode 100644 sklearn/utils/stats.py create mode 100644 sklearn/utils/testing.py create mode 100644 sklearn/utils/tests/__init__.py create mode 100644 sklearn/utils/tests/test_bench.py create mode 100644 sklearn/utils/tests/test_class_weight.py create mode 100644 sklearn/utils/tests/test_estimator_checks.py create mode 100644 sklearn/utils/tests/test_extmath.py create mode 100644 sklearn/utils/tests/test_fast_dict.py create mode 100644 sklearn/utils/tests/test_fixes.py create mode 100644 sklearn/utils/tests/test_graph.py create mode 100644 sklearn/utils/tests/test_linear_assignment.py create mode 100644 sklearn/utils/tests/test_metaestimators.py create mode 100644 sklearn/utils/tests/test_multiclass.py create mode 100644 sklearn/utils/tests/test_murmurhash.py create mode 100644 sklearn/utils/tests/test_optimize.py create mode 100644 sklearn/utils/tests/test_random.py create mode 100644 sklearn/utils/tests/test_seq_dataset.py create mode 100644 sklearn/utils/tests/test_shortest_path.py create mode 100644 sklearn/utils/tests/test_sparsefuncs.py create mode 100644 sklearn/utils/tests/test_stats.py create mode 100644 sklearn/utils/tests/test_testing.py create mode 100644 sklearn/utils/tests/test_utils.py create mode 100644 sklearn/utils/tests/test_validation.py create mode 100644 sklearn/utils/validation.py create mode 100644 sklearn/utils/weight_vector.pxd create mode 100644 sklearn/utils/weight_vector.pyx diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..6d76a5b --- /dev/null +++ b/.coveragerc @@ -0,0 +1,8 @@ +[run] +branch = True +source = sklearn +include = */sklearn/* +omit = + */sklearn/externals/* + */benchmarks/* + */setup.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..30cb453 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,30 @@ +/sklearn/__check_build/_check_build.c -diff +/sklearn/_isotonic.c -diff +/sklearn/cluster/_dbscan_inner.cpp -diff +/sklearn/cluster/_hierarchical.cpp -diff +/sklearn/cluster/_k_means.c -diff +/sklearn/cluster/_k_means_elkan.c -diff +/sklearn/datasets/_svmlight_format.c -diff +/sklearn/decomposition/_online_lda.c -diff +/sklearn/decomposition/cdnmf_fast.c -diff +/sklearn/ensemble/_gradient_boosting.c -diff +/sklearn/feature_extraction/_hashing.c -diff +/sklearn/linear_model/cd_fast.c -diff +/sklearn/linear_model/sgd_fast.c -diff +/sklearn/linear_model/sag_fast.c -diff +/sklearn/metrics/pairwise_fast.c -diff +/sklearn/neighbors/ball_tree.c -diff +/sklearn/neighbors/kd_tree.c -diff +/sklearn/svm/liblinear.c -diff +/sklearn/svm/libsvm.c -diff +/sklearn/svm/libsvm_sparse.c -diff +/sklearn/tree/_tree.c -diff +/sklearn/tree/_utils.c -diff +/sklearn/utils/arrayfuncs.c -diff +/sklearn/utils/graph_shortest_path.c -diff +/sklearn/utils/lgamma.c -diff +/sklearn/utils/_logistic_sigmoid.c -diff +/sklearn/utils/murmurhash.c -diff +/sklearn/utils/seq_dataset.c -diff +/sklearn/utils/sparsefuncs_fast.c -diff +/sklearn/utils/weight_vector.c -diff diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..422c158 --- /dev/null +++ b/.gitignore @@ -0,0 +1,65 @@ +*.pyc +*.so +*.pyd +*~ +.#* +*.lprof +*.swp +*.swo +.DS_Store +build +sklearn/datasets/__config__.py +sklearn/**/*.html + +dist/ +MANIFEST +doc/_build/ +doc/auto_examples/ +doc/modules/generated/ +doc/datasets/generated/ +*.pdf +pip-log.txt +scikit_learn.egg-info/ +.coverage +coverage +*.py,cover +.tags +tags +covtype.data.gz +20news-18828/ +20news-18828.tar.gz +coverages.zip +samples.zip +doc/coverages.zip +doc/samples.zip +coverages +samples +doc/coverages +doc/samples +*.prof +.tox/ +.coverage + +lfw_preprocessed/ +nips2010_pdf/ + +*.nt.bz2 +*.tar.gz +*.tgz + +examples/cluster/joblib +reuters/ +benchmarks/bench_covertype_data/ + +*.prefs +.pydevproject +.idea + +cythonize.dat +*.c +*.cpp + +!*/src/*.c +!*/src/*.cpp +*.sln +*.pyproj diff --git a/.landscape.yml b/.landscape.yml new file mode 100644 index 0000000..4774bdc --- /dev/null +++ b/.landscape.yml @@ -0,0 +1,5 @@ +pylint: + disable: + - unpacking-non-sequence +ignore-paths: + - sklearn/externals diff --git a/.mailmap b/.mailmap new file mode 100644 index 0000000..e269536 --- /dev/null +++ b/.mailmap @@ -0,0 +1,125 @@ +Alexandre Gramfort +Alexandre Gramfort +Alexandre Gramfort +Alexandre Saint +Andreas Mueller +Andreas Mueller +Andreas Mueller +Andreas Mueller +Andreas Mueller +Andreas Mueller +Arnaud Joly +Arnaud Joly +Arnaud Joly +Anne-Laure Fouque +Ariel Rokem arokem +Bala Subrahmanyam Varanasi +Bertrand Thirion +Brandyn A. White +Brian Cheung +Brian Cheung +Brian Cheung +Brian Holt +Christian Osendorfer +Clay Woolam +Danny Sullivan +Denis Engemann +Denis Engemann +Denis Engemann +Denis Engemann +Diego Molla +DraXus draxus +Edouard DUCHESNAY +Edouard DUCHESNAY +Edouard DUCHESNAY +Emmanuelle Gouillart +Emmanuelle Gouillart +Eustache Diemert +Fabian Pedregosa +Fabian Pedregosa +Fabian Pedregosa +Federico Vaggi +Federico Vaggi +Gael Varoquaux +Gael Varoquaux +Gael Varoquaux +Giorgio Patrini +Giorgio Patrini +Gilles Louppe +Hamzeh Alsalhi <93hamsal@gmail.com> +Harikrishnan S +Hendrik Heuer +Henry Lin +Hrishikesh Huilgolkar +Hugo Bowne-Anderson +Imaculate +Immanuel Bayer +Jacob Schreiber +Jacob Schreiber +Jake VanderPlas +Jake VanderPlas +Jake VanderPlas +James Bergstra +Jaques Grobler +Jan Schlüter +Jean Kossaifi +Jean Kossaifi +Jean Kossaifi +Joel Nothman +Kyle Kastner +Lars Buitinck +Lars Buitinck +Lars Buitinck +Lars Buitinck +Lars Buitinck +Loic Esteve +Manoj Kumar +Matthieu Perrot +Maheshakya Wijewardena +Michael Bommarito +Michael Eickenberg +Michael Eickenberg +Samuel Charron +Sergio Medina +Nelle Varoquaux +Nelle Varoquaux +Nelle Varoquaux +Nicolas Goix +Nicolas Pinto +Noel Dawe +Noel Dawe +Olivier Grisel +Olivier Grisel +Olivier Hervieu +Paul Butler +Peter Prettenhofer +Raghav R V +Robert Layton +Roman Sinayev +Roman Sinayev +Ronald Phlypo +Satrajit Ghosh +Sebastian Raschka +Sebastian Raschka +Shiqiao Du +Shiqiao Du +Thomas Unterthiner +Tim Sheerman-Chase +Vincent Dubourg +Vincent Dubourg +Vincent Michel +Vincent Michel +Vincent Michel +Vincent Michel +Vincent Michel +Vincent Schut +Virgile Fritsch +Virgile Fritsch +Vlad Niculae +Wei Li +Wei Li +X006 +Xinfan Meng +Yannick Schwartz +Yannick Schwartz +Yannick Schwartz diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..5b5733f --- /dev/null +++ b/.travis.yml @@ -0,0 +1,82 @@ +# make it explicit that we favor the new container-based travis workers +sudo: false + +language: python + +# Pre-install packages for the ubuntu distribution +cache: + apt: true + # We use three different cache directory + # to work around a Travis bug with multi-platform cache + directories: + - $HOME/sklearn_build_ubuntu + - $HOME/sklearn_build_oldest + - $HOME/sklearn_build_latest + - $HOME/sklearn_build_numpy_dev + - $HOME/.cache/pip + - $HOME/download +addons: + apt: + packages: + - libatlas3gf-base + - libatlas-dev + # only required by the DISTRIB="ubuntu" build: + - python-scipy + +env: + global: + # Directory where tests are run from + - TEST_DIR=/tmp/sklearn + - OMP_NUM_THREADS=4 + - OPENBLAS_NUM_THREADS=4 + matrix: + # This environment tests that scikit-learn can be built against + # versions of numpy, scipy with ATLAS that comes with Ubuntu Precise 12.04 + - DISTRIB="ubuntu" PYTHON_VERSION="2.7" CYTHON_VERSION="0.23.4" + CACHED_BUILD_DIR="$HOME/sklearn_build_ubuntu" COVERAGE=true + # This environment tests the oldest supported anaconda env + - DISTRIB="conda" PYTHON_VERSION="2.6" INSTALL_MKL="false" + NUMPY_VERSION="1.6.2" SCIPY_VERSION="0.11.0" CYTHON_VERSION="0.21" + CACHED_BUILD_DIR="$HOME/sklearn_build_oldest" + # This environment tests the newest supported anaconda env + - DISTRIB="conda" PYTHON_VERSION="3.5" INSTALL_MKL="true" + NUMPY_VERSION="1.10.4" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.23.4" + CACHED_BUILD_DIR="$HOME/sklearn_build_latest" + # flake8 linting on diff wrt common ancestor with upstream/master + - RUN_FLAKE8="true" SKIP_TESTS="true" + DISTRIB="conda" PYTHON_VERSION="3.5" INSTALL_MKL="true" + NUMPY_VERSION="1.10.4" SCIPY_VERSION="0.17.0" CYTHON_VERSION="0.23.4" + CACHED_BUILD_DIR="$HOME/dummy" + + +matrix: + allow_failures: + # allow_failures seems to be keyed on the python version + # We are using this to allow failures for DISTRIB=scipy-dev-wheels + - python: 3.5 + + include: + # This environment tests scikit-learn against numpy and scipy master + # installed from their CI wheels in a virtualenv with the Python + # interpreter provided by travis. + # Note: libatlas3gf-base is not allowed yet so we need 'sudo': + # https://github.com/travis-ci/apt-package-whitelist/issues/2407 + # Once libatlas3gf-base is on the whitelist it will be possible to replace + # the before_install step with and addons/apt/packages declaration. + - python: 3.5 + env: DISTRIB="scipy-dev-wheels" + CACHED_BUILD_DIR="$HOME/sklearn_build_numpy_dev" + sudo: True + before_install: sudo apt-get install -yqq libatlas3gf-base libatlas-dev + + +install: source build_tools/travis/install.sh +script: bash build_tools/travis/test_script.sh +after_success: source build_tools/travis/after_success.sh +notifications: + webhooks: + urls: + - https://webhooks.gitter.im/e/4ffabb4df010b70cd624 + on_success: change # options: [always|never|change] default: always + on_failure: always # options: [always|never|change] default: always + on_start: never # options: [always|never|change] default: always diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 0000000..7001276 --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,71 @@ +.. -*- mode: rst -*- + + +This is a community effort, and as such many people have contributed +to it over the years. + +History +------- + +This project was started in 2007 as a Google Summer of Code project by +David Cournapeau. Later that year, Matthieu Brucher started work on +this project as part of his thesis. + +In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent +Michel of INRIA took leadership of the project and made the first public +release, February the 1st 2010. Since then, several releases have appeared +following a ~3 month cycle, and a thriving international community has +been leading the development. + +People +------ + +The following people have been core contributors to scikit-learn's development and maintenance: + +.. hlist:: + + * `Mathieu Blondel `_ + * `Matthieu Brucher `_ + * Lars Buitinck + * David Cournapeau + * `Noel Dawe `_ + * Vincent Dubourg + * Edouard Duchesnay + * `Tom Dupré la Tour `_ + * Alexander Fabisch + * `Virgile Fritsch `_ + * `Satra Ghosh `_ + * `Angel Soler Gollonet `_ + * Chris Filo Gorgolewski + * `Alexandre Gramfort `_ + * `Olivier Grisel `_ + * `Jaques Grobler `_ + * `Yaroslav Halchenko `_ + * `Brian Holt `_ + * `Arnaud Joly `_ + * Thouis (Ray) Jones + * `Kyle Kastner `_ + * `Manoj Kumar `_ + * Robert Layton + * `Wei Li `_ + * Paolo Losi + * `Gilles Louppe `_ + * `Jan Hendrik Metzen `_ + * Vincent Michel + * Jarrod Millman + * `Andreas Müller `_ (release manager) + * `Vlad Niculae `_ + * `Joel Nothman `_ + * `Alexandre Passos `_ + * `Fabian Pedregosa `_ + * `Peter Prettenhofer `_ + * Bertrand Thirion + * `Jake VanderPlas `_ + * Nelle Varoquaux + * `Gael Varoquaux `_ + * Ron Weiss + +Please do not email the authors directly to ask for assistance or report issues. +Instead, please see `What's the best way to ask questions about scikit-learn +`_ +in the FAQ. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..5f6115e --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,232 @@ + +Contributing to scikit-learn +============================ + +**Note: This document is a 'getting started' summary for contributing code, +documentation, testing, and filing issues.** Visit the [**Contributing +page**](http://scikit-learn.org/stable/developers/index.html) +for the full contributor's guide. Please read it carefully to help make +the code review process go as smoothly as possible and maximize the +likelihood of your contribution being merged.** + +How to contribute +----------------- + +The preferred workflow for contributing to scikit-learn is to fork the +[main repository](https://github.com/scikit-learn/scikit-learn) on +GitHub, clone, and develop on a branch. Steps: + +1. Fork the [project repository](https://github.com/scikit-learn/scikit-learn) + by clicking on the 'Fork' button near the top right of the page. This creates + a copy of the code under your GitHub user account. + +2. Clone your fork of the scikit-learn repo from your GitHub account to your local disk: + + ```bash + $ git clone git@github.com:YourLogin/scikit-learn.git + $ cd scikit-learn + ``` + +3. Create a ``feature`` branch to hold your development changes: + + ```bash + $ git checkout -b my-feature + ``` + + Always use a ``feature`` branch. It's good practice to never work on the ``master`` branch! + +4. Develop the feature on your feature branch. Add changed files using ``git add`` and then ``git commit`` files: + + ```bash + $ git add modified_files + $ git commit + ``` + + to record your changes in Git, then push the changes to your GitHub account with: + + ```bash + $ git push -u origin my-feature + ``` + +5. Go to the GitHub web page of your fork of the scikit-learn repo. +Click the 'Pull request' button to send your changes to the project's maintainers for +review. This will send an email to the committers. + +(If any of the above seems like magic to you, please look up the +[Git documentation](https://git-scm.com/documentation) on the web, or ask a friend or another contributor for help.) + +Pull Request Checklist +---------------------- + +We recommended that your contribution complies with the +following rules before you submit a pull request: + +- Follow the + [coding-guidelines](http://scikit-learn.org/dev/developers/contributing.html#coding-guidelines). + +- Use, when applicable, the validation tools and scripts in the + `sklearn.utils` submodule. A list of utility routines available + for developers can be found in the + [Utilities for Developers](http://scikit-learn.org/dev/developers/utilities.html#developers-utils) + page. + +- If your pull request addresses an issue, please use the pull request title + to describe the issue and mention the issue number in the pull request description. This will make sure a link back to the original issue is + created. + +- All public methods should have informative docstrings with sample + usage presented as doctests when appropriate. + +- Please prefix the title of your pull request with `[MRG]` (Ready for + Merge), if the contribution is complete and ready for a detailed review. + Incomplete contributions should be prefixed `[WIP]` (to indicate a work + in progress) and changed to `[MRG]` when it matures. WIPs may be useful + to: indicate you are working on something to avoid duplicated work, + request broad review of functionality or API, or seek collaborators. + WIPs often benefit from the inclusion of a + [task list](https://github.com/blog/1375-task-lists-in-gfm-issues-pulls-comments) + in the PR description. + +- All other tests pass when everything is rebuilt from scratch. On + Unix-like systems, check with (from the toplevel source folder): + + ```bash + $ make + ``` + +- When adding additional functionality, provide at least one + example script in the ``examples/`` folder. Have a look at other + examples for reference. Examples should demonstrate why the new + functionality is useful in practice and, if possible, compare it + to other methods available in scikit-learn. + +- Documentation and high-coverage tests are necessary for enhancements + to be accepted. + +- At least one paragraph of narrative documentation with links to + references in the literature (with PDF links when possible) and + the example. + +- The documentation should also include expected time and space + complexity of the algorithm and scalability, e.g. "this algorithm + can scale to a large number of samples > 100000, but does not + scale in dimensionality: n_features is expected to be lower than + 100". + +You can also check for common programming errors with the following +tools: + +- Code with good unittest **coverage** (at least 80%), check with: + + ```bash + $ pip install nose coverage + $ nosetests --with-coverage path/to/tests_for_package + ``` + +- No pyflakes warnings, check with: + + ```bash + $ pip install pyflakes + $ pyflakes path/to/module.py + ``` + +- No PEP8 warnings, check with: + + ```bash + $ pip install pep8 + $ pep8 path/to/module.py + ``` + +- AutoPEP8 can help you fix some of the easy redundant errors: + + ```bash + $ pip install autopep8 + $ autopep8 path/to/pep8.py + ``` + +Bonus points for contributions that include a performance analysis with +a benchmark script and profiling output (please report on the mailing +list or on the GitHub issue). + +Filing bugs +----------- +We use Github issues to track all bugs and feature requests; feel free to +open an issue if you have found a bug or wish to see a feature implemented. + +It is recommended to check that your issue complies with the +following rules before submitting: + +- Verify that your issue is not being currently addressed by other + [issues](https://github.com/scikit-learn/scikit-learn/issues?q=) + or [pull requests](https://github.com/scikit-learn/scikit-learn/pulls?q=). + +- If you are submitting an algorithm or feature request, please verify that + the algorithm fulfills our + [new algorithm requirements](http://scikit-learn.org/stable/faq.html#can-i-add-this-new-algorithm-that-i-or-someone-else-just-published). + +- Please ensure all code snippets and error messages are formatted in + appropriate code blocks. + See [Creating and highlighting code blocks](https://help.github.com/articles/creating-and-highlighting-code-blocks). + +- Please include your operating system type and version number, as well + as your Python, scikit-learn, numpy, and scipy versions. This information + can be found by runnning the following code snippet: + + ```python + import platform; print(platform.platform()) + import sys; print("Python", sys.version) + import numpy; print("NumPy", numpy.__version__) + import scipy; print("SciPy", scipy.__version__) + import sklearn; print("Scikit-Learn", sklearn.__version__) + ``` + +- Please be specific about what estimators and/or functions are involved + and the shape of the data, as appropriate; please include a + [reproducible](http://stackoverflow.com/help/mcve) code snippet + or link to a [gist](https://gist.github.com). If an exception is raised, + please provide the traceback. + +New contributor tips +-------------------- + +A great way to start contributing to scikit-learn is to pick an item +from the list of [Easy issues](https://github.com/scikit-learn/scikit-learn/issues?labels=Easy) +in the issue tracker. Resolving these issues allow you to start +contributing to the project without much prior knowledge. Your +assistance in this area will be greatly appreciated by the more +experienced developers as it helps free up their time to concentrate on +other issues. + +Documentation +------------- + +We are glad to accept any sort of documentation: function docstrings, +reStructuredText documents (like this one), tutorials, etc. +reStructuredText documents live in the source code repository under the +doc/ directory. + +You can edit the documentation using any text editor and then generate +the HTML output by typing ``make html`` from the doc/ directory. +Alternatively, ``make`` can be used to quickly generate the +documentation without the example gallery. The resulting HTML files will +be placed in ``_build/html/`` and are viewable in a web browser. See the +``README`` file in the ``doc/`` directory for more information. + +For building the documentation, you will need +[sphinx](http://sphinx.pocoo.org/), +[matplotlib](http://matplotlib.sourceforge.net/), and +[pillow](http://pillow.readthedocs.io/en/latest/). + +When you are writing documentation, it is important to keep a good +compromise between mathematical and algorithmic details, and give +intuition to the reader on what the algorithm does. It is best to always +start with a small paragraph with a hand-waving explanation of what the +method does to the data and a figure (coming from an example) +illustrating it. + +Further Information +------------------- + +Visit the [Contributing Code](http://scikit-learn.org/stable/developers/index.html#coding-guidelines) +section of the website for more information including conforming to the +API spec and profiling contributed code. diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..e8f7e14 --- /dev/null +++ b/COPYING @@ -0,0 +1,32 @@ +New BSD License + +Copyright (c) 2007–2016 The scikit-learn developers. +All rights reserved. + + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + a. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + b. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + c. Neither the name of the Scikit-learn Developers nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. + diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md new file mode 100644 index 0000000..9d40827 --- /dev/null +++ b/ISSUE_TEMPLATE.md @@ -0,0 +1,54 @@ + + + + +#### Description + + +#### Steps/Code to Reproduce + + +#### Expected Results + + +#### Actual Results + + +#### Versions + + + + diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..ed0ca0e --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,9 @@ +include *.rst +recursive-include doc * +recursive-include examples * +recursive-include sklearn *.c *.h *.pyx *.pxd *.pxi +recursive-include sklearn/datasets *.csv *.csv.gz *.rst *.jpg *.txt +include COPYING +include AUTHORS.rst +include README.rst + diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..20bc4f5 --- /dev/null +++ b/Makefile @@ -0,0 +1,73 @@ +# simple makefile to simplify repetitive build env management tasks under posix + +# caution: testing won't work on windows, see README + +PYTHON ?= python +CYTHON ?= cython +NOSETESTS ?= nosetests +CTAGS ?= ctags + +# skip doctests on 32bit python +BITS := $(shell python -c 'import struct; print(8 * struct.calcsize("P"))') + +ifeq ($(BITS),32) + NOSETESTS:=$(NOSETESTS) -c setup32.cfg +endif + + +all: clean inplace test + +clean-ctags: + rm -f tags + +clean: clean-ctags + $(PYTHON) setup.py clean + rm -rf dist + +in: inplace # just a shortcut +inplace: + # to avoid errors in 0.15 upgrade + rm -f sklearn/utils/sparsefuncs*.so + rm -f sklearn/utils/random*.so + $(PYTHON) setup.py build_ext -i + +test-code: in + $(NOSETESTS) -s -v sklearn +test-sphinxext: + $(NOSETESTS) -s -v doc/sphinxext/ +test-doc: +ifeq ($(BITS),64) + $(NOSETESTS) -s -v doc/*.rst doc/modules/ doc/datasets/ \ + doc/developers doc/tutorial/basic doc/tutorial/statistical_inference \ + doc/tutorial/text_analytics +endif + +test-coverage: + rm -rf coverage .coverage + $(NOSETESTS) -s -v --with-coverage sklearn + +test: test-code test-sphinxext test-doc + +trailing-spaces: + find sklearn -name "*.py" -exec perl -pi -e 's/[ \t]*$$//' {} \; + +cython: + python build_tools/cythonize.py sklearn + +ctags: + # make tags for symbol based navigation in emacs and vim + # Install with: sudo apt-get install exuberant-ctags + $(CTAGS) --python-kinds=-i -R sklearn + +doc: inplace + $(MAKE) -C doc html + +doc-noplot: inplace + $(MAKE) -C doc html-noplot + +code-analysis: + flake8 sklearn | grep -v __init__ | grep -v external + pylint -E -i y sklearn/ -d E1103,E0611,E1101 + +flake8-diff: + ./build_tools/travis/flake8_diff.sh diff --git a/PULL_REQUEST_TEMPLATE.md b/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..3321b70 --- /dev/null +++ b/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,26 @@ + +#### Reference Issue + + + +#### What does this implement/fix? Explain your changes. + + +#### Any other comments? + + + diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..8d67f20 --- /dev/null +++ b/README.rst @@ -0,0 +1,161 @@ +.. -*- mode: rst -*- +|Travis|_ |AppVeyor|_ |Coveralls|_ |CircleCI|_ |Python27|_ |Python35|_ |PyPi|_ |DOI|_ + +.. |Travis| image:: https://api.travis-ci.org/scikit-learn/scikit-learn.svg?branch=master +.. _Travis: https://travis-ci.org/scikit-learn/scikit-learn + +.. |AppVeyor| image:: https://ci.appveyor.com/api/projects/status/github/scikit-learn/scikit-learn?branch=master&svg=true +.. _AppVeyor: https://ci.appveyor.com/project/sklearn-ci/scikit-learn/history + +.. |Coveralls| image:: https://coveralls.io/repos/scikit-learn/scikit-learn/badge.svg?branch=master&service=github +.. _Coveralls: https://coveralls.io/r/scikit-learn/scikit-learn + +.. |CircleCI| image:: https://circleci.com/gh/scikit-learn/scikit-learn/tree/master.svg?style=shield&circle-token=:circle-token +.. _CircleCI: https://circleci.com/gh/scikit-learn/scikit-learn + +.. |Python27| image:: https://img.shields.io/badge/python-2.7-blue.svg +.. _Python27: https://badge.fury.io/py/scikit-learn + +.. |Python35| image:: https://img.shields.io/badge/python-3.5-blue.svg +.. _Python35: https://badge.fury.io/py/scikit-learn + +.. |PyPi| image:: https://badge.fury.io/py/scikit-learn.svg +.. _PyPi: https://badge.fury.io/py/scikit-learn + +.. |DOI| image:: https://zenodo.org/badge/21369/scikit-learn/scikit-learn.svg +.. _DOI: https://zenodo.org/badge/latestdoi/21369/scikit-learn/scikit-learn + +scikit-learn +============ + +scikit-learn is a Python module for machine learning built on top of +SciPy and distributed under the 3-Clause BSD license. + +The project was started in 2007 by David Cournapeau as a Google Summer +of Code project, and since then many volunteers have contributed. See +the `AUTHORS.rst `_ file for a complete list of contributors. + +It is currently maintained by a team of volunteers. + +Website: http://scikit-learn.org + +Installation +------------ + +Dependencies +~~~~~~~~~~~~ + +Scikit-learn requires:: + +- Python (>= 2.6 or >= 3.3), +- NumPy (>= 1.6.1), +- SciPy (>= 0.9). + +scikit-learn also uses CBLAS, the C interface to the Basic Linear Algebra +Subprograms library. scikit-learn comes with a reference implementation, but +the system CBLAS will be detected by the build system and used if present. +CBLAS exists in many implementations; see `Linear algebra libraries +`_ +for known issues. + +User installation +~~~~~~~~~~~~~~~~~ + +If you already have a working installation of numpy and scipy, +the easiest way to install scikit-learn is using ``pip`` :: + + pip install -U scikit-learn + +or ``conda``:: + + conda install scikit-learn + +The documentation includes more detailed `installation instructions `_. + + +Development +----------- + +We welcome new contributors of all experience levels. The scikit-learn +community goals are to be helpful, welcoming, and effective. The +`Contributor's Guide `_ +has detailed information about contributing code, documentation, tests, and +more. We've included some basic information in this README. + +Important links +~~~~~~~~~~~~~~~ + +- Official source code repo: https://github.com/scikit-learn/scikit-learn +- Download releases: http://sourceforge.net/projects/scikit-learn/files/ +- Issue tracker: https://github.com/scikit-learn/scikit-learn/issues + +Source code +~~~~~~~~~~~ + +You can check the latest sources with the command:: + + git clone https://github.com/scikit-learn/scikit-learn.git + +Setting up a development environment +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Quick tutorial on how to go about setting up your environment to +contribute to scikit-learn: https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md + +Testing +~~~~~~~ + +After installation, you can launch the test suite from outside the +source directory (you will need to have the ``nose`` package installed):: + + $ nosetests -v sklearn + +Under Windows, it is recommended to use the following command (adjust the path +to the ``python.exe`` program) as using the ``nosetests.exe`` program can badly +interact with tests that use ``multiprocessing``:: + + C:\Python34\python.exe -c "import nose; nose.main()" -v sklearn + +See the web page http://scikit-learn.org/stable/install.html#testing +for more information. + + Random number generation can be controlled during testing by setting + the ``SKLEARN_SEED`` environment variable. + +Submitting a Pull Request +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Before opening a Pull Request, have a look at the +full Contributing page to make sure your code complies +with our guidelines: http://scikit-learn.org/stable/developers/index.html + + +Project history +--------------- + +The project was started in 2007 by David Cournapeau as a Google Summer +of Code project, and since then many volunteers have contributed. See +the AUTHORS.rst file for a complete list of contributors. + +The project is currently maintained by a team of volunteers. + +**Note** `scikit-learn` was previously referred to as `scikits.learn`. + + +Help and Support +---------------- + +Documentation +~~~~~~~~~~~~~ + +- HTML documentation (stable release): http://scikit-learn.org +- HTML documentation (development version): http://scikit-learn.org/dev/ +- FAQ: http://scikit-learn.org/stable/faq.html + +Communication +~~~~~~~~~~~~~ + +- Mailing list: https://mail.python.org/mailman/listinfo/scikit-learn +- IRC channel: ``#scikit-learn`` at ``irc.freenode.net`` +- Stack Overflow: http://stackoverflow.com/questions/tagged/scikit-learn +- Website: http://scikit-learn.org \ No newline at end of file diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 0000000..205018f --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,93 @@ +# AppVeyor.com is a Continuous Integration service to build and run tests under +# Windows +# https://ci.appveyor.com/project/sklearn-ci/scikit-learn + +environment: + global: + # SDK v7.0 MSVC Express 2008's SetEnv.cmd script will fail if the + # /E:ON and /V:ON options are not enabled in the batch script interpreter + # See: http://stackoverflow.com/a/13751649/163740 + CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\build_tools\\appveyor\\run_with_env.cmd" + WHEELHOUSE_UPLOADER_USERNAME: sklearn-appveyor + WHEELHOUSE_UPLOADER_SECRET: + secure: BQm8KfEj6v2Y+dQxb2syQvTFxDnHXvaNktkLcYSq7jfbTOO6eH9n09tfQzFUVcWZ + + # Make sure we don't download large datasets when running the test on + # continuous integration platform + SKLEARN_SKIP_NETWORK_TESTS: 1 + + matrix: + - PYTHON: "C:\\Python27" + PYTHON_VERSION: "2.7.8" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python27-x64" + PYTHON_VERSION: "2.7.8" + PYTHON_ARCH: "64" + + - PYTHON: "C:\\Python35" + PYTHON_VERSION: "3.5.0" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python35-x64" + PYTHON_VERSION: "3.5.0" + PYTHON_ARCH: "64" + + + +install: + # Install Python (from the official .msi of http://python.org) and pip when + # not already installed. + - "powershell ./build_tools/appveyor/install.ps1" + - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" + - "python -m pip install -U pip" + + # Check that we have the expected version and architecture for Python + - "python --version" + - "python -c \"import struct; print(struct.calcsize('P') * 8)\"" + - "pip --version" + + # Install the build and runtime dependencies of the project. + - "%CMD_IN_ENV% pip install --timeout=60 --trusted-host 28daf2247a33ed269873-7b1aad3fab3cc330e1fd9d109892382a.r6.cf2.rackcdn.com -r build_tools/appveyor/requirements.txt" + - "%CMD_IN_ENV% python setup.py bdist_wheel bdist_wininst -b doc/logos/scikit-learn-logo.bmp" + - ps: "ls dist" + + # Install the generated wheel package to test it + - "pip install --pre --no-index --find-links dist/ scikit-learn" + +# Not a .NET project, we build scikit-learn in the install step instead +build: false + +test_script: + # Change to a non-source folder to make sure we run the tests on the + # installed library. + - "mkdir empty_folder" + - "cd empty_folder" + + - "python -c \"import nose; nose.main()\" --with-timer --timer-top-n 20 -s -v sklearn" + + # Move back to the project folder + - "cd .." + +artifacts: + # Archive the generated wheel package in the ci.appveyor.com build report. + - path: dist\* + +on_success: + # Upload the generated wheel package to Rackspace + # On Windows, Apache Libcloud cannot find a standard CA cert bundle so we + # disable the ssl checks. + - "python -m wheelhouse_uploader upload --no-ssl-check --local-folder=dist sklearn-windows-wheels" + +notifications: + - provider: Webhook + url: https://webhooks.gitter.im/e/0dc8e57cd38105aeb1b4 + on_build_success: false + on_build_failure: True + +cache: + # Use the appveyor cache to avoid re-downloading large archives such + # the MKL numpy and scipy wheels mirrored on a rackspace cloud + # container, speed up the appveyor jobs and reduce bandwidth + # usage on our rackspace account. + - '%APPDATA%\pip\Cache' diff --git a/benchmarks/bench_20newsgroups.py b/benchmarks/bench_20newsgroups.py new file mode 100644 index 0000000..5bd56f3 --- /dev/null +++ b/benchmarks/bench_20newsgroups.py @@ -0,0 +1,97 @@ +from __future__ import print_function, division +from time import time +import argparse +import numpy as np + +from sklearn.dummy import DummyClassifier + +from sklearn.datasets import fetch_20newsgroups_vectorized +from sklearn.metrics import accuracy_score +from sklearn.utils.validation import check_array + +from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.ensemble import AdaBoostClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.naive_bayes import MultinomialNB + +ESTIMATORS = { + "dummy": DummyClassifier(), + "random_forest": RandomForestClassifier(n_estimators=100, + max_features="sqrt", + min_samples_split=10), + "extra_trees": ExtraTreesClassifier(n_estimators=100, + max_features="sqrt", + min_samples_split=10), + "logistic_regression": LogisticRegression(), + "naive_bayes": MultinomialNB(), + "adaboost": AdaBoostClassifier(n_estimators=10), +} + + +############################################################################### +# Data + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument('-e', '--estimators', nargs="+", required=True, + choices=ESTIMATORS) + args = vars(parser.parse_args()) + + data_train = fetch_20newsgroups_vectorized(subset="train") + data_test = fetch_20newsgroups_vectorized(subset="test") + X_train = check_array(data_train.data, dtype=np.float32, + accept_sparse="csc") + X_test = check_array(data_test.data, dtype=np.float32, accept_sparse="csr") + y_train = data_train.target + y_test = data_test.target + + print("20 newsgroups") + print("=============") + print("X_train.shape = {0}".format(X_train.shape)) + print("X_train.format = {0}".format(X_train.format)) + print("X_train.dtype = {0}".format(X_train.dtype)) + print("X_train density = {0}" + "".format(X_train.nnz / np.product(X_train.shape))) + print("y_train {0}".format(y_train.shape)) + print("X_test {0}".format(X_test.shape)) + print("X_test.format = {0}".format(X_test.format)) + print("X_test.dtype = {0}".format(X_test.dtype)) + print("y_test {0}".format(y_test.shape)) + print() + + print("Classifier Training") + print("===================") + accuracy, train_time, test_time = {}, {}, {} + for name in sorted(args["estimators"]): + clf = ESTIMATORS[name] + try: + clf.set_params(random_state=0) + except (TypeError, ValueError): + pass + + print("Training %s ... " % name, end="") + t0 = time() + clf.fit(X_train, y_train) + train_time[name] = time() - t0 + t0 = time() + y_pred = clf.predict(X_test) + test_time[name] = time() - t0 + accuracy[name] = accuracy_score(y_test, y_pred) + print("done") + + print() + print("Classification performance:") + print("===========================") + print() + print("%s %s %s %s" % ("Classifier ", "train-time", "test-time", + "Accuracy")) + print("-" * 44) + for name in sorted(accuracy, key=accuracy.get): + print("%s %s %s %s" % (name.ljust(16), + ("%.4fs" % train_time[name]).center(10), + ("%.4fs" % test_time[name]).center(10), + ("%.4f" % accuracy[name]).center(10))) + + print() diff --git a/benchmarks/bench_covertype.py b/benchmarks/bench_covertype.py new file mode 100644 index 0000000..5d995c7 --- /dev/null +++ b/benchmarks/bench_covertype.py @@ -0,0 +1,190 @@ +""" +=========================== +Covertype dataset benchmark +=========================== + +Benchmark stochastic gradient descent (SGD), Liblinear, and Naive Bayes, CART +(decision tree), RandomForest and Extra-Trees on the forest covertype dataset +of Blackard, Jock, and Dean [1]. The dataset comprises 581,012 samples. It is +low dimensional with 54 features and a sparsity of approx. 23%. Here, we +consider the task of predicting class 1 (spruce/fir). The classification +performance of SGD is competitive with Liblinear while being two orders of +magnitude faster to train:: + + [..] + Classification performance: + =========================== + Classifier train-time test-time error-rate + -------------------------------------------- + liblinear 15.9744s 0.0705s 0.2305 + GaussianNB 3.0666s 0.3884s 0.4841 + SGD 1.0558s 0.1152s 0.2300 + CART 79.4296s 0.0523s 0.0469 + RandomForest 1190.1620s 0.5881s 0.0243 + ExtraTrees 640.3194s 0.6495s 0.0198 + +The same task has been used in a number of papers including: + + * `"SVM Optimization: Inverse Dependence on Training Set Size" + `_ + S. Shalev-Shwartz, N. Srebro - In Proceedings of ICML '08. + + * `"Pegasos: Primal estimated sub-gradient solver for svm" + `_ + S. Shalev-Shwartz, Y. Singer, N. Srebro - In Proceedings of ICML '07. + + * `"Training Linear SVMs in Linear Time" + `_ + T. Joachims - In SIGKDD '06 + +[1] http://archive.ics.uci.edu/ml/datasets/Covertype + +""" +from __future__ import division, print_function + +# Author: Peter Prettenhofer +# Arnaud Joly +# License: BSD 3 clause + +import os +from time import time +import argparse +import numpy as np + +from sklearn.datasets import fetch_covtype, get_data_home +from sklearn.svm import LinearSVC +from sklearn.linear_model import SGDClassifier, LogisticRegression +from sklearn.naive_bayes import GaussianNB +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.metrics import zero_one_loss +from sklearn.externals.joblib import Memory +from sklearn.utils import check_array + +# Memoize the data extraction and memory map the resulting +# train / test splits in readonly mode +memory = Memory(os.path.join(get_data_home(), 'covertype_benchmark_data'), + mmap_mode='r') + + +@memory.cache +def load_data(dtype=np.float32, order='C', random_state=13): + """Load the data, then cache and memmap the train/test split""" + ###################################################################### + # Load dataset + print("Loading dataset...") + data = fetch_covtype(download_if_missing=True, shuffle=True, + random_state=random_state) + X = check_array(data['data'], dtype=dtype, order=order) + y = (data['target'] != 1).astype(np.int) + + # Create train-test split (as [Joachims, 2006]) + print("Creating train-test split...") + n_train = 522911 + X_train = X[:n_train] + y_train = y[:n_train] + X_test = X[n_train:] + y_test = y[n_train:] + + # Standardize first 10 features (the numerical ones) + mean = X_train.mean(axis=0) + std = X_train.std(axis=0) + mean[10:] = 0.0 + std[10:] = 1.0 + X_train = (X_train - mean) / std + X_test = (X_test - mean) / std + return X_train, X_test, y_train, y_test + + +ESTIMATORS = { + 'GBRT': GradientBoostingClassifier(n_estimators=250), + 'ExtraTrees': ExtraTreesClassifier(n_estimators=20), + 'RandomForest': RandomForestClassifier(n_estimators=20), + 'CART': DecisionTreeClassifier(min_samples_split=5), + 'SGD': SGDClassifier(alpha=0.001, n_iter=2), + 'GaussianNB': GaussianNB(), + 'liblinear': LinearSVC(loss="l2", penalty="l2", C=1000, dual=False, + tol=1e-3), + 'SAG': LogisticRegression(solver='sag', max_iter=2, C=1000) +} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--classifiers', nargs="+", + choices=ESTIMATORS, type=str, + default=['liblinear', 'GaussianNB', 'SGD', 'CART'], + help="list of classifiers to benchmark.") + parser.add_argument('--n-jobs', nargs="?", default=1, type=int, + help="Number of concurrently running workers for " + "models that support parallelism.") + parser.add_argument('--order', nargs="?", default="C", type=str, + choices=["F", "C"], + help="Allow to choose between fortran and C ordered " + "data") + parser.add_argument('--random-seed', nargs="?", default=13, type=int, + help="Common seed used by random number generator.") + args = vars(parser.parse_args()) + + print(__doc__) + + X_train, X_test, y_train, y_test = load_data( + order=args["order"], random_state=args["random_seed"]) + + print("") + print("Dataset statistics:") + print("===================") + print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) + print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) + print("%s %s" % ("data type:".ljust(25), X_train.dtype)) + print("%s %d (pos=%d, neg=%d, size=%dMB)" + % ("number of train samples:".ljust(25), + X_train.shape[0], np.sum(y_train == 1), + np.sum(y_train == 0), int(X_train.nbytes / 1e6))) + print("%s %d (pos=%d, neg=%d, size=%dMB)" + % ("number of test samples:".ljust(25), + X_test.shape[0], np.sum(y_test == 1), + np.sum(y_test == 0), int(X_test.nbytes / 1e6))) + + print() + print("Training Classifiers") + print("====================") + error, train_time, test_time = {}, {}, {} + for name in sorted(args["classifiers"]): + print("Training %s ... " % name, end="") + estimator = ESTIMATORS[name] + estimator_params = estimator.get_params() + + estimator.set_params(**{p: args["random_seed"] + for p in estimator_params + if p.endswith("random_state")}) + + if "n_jobs" in estimator_params: + estimator.set_params(n_jobs=args["n_jobs"]) + + time_start = time() + estimator.fit(X_train, y_train) + train_time[name] = time() - time_start + + time_start = time() + y_pred = estimator.predict(X_test) + test_time[name] = time() - time_start + + error[name] = zero_one_loss(y_test, y_pred) + + print("done") + + print() + print("Classification performance:") + print("===========================") + print("%s %s %s %s" + % ("Classifier ", "train-time", "test-time", "error-rate")) + print("-" * 44) + for name in sorted(args["classifiers"], key=error.get): + print("%s %s %s %s" % (name.ljust(12), + ("%.4fs" % train_time[name]).center(10), + ("%.4fs" % test_time[name]).center(10), + ("%.4f" % error[name]).center(10))) + + print() diff --git a/benchmarks/bench_glm.py b/benchmarks/bench_glm.py new file mode 100644 index 0000000..660c208 --- /dev/null +++ b/benchmarks/bench_glm.py @@ -0,0 +1,58 @@ +""" +A comparison of different methods in GLM + +Data comes from a random square matrix. + +""" +from datetime import datetime +import numpy as np +from sklearn import linear_model +from sklearn.utils.bench import total_seconds + + +if __name__ == '__main__': + + import matplotlib.pyplot as plt + + n_iter = 40 + + time_ridge = np.empty(n_iter) + time_ols = np.empty(n_iter) + time_lasso = np.empty(n_iter) + + dimensions = 500 * np.arange(1, n_iter + 1) + + for i in range(n_iter): + + print('Iteration %s of %s' % (i, n_iter)) + + n_samples, n_features = 10 * i + 3, 10 * i + 3 + + X = np.random.randn(n_samples, n_features) + Y = np.random.randn(n_samples) + + start = datetime.now() + ridge = linear_model.Ridge(alpha=1.) + ridge.fit(X, Y) + time_ridge[i] = total_seconds(datetime.now() - start) + + start = datetime.now() + ols = linear_model.LinearRegression() + ols.fit(X, Y) + time_ols[i] = total_seconds(datetime.now() - start) + + start = datetime.now() + lasso = linear_model.LassoLars() + lasso.fit(X, Y) + time_lasso[i] = total_seconds(datetime.now() - start) + + plt.figure('scikit-learn GLM benchmark results') + plt.xlabel('Dimensions') + plt.ylabel('Time (s)') + plt.plot(dimensions, time_ridge, color='r') + plt.plot(dimensions, time_ols, color='g') + plt.plot(dimensions, time_lasso, color='b') + + plt.legend(['Ridge', 'OLS', 'LassoLars'], loc='upper left') + plt.axis('tight') + plt.show() diff --git a/benchmarks/bench_glmnet.py b/benchmarks/bench_glmnet.py new file mode 100644 index 0000000..b05971b --- /dev/null +++ b/benchmarks/bench_glmnet.py @@ -0,0 +1,128 @@ +""" +To run this, you'll need to have installed. + + * glmnet-python + * scikit-learn (of course) + +Does two benchmarks + +First, we fix a training set and increase the number of +samples. Then we plot the computation time as function of +the number of samples. + +In the second benchmark, we increase the number of dimensions of the +training set. Then we plot the computation time as function of +the number of dimensions. + +In both cases, only 10% of the features are informative. +""" +import numpy as np +import gc +from time import time +from sklearn.datasets.samples_generator import make_regression + +alpha = 0.1 +# alpha = 0.01 + + +def rmse(a, b): + return np.sqrt(np.mean((a - b) ** 2)) + + +def bench(factory, X, Y, X_test, Y_test, ref_coef): + gc.collect() + + # start time + tstart = time() + clf = factory(alpha=alpha).fit(X, Y) + delta = (time() - tstart) + # stop time + + print("duration: %0.3fs" % delta) + print("rmse: %f" % rmse(Y_test, clf.predict(X_test))) + print("mean coef abs diff: %f" % abs(ref_coef - clf.coef_.ravel()).mean()) + return delta + + +if __name__ == '__main__': + from glmnet.elastic_net import Lasso as GlmnetLasso + from sklearn.linear_model import Lasso as ScikitLasso + # Delayed import of matplotlib.pyplot + import matplotlib.pyplot as plt + + scikit_results = [] + glmnet_results = [] + n = 20 + step = 500 + n_features = 1000 + n_informative = n_features / 10 + n_test_samples = 1000 + for i in range(1, n + 1): + print('==================') + print('Iteration %s of %s' % (i, n)) + print('==================') + + X, Y, coef_ = make_regression( + n_samples=(i * step) + n_test_samples, n_features=n_features, + noise=0.1, n_informative=n_informative, coef=True) + + X_test = X[-n_test_samples:] + Y_test = Y[-n_test_samples:] + X = X[:(i * step)] + Y = Y[:(i * step)] + + print("benchmarking scikit-learn: ") + scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_)) + print("benchmarking glmnet: ") + glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_)) + + plt.clf() + xx = range(0, n * step, step) + plt.title('Lasso regression on sample dataset (%d features)' % n_features) + plt.plot(xx, scikit_results, 'b-', label='scikit-learn') + plt.plot(xx, glmnet_results, 'r-', label='glmnet') + plt.legend() + plt.xlabel('number of samples to classify') + plt.ylabel('Time (s)') + plt.show() + + # now do a benchmark where the number of points is fixed + # and the variable is the number of features + + scikit_results = [] + glmnet_results = [] + n = 20 + step = 100 + n_samples = 500 + + for i in range(1, n + 1): + print('==================') + print('Iteration %02d of %02d' % (i, n)) + print('==================') + n_features = i * step + n_informative = n_features / 10 + + X, Y, coef_ = make_regression( + n_samples=(i * step) + n_test_samples, n_features=n_features, + noise=0.1, n_informative=n_informative, coef=True) + + X_test = X[-n_test_samples:] + Y_test = Y[-n_test_samples:] + X = X[:n_samples] + Y = Y[:n_samples] + + print("benchmarking scikit-learn: ") + scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_)) + print("benchmarking glmnet: ") + glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_)) + + xx = np.arange(100, 100 + n * step, step) + plt.figure('scikit-learn vs. glmnet benchmark results') + plt.title('Regression in high dimensional spaces (%d samples)' % n_samples) + plt.plot(xx, scikit_results, 'b-', label='scikit-learn') + plt.plot(xx, glmnet_results, 'r-', label='glmnet') + plt.legend() + plt.xlabel('number of features') + plt.ylabel('Time (s)') + plt.axis('tight') + plt.show() diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py new file mode 100644 index 0000000..322f2ad --- /dev/null +++ b/benchmarks/bench_isolation_forest.py @@ -0,0 +1,128 @@ +""" +========================================== +IsolationForest benchmark +========================================== + +A test of IsolationForest on classical anomaly detection datasets. + +""" +print(__doc__) + +from time import time +import numpy as np +import matplotlib.pyplot as plt +from sklearn.ensemble import IsolationForest +from sklearn.metrics import roc_curve, auc +from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_mldata +from sklearn.preprocessing import LabelBinarizer +from sklearn.utils import shuffle as sh + +np.random.seed(1) + +datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover'] + +fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5)) + + +for dat in datasets: + # loading and vectorization + print('loading data') + if dat in ['http', 'smtp', 'SA', 'SF']: + dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True) + X = dataset.data + y = dataset.target + + if dat == 'shuttle': + dataset = fetch_mldata('shuttle') + X = dataset.data + y = dataset.target + X, y = sh(X, y) + # we remove data with label 4 + # normal data are then those of class 1 + s = (y != 4) + X = X[s, :] + y = y[s] + y = (y != 1).astype(int) + + if dat == 'forestcover': + dataset = fetch_covtype(shuffle=True) + X = dataset.data + y = dataset.target + # normal data are those with attribute 2 + # abnormal those with attribute 4 + s = (y == 2) + (y == 4) + X = X[s, :] + y = y[s] + y = (y != 2).astype(int) + + print('vectorizing data') + + if dat == 'SF': + lb = LabelBinarizer() + lb.fit(X[:, 1]) + x1 = lb.transform(X[:, 1]) + X = np.c_[X[:, :1], x1, X[:, 2:]] + y = (y != 'normal.').astype(int) + + if dat == 'SA': + lb = LabelBinarizer() + lb.fit(X[:, 1]) + x1 = lb.transform(X[:, 1]) + lb.fit(X[:, 2]) + x2 = lb.transform(X[:, 2]) + lb.fit(X[:, 3]) + x3 = lb.transform(X[:, 3]) + X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] + y = (y != 'normal.').astype(int) + + if dat == 'http' or dat == 'smtp': + y = (y != 'normal.').astype(int) + + n_samples, n_features = X.shape + n_samples_train = n_samples // 2 + + X = X.astype(float) + X_train = X[:n_samples_train, :] + X_test = X[n_samples_train:, :] + y_train = y[:n_samples_train] + y_test = y[n_samples_train:] + + print('IsolationForest processing...') + model = IsolationForest(n_jobs=-1) + tstart = time() + model.fit(X_train) + fit_time = time() - tstart + tstart = time() + + scoring = - model.decision_function(X_test) # the lower, the more normal + + # Show score histograms + fig, ax = plt.subplots(3, sharex=True, sharey=True) + bins = np.linspace(-0.5, 0.5, 200) + ax[0].hist(scoring, bins, color='black') + ax[0].set_title('decision function for %s dataset' % dat) + ax[0].legend(loc="lower right") + ax[1].hist(scoring[y_test == 0], bins, color='b', + label='normal data') + ax[1].legend(loc="lower right") + ax[2].hist(scoring[y_test == 1], bins, color='r', + label='outliers') + ax[2].legend(loc="lower right") + + # Show ROC Curves + predict_time = time() - tstart + fpr, tpr, thresholds = roc_curve(y_test, scoring) + AUC = auc(fpr, tpr) + label = ('%s (area: %0.3f, train-time: %0.2fs, ' + 'test-time: %0.2fs)' % (dat, AUC, fit_time, predict_time)) + ax_roc.plot(fpr, tpr, lw=1, label=label) + + +ax_roc.set_xlim([-0.05, 1.05]) +ax_roc.set_ylim([-0.05, 1.05]) +ax_roc.set_xlabel('False Positive Rate') +ax_roc.set_ylabel('True Positive Rate') +ax_roc.set_title('Receiver operating characteristic (ROC) curves') +ax_roc.legend(loc="lower right") +fig_roc.tight_layout() +plt.show() diff --git a/benchmarks/bench_isotonic.py b/benchmarks/bench_isotonic.py new file mode 100644 index 0000000..0a4fb6d --- /dev/null +++ b/benchmarks/bench_isotonic.py @@ -0,0 +1,103 @@ +""" +Benchmarks of isotonic regression performance. + +We generate a synthetic dataset of size 10^n, for n in [min, max], and +examine the time taken to run isotonic regression over the dataset. + +The timings are then output to stdout, or visualized on a log-log scale +with matplotlib. + +This allows the scaling of the algorithm with the problem size to be +visualized and understood. +""" +from __future__ import print_function + +import numpy as np +import gc +from datetime import datetime +from sklearn.isotonic import isotonic_regression +from sklearn.utils.bench import total_seconds +import matplotlib.pyplot as plt +import argparse + + +def generate_perturbed_logarithm_dataset(size): + return (np.random.randint(-50, 50, size=size) + + 50. * np.log(1 + np.arange(size))) + + +def generate_logistic_dataset(size): + X = np.sort(np.random.normal(size=size)) + return np.random.random(size=size) < 1.0 / (1.0 + np.exp(-X)) + + +def generate_pathological_dataset(size): + # Triggers O(n^2) complexity on the original implementation. + return np.r_[np.arange(size), + np.arange(-(size - 1), size), + np.arange(-(size - 1), 1)] + + +DATASET_GENERATORS = { + 'perturbed_logarithm': generate_perturbed_logarithm_dataset, + 'logistic': generate_logistic_dataset, + 'pathological': generate_pathological_dataset, +} + + +def bench_isotonic_regression(Y): + """ + Runs a single iteration of isotonic regression on the input data, + and reports the total time taken (in seconds). + """ + gc.collect() + + tstart = datetime.now() + isotonic_regression(Y) + delta = datetime.now() - tstart + return total_seconds(delta) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Isotonic Regression benchmark tool") + parser.add_argument('--seed', type=int, + help="RNG seed") + parser.add_argument('--iterations', type=int, required=True, + help="Number of iterations to average timings over " + "for each problem size") + parser.add_argument('--log_min_problem_size', type=int, required=True, + help="Base 10 logarithm of the minimum problem size") + parser.add_argument('--log_max_problem_size', type=int, required=True, + help="Base 10 logarithm of the maximum problem size") + parser.add_argument('--show_plot', action='store_true', + help="Plot timing output with matplotlib") + parser.add_argument('--dataset', choices=DATASET_GENERATORS.keys(), + required=True) + + args = parser.parse_args() + + np.random.seed(args.seed) + + timings = [] + for exponent in range(args.log_min_problem_size, + args.log_max_problem_size): + n = 10 ** exponent + Y = DATASET_GENERATORS[args.dataset](n) + time_per_iteration = \ + [bench_isotonic_regression(Y) for i in range(args.iterations)] + timing = (n, np.mean(time_per_iteration)) + timings.append(timing) + + # If we're not plotting, dump the timing to stdout + if not args.show_plot: + print(n, np.mean(time_per_iteration)) + + if args.show_plot: + plt.plot(*zip(*timings)) + plt.title("Average time taken running isotonic regression") + plt.xlabel('Number of observations') + plt.ylabel('Time (s)') + plt.axis('tight') + plt.loglog() + plt.show() diff --git a/benchmarks/bench_lasso.py b/benchmarks/bench_lasso.py new file mode 100644 index 0000000..7ed774a --- /dev/null +++ b/benchmarks/bench_lasso.py @@ -0,0 +1,96 @@ +""" +Benchmarks of Lasso vs LassoLars + +First, we fix a training set and increase the number of +samples. Then we plot the computation time as function of +the number of samples. + +In the second benchmark, we increase the number of dimensions of the +training set. Then we plot the computation time as function of +the number of dimensions. + +In both cases, only 10% of the features are informative. +""" +import gc +from time import time +import numpy as np + +from sklearn.datasets.samples_generator import make_regression + + +def compute_bench(alpha, n_samples, n_features, precompute): + lasso_results = [] + lars_lasso_results = [] + + it = 0 + + for ns in n_samples: + for nf in n_features: + it += 1 + print('==================') + print('Iteration %s of %s' % (it, max(len(n_samples), + len(n_features)))) + print('==================') + n_informative = nf // 10 + X, Y, coef_ = make_regression(n_samples=ns, n_features=nf, + n_informative=n_informative, + noise=0.1, coef=True) + + X /= np.sqrt(np.sum(X ** 2, axis=0)) # Normalize data + + gc.collect() + print("- benchmarking Lasso") + clf = Lasso(alpha=alpha, fit_intercept=False, + precompute=precompute) + tstart = time() + clf.fit(X, Y) + lasso_results.append(time() - tstart) + + gc.collect() + print("- benchmarking LassoLars") + clf = LassoLars(alpha=alpha, fit_intercept=False, + normalize=False, precompute=precompute) + tstart = time() + clf.fit(X, Y) + lars_lasso_results.append(time() - tstart) + + return lasso_results, lars_lasso_results + + +if __name__ == '__main__': + from sklearn.linear_model import Lasso, LassoLars + import matplotlib.pyplot as plt + + alpha = 0.01 # regularization parameter + + n_features = 10 + list_n_samples = np.linspace(100, 1000000, 5).astype(np.int) + lasso_results, lars_lasso_results = compute_bench(alpha, list_n_samples, + [n_features], precompute=True) + + plt.figure('scikit-learn LASSO benchmark results') + plt.subplot(211) + plt.plot(list_n_samples, lasso_results, 'b-', + label='Lasso') + plt.plot(list_n_samples, lars_lasso_results, 'r-', + label='LassoLars') + plt.title('precomputed Gram matrix, %d features, alpha=%s' % (n_features, + alpha)) + plt.legend(loc='upper left') + plt.xlabel('number of samples') + plt.ylabel('Time (s)') + plt.axis('tight') + + n_samples = 2000 + list_n_features = np.linspace(500, 3000, 5).astype(np.int) + lasso_results, lars_lasso_results = compute_bench(alpha, [n_samples], + list_n_features, precompute=False) + plt.subplot(212) + plt.plot(list_n_features, lasso_results, 'b-', label='Lasso') + plt.plot(list_n_features, lars_lasso_results, 'r-', label='LassoLars') + plt.title('%d samples, alpha=%s' % (n_samples, alpha)) + plt.legend(loc='upper left') + plt.xlabel('number of features') + plt.ylabel('Time (s)') + plt.axis('tight') + plt.show() diff --git a/benchmarks/bench_mnist.py b/benchmarks/bench_mnist.py new file mode 100644 index 0000000..01e0e1b --- /dev/null +++ b/benchmarks/bench_mnist.py @@ -0,0 +1,177 @@ +""" +======================= +MNIST dataset benchmark +======================= + +Benchmark on the MNIST dataset. The dataset comprises 70,000 samples +and 784 features. Here, we consider the task of predicting +10 classes - digits from 0 to 9 from their raw images. By contrast to the +covertype dataset, the feature space is homogenous. + +Example of output : + [..] + + Classification performance: + =========================== + Classifier train-time test-time error-rate + ------------------------------------------------------------ + MLP_adam 53.46s 0.11s 0.0224 + Nystroem-SVM 112.97s 0.92s 0.0228 + MultilayerPerceptron 24.33s 0.14s 0.0287 + ExtraTrees 42.99s 0.57s 0.0294 + RandomForest 42.70s 0.49s 0.0318 + SampledRBF-SVM 135.81s 0.56s 0.0486 + LinearRegression-SAG 16.67s 0.06s 0.0824 + CART 20.69s 0.02s 0.1219 + dummy 0.00s 0.01s 0.8973 +""" +from __future__ import division, print_function + +# Author: Issam H. Laradji +# Arnaud Joly +# License: BSD 3 clause + +import os +from time import time +import argparse +import numpy as np + +from sklearn.datasets import fetch_mldata +from sklearn.datasets import get_data_home +from sklearn.ensemble import ExtraTreesClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.dummy import DummyClassifier +from sklearn.externals.joblib import Memory +from sklearn.kernel_approximation import Nystroem +from sklearn.kernel_approximation import RBFSampler +from sklearn.metrics import zero_one_loss +from sklearn.pipeline import make_pipeline +from sklearn.svm import LinearSVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.utils import check_array +from sklearn.linear_model import LogisticRegression +from sklearn.neural_network import MLPClassifier + +# Memoize the data extraction and memory map the resulting +# train / test splits in readonly mode +memory = Memory(os.path.join(get_data_home(), 'mnist_benchmark_data'), + mmap_mode='r') + + +@memory.cache +def load_data(dtype=np.float32, order='F'): + """Load the data, then cache and memmap the train/test split""" + ###################################################################### + # Load dataset + print("Loading dataset...") + data = fetch_mldata('MNIST original') + X = check_array(data['data'], dtype=dtype, order=order) + y = data["target"] + + # Normalize features + X = X / 255 + + # Create train-test split (as [Joachims, 2006]) + print("Creating train-test split...") + n_train = 60000 + X_train = X[:n_train] + y_train = y[:n_train] + X_test = X[n_train:] + y_test = y[n_train:] + + return X_train, X_test, y_train, y_test + + +ESTIMATORS = { + "dummy": DummyClassifier(), + 'CART': DecisionTreeClassifier(), + 'ExtraTrees': ExtraTreesClassifier(n_estimators=100), + 'RandomForest': RandomForestClassifier(n_estimators=100), + 'Nystroem-SVM': make_pipeline( + Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100)), + 'SampledRBF-SVM': make_pipeline( + RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100)), + 'LinearRegression-SAG': LogisticRegression(solver='sag', tol=1e-1, C=1e4), + 'MultilayerPerceptron': MLPClassifier( + hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, + algorithm='sgd', learning_rate_init=0.2, momentum=0.9, verbose=1, + tol=1e-4, random_state=1), + 'MLP-adam': MLPClassifier( + hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, + algorithm='adam', learning_rate_init=0.001, verbose=1, + tol=1e-4, random_state=1) +} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--classifiers', nargs="+", + choices=ESTIMATORS, type=str, + default=['ExtraTrees', 'Nystroem-SVM'], + help="list of classifiers to benchmark.") + parser.add_argument('--n-jobs', nargs="?", default=1, type=int, + help="Number of concurrently running workers for " + "models that support parallelism.") + parser.add_argument('--order', nargs="?", default="C", type=str, + choices=["F", "C"], + help="Allow to choose between fortran and C ordered " + "data") + parser.add_argument('--random-seed', nargs="?", default=0, type=int, + help="Common seed used by random number generator.") + args = vars(parser.parse_args()) + + print(__doc__) + + X_train, X_test, y_train, y_test = load_data(order=args["order"]) + + print("") + print("Dataset statistics:") + print("===================") + print("%s %d" % ("number of features:".ljust(25), X_train.shape[1])) + print("%s %d" % ("number of classes:".ljust(25), np.unique(y_train).size)) + print("%s %s" % ("data type:".ljust(25), X_train.dtype)) + print("%s %d (size=%dMB)" % ("number of train samples:".ljust(25), + X_train.shape[0], int(X_train.nbytes / 1e6))) + print("%s %d (size=%dMB)" % ("number of test samples:".ljust(25), + X_test.shape[0], int(X_test.nbytes / 1e6))) + + print() + print("Training Classifiers") + print("====================") + error, train_time, test_time = {}, {}, {} + for name in sorted(args["classifiers"]): + print("Training %s ... " % name, end="") + estimator = ESTIMATORS[name] + estimator_params = estimator.get_params() + + estimator.set_params(**{p: args["random_seed"] + for p in estimator_params + if p.endswith("random_state")}) + + if "n_jobs" in estimator_params: + estimator.set_params(n_jobs=args["n_jobs"]) + + time_start = time() + estimator.fit(X_train, y_train) + train_time[name] = time() - time_start + + time_start = time() + y_pred = estimator.predict(X_test) + test_time[name] = time() - time_start + + error[name] = zero_one_loss(y_test, y_pred) + + print("done") + + print() + print("Classification performance:") + print("===========================") + print("{0: <24} {1: >10} {2: >11} {3: >12}" + "".format("Classifier ", "train-time", "test-time", "error-rate")) + print("-" * 60) + for name in sorted(args["classifiers"], key=error.get): + + print("{0: <23} {1: >10.2f}s {2: >10.2f}s {3: >12.4f}" + "".format(name, train_time[name], test_time[name], error[name])) + + print() diff --git a/benchmarks/bench_multilabel_metrics.py b/benchmarks/bench_multilabel_metrics.py new file mode 100755 index 0000000..a7b9374 --- /dev/null +++ b/benchmarks/bench_multilabel_metrics.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python +""" +A comparison of multilabel target formats and metrics over them +""" +from __future__ import division +from __future__ import print_function + +from timeit import timeit +from functools import partial +import itertools +import argparse +import sys + +import matplotlib.pyplot as plt +import scipy.sparse as sp +import numpy as np + +from sklearn.datasets import make_multilabel_classification +from sklearn.metrics import (f1_score, accuracy_score, hamming_loss, + jaccard_similarity_score) +from sklearn.utils.testing import ignore_warnings + + +METRICS = { + 'f1': partial(f1_score, average='micro'), + 'f1-by-sample': partial(f1_score, average='samples'), + 'accuracy': accuracy_score, + 'hamming': hamming_loss, + 'jaccard': jaccard_similarity_score, +} + +FORMATS = { + 'sequences': lambda y: [list(np.flatnonzero(s)) for s in y], + 'dense': lambda y: y, + 'csr': lambda y: sp.csr_matrix(y), + 'csc': lambda y: sp.csc_matrix(y), +} + + +@ignore_warnings +def benchmark(metrics=tuple(v for k, v in sorted(METRICS.items())), + formats=tuple(v for k, v in sorted(FORMATS.items())), + samples=1000, classes=4, density=.2, + n_times=5): + """Times metric calculations for a number of inputs + + Parameters + ---------- + metrics : array-like of callables (1d or 0d) + The metric functions to time. + + formats : array-like of callables (1d or 0d) + These may transform a dense indicator matrix into multilabel + representation. + + samples : array-like of ints (1d or 0d) + The number of samples to generate as input. + + classes : array-like of ints (1d or 0d) + The number of classes in the input. + + density : array-like of ints (1d or 0d) + The density of positive labels in the input. + + n_times : int + Time calling the metric n_times times. + + Returns + ------- + array of floats shaped like (metrics, formats, samples, classes, density) + Time in seconds. + """ + metrics = np.atleast_1d(metrics) + samples = np.atleast_1d(samples) + classes = np.atleast_1d(classes) + density = np.atleast_1d(density) + formats = np.atleast_1d(formats) + out = np.zeros((len(metrics), len(formats), len(samples), len(classes), + len(density)), dtype=float) + it = itertools.product(samples, classes, density) + for i, (s, c, d) in enumerate(it): + _, y_true = make_multilabel_classification(n_samples=s, n_features=1, + n_classes=c, n_labels=d * c, + random_state=42) + _, y_pred = make_multilabel_classification(n_samples=s, n_features=1, + n_classes=c, n_labels=d * c, + random_state=84) + for j, f in enumerate(formats): + f_true = f(y_true) + f_pred = f(y_pred) + for k, metric in enumerate(metrics): + t = timeit(partial(metric, f_true, f_pred), number=n_times) + + out[k, j].flat[i] = t + return out + + +def _tabulate(results, metrics, formats): + """Prints results by metric and format + + Uses the last ([-1]) value of other fields + """ + column_width = max(max(len(k) for k in formats) + 1, 8) + first_width = max(len(k) for k in metrics) + head_fmt = ('{:<{fw}s}' + '{:>{cw}s}' * len(formats)) + row_fmt = ('{:<{fw}s}' + '{:>{cw}.3f}' * len(formats)) + print(head_fmt.format('Metric', *formats, + cw=column_width, fw=first_width)) + for metric, row in zip(metrics, results[:, :, -1, -1, -1]): + print(row_fmt.format(metric, *row, + cw=column_width, fw=first_width)) + + +def _plot(results, metrics, formats, title, x_ticks, x_label, + format_markers=('x', '|', 'o', '+'), + metric_colors=('c', 'm', 'y', 'k', 'g', 'r', 'b')): + """ + Plot the results by metric, format and some other variable given by + x_label + """ + fig = plt.figure('scikit-learn multilabel metrics benchmarks') + plt.title(title) + ax = fig.add_subplot(111) + for i, metric in enumerate(metrics): + for j, format in enumerate(formats): + ax.plot(x_ticks, results[i, j].flat, + label='{}, {}'.format(metric, format), + marker=format_markers[j], + color=metric_colors[i % len(metric_colors)]) + ax.set_xlabel(x_label) + ax.set_ylabel('Time (s)') + ax.legend() + plt.show() + + +if __name__ == "__main__": + ap = argparse.ArgumentParser() + ap.add_argument('metrics', nargs='*', default=sorted(METRICS), + help='Specifies metrics to benchmark, defaults to all. ' + 'Choices are: {}'.format(sorted(METRICS))) + ap.add_argument('--formats', nargs='+', choices=sorted(FORMATS), + help='Specifies multilabel formats to benchmark ' + '(defaults to all).') + ap.add_argument('--samples', type=int, default=1000, + help='The number of samples to generate') + ap.add_argument('--classes', type=int, default=10, + help='The number of classes') + ap.add_argument('--density', type=float, default=.2, + help='The average density of labels per sample') + ap.add_argument('--plot', choices=['classes', 'density', 'samples'], + default=None, + help='Plot time with respect to this parameter varying ' + 'up to the specified value') + ap.add_argument('--n-steps', default=10, type=int, + help='Plot this many points for each metric') + ap.add_argument('--n-times', + default=5, type=int, + help="Time performance over n_times trials") + args = ap.parse_args() + + if args.plot is not None: + max_val = getattr(args, args.plot) + if args.plot in ('classes', 'samples'): + min_val = 2 + else: + min_val = 0 + steps = np.linspace(min_val, max_val, num=args.n_steps + 1)[1:] + if args.plot in ('classes', 'samples'): + steps = np.unique(np.round(steps).astype(int)) + setattr(args, args.plot, steps) + + if args.metrics is None: + args.metrics = sorted(METRICS) + if args.formats is None: + args.formats = sorted(FORMATS) + + results = benchmark([METRICS[k] for k in args.metrics], + [FORMATS[k] for k in args.formats], + args.samples, args.classes, args.density, + args.n_times) + + _tabulate(results, args.metrics, args.formats) + + if args.plot is not None: + print('Displaying plot', file=sys.stderr) + title = ('Multilabel metrics with %s' % + ', '.join('{0}={1}'.format(field, getattr(args, field)) + for field in ['samples', 'classes', 'density'] + if args.plot != field)) + _plot(results, args.metrics, args.formats, title, steps, args.plot) diff --git a/benchmarks/bench_plot_approximate_neighbors.py b/benchmarks/bench_plot_approximate_neighbors.py new file mode 100644 index 0000000..fc8d394 --- /dev/null +++ b/benchmarks/bench_plot_approximate_neighbors.py @@ -0,0 +1,167 @@ +""" +Benchmark for approximate nearest neighbor search using +locality sensitive hashing forest. + +There are two types of benchmarks. + +First, accuracy of LSHForest queries are measured for various +hyper-parameters and index sizes. + +Second, speed up of LSHForest queries compared to brute force +method in exact nearest neighbors is measures for the +aforementioned settings. In general, speed up is increasing as +the index size grows. +""" + +from __future__ import division + +import numpy as np +from tempfile import gettempdir +from time import time + +from sklearn.neighbors import NearestNeighbors +from sklearn.neighbors.approximate import LSHForest +from sklearn.datasets import make_blobs +from sklearn.externals.joblib import Memory + +m = Memory(cachedir=gettempdir()) + + +@m.cache() +def make_data(n_samples, n_features, n_queries, random_state=0): + """Create index and query data.""" + print('Generating random blob-ish data') + X, _ = make_blobs(n_samples=n_samples + n_queries, + n_features=n_features, centers=100, + shuffle=True, random_state=random_state) + + # Keep the last samples as held out query vectors: note since we used + # shuffle=True we have ensured that index and query vectors are + # samples from the same distribution (a mixture of 100 gaussians in this + # case) + return X[:n_samples], X[n_samples:] + + +def calc_exact_neighbors(X, queries, n_queries, n_neighbors): + """Measures average times for exact neighbor queries.""" + print ('Building NearestNeighbors for %d samples in %d dimensions' % + (X.shape[0], X.shape[1])) + nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X) + average_time = 0 + + t0 = time() + neighbors = nbrs.kneighbors(queries, n_neighbors=n_neighbors, + return_distance=False) + average_time = (time() - t0) / n_queries + return neighbors, average_time + + +def calc_accuracy(X, queries, n_queries, n_neighbors, exact_neighbors, + average_time_exact, **lshf_params): + """Calculates accuracy and the speed up of LSHForest.""" + print('Building LSHForest for %d samples in %d dimensions' % + (X.shape[0], X.shape[1])) + lshf = LSHForest(**lshf_params) + t0 = time() + lshf.fit(X) + lshf_build_time = time() - t0 + print('Done in %0.3fs' % lshf_build_time) + + accuracy = 0 + + t0 = time() + approx_neighbors = lshf.kneighbors(queries, n_neighbors=n_neighbors, + return_distance=False) + average_time_approx = (time() - t0) / n_queries + + for i in range(len(queries)): + accuracy += np.in1d(approx_neighbors[i], exact_neighbors[i]).mean() + + accuracy /= n_queries + speed_up = average_time_exact / average_time_approx + + print('Average time for lshf neighbor queries: %0.3fs' % + average_time_approx) + print ('Average time for exact neighbor queries: %0.3fs' % + average_time_exact) + print ('Average Accuracy : %0.2f' % accuracy) + print ('Speed up: %0.1fx' % speed_up) + + return speed_up, accuracy + + +if __name__ == '__main__': + import matplotlib.pyplot as plt + # Initialize index sizes + n_samples = [int(1e3), int(1e4), int(1e5), int(1e6)] + n_features = int(1e2) + n_queries = 100 + n_neighbors = 10 + + X_index, X_query = make_data(np.max(n_samples), n_features, n_queries, + random_state=0) + + params_list = [{'n_estimators': 3, 'n_candidates': 50}, + {'n_estimators': 5, 'n_candidates': 70}, + {'n_estimators': 10, 'n_candidates': 100}] + + accuracies = np.zeros((len(n_samples), len(params_list)), dtype=float) + speed_ups = np.zeros((len(n_samples), len(params_list)), dtype=float) + + for i, sample_size in enumerate(n_samples): + print ('==========================================================') + print ('Sample size: %i' % sample_size) + print ('------------------------') + exact_neighbors, average_time_exact = calc_exact_neighbors( + X_index[:sample_size], X_query, n_queries, n_neighbors) + for j, params in enumerate(params_list): + print ('LSHF parameters: n_estimators = %i, n_candidates = %i' % + (params['n_estimators'], params['n_candidates'])) + speed_ups[i, j], accuracies[i, j] = calc_accuracy( + X_index[:sample_size], X_query, n_queries, n_neighbors, + exact_neighbors, average_time_exact, random_state=0, **params) + print ('') + print ('==========================================================') + + # Set labels for LSHForest parameters + colors = ['c', 'm', 'y'] + legend_rects = [plt.Rectangle((0, 0), 0.1, 0.1, fc=color) + for color in colors] + + legend_labels = ['n_estimators={n_estimators}, ' + 'n_candidates={n_candidates}'.format(**p) + for p in params_list] + + # Plot precision + plt.figure() + plt.legend(legend_rects, legend_labels, + loc='upper left') + + for i in range(len(params_list)): + plt.scatter(n_samples, accuracies[:, i], c=colors[i]) + plt.plot(n_samples, accuracies[:, i], c=colors[i]) + plt.ylim([0, 1.3]) + plt.xlim(np.min(n_samples), np.max(n_samples)) + plt.semilogx() + plt.ylabel("Precision@10") + plt.xlabel("Index size") + plt.grid(which='both') + plt.title("Precision of first 10 neighbors with index size") + + # Plot speed up + plt.figure() + plt.legend(legend_rects, legend_labels, + loc='upper left') + + for i in range(len(params_list)): + plt.scatter(n_samples, speed_ups[:, i], c=colors[i]) + plt.plot(n_samples, speed_ups[:, i], c=colors[i]) + plt.ylim(0, np.max(speed_ups)) + plt.xlim(np.min(n_samples), np.max(n_samples)) + plt.semilogx() + plt.ylabel("Speed up") + plt.xlabel("Index size") + plt.grid(which='both') + plt.title("Relationship between Speed up and index size") + + plt.show() diff --git a/benchmarks/bench_plot_fastkmeans.py b/benchmarks/bench_plot_fastkmeans.py new file mode 100644 index 0000000..865cdb5 --- /dev/null +++ b/benchmarks/bench_plot_fastkmeans.py @@ -0,0 +1,138 @@ +from __future__ import print_function + +from collections import defaultdict +from time import time + +import numpy as np +from numpy import random as nr + +from sklearn.cluster.k_means_ import KMeans, MiniBatchKMeans + + +def compute_bench(samples_range, features_range): + + it = 0 + results = defaultdict(lambda: []) + chunk = 100 + + max_it = len(samples_range) * len(features_range) + for n_samples in samples_range: + for n_features in features_range: + it += 1 + print('==============================') + print('Iteration %03d of %03d' % (it, max_it)) + print('==============================') + print() + data = nr.randint(-50, 51, (n_samples, n_features)) + + print('K-Means') + tstart = time() + kmeans = KMeans(init='k-means++', n_clusters=10).fit(data) + + delta = time() - tstart + print("Speed: %0.3fs" % delta) + print("Inertia: %0.5f" % kmeans.inertia_) + print() + + results['kmeans_speed'].append(delta) + results['kmeans_quality'].append(kmeans.inertia_) + + print('Fast K-Means') + # let's prepare the data in small chunks + mbkmeans = MiniBatchKMeans(init='k-means++', + n_clusters=10, + batch_size=chunk) + tstart = time() + mbkmeans.fit(data) + delta = time() - tstart + print("Speed: %0.3fs" % delta) + print("Inertia: %f" % mbkmeans.inertia_) + print() + print() + + results['MiniBatchKMeans Speed'].append(delta) + results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_) + + return results + + +def compute_bench_2(chunks): + results = defaultdict(lambda: []) + n_features = 50000 + means = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1], + [0.5, 0.5], [0.75, -0.5], [-1, 0.75], [1, 0]]) + X = np.empty((0, 2)) + for i in range(8): + X = np.r_[X, means[i] + 0.8 * np.random.randn(n_features, 2)] + max_it = len(chunks) + it = 0 + for chunk in chunks: + it += 1 + print('==============================') + print('Iteration %03d of %03d' % (it, max_it)) + print('==============================') + print() + + print('Fast K-Means') + tstart = time() + mbkmeans = MiniBatchKMeans(init='k-means++', + n_clusters=8, + batch_size=chunk) + + mbkmeans.fit(X) + delta = time() - tstart + print("Speed: %0.3fs" % delta) + print("Inertia: %0.3fs" % mbkmeans.inertia_) + print() + + results['MiniBatchKMeans Speed'].append(delta) + results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_) + + return results + + +if __name__ == '__main__': + from mpl_toolkits.mplot3d import axes3d # register the 3d projection + import matplotlib.pyplot as plt + + samples_range = np.linspace(50, 150, 5).astype(np.int) + features_range = np.linspace(150, 50000, 5).astype(np.int) + chunks = np.linspace(500, 10000, 15).astype(np.int) + + results = compute_bench(samples_range, features_range) + results_2 = compute_bench_2(chunks) + + max_time = max([max(i) for i in [t for (label, t) in results.iteritems() + if "speed" in label]]) + max_inertia = max([max(i) for i in [ + t for (label, t) in results.iteritems() + if "speed" not in label]]) + + fig = plt.figure('scikit-learn K-Means benchmark results') + for c, (label, timings) in zip('brcy', + sorted(results.iteritems())): + if 'speed' in label: + ax = fig.add_subplot(2, 2, 1, projection='3d') + ax.set_zlim3d(0.0, max_time * 1.1) + else: + ax = fig.add_subplot(2, 2, 2, projection='3d') + ax.set_zlim3d(0.0, max_inertia * 1.1) + + X, Y = np.meshgrid(samples_range, features_range) + Z = np.asarray(timings).reshape(samples_range.shape[0], + features_range.shape[0]) + ax.plot_surface(X, Y, Z.T, cstride=1, rstride=1, color=c, alpha=0.5) + ax.set_xlabel('n_samples') + ax.set_ylabel('n_features') + + i = 0 + for c, (label, timings) in zip('br', + sorted(results_2.iteritems())): + i += 1 + ax = fig.add_subplot(2, 2, i + 2) + y = np.asarray(timings) + ax.plot(chunks, y, color=c, alpha=0.8) + ax.set_xlabel('Chunks') + ax.set_ylabel(label) + + plt.show() diff --git a/benchmarks/bench_plot_incremental_pca.py b/benchmarks/bench_plot_incremental_pca.py new file mode 100644 index 0000000..495d58f --- /dev/null +++ b/benchmarks/bench_plot_incremental_pca.py @@ -0,0 +1,156 @@ +""" +======================== +IncrementalPCA benchmark +======================== + +Benchmarks for IncrementalPCA + +""" + +import numpy as np +import gc +from time import time +from collections import defaultdict +import matplotlib.pyplot as plt +from sklearn.datasets import fetch_lfw_people +from sklearn.decomposition import IncrementalPCA, RandomizedPCA, PCA + + +def plot_results(X, y, label): + plt.plot(X, y, label=label, marker='o') + + +def benchmark(estimator, data): + gc.collect() + print("Benching %s" % estimator) + t0 = time() + estimator.fit(data) + training_time = time() - t0 + data_t = estimator.transform(data) + data_r = estimator.inverse_transform(data_t) + reconstruction_error = np.mean(np.abs(data - data_r)) + return {'time': training_time, 'error': reconstruction_error} + + +def plot_feature_times(all_times, batch_size, all_components, data): + plt.figure() + plot_results(all_components, all_times['pca'], label="PCA") + plot_results(all_components, all_times['ipca'], + label="IncrementalPCA, bsize=%i" % batch_size) + plot_results(all_components, all_times['rpca'], label="RandomizedPCA") + plt.legend(loc="upper left") + plt.suptitle("Algorithm runtime vs. n_components\n \ + LFW, size %i x %i" % data.shape) + plt.xlabel("Number of components (out of max %i)" % data.shape[1]) + plt.ylabel("Time (seconds)") + + +def plot_feature_errors(all_errors, batch_size, all_components, data): + plt.figure() + plot_results(all_components, all_errors['pca'], label="PCA") + plot_results(all_components, all_errors['ipca'], + label="IncrementalPCA, bsize=%i" % batch_size) + plot_results(all_components, all_errors['rpca'], label="RandomizedPCA") + plt.legend(loc="lower left") + plt.suptitle("Algorithm error vs. n_components\n" + "LFW, size %i x %i" % data.shape) + plt.xlabel("Number of components (out of max %i)" % data.shape[1]) + plt.ylabel("Mean absolute error") + + +def plot_batch_times(all_times, n_features, all_batch_sizes, data): + plt.figure() + plot_results(all_batch_sizes, all_times['pca'], label="PCA") + plot_results(all_batch_sizes, all_times['rpca'], label="RandomizedPCA") + plot_results(all_batch_sizes, all_times['ipca'], label="IncrementalPCA") + plt.legend(loc="lower left") + plt.suptitle("Algorithm runtime vs. batch_size for n_components %i\n \ + LFW, size %i x %i" % ( + n_features, data.shape[0], data.shape[1])) + plt.xlabel("Batch size") + plt.ylabel("Time (seconds)") + + +def plot_batch_errors(all_errors, n_features, all_batch_sizes, data): + plt.figure() + plot_results(all_batch_sizes, all_errors['pca'], label="PCA") + plot_results(all_batch_sizes, all_errors['ipca'], label="IncrementalPCA") + plt.legend(loc="lower left") + plt.suptitle("Algorithm error vs. batch_size for n_components %i\n \ + LFW, size %i x %i" % ( + n_features, data.shape[0], data.shape[1])) + plt.xlabel("Batch size") + plt.ylabel("Mean absolute error") + + +def fixed_batch_size_comparison(data): + all_features = [i.astype(int) for i in np.linspace(data.shape[1] // 10, + data.shape[1], num=5)] + batch_size = 1000 + # Compare runtimes and error for fixed batch size + all_times = defaultdict(list) + all_errors = defaultdict(list) + for n_components in all_features: + pca = PCA(n_components=n_components) + rpca = RandomizedPCA(n_components=n_components, random_state=1999) + ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size) + results_dict = {k: benchmark(est, data) for k, est in [('pca', pca), + ('ipca', ipca), + ('rpca', rpca)]} + + for k in sorted(results_dict.keys()): + all_times[k].append(results_dict[k]['time']) + all_errors[k].append(results_dict[k]['error']) + + plot_feature_times(all_times, batch_size, all_features, data) + plot_feature_errors(all_errors, batch_size, all_features, data) + + +def variable_batch_size_comparison(data): + batch_sizes = [i.astype(int) for i in np.linspace(data.shape[0] // 10, + data.shape[0], num=10)] + + for n_components in [i.astype(int) for i in + np.linspace(data.shape[1] // 10, + data.shape[1], num=4)]: + all_times = defaultdict(list) + all_errors = defaultdict(list) + pca = PCA(n_components=n_components) + rpca = RandomizedPCA(n_components=n_components, random_state=1999) + results_dict = {k: benchmark(est, data) for k, est in [('pca', pca), + ('rpca', rpca)]} + + # Create flat baselines to compare the variation over batch size + all_times['pca'].extend([results_dict['pca']['time']] * + len(batch_sizes)) + all_errors['pca'].extend([results_dict['pca']['error']] * + len(batch_sizes)) + all_times['rpca'].extend([results_dict['rpca']['time']] * + len(batch_sizes)) + all_errors['rpca'].extend([results_dict['rpca']['error']] * + len(batch_sizes)) + for batch_size in batch_sizes: + ipca = IncrementalPCA(n_components=n_components, + batch_size=batch_size) + results_dict = {k: benchmark(est, data) for k, est in [('ipca', + ipca)]} + all_times['ipca'].append(results_dict['ipca']['time']) + all_errors['ipca'].append(results_dict['ipca']['error']) + + plot_batch_times(all_times, n_components, batch_sizes, data) + # RandomizedPCA error is always worse (approx 100x) than other PCA + # tests + plot_batch_errors(all_errors, n_components, batch_sizes, data) + +faces = fetch_lfw_people(resize=.2, min_faces_per_person=5) +# limit dataset to 5000 people (don't care who they are!) +X = faces.data[:5000] +n_samples, h, w = faces.images.shape +n_features = X.shape[1] + +X -= X.mean(axis=0) +X /= X.std(axis=0) + +fixed_batch_size_comparison(X) +variable_batch_size_comparison(X) +plt.show() diff --git a/benchmarks/bench_plot_lasso_path.py b/benchmarks/bench_plot_lasso_path.py new file mode 100644 index 0000000..60a7601 --- /dev/null +++ b/benchmarks/bench_plot_lasso_path.py @@ -0,0 +1,117 @@ +"""Benchmarks of Lasso regularization path computation using Lars and CD + +The input data is mostly low rank but is a fat infinite tail. +""" +from __future__ import print_function + +from collections import defaultdict +import gc +import sys +from time import time + +import numpy as np + +from sklearn.linear_model import lars_path +from sklearn.linear_model import lasso_path +from sklearn.datasets.samples_generator import make_regression + + +def compute_bench(samples_range, features_range): + + it = 0 + + results = defaultdict(lambda: []) + + max_it = len(samples_range) * len(features_range) + for n_samples in samples_range: + for n_features in features_range: + it += 1 + print('====================') + print('Iteration %03d of %03d' % (it, max_it)) + print('====================') + dataset_kwargs = { + 'n_samples': n_samples, + 'n_features': n_features, + 'n_informative': n_features / 10, + 'effective_rank': min(n_samples, n_features) / 10, + #'effective_rank': None, + 'bias': 0.0, + } + print("n_samples: %d" % n_samples) + print("n_features: %d" % n_features) + X, y = make_regression(**dataset_kwargs) + + gc.collect() + print("benchmarking lars_path (with Gram):", end='') + sys.stdout.flush() + tstart = time() + G = np.dot(X.T, X) # precomputed Gram matrix + Xy = np.dot(X.T, y) + lars_path(X, y, Xy=Xy, Gram=G, method='lasso') + delta = time() - tstart + print("%0.3fs" % delta) + results['lars_path (with Gram)'].append(delta) + + gc.collect() + print("benchmarking lars_path (without Gram):", end='') + sys.stdout.flush() + tstart = time() + lars_path(X, y, method='lasso') + delta = time() - tstart + print("%0.3fs" % delta) + results['lars_path (without Gram)'].append(delta) + + gc.collect() + print("benchmarking lasso_path (with Gram):", end='') + sys.stdout.flush() + tstart = time() + lasso_path(X, y, precompute=True) + delta = time() - tstart + print("%0.3fs" % delta) + results['lasso_path (with Gram)'].append(delta) + + gc.collect() + print("benchmarking lasso_path (without Gram):", end='') + sys.stdout.flush() + tstart = time() + lasso_path(X, y, precompute=False) + delta = time() - tstart + print("%0.3fs" % delta) + results['lasso_path (without Gram)'].append(delta) + + return results + + +if __name__ == '__main__': + from mpl_toolkits.mplot3d import axes3d # register the 3d projection + import matplotlib.pyplot as plt + + samples_range = np.linspace(10, 2000, 5).astype(np.int) + features_range = np.linspace(10, 2000, 5).astype(np.int) + results = compute_bench(samples_range, features_range) + + max_time = max(max(t) for t in results.values()) + + fig = plt.figure('scikit-learn Lasso path benchmark results') + i = 1 + for c, (label, timings) in zip('bcry', sorted(results.items())): + ax = fig.add_subplot(2, 2, i, projection='3d') + X, Y = np.meshgrid(samples_range, features_range) + Z = np.asarray(timings).reshape(samples_range.shape[0], + features_range.shape[0]) + + # plot the actual surface + ax.plot_surface(X, Y, Z.T, cstride=1, rstride=1, color=c, alpha=0.8) + + # dummy point plot to stick the legend to since surface plot do not + # support legends (yet?) + # ax.plot([1], [1], [1], color=c, label=label) + + ax.set_xlabel('n_samples') + ax.set_ylabel('n_features') + ax.set_zlabel('Time (s)') + ax.set_zlim3d(0.0, max_time * 1.1) + ax.set_title(label) + # ax.legend() + i += 1 + plt.show() diff --git a/benchmarks/bench_plot_neighbors.py b/benchmarks/bench_plot_neighbors.py new file mode 100644 index 0000000..19bd22a --- /dev/null +++ b/benchmarks/bench_plot_neighbors.py @@ -0,0 +1,185 @@ +""" +Plot the scaling of the nearest neighbors algorithms with k, D, and N +""" +from time import time + +import numpy as np +import matplotlib.pyplot as plt +from matplotlib import ticker + +from sklearn import neighbors, datasets + + +def get_data(N, D, dataset='dense'): + if dataset == 'dense': + np.random.seed(0) + return np.random.random((N, D)) + elif dataset == 'digits': + X = datasets.load_digits().data + i = np.argsort(X[0])[::-1] + X = X[:, i] + return X[:N, :D] + else: + raise ValueError("invalid dataset: %s" % dataset) + + +def barplot_neighbors(Nrange=2 ** np.arange(1, 11), + Drange=2 ** np.arange(7), + krange=2 ** np.arange(10), + N=1000, + D=64, + k=5, + leaf_size=30, + dataset='digits'): + algorithms = ('kd_tree', 'brute', 'ball_tree') + fiducial_values = {'N': N, + 'D': D, + 'k': k} + + #------------------------------------------------------------ + # varying N + N_results_build = dict([(alg, np.zeros(len(Nrange))) + for alg in algorithms]) + N_results_query = dict([(alg, np.zeros(len(Nrange))) + for alg in algorithms]) + + for i, NN in enumerate(Nrange): + print("N = %i (%i out of %i)" % (NN, i + 1, len(Nrange))) + X = get_data(NN, D, dataset) + for algorithm in algorithms: + nbrs = neighbors.NearestNeighbors(n_neighbors=min(NN, k), + algorithm=algorithm, + leaf_size=leaf_size) + t0 = time() + nbrs.fit(X) + t1 = time() + nbrs.kneighbors(X) + t2 = time() + + N_results_build[algorithm][i] = (t1 - t0) + N_results_query[algorithm][i] = (t2 - t1) + + #------------------------------------------------------------ + # varying D + D_results_build = dict([(alg, np.zeros(len(Drange))) + for alg in algorithms]) + D_results_query = dict([(alg, np.zeros(len(Drange))) + for alg in algorithms]) + + for i, DD in enumerate(Drange): + print("D = %i (%i out of %i)" % (DD, i + 1, len(Drange))) + X = get_data(N, DD, dataset) + for algorithm in algorithms: + nbrs = neighbors.NearestNeighbors(n_neighbors=k, + algorithm=algorithm, + leaf_size=leaf_size) + t0 = time() + nbrs.fit(X) + t1 = time() + nbrs.kneighbors(X) + t2 = time() + + D_results_build[algorithm][i] = (t1 - t0) + D_results_query[algorithm][i] = (t2 - t1) + + #------------------------------------------------------------ + # varying k + k_results_build = dict([(alg, np.zeros(len(krange))) + for alg in algorithms]) + k_results_query = dict([(alg, np.zeros(len(krange))) + for alg in algorithms]) + + X = get_data(N, DD, dataset) + + for i, kk in enumerate(krange): + print("k = %i (%i out of %i)" % (kk, i + 1, len(krange))) + for algorithm in algorithms: + nbrs = neighbors.NearestNeighbors(n_neighbors=kk, + algorithm=algorithm, + leaf_size=leaf_size) + t0 = time() + nbrs.fit(X) + t1 = time() + nbrs.kneighbors(X) + t2 = time() + + k_results_build[algorithm][i] = (t1 - t0) + k_results_query[algorithm][i] = (t2 - t1) + + plt.figure(figsize=(8, 11)) + + for (sbplt, vals, quantity, + build_time, query_time) in [(311, Nrange, 'N', + N_results_build, + N_results_query), + (312, Drange, 'D', + D_results_build, + D_results_query), + (313, krange, 'k', + k_results_build, + k_results_query)]: + ax = plt.subplot(sbplt, yscale='log') + plt.grid(True) + + tick_vals = [] + tick_labels = [] + + bottom = 10 ** np.min([min(np.floor(np.log10(build_time[alg]))) + for alg in algorithms]) + + for i, alg in enumerate(algorithms): + xvals = 0.1 + i * (1 + len(vals)) + np.arange(len(vals)) + width = 0.8 + + c_bar = plt.bar(xvals, build_time[alg] - bottom, + width, bottom, color='r') + q_bar = plt.bar(xvals, query_time[alg], + width, build_time[alg], color='b') + + tick_vals += list(xvals + 0.5 * width) + tick_labels += ['%i' % val for val in vals] + + plt.text((i + 0.02) / len(algorithms), 0.98, alg, + transform=ax.transAxes, + ha='left', + va='top', + bbox=dict(facecolor='w', edgecolor='w', alpha=0.5)) + + plt.ylabel('Time (s)') + + ax.xaxis.set_major_locator(ticker.FixedLocator(tick_vals)) + ax.xaxis.set_major_formatter(ticker.FixedFormatter(tick_labels)) + + for label in ax.get_xticklabels(): + label.set_rotation(-90) + label.set_fontsize(10) + + title_string = 'Varying %s' % quantity + + descr_string = '' + + for s in 'NDk': + if s == quantity: + pass + else: + descr_string += '%s = %i, ' % (s, fiducial_values[s]) + + descr_string = descr_string[:-2] + + plt.text(1.01, 0.5, title_string, + transform=ax.transAxes, rotation=-90, + ha='left', va='center', fontsize=20) + + plt.text(0.99, 0.5, descr_string, + transform=ax.transAxes, rotation=-90, + ha='right', va='center') + + plt.gcf().suptitle("%s data set" % dataset.capitalize(), fontsize=16) + + plt.figlegend((c_bar, q_bar), ('construction', 'N-point query'), + 'upper right') + +if __name__ == '__main__': + barplot_neighbors(dataset='digits') + barplot_neighbors(dataset='dense') + plt.show() diff --git a/benchmarks/bench_plot_nmf.py b/benchmarks/bench_plot_nmf.py new file mode 100644 index 0000000..5d185dd --- /dev/null +++ b/benchmarks/bench_plot_nmf.py @@ -0,0 +1,166 @@ +""" +Benchmarks of Non-Negative Matrix Factorization +""" + +from __future__ import print_function + +from collections import defaultdict +import gc +from time import time + +import numpy as np +from scipy.linalg import norm + +from sklearn.decomposition.nmf import NMF, _initialize_nmf +from sklearn.datasets.samples_generator import make_low_rank_matrix +from sklearn.externals.six.moves import xrange + + +def alt_nnmf(V, r, max_iter=1000, tol=1e-3, init='random'): + """ + A, S = nnmf(X, r, tol=1e-3, R=None) + + Implement Lee & Seung's algorithm + + Parameters + ---------- + V : 2-ndarray, [n_samples, n_features] + input matrix + r : integer + number of latent features + max_iter : integer, optional + maximum number of iterations (default: 1000) + tol : double + tolerance threshold for early exit (when the update factor is within + tol of 1., the function exits) + init : string + Method used to initialize the procedure. + + Returns + ------- + A : 2-ndarray, [n_samples, r] + Component part of the factorization + + S : 2-ndarray, [r, n_features] + Data part of the factorization + Reference + --------- + "Algorithms for Non-negative Matrix Factorization" + by Daniel D Lee, Sebastian H Seung + (available at http://citeseer.ist.psu.edu/lee01algorithms.html) + """ + # Nomenclature in the function follows Lee & Seung + eps = 1e-5 + n, m = V.shape + W, H = _initialize_nmf(V, r, init, random_state=0) + + for i in xrange(max_iter): + updateH = np.dot(W.T, V) / (np.dot(np.dot(W.T, W), H) + eps) + H *= updateH + updateW = np.dot(V, H.T) / (np.dot(W, np.dot(H, H.T)) + eps) + W *= updateW + if i % 10 == 0: + max_update = max(updateW.max(), updateH.max()) + if abs(1. - max_update) < tol: + break + return W, H + + +def report(error, time): + print("Frobenius loss: %.5f" % error) + print("Took: %.2fs" % time) + print() + + +def benchmark(samples_range, features_range, rank=50, tolerance=1e-5): + timeset = defaultdict(lambda: []) + err = defaultdict(lambda: []) + + for n_samples in samples_range: + for n_features in features_range: + print("%2d samples, %2d features" % (n_samples, n_features)) + print('=======================') + X = np.abs(make_low_rank_matrix(n_samples, n_features, + effective_rank=rank, tail_strength=0.2)) + + gc.collect() + print("benchmarking nndsvd-nmf: ") + tstart = time() + m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X) + tend = time() - tstart + timeset['nndsvd-nmf'].append(tend) + err['nndsvd-nmf'].append(m.reconstruction_err_) + report(m.reconstruction_err_, tend) + + gc.collect() + print("benchmarking nndsvda-nmf: ") + tstart = time() + m = NMF(n_components=30, init='nndsvda', + tol=tolerance).fit(X) + tend = time() - tstart + timeset['nndsvda-nmf'].append(tend) + err['nndsvda-nmf'].append(m.reconstruction_err_) + report(m.reconstruction_err_, tend) + + gc.collect() + print("benchmarking nndsvdar-nmf: ") + tstart = time() + m = NMF(n_components=30, init='nndsvdar', + tol=tolerance).fit(X) + tend = time() - tstart + timeset['nndsvdar-nmf'].append(tend) + err['nndsvdar-nmf'].append(m.reconstruction_err_) + report(m.reconstruction_err_, tend) + + gc.collect() + print("benchmarking random-nmf") + tstart = time() + m = NMF(n_components=30, init='random', max_iter=1000, + tol=tolerance).fit(X) + tend = time() - tstart + timeset['random-nmf'].append(tend) + err['random-nmf'].append(m.reconstruction_err_) + report(m.reconstruction_err_, tend) + + gc.collect() + print("benchmarking alt-random-nmf") + tstart = time() + W, H = alt_nnmf(X, r=30, init='random', tol=tolerance) + tend = time() - tstart + timeset['alt-random-nmf'].append(tend) + err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H))) + report(norm(X - np.dot(W, H)), tend) + + return timeset, err + + +if __name__ == '__main__': + from mpl_toolkits.mplot3d import axes3d # register the 3d projection + axes3d + import matplotlib.pyplot as plt + + samples_range = np.linspace(50, 500, 3).astype(np.int) + features_range = np.linspace(50, 500, 3).astype(np.int) + timeset, err = benchmark(samples_range, features_range) + + for i, results in enumerate((timeset, err)): + fig = plt.figure('scikit-learn Non-Negative Matrix Factorization' + 'benchmark results') + ax = fig.gca(projection='3d') + for c, (label, timings) in zip('rbgcm', sorted(results.iteritems())): + X, Y = np.meshgrid(samples_range, features_range) + Z = np.asarray(timings).reshape(samples_range.shape[0], + features_range.shape[0]) + # plot the actual surface + ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3, + color=c) + # dummy point plot to stick the legend to since surface plot do not + # support legends (yet?) + ax.plot([1], [1], [1], color=c, label=label) + + ax.set_xlabel('n_samples') + ax.set_ylabel('n_features') + zlabel = 'Time (s)' if i == 0 else 'reconstruction error' + ax.set_zlabel(zlabel) + ax.legend() + plt.show() diff --git a/benchmarks/bench_plot_omp_lars.py b/benchmarks/bench_plot_omp_lars.py new file mode 100644 index 0000000..58d350a --- /dev/null +++ b/benchmarks/bench_plot_omp_lars.py @@ -0,0 +1,123 @@ +"""Benchmarks of orthogonal matching pursuit (:ref:`OMP`) versus least angle +regression (:ref:`least_angle_regression`) + +The input data is mostly low rank but is a fat infinite tail. +""" +from __future__ import print_function + +import gc +import sys +from time import time + +import numpy as np + +from sklearn.linear_model import lars_path, orthogonal_mp +from sklearn.datasets.samples_generator import make_sparse_coded_signal + + +def compute_bench(samples_range, features_range): + + it = 0 + + results = dict() + lars = np.empty((len(features_range), len(samples_range))) + lars_gram = lars.copy() + omp = lars.copy() + omp_gram = lars.copy() + + max_it = len(samples_range) * len(features_range) + for i_s, n_samples in enumerate(samples_range): + for i_f, n_features in enumerate(features_range): + it += 1 + n_informative = n_features / 10 + print('====================') + print('Iteration %03d of %03d' % (it, max_it)) + print('====================') + # dataset_kwargs = { + # 'n_train_samples': n_samples, + # 'n_test_samples': 2, + # 'n_features': n_features, + # 'n_informative': n_informative, + # 'effective_rank': min(n_samples, n_features) / 10, + # #'effective_rank': None, + # 'bias': 0.0, + # } + dataset_kwargs = { + 'n_samples': 1, + 'n_components': n_features, + 'n_features': n_samples, + 'n_nonzero_coefs': n_informative, + 'random_state': 0 + } + print("n_samples: %d" % n_samples) + print("n_features: %d" % n_features) + y, X, _ = make_sparse_coded_signal(**dataset_kwargs) + X = np.asfortranarray(X) + + gc.collect() + print("benchmarking lars_path (with Gram):", end='') + sys.stdout.flush() + tstart = time() + G = np.dot(X.T, X) # precomputed Gram matrix + Xy = np.dot(X.T, y) + lars_path(X, y, Xy=Xy, Gram=G, max_iter=n_informative) + delta = time() - tstart + print("%0.3fs" % delta) + lars_gram[i_f, i_s] = delta + + gc.collect() + print("benchmarking lars_path (without Gram):", end='') + sys.stdout.flush() + tstart = time() + lars_path(X, y, Gram=None, max_iter=n_informative) + delta = time() - tstart + print("%0.3fs" % delta) + lars[i_f, i_s] = delta + + gc.collect() + print("benchmarking orthogonal_mp (with Gram):", end='') + sys.stdout.flush() + tstart = time() + orthogonal_mp(X, y, precompute=True, + n_nonzero_coefs=n_informative) + delta = time() - tstart + print("%0.3fs" % delta) + omp_gram[i_f, i_s] = delta + + gc.collect() + print("benchmarking orthogonal_mp (without Gram):", end='') + sys.stdout.flush() + tstart = time() + orthogonal_mp(X, y, precompute=False, + n_nonzero_coefs=n_informative) + delta = time() - tstart + print("%0.3fs" % delta) + omp[i_f, i_s] = delta + + results['time(LARS) / time(OMP)\n (w/ Gram)'] = (lars_gram / omp_gram) + results['time(LARS) / time(OMP)\n (w/o Gram)'] = (lars / omp) + return results + + +if __name__ == '__main__': + samples_range = np.linspace(1000, 5000, 5).astype(np.int) + features_range = np.linspace(1000, 5000, 5).astype(np.int) + results = compute_bench(samples_range, features_range) + max_time = max(np.max(t) for t in results.values()) + + import matplotlib.pyplot as plt + fig = plt.figure('scikit-learn OMP vs. LARS benchmark results') + for i, (label, timings) in enumerate(sorted(results.iteritems())): + ax = fig.add_subplot(1, 2, i+1) + vmax = max(1 - timings.min(), -1 + timings.max()) + plt.matshow(timings, fignum=False, vmin=1 - vmax, vmax=1 + vmax) + ax.set_xticklabels([''] + map(str, samples_range)) + ax.set_yticklabels([''] + map(str, features_range)) + plt.xlabel('n_samples') + plt.ylabel('n_features') + plt.title(label) + + plt.subplots_adjust(0.1, 0.08, 0.96, 0.98, 0.4, 0.63) + ax = plt.axes([0.1, 0.08, 0.8, 0.06]) + plt.colorbar(cax=ax, orientation='horizontal') + plt.show() diff --git a/benchmarks/bench_plot_parallel_pairwise.py b/benchmarks/bench_plot_parallel_pairwise.py new file mode 100644 index 0000000..0fed069 --- /dev/null +++ b/benchmarks/bench_plot_parallel_pairwise.py @@ -0,0 +1,46 @@ +# Author: Mathieu Blondel +# License: BSD 3 clause +import time + +import matplotlib.pyplot as plt + +from sklearn.utils import check_random_state +from sklearn.metrics.pairwise import pairwise_distances +from sklearn.metrics.pairwise import pairwise_kernels + +def plot(func): + random_state = check_random_state(0) + one_core = [] + multi_core = [] + sample_sizes = range(1000, 6000, 1000) + + for n_samples in sample_sizes: + X = random_state.rand(n_samples, 300) + + start = time.time() + func(X, n_jobs=1) + one_core.append(time.time() - start) + + start = time.time() + func(X, n_jobs=-1) + multi_core.append(time.time() - start) + + plt.figure('scikit-learn parallel %s benchmark results' % func.__name__) + plt.plot(sample_sizes, one_core, label="one core") + plt.plot(sample_sizes, multi_core, label="multi core") + plt.xlabel('n_samples') + plt.ylabel('Time (s)') + plt.title('Parallel %s' % func.__name__) + plt.legend() + + +def euclidean_distances(X, n_jobs): + return pairwise_distances(X, metric="euclidean", n_jobs=n_jobs) + + +def rbf_kernels(X, n_jobs): + return pairwise_kernels(X, metric="rbf", n_jobs=n_jobs, gamma=0.1) + +plot(euclidean_distances) +plot(rbf_kernels) +plt.show() diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py new file mode 100644 index 0000000..e4c2f63 --- /dev/null +++ b/benchmarks/bench_plot_randomized_svd.py @@ -0,0 +1,455 @@ +""" +Benchmarks on the power iterations phase in randomized SVD. + +We test on various synthetic and real datasets the effect of increasing +the number of power iterations in terms of quality of approximation +and running time. A number greater than 0 should help with noisy matrices, +which are characterized by a slow spectral decay. + +We test several policy for normalizing the power iterations. Normalization +is crucial to avoid numerical issues. + +The quality of the approximation is measured by the spectral norm discrepancy +between the original input matrix and the reconstructed one (by multiplying +the randomized_svd's outputs). The spectral norm is always equivalent to the +largest singular value of a matrix. (3) justifies this choice. However, one can +notice in these experiments that Frobenius and spectral norms behave +very similarly in a qualitative sense. Therefore, we suggest to run these +benchmarks with `enable_spectral_norm = False`, as Frobenius' is MUCH faster to +compute. + +The benchmarks follow. + +(a) plot: time vs norm, varying number of power iterations + data: many datasets + goal: compare normalization policies and study how the number of power + iterations affect time and norm + +(b) plot: n_iter vs norm, varying rank of data and number of components for + randomized_SVD + data: low-rank matrices on which we control the rank + goal: study whether the rank of the matrix and the number of components + extracted by randomized SVD affect "the optimal" number of power iterations + +(c) plot: time vs norm, varing datasets + data: many datasets + goal: compare default configurations + +We compare the following algorithms: +- randomized_svd(..., power_iteration_normalizer='none') +- randomized_svd(..., power_iteration_normalizer='LU') +- randomized_svd(..., power_iteration_normalizer='QR') +- randomized_svd(..., power_iteration_normalizer='auto') +- fbpca.pca() from https://github.com/facebook/fbpca (if installed) + +Conclusion +---------- +- n_iter=2 appears to be a good default value +- power_iteration_normalizer='none' is OK if n_iter is small, otherwise LU + gives similar errors to QR but is cheaper. That's what 'auto' implements. + +References +---------- +(1) Finding structure with randomness: Stochastic algorithms for constructing + approximate matrix decompositions + Halko, et al., 2009 http://arxiv.org/abs/arXiv:0909.4061 + +(2) A randomized algorithm for the decomposition of matrices + Per-Gunnar Martinsson, Vladimir Rokhlin and Mark Tygert + +(3) An implementation of a randomized algorithm for principal component + analysis + A. Szlam et al. 2014 +""" + +# Author: Giorgio Patrini + +import numpy as np +import scipy as sp +import matplotlib.pyplot as plt + +import gc +import pickle +from time import time +from collections import defaultdict +import os.path + +from sklearn.utils import gen_batches +from sklearn.utils.validation import check_random_state +from sklearn.utils.extmath import randomized_svd +from sklearn.datasets.samples_generator import (make_low_rank_matrix, + make_sparse_uncorrelated) +from sklearn.datasets import (fetch_lfw_people, + fetch_mldata, + fetch_20newsgroups_vectorized, + fetch_olivetti_faces, + fetch_rcv1) + +try: + import fbpca + fbpca_available = True +except ImportError: + fbpca_available = False + +# If this is enabled, tests are much slower and will crash with the large data +enable_spectral_norm = False + +# TODO: compute approximate spectral norms with the power method as in +# Estimating the largest eigenvalues by the power and Lanczos methods with +# a random start, Jacek Kuczynski and Henryk Wozniakowski, SIAM Journal on +# Matrix Analysis and Applications, 13 (4): 1094-1122, 1992. +# This approximation is a very fast estimate of the spectral norm, but depends +# on starting random vectors. + +# Determine when to switch to batch computation for matrix norms, +# in case the reconstructed (dense) matrix is too large +MAX_MEMORY = np.int(2e9) + +# The following datasets can be dowloaded manually from: +# CIFAR 10: http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz +# SVHN: http://ufldl.stanford.edu/housenumbers/train_32x32.mat +CIFAR_FOLDER = "./cifar-10-batches-py/" +SVHN_FOLDER = "./SVHN/" + +datasets = ['low rank matrix', 'lfw_people', 'olivetti_faces', '20newsgroups', + 'MNIST original', 'CIFAR', 'a1a', 'SVHN', 'uncorrelated matrix'] + +big_sparse_datasets = ['big sparse matrix', 'rcv1'] + + +def unpickle(file_name): + with open(file_name, 'rb') as fo: + return pickle.load(fo, encoding='latin1')["data"] + + +def handle_missing_dataset(file_folder): + if not os.path.isdir(file_folder): + print("%s file folder not found. Test skipped." % file_folder) + return 0 + + +def get_data(dataset_name): + print("Getting dataset: %s" % dataset_name) + + if dataset_name == 'lfw_people': + X = fetch_lfw_people().data + elif dataset_name == '20newsgroups': + X = fetch_20newsgroups_vectorized().data[:, :100000] + elif dataset_name == 'olivetti_faces': + X = fetch_olivetti_faces().data + elif dataset_name == 'rcv1': + X = fetch_rcv1().data + elif dataset_name == 'CIFAR': + if handle_missing_dataset(CIFAR_FOLDER) == "skip": + return + X1 = [unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1)) + for i in range(5)] + X = np.vstack(X1) + del X1 + elif dataset_name == 'SVHN': + if handle_missing_dataset(SVHN_FOLDER) == 0: + return + X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)['X'] + X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])] + X = np.vstack(X2) + del X1 + del X2 + elif dataset_name == 'low rank matrix': + X = make_low_rank_matrix(n_samples=500, n_features=np.int(1e4), + effective_rank=100, tail_strength=.5, + random_state=random_state) + elif dataset_name == 'uncorrelated matrix': + X, _ = make_sparse_uncorrelated(n_samples=500, n_features=10000, + random_state=random_state) + elif dataset_name == 'big sparse matrix': + sparsity = np.int(1e6) + size = np.int(1e6) + small_size = np.int(1e4) + data = np.random.normal(0, 1, np.int(sparsity/10)) + data = np.repeat(data, 10) + row = np.random.uniform(0, small_size, sparsity) + col = np.random.uniform(0, small_size, sparsity) + X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size)) + del data + del row + del col + else: + X = fetch_mldata(dataset_name).data + return X + + +def plot_time_vs_s(time, norm, point_labels, title): + plt.figure() + colors = ['g', 'b', 'y'] + for i, l in enumerate(sorted(norm.keys())): + if l is not "fbpca": + plt.plot(time[l], norm[l], label=l, marker='o', c=colors.pop()) + else: + plt.plot(time[l], norm[l], label=l, marker='^', c='red') + + for label, x, y in zip(point_labels, list(time[l]), list(norm[l])): + plt.annotate(label, xy=(x, y), xytext=(0, -20), + textcoords='offset points', ha='right', va='bottom') + plt.legend(loc="upper right") + plt.suptitle(title) + plt.ylabel("norm discrepancy") + plt.xlabel("running time [s]") + + +def scatter_time_vs_s(time, norm, point_labels, title): + plt.figure() + size = 100 + for i, l in enumerate(sorted(norm.keys())): + if l is not "fbpca": + plt.scatter(time[l], norm[l], label=l, marker='o', c='b', s=size) + for label, x, y in zip(point_labels, list(time[l]), list(norm[l])): + plt.annotate(label, xy=(x, y), xytext=(0, -80), + textcoords='offset points', ha='right', + arrowprops=dict(arrowstyle="->", + connectionstyle="arc3"), + va='bottom', size=11, rotation=90) + else: + plt.scatter(time[l], norm[l], label=l, marker='^', c='red', s=size) + for label, x, y in zip(point_labels, list(time[l]), list(norm[l])): + plt.annotate(label, xy=(x, y), xytext=(0, 30), + textcoords='offset points', ha='right', + arrowprops=dict(arrowstyle="->", + connectionstyle="arc3"), + va='bottom', size=11, rotation=90) + + plt.legend(loc="best") + plt.suptitle(title) + plt.ylabel("norm discrepancy") + plt.xlabel("running time [s]") + + +def plot_power_iter_vs_s(power_iter, s, title): + plt.figure() + for l in sorted(s.keys()): + plt.plot(power_iter, s[l], label=l, marker='o') + plt.legend(loc="lower right", prop={'size': 10}) + plt.suptitle(title) + plt.ylabel("norm discrepancy") + plt.xlabel("n_iter") + + +def svd_timing(X, n_comps, n_iter, n_oversamples, + power_iteration_normalizer='auto', method=None): + """ + Measure time for decomposition + """ + print("... running SVD ...") + if method is not 'fbpca': + gc.collect() + t0 = time() + U, mu, V = randomized_svd(X, n_comps, n_oversamples, n_iter, + power_iteration_normalizer, + random_state=random_state, transpose=False) + call_time = time() - t0 + else: + gc.collect() + t0 = time() + # There is a different convention for l here + U, mu, V = fbpca.pca(X, n_comps, raw=True, n_iter=n_iter, + l=n_oversamples+n_comps) + call_time = time() - t0 + + return U, mu, V, call_time + + +def norm_diff(A, norm=2, msg=True): + """ + Compute the norm diff with the original matrix, when randomized + SVD is called with *params. + + norm: 2 => spectral; 'fro' => Frobenius + """ + + if msg: + print("... computing %s norm ..." % norm) + if norm == 2: + # s = sp.linalg.norm(A, ord=2) # slow + value = sp.sparse.linalg.svds(A, k=1, return_singular_vectors=False) + else: + if sp.sparse.issparse(A): + value = sp.sparse.linalg.norm(A, ord=norm) + else: + value = sp.linalg.norm(A, ord=norm) + return value + + +def scalable_frobenius_norm_discrepancy(X, U, s, V): + # if the input is not too big, just call scipy + if X.shape[0] * X.shape[1] < MAX_MEMORY: + A = X - U.dot(np.diag(s).dot(V)) + return norm_diff(A, norm='fro') + + print("... computing fro norm by batches...") + batch_size = 1000 + Vhat = np.diag(s).dot(V) + cum_norm = .0 + for batch in gen_batches(X.shape[0], batch_size): + M = X[batch, :] - U[batch, :].dot(Vhat) + cum_norm += norm_diff(M, norm='fro', msg=False) + return np.sqrt(cum_norm) + + +def bench_a(X, dataset_name, power_iter, n_oversamples, n_comps): + + all_time = defaultdict(list) + if enable_spectral_norm: + all_spectral = defaultdict(list) + X_spectral_norm = norm_diff(X, norm=2, msg=False) + all_frobenius = defaultdict(list) + X_fro_norm = norm_diff(X, norm='fro', msg=False) + + for pi in power_iter: + for pm in ['none', 'LU', 'QR']: + print("n_iter = %d on sklearn - %s" % (pi, pm)) + U, s, V, time = svd_timing(X, n_comps, n_iter=pi, + power_iteration_normalizer=pm, + n_oversamples=n_oversamples) + label = "sklearn - %s" % pm + all_time[label].append(time) + if enable_spectral_norm: + A = U.dot(np.diag(s).dot(V)) + all_spectral[label].append(norm_diff(X - A, norm=2) / + X_spectral_norm) + f = scalable_frobenius_norm_discrepancy(X, U, s, V) + all_frobenius[label].append(f / X_fro_norm) + + if fbpca_available: + print("n_iter = %d on fbca" % (pi)) + U, s, V, time = svd_timing(X, n_comps, n_iter=pi, + power_iteration_normalizer=pm, + n_oversamples=n_oversamples, + method='fbpca') + label = "fbpca" + all_time[label].append(time) + if enable_spectral_norm: + A = U.dot(np.diag(s).dot(V)) + all_spectral[label].append(norm_diff(X - A, norm=2) / + X_spectral_norm) + f = scalable_frobenius_norm_discrepancy(X, U, s, V) + all_frobenius[label].append(f / X_fro_norm) + + if enable_spectral_norm: + title = "%s: spectral norm diff vs running time" % (dataset_name) + plot_time_vs_s(all_time, all_spectral, power_iter, title) + title = "%s: Frobenius norm diff vs running time" % (dataset_name) + plot_time_vs_s(all_time, all_frobenius, power_iter, title) + + +def bench_b(power_list): + + n_samples, n_features = 1000, 10000 + data_params = {'n_samples': n_samples, 'n_features': n_features, + 'tail_strength': .7, 'random_state': random_state} + dataset_name = "low rank matrix %d x %d" % (n_samples, n_features) + ranks = [10, 50, 100] + + if enable_spectral_norm: + all_spectral = defaultdict(list) + all_frobenius = defaultdict(list) + for rank in ranks: + X = make_low_rank_matrix(effective_rank=rank, **data_params) + if enable_spectral_norm: + X_spectral_norm = norm_diff(X, norm=2, msg=False) + X_fro_norm = norm_diff(X, norm='fro', msg=False) + + for n_comp in [np.int(rank/2), rank, rank*2]: + label = "rank=%d, n_comp=%d" % (rank, n_comp) + print(label) + for pi in power_list: + U, s, V, _ = svd_timing(X, n_comp, n_iter=pi, n_oversamples=2, + power_iteration_normalizer='LU') + if enable_spectral_norm: + A = U.dot(np.diag(s).dot(V)) + all_spectral[label].append(norm_diff(X - A, norm=2) / + X_spectral_norm) + f = scalable_frobenius_norm_discrepancy(X, U, s, V) + all_frobenius[label].append(f / X_fro_norm) + + if enable_spectral_norm: + title = "%s: spectral norm diff vs n power iteration" % (dataset_name) + plot_power_iter_vs_s(power_iter, all_spectral, title) + title = "%s: Frobenius norm diff vs n power iteration" % (dataset_name) + plot_power_iter_vs_s(power_iter, all_frobenius, title) + + +def bench_c(datasets, n_comps): + all_time = defaultdict(list) + if enable_spectral_norm: + all_spectral = defaultdict(list) + all_frobenius = defaultdict(list) + + for dataset_name in datasets: + X = get_data(dataset_name) + if X is None: + continue + + if enable_spectral_norm: + X_spectral_norm = norm_diff(X, norm=2, msg=False) + X_fro_norm = norm_diff(X, norm='fro', msg=False) + n_comps = np.minimum(n_comps, np.min(X.shape)) + + label = "sklearn" + print("%s %d x %d - %s" % + (dataset_name, X.shape[0], X.shape[1], label)) + U, s, V, time = svd_timing(X, n_comps, n_iter=2, n_oversamples=10, + method=label) + + all_time[label].append(time) + if enable_spectral_norm: + A = U.dot(np.diag(s).dot(V)) + all_spectral[label].append(norm_diff(X - A, norm=2) / + X_spectral_norm) + f = scalable_frobenius_norm_discrepancy(X, U, s, V) + all_frobenius[label].append(f / X_fro_norm) + + if fbpca_available: + label = "fbpca" + print("%s %d x %d - %s" % + (dataset_name, X.shape[0], X.shape[1], label)) + U, s, V, time = svd_timing(X, n_comps, n_iter=2, n_oversamples=2, + method=label) + all_time[label].append(time) + if enable_spectral_norm: + A = U.dot(np.diag(s).dot(V)) + all_spectral[label].append(norm_diff(X - A, norm=2) / + X_spectral_norm) + f = scalable_frobenius_norm_discrepancy(X, U, s, V) + all_frobenius[label].append(f / X_fro_norm) + + if len(all_time) == 0: + raise ValueError("No tests ran. Aborting.") + + if enable_spectral_norm: + title = "normalized spectral norm diff vs running time" + scatter_time_vs_s(all_time, all_spectral, datasets, title) + title = "normalized Frobenius norm diff vs running time" + scatter_time_vs_s(all_time, all_frobenius, datasets, title) + + +if __name__ == '__main__': + random_state = check_random_state(1234) + + power_iter = np.linspace(0, 6, 7, dtype=int) + n_comps = 50 + + for dataset_name in datasets: + X = get_data(dataset_name) + if X is None: + continue + print(" >>>>>> Benching sklearn and fbpca on %s %d x %d" % + (dataset_name, X.shape[0], X.shape[1])) + bench_a(X, dataset_name, power_iter, n_oversamples=2, + n_comps=np.minimum(n_comps, np.min(X.shape))) + + print(" >>>>>> Benching on simulated low rank matrix with variable rank") + bench_b(power_iter) + + print(" >>>>>> Benching sklearn and fbpca default configurations") + bench_c(datasets + big_sparse_datasets, n_comps) + + plt.show() diff --git a/benchmarks/bench_plot_svd.py b/benchmarks/bench_plot_svd.py new file mode 100644 index 0000000..ce5cba9 --- /dev/null +++ b/benchmarks/bench_plot_svd.py @@ -0,0 +1,82 @@ +"""Benchmarks of Singular Value Decomposition (Exact and Approximate) + +The data is mostly low rank but is a fat infinite tail. +""" +import gc +from time import time +import numpy as np +from collections import defaultdict + +from scipy.linalg import svd +from sklearn.utils.extmath import randomized_svd +from sklearn.datasets.samples_generator import make_low_rank_matrix + + +def compute_bench(samples_range, features_range, n_iter=3, rank=50): + + it = 0 + + results = defaultdict(lambda: []) + + max_it = len(samples_range) * len(features_range) + for n_samples in samples_range: + for n_features in features_range: + it += 1 + print('====================') + print('Iteration %03d of %03d' % (it, max_it)) + print('====================') + X = make_low_rank_matrix(n_samples, n_features, + effective_rank=rank, + tail_strength=0.2) + + gc.collect() + print("benchmarking scipy svd: ") + tstart = time() + svd(X, full_matrices=False) + results['scipy svd'].append(time() - tstart) + + gc.collect() + print("benchmarking scikit-learn randomized_svd: n_iter=0") + tstart = time() + randomized_svd(X, rank, n_iter=0) + results['scikit-learn randomized_svd (n_iter=0)'].append( + time() - tstart) + + gc.collect() + print("benchmarking scikit-learn randomized_svd: n_iter=%d " + % n_iter) + tstart = time() + randomized_svd(X, rank, n_iter=n_iter) + results['scikit-learn randomized_svd (n_iter=%d)' + % n_iter].append(time() - tstart) + + return results + + +if __name__ == '__main__': + from mpl_toolkits.mplot3d import axes3d # register the 3d projection + import matplotlib.pyplot as plt + + samples_range = np.linspace(2, 1000, 4).astype(np.int) + features_range = np.linspace(2, 1000, 4).astype(np.int) + results = compute_bench(samples_range, features_range) + + label = 'scikit-learn singular value decomposition benchmark results' + fig = plt.figure(label) + ax = fig.gca(projection='3d') + for c, (label, timings) in zip('rbg', sorted(results.iteritems())): + X, Y = np.meshgrid(samples_range, features_range) + Z = np.asarray(timings).reshape(samples_range.shape[0], + features_range.shape[0]) + # plot the actual surface + ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3, + color=c) + # dummy point plot to stick the legend to since surface plot do not + # support legends (yet?) + ax.plot([1], [1], [1], color=c, label=label) + + ax.set_xlabel('n_samples') + ax.set_ylabel('n_features') + ax.set_zlabel('Time (s)') + ax.legend() + plt.show() diff --git a/benchmarks/bench_plot_ward.py b/benchmarks/bench_plot_ward.py new file mode 100644 index 0000000..be93d6d --- /dev/null +++ b/benchmarks/bench_plot_ward.py @@ -0,0 +1,43 @@ +""" +Benchmark scikit-learn's Ward implement compared to SciPy's +""" + +import time + +import numpy as np +from scipy.cluster import hierarchy +import matplotlib.pyplot as plt + +from sklearn.cluster import AgglomerativeClustering + +ward = AgglomerativeClustering(n_clusters=3, linkage='ward') + +n_samples = np.logspace(.5, 3, 9) +n_features = np.logspace(1, 3.5, 7) +N_samples, N_features = np.meshgrid(n_samples, + n_features) +scikits_time = np.zeros(N_samples.shape) +scipy_time = np.zeros(N_samples.shape) + +for i, n in enumerate(n_samples): + for j, p in enumerate(n_features): + X = np.random.normal(size=(n, p)) + t0 = time.time() + ward.fit(X) + scikits_time[j, i] = time.time() - t0 + t0 = time.time() + hierarchy.ward(X) + scipy_time[j, i] = time.time() - t0 + +ratio = scikits_time / scipy_time + +plt.figure("scikit-learn Ward's method benchmark results") +plt.imshow(np.log(ratio), aspect='auto', origin="lower") +plt.colorbar() +plt.contour(ratio, levels=[1, ], colors='k') +plt.yticks(range(len(n_features)), n_features.astype(np.int)) +plt.ylabel('N features') +plt.xticks(range(len(n_samples)), n_samples.astype(np.int)) +plt.xlabel('N samples') +plt.title("Scikit's time, in units of scipy time (log)") +plt.show() diff --git a/benchmarks/bench_random_projections.py b/benchmarks/bench_random_projections.py new file mode 100644 index 0000000..4379e50 --- /dev/null +++ b/benchmarks/bench_random_projections.py @@ -0,0 +1,254 @@ +""" +=========================== +Random projection benchmark +=========================== + +Benchmarks for random projections. + +""" +from __future__ import division +from __future__ import print_function + +import gc +import sys +import optparse +from datetime import datetime +import collections + +import numpy as np +import scipy.sparse as sp + +from sklearn import clone +from sklearn.externals.six.moves import xrange +from sklearn.random_projection import (SparseRandomProjection, + GaussianRandomProjection, + johnson_lindenstrauss_min_dim) + + +def type_auto_or_float(val): + if val == "auto": + return "auto" + else: + return float(val) + + +def type_auto_or_int(val): + if val == "auto": + return "auto" + else: + return int(val) + + +def compute_time(t_start, delta): + mu_second = 0.0 + 10 ** 6 # number of microseconds in a second + + return delta.seconds + delta.microseconds / mu_second + + +def bench_scikit_transformer(X, transfomer): + gc.collect() + + clf = clone(transfomer) + + # start time + t_start = datetime.now() + clf.fit(X) + delta = (datetime.now() - t_start) + # stop time + time_to_fit = compute_time(t_start, delta) + + # start time + t_start = datetime.now() + clf.transform(X) + delta = (datetime.now() - t_start) + # stop time + time_to_transform = compute_time(t_start, delta) + + return time_to_fit, time_to_transform + + +# Make some random data with uniformly located non zero entries with +# Gaussian distributed values +def make_sparse_random_data(n_samples, n_features, n_nonzeros, + random_state=None): + rng = np.random.RandomState(random_state) + data_coo = sp.coo_matrix( + (rng.randn(n_nonzeros), + (rng.randint(n_samples, size=n_nonzeros), + rng.randint(n_features, size=n_nonzeros))), + shape=(n_samples, n_features)) + return data_coo.toarray(), data_coo.tocsr() + + +def print_row(clf_type, time_fit, time_transform): + print("%s | %s | %s" % (clf_type.ljust(30), + ("%.4fs" % time_fit).center(12), + ("%.4fs" % time_transform).center(12))) + + +if __name__ == "__main__": + ########################################################################### + # Option parser + ########################################################################### + op = optparse.OptionParser() + op.add_option("--n-times", + dest="n_times", default=5, type=int, + help="Benchmark results are average over n_times experiments") + + op.add_option("--n-features", + dest="n_features", default=10 ** 4, type=int, + help="Number of features in the benchmarks") + + op.add_option("--n-components", + dest="n_components", default="auto", + help="Size of the random subspace." + " ('auto' or int > 0)") + + op.add_option("--ratio-nonzeros", + dest="ratio_nonzeros", default=10 ** -3, type=float, + help="Number of features in the benchmarks") + + op.add_option("--n-samples", + dest="n_samples", default=500, type=int, + help="Number of samples in the benchmarks") + + op.add_option("--random-seed", + dest="random_seed", default=13, type=int, + help="Seed used by the random number generators.") + + op.add_option("--density", + dest="density", default=1 / 3, + help="Density used by the sparse random projection." + " ('auto' or float (0.0, 1.0]") + + op.add_option("--eps", + dest="eps", default=0.5, type=float, + help="See the documentation of the underlying transformers.") + + op.add_option("--transformers", + dest="selected_transformers", + default='GaussianRandomProjection,SparseRandomProjection', + type=str, + help="Comma-separated list of transformer to benchmark. " + "Default: %default. Available: " + "GaussianRandomProjection,SparseRandomProjection") + + op.add_option("--dense", + dest="dense", + default=False, + action="store_true", + help="Set input space as a dense matrix.") + + (opts, args) = op.parse_args() + if len(args) > 0: + op.error("this script takes no arguments.") + sys.exit(1) + opts.n_components = type_auto_or_int(opts.n_components) + opts.density = type_auto_or_float(opts.density) + selected_transformers = opts.selected_transformers.split(',') + + ########################################################################### + # Generate dataset + ########################################################################### + n_nonzeros = int(opts.ratio_nonzeros * opts.n_features) + + print('Dataset statics') + print("===========================") + print('n_samples \t= %s' % opts.n_samples) + print('n_features \t= %s' % opts.n_features) + if opts.n_components == "auto": + print('n_components \t= %s (auto)' % + johnson_lindenstrauss_min_dim(n_samples=opts.n_samples, + eps=opts.eps)) + else: + print('n_components \t= %s' % opts.n_components) + print('n_elements \t= %s' % (opts.n_features * opts.n_samples)) + print('n_nonzeros \t= %s per feature' % n_nonzeros) + print('ratio_nonzeros \t= %s' % opts.ratio_nonzeros) + print('') + + ########################################################################### + # Set transformer input + ########################################################################### + transformers = {} + + ########################################################################### + # Set GaussianRandomProjection input + gaussian_matrix_params = { + "n_components": opts.n_components, + "random_state": opts.random_seed + } + transformers["GaussianRandomProjection"] = \ + GaussianRandomProjection(**gaussian_matrix_params) + + ########################################################################### + # Set SparseRandomProjection input + sparse_matrix_params = { + "n_components": opts.n_components, + "random_state": opts.random_seed, + "density": opts.density, + "eps": opts.eps, + } + + transformers["SparseRandomProjection"] = \ + SparseRandomProjection(**sparse_matrix_params) + + ########################################################################### + # Perform benchmark + ########################################################################### + time_fit = collections.defaultdict(list) + time_transform = collections.defaultdict(list) + + print('Benchmarks') + print("===========================") + print("Generate dataset benchmarks... ", end="") + X_dense, X_sparse = make_sparse_random_data(opts.n_samples, + opts.n_features, + n_nonzeros, + random_state=opts.random_seed) + X = X_dense if opts.dense else X_sparse + print("done") + + for name in selected_transformers: + print("Perform benchmarks for %s..." % name) + + for iteration in xrange(opts.n_times): + print("\titer %s..." % iteration, end="") + time_to_fit, time_to_transform = bench_scikit_transformer(X_dense, + transformers[name]) + time_fit[name].append(time_to_fit) + time_transform[name].append(time_to_transform) + print("done") + + print("") + + ########################################################################### + # Print results + ########################################################################### + print("Script arguments") + print("===========================") + arguments = vars(opts) + print("%s \t | %s " % ("Arguments".ljust(16), + "Value".center(12),)) + print(25 * "-" + ("|" + "-" * 14) * 1) + for key, value in arguments.items(): + print("%s \t | %s " % (str(key).ljust(16), + str(value).strip().center(12))) + print("") + + print("Transformer performance:") + print("===========================") + print("Results are averaged over %s repetition(s)." % opts.n_times) + print("") + print("%s | %s | %s" % ("Transformer".ljust(30), + "fit".center(12), + "transform".center(12))) + print(31 * "-" + ("|" + "-" * 14) * 2) + + for name in sorted(selected_transformers): + print_row(name, + np.mean(time_fit[name]), + np.mean(time_transform[name])) + + print("") + print("") diff --git a/benchmarks/bench_rcv1_logreg_convergence.py b/benchmarks/bench_rcv1_logreg_convergence.py new file mode 100644 index 0000000..f452199 --- /dev/null +++ b/benchmarks/bench_rcv1_logreg_convergence.py @@ -0,0 +1,236 @@ +# Authors: Tom Dupre la Tour +# Olivier Grisel +# +# License: BSD 3 clause + +import matplotlib.pyplot as plt +import numpy as np +import gc +import time + +from sklearn.externals.joblib import Memory +from sklearn.linear_model import (LogisticRegression, SGDClassifier) +from sklearn.datasets import fetch_rcv1 +from sklearn.linear_model.sag import get_auto_step_size +from sklearn.linear_model.sag_fast import get_max_squared_sum + + +try: + import lightning.classification as lightning_clf +except ImportError: + lightning_clf = None + +m = Memory(cachedir='.', verbose=0) + + +# compute logistic loss +def get_loss(w, intercept, myX, myy, C): + n_samples = myX.shape[0] + w = w.ravel() + p = np.mean(np.log(1. + np.exp(-myy * (myX.dot(w) + intercept)))) + print("%f + %f" % (p, w.dot(w) / 2. / C / n_samples)) + p += w.dot(w) / 2. / C / n_samples + return p + + +# We use joblib to cache individual fits. Note that we do not pass the dataset +# as argument as the hashing would be too slow, so we assume that the dataset +# never changes. +@m.cache() +def bench_one(name, clf_type, clf_params, n_iter): + clf = clf_type(**clf_params) + try: + clf.set_params(max_iter=n_iter, random_state=42) + except: + clf.set_params(n_iter=n_iter, random_state=42) + + st = time.time() + clf.fit(X, y) + end = time.time() + + try: + C = 1.0 / clf.alpha / n_samples + except: + C = clf.C + + try: + intercept = clf.intercept_ + except: + intercept = 0. + + train_loss = get_loss(clf.coef_, intercept, X, y, C) + train_score = clf.score(X, y) + test_score = clf.score(X_test, y_test) + duration = end - st + + return train_loss, train_score, test_score, duration + + +def bench(clfs): + for (name, clf, iter_range, train_losses, train_scores, + test_scores, durations) in clfs: + print("training %s" % name) + clf_type = type(clf) + clf_params = clf.get_params() + + for n_iter in iter_range: + gc.collect() + + train_loss, train_score, test_score, duration = bench_one( + name, clf_type, clf_params, n_iter) + + train_losses.append(train_loss) + train_scores.append(train_score) + test_scores.append(test_score) + durations.append(duration) + print("classifier: %s" % name) + print("train_loss: %.8f" % train_loss) + print("train_score: %.8f" % train_score) + print("test_score: %.8f" % test_score) + print("time for fit: %.8f seconds" % duration) + print("") + + print("") + return clfs + + +def plot_train_losses(clfs): + plt.figure() + for (name, _, _, train_losses, _, _, durations) in clfs: + plt.plot(durations, train_losses, '-o', label=name) + plt.legend(loc=0) + plt.xlabel("seconds") + plt.ylabel("train loss") + + +def plot_train_scores(clfs): + plt.figure() + for (name, _, _, _, train_scores, _, durations) in clfs: + plt.plot(durations, train_scores, '-o', label=name) + plt.legend(loc=0) + plt.xlabel("seconds") + plt.ylabel("train score") + plt.ylim((0.92, 0.96)) + + +def plot_test_scores(clfs): + plt.figure() + for (name, _, _, _, _, test_scores, durations) in clfs: + plt.plot(durations, test_scores, '-o', label=name) + plt.legend(loc=0) + plt.xlabel("seconds") + plt.ylabel("test score") + plt.ylim((0.92, 0.96)) + + +def plot_dloss(clfs): + plt.figure() + pobj_final = [] + for (name, _, _, train_losses, _, _, durations) in clfs: + pobj_final.append(train_losses[-1]) + + indices = np.argsort(pobj_final) + pobj_best = pobj_final[indices[0]] + + for (name, _, _, train_losses, _, _, durations) in clfs: + log_pobj = np.log(abs(np.array(train_losses) - pobj_best)) / np.log(10) + + plt.plot(durations, log_pobj, '-o', label=name) + plt.legend(loc=0) + plt.xlabel("seconds") + plt.ylabel("log(best - train_loss)") + + +rcv1 = fetch_rcv1() +X = rcv1.data +n_samples, n_features = X.shape + +# consider the binary classification problem 'CCAT' vs the rest +ccat_idx = rcv1.target_names.tolist().index('CCAT') +y = rcv1.target.tocsc()[:, ccat_idx].toarray().ravel().astype(np.float64) +y[y == 0] = -1 + +# parameters +C = 1. +fit_intercept = True +tol = 1.0e-14 + +# max_iter range +sgd_iter_range = list(range(1, 121, 10)) +newton_iter_range = list(range(1, 25, 3)) +lbfgs_iter_range = list(range(1, 242, 12)) +liblinear_iter_range = list(range(1, 37, 3)) +liblinear_dual_iter_range = list(range(1, 85, 6)) +sag_iter_range = list(range(1, 37, 3)) + +clfs = [ + ("LR-liblinear", + LogisticRegression(C=C, tol=tol, + solver="liblinear", fit_intercept=fit_intercept, + intercept_scaling=1), + liblinear_iter_range, [], [], [], []), + ("LR-liblinear-dual", + LogisticRegression(C=C, tol=tol, dual=True, + solver="liblinear", fit_intercept=fit_intercept, + intercept_scaling=1), + liblinear_dual_iter_range, [], [], [], []), + ("LR-SAG", + LogisticRegression(C=C, tol=tol, + solver="sag", fit_intercept=fit_intercept), + sag_iter_range, [], [], [], []), + ("LR-newton-cg", + LogisticRegression(C=C, tol=tol, solver="newton-cg", + fit_intercept=fit_intercept), + newton_iter_range, [], [], [], []), + ("LR-lbfgs", + LogisticRegression(C=C, tol=tol, + solver="lbfgs", fit_intercept=fit_intercept), + lbfgs_iter_range, [], [], [], []), + ("SGD", + SGDClassifier(alpha=1.0 / C / n_samples, penalty='l2', loss='log', + fit_intercept=fit_intercept, verbose=0), + sgd_iter_range, [], [], [], [])] + + +if lightning_clf is not None and not fit_intercept: + alpha = 1. / C / n_samples + # compute the same step_size than in LR-sag + max_squared_sum = get_max_squared_sum(X) + step_size = get_auto_step_size(max_squared_sum, alpha, "log", + fit_intercept) + + clfs.append( + ("Lightning-SVRG", + lightning_clf.SVRGClassifier(alpha=alpha, eta=step_size, + tol=tol, loss="log"), + sag_iter_range, [], [], [], [])) + clfs.append( + ("Lightning-SAG", + lightning_clf.SAGClassifier(alpha=alpha, eta=step_size, + tol=tol, loss="log"), + sag_iter_range, [], [], [], [])) + + # We keep only 200 features, to have a dense dataset, + # and compare to lightning SAG, which seems incorrect in the sparse case. + X_csc = X.tocsc() + nnz_in_each_features = X_csc.indptr[1:] - X_csc.indptr[:-1] + X = X_csc[:, np.argsort(nnz_in_each_features)[-200:]] + X = X.toarray() + print("dataset: %.3f MB" % (X.nbytes / 1e6)) + + +# Split training and testing. Switch train and test subset compared to +# LYRL2004 split, to have a larger training dataset. +n = 23149 +X_test = X[:n, :] +y_test = y[:n] +X = X[n:, :] +y = y[n:] + +clfs = bench(clfs) + +plot_train_scores(clfs) +plot_test_scores(clfs) +plot_train_losses(clfs) +plot_dloss(clfs) +plt.show() diff --git a/benchmarks/bench_sample_without_replacement.py b/benchmarks/bench_sample_without_replacement.py new file mode 100644 index 0000000..90c382e --- /dev/null +++ b/benchmarks/bench_sample_without_replacement.py @@ -0,0 +1,207 @@ +""" +Benchmarks for sampling without replacement of integer. + +""" +from __future__ import division +from __future__ import print_function + +import gc +import sys +import optparse +from datetime import datetime +import operator + +import matplotlib.pyplot as plt +import numpy as np +import random + +from sklearn.externals.six.moves import xrange +from sklearn.utils.random import sample_without_replacement + + +def compute_time(t_start, delta): + mu_second = 0.0 + 10 ** 6 # number of microseconds in a second + + return delta.seconds + delta.microseconds / mu_second + + +def bench_sample(sampling, n_population, n_samples): + gc.collect() + # start time + t_start = datetime.now() + sampling(n_population, n_samples) + delta = (datetime.now() - t_start) + # stop time + time = compute_time(t_start, delta) + return time + +if __name__ == "__main__": + ########################################################################### + # Option parser + ########################################################################### + op = optparse.OptionParser() + op.add_option("--n-times", + dest="n_times", default=5, type=int, + help="Benchmark results are average over n_times experiments") + + op.add_option("--n-population", + dest="n_population", default=100000, type=int, + help="Size of the population to sample from.") + + op.add_option("--n-step", + dest="n_steps", default=5, type=int, + help="Number of step interval between 0 and n_population.") + + default_algorithms = "custom-tracking-selection,custom-auto," \ + "custom-reservoir-sampling,custom-pool,"\ + "python-core-sample,numpy-permutation" + + op.add_option("--algorithm", + dest="selected_algorithm", + default=default_algorithms, + type=str, + help="Comma-separated list of transformer to benchmark. " + "Default: %default. \nAvailable: %default") + + # op.add_option("--random-seed", + # dest="random_seed", default=13, type=int, + # help="Seed used by the random number generators.") + + (opts, args) = op.parse_args() + if len(args) > 0: + op.error("this script takes no arguments.") + sys.exit(1) + + selected_algorithm = opts.selected_algorithm.split(',') + for key in selected_algorithm: + if key not in default_algorithms.split(','): + raise ValueError("Unknown sampling algorithm \"%s\" not in (%s)." + % (key, default_algorithms)) + + ########################################################################### + # List sampling algorithm + ########################################################################### + # We assume that sampling algorithm has the following signature: + # sample(n_population, n_sample) + # + sampling_algorithm = {} + + ########################################################################### + # Set Python core input + sampling_algorithm["python-core-sample"] = \ + lambda n_population, n_sample: \ + random.sample(xrange(n_population), n_sample) + + ########################################################################### + # Set custom automatic method selection + sampling_algorithm["custom-auto"] = \ + lambda n_population, n_samples, random_state=None: \ + sample_without_replacement(n_population, + n_samples, + method="auto", + random_state=random_state) + + ########################################################################### + # Set custom tracking based method + sampling_algorithm["custom-tracking-selection"] = \ + lambda n_population, n_samples, random_state=None: \ + sample_without_replacement(n_population, + n_samples, + method="tracking_selection", + random_state=random_state) + + ########################################################################### + # Set custom reservoir based method + sampling_algorithm["custom-reservoir-sampling"] = \ + lambda n_population, n_samples, random_state=None: \ + sample_without_replacement(n_population, + n_samples, + method="reservoir_sampling", + random_state=random_state) + + ########################################################################### + # Set custom reservoir based method + sampling_algorithm["custom-pool"] = \ + lambda n_population, n_samples, random_state=None: \ + sample_without_replacement(n_population, + n_samples, + method="pool", + random_state=random_state) + + ########################################################################### + # Numpy permutation based + sampling_algorithm["numpy-permutation"] = \ + lambda n_population, n_sample: \ + np.random.permutation(n_population)[:n_sample] + + ########################################################################### + # Remove unspecified algorithm + sampling_algorithm = dict((key, value) + for key, value in sampling_algorithm.items() + if key in selected_algorithm) + + ########################################################################### + # Perform benchmark + ########################################################################### + time = {} + n_samples = np.linspace(start=0, stop=opts.n_population, + num=opts.n_steps).astype(np.int) + + ratio = n_samples / opts.n_population + + print('Benchmarks') + print("===========================") + + for name in sorted(sampling_algorithm): + print("Perform benchmarks for %s..." % name, end="") + time[name] = np.zeros(shape=(opts.n_steps, opts.n_times)) + + for step in xrange(opts.n_steps): + for it in xrange(opts.n_times): + time[name][step, it] = bench_sample(sampling_algorithm[name], + opts.n_population, + n_samples[step]) + + print("done") + + print("Averaging results...", end="") + for name in sampling_algorithm: + time[name] = np.mean(time[name], axis=1) + print("done\n") + + # Print results + ########################################################################### + print("Script arguments") + print("===========================") + arguments = vars(opts) + print("%s \t | %s " % ("Arguments".ljust(16), + "Value".center(12),)) + print(25 * "-" + ("|" + "-" * 14) * 1) + for key, value in arguments.items(): + print("%s \t | %s " % (str(key).ljust(16), + str(value).strip().center(12))) + print("") + + print("Sampling algorithm performance:") + print("===============================") + print("Results are averaged over %s repetition(s)." % opts.n_times) + print("") + + fig = plt.figure('scikit-learn sample w/o replacement benchmark results') + plt.title("n_population = %s, n_times = %s" % + (opts.n_population, opts.n_times)) + ax = fig.add_subplot(111) + for name in sampling_algorithm: + ax.plot(ratio, time[name], label=name) + + ax.set_xlabel('ratio of n_sample / n_population') + ax.set_ylabel('Time (s)') + ax.legend() + + # Sort legend labels + handles, labels = ax.get_legend_handles_labels() + hl = sorted(zip(handles, labels), key=operator.itemgetter(1)) + handles2, labels2 = zip(*hl) + ax.legend(handles2, labels2, loc=0) + + plt.show() diff --git a/benchmarks/bench_sgd_regression.py b/benchmarks/bench_sgd_regression.py new file mode 100644 index 0000000..e66f656 --- /dev/null +++ b/benchmarks/bench_sgd_regression.py @@ -0,0 +1,151 @@ +""" +Benchmark for SGD regression + +Compares SGD regression against coordinate descent and Ridge +on synthetic data. +""" + +print(__doc__) + +# Author: Peter Prettenhofer +# License: BSD 3 clause + +import numpy as np +import matplotlib.pyplot as plt + +import gc + +from time import time + +from sklearn.linear_model import Ridge, SGDRegressor, ElasticNet +from sklearn.metrics import mean_squared_error +from sklearn.datasets.samples_generator import make_regression + +if __name__ == "__main__": + list_n_samples = np.linspace(100, 10000, 5).astype(np.int) + list_n_features = [10, 100, 1000] + n_test = 1000 + noise = 0.1 + alpha = 0.01 + sgd_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) + elnet_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) + ridge_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) + asgd_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) + for i, n_train in enumerate(list_n_samples): + for j, n_features in enumerate(list_n_features): + X, y, coef = make_regression( + n_samples=n_train + n_test, n_features=n_features, + noise=noise, coef=True) + + X_train = X[:n_train] + y_train = y[:n_train] + X_test = X[n_train:] + y_test = y[n_train:] + + print("=======================") + print("Round %d %d" % (i, j)) + print("n_features:", n_features) + print("n_samples:", n_train) + + # Shuffle data + idx = np.arange(n_train) + np.random.seed(13) + np.random.shuffle(idx) + X_train = X_train[idx] + y_train = y_train[idx] + + std = X_train.std(axis=0) + mean = X_train.mean(axis=0) + X_train = (X_train - mean) / std + X_test = (X_test - mean) / std + + std = y_train.std(axis=0) + mean = y_train.mean(axis=0) + y_train = (y_train - mean) / std + y_test = (y_test - mean) / std + + gc.collect() + print("- benchmarking ElasticNet") + clf = ElasticNet(alpha=alpha, l1_ratio=0.5, fit_intercept=False) + tstart = time() + clf.fit(X_train, y_train) + elnet_results[i, j, 0] = mean_squared_error(clf.predict(X_test), + y_test) + elnet_results[i, j, 1] = time() - tstart + + gc.collect() + print("- benchmarking SGD") + n_iter = np.ceil(10 ** 4.0 / n_train) + clf = SGDRegressor(alpha=alpha / n_train, fit_intercept=False, + n_iter=n_iter, learning_rate="invscaling", + eta0=.01, power_t=0.25) + + tstart = time() + clf.fit(X_train, y_train) + sgd_results[i, j, 0] = mean_squared_error(clf.predict(X_test), + y_test) + sgd_results[i, j, 1] = time() - tstart + + gc.collect() + print("n_iter", n_iter) + print("- benchmarking A-SGD") + n_iter = np.ceil(10 ** 4.0 / n_train) + clf = SGDRegressor(alpha=alpha / n_train, fit_intercept=False, + n_iter=n_iter, learning_rate="invscaling", + eta0=.002, power_t=0.05, + average=(n_iter * n_train // 2)) + + tstart = time() + clf.fit(X_train, y_train) + asgd_results[i, j, 0] = mean_squared_error(clf.predict(X_test), + y_test) + asgd_results[i, j, 1] = time() - tstart + + gc.collect() + print("- benchmarking RidgeRegression") + clf = Ridge(alpha=alpha, fit_intercept=False) + tstart = time() + clf.fit(X_train, y_train) + ridge_results[i, j, 0] = mean_squared_error(clf.predict(X_test), + y_test) + ridge_results[i, j, 1] = time() - tstart + + # Plot results + i = 0 + m = len(list_n_features) + plt.figure('scikit-learn SGD regression benchmark results', + figsize=(5 * 2, 4 * m)) + for j in range(m): + plt.subplot(m, 2, i + 1) + plt.plot(list_n_samples, np.sqrt(elnet_results[:, j, 0]), + label="ElasticNet") + plt.plot(list_n_samples, np.sqrt(sgd_results[:, j, 0]), + label="SGDRegressor") + plt.plot(list_n_samples, np.sqrt(asgd_results[:, j, 0]), + label="A-SGDRegressor") + plt.plot(list_n_samples, np.sqrt(ridge_results[:, j, 0]), + label="Ridge") + plt.legend(prop={"size": 10}) + plt.xlabel("n_train") + plt.ylabel("RMSE") + plt.title("Test error - %d features" % list_n_features[j]) + i += 1 + + plt.subplot(m, 2, i + 1) + plt.plot(list_n_samples, np.sqrt(elnet_results[:, j, 1]), + label="ElasticNet") + plt.plot(list_n_samples, np.sqrt(sgd_results[:, j, 1]), + label="SGDRegressor") + plt.plot(list_n_samples, np.sqrt(asgd_results[:, j, 1]), + label="A-SGDRegressor") + plt.plot(list_n_samples, np.sqrt(ridge_results[:, j, 1]), + label="Ridge") + plt.legend(prop={"size": 10}) + plt.xlabel("n_train") + plt.ylabel("Time [sec]") + plt.title("Training time - %d features" % list_n_features[j]) + i += 1 + + plt.subplots_adjust(hspace=.30) + + plt.show() diff --git a/benchmarks/bench_sparsify.py b/benchmarks/bench_sparsify.py new file mode 100644 index 0000000..6affa4f --- /dev/null +++ b/benchmarks/bench_sparsify.py @@ -0,0 +1,104 @@ +""" +Benchmark SGD prediction time with dense/sparse coefficients. + +Invoke with +----------- + +$ kernprof.py -l sparsity_benchmark.py +$ python -m line_profiler sparsity_benchmark.py.lprof + +Typical output +-------------- + +input data sparsity: 0.050000 +true coef sparsity: 0.000100 +test data sparsity: 0.027400 +model sparsity: 0.000024 +r^2 on test data (dense model) : 0.233651 +r^2 on test data (sparse model) : 0.233651 +Wrote profile results to sparsity_benchmark.py.lprof +Timer unit: 1e-06 s + +File: sparsity_benchmark.py +Function: benchmark_dense_predict at line 51 +Total time: 0.532979 s + +Line # Hits Time Per Hit % Time Line Contents +============================================================== + 51 @profile + 52 def benchmark_dense_predict(): + 53 301 640 2.1 0.1 for _ in range(300): + 54 300 532339 1774.5 99.9 clf.predict(X_test) + +File: sparsity_benchmark.py +Function: benchmark_sparse_predict at line 56 +Total time: 0.39274 s + +Line # Hits Time Per Hit % Time Line Contents +============================================================== + 56 @profile + 57 def benchmark_sparse_predict(): + 58 1 10854 10854.0 2.8 X_test_sparse = csr_matrix(X_test) + 59 301 477 1.6 0.1 for _ in range(300): + 60 300 381409 1271.4 97.1 clf.predict(X_test_sparse) +""" + +from scipy.sparse.csr import csr_matrix +import numpy as np +from sklearn.linear_model.stochastic_gradient import SGDRegressor +from sklearn.metrics import r2_score + +np.random.seed(42) + + +def sparsity_ratio(X): + return np.count_nonzero(X) / float(n_samples * n_features) + +n_samples, n_features = 5000, 300 +X = np.random.randn(n_samples, n_features) +inds = np.arange(n_samples) +np.random.shuffle(inds) +X[inds[int(n_features / 1.2):]] = 0 # sparsify input +print("input data sparsity: %f" % sparsity_ratio(X)) +coef = 3 * np.random.randn(n_features) +inds = np.arange(n_features) +np.random.shuffle(inds) +coef[inds[n_features/2:]] = 0 # sparsify coef +print("true coef sparsity: %f" % sparsity_ratio(coef)) +y = np.dot(X, coef) + +# add noise +y += 0.01 * np.random.normal((n_samples,)) + +# Split data in train set and test set +n_samples = X.shape[0] +X_train, y_train = X[:n_samples / 2], y[:n_samples / 2] +X_test, y_test = X[n_samples / 2:], y[n_samples / 2:] +print("test data sparsity: %f" % sparsity_ratio(X_test)) + +############################################################################### +clf = SGDRegressor(penalty='l1', alpha=.2, fit_intercept=True, n_iter=2000) +clf.fit(X_train, y_train) +print("model sparsity: %f" % sparsity_ratio(clf.coef_)) + + +def benchmark_dense_predict(): + for _ in range(300): + clf.predict(X_test) + + +def benchmark_sparse_predict(): + X_test_sparse = csr_matrix(X_test) + for _ in range(300): + clf.predict(X_test_sparse) + + +def score(y_test, y_pred, case): + r2 = r2_score(y_test, y_pred) + print("r^2 on test data (%s) : %f" % (case, r2)) + +score(y_test, clf.predict(X_test), 'dense model') +benchmark_dense_predict() +clf.sparsify() +score(y_test, clf.predict(X_test), 'sparse model') +benchmark_sparse_predict() diff --git a/benchmarks/bench_tree.py b/benchmarks/bench_tree.py new file mode 100644 index 0000000..8a0af26 --- /dev/null +++ b/benchmarks/bench_tree.py @@ -0,0 +1,124 @@ +""" +To run this, you'll need to have installed. + + * scikit-learn + +Does two benchmarks + +First, we fix a training set, increase the number of +samples to classify and plot number of classified samples as a +function of time. + +In the second benchmark, we increase the number of dimensions of the +training set, classify a sample and plot the time taken as a function +of the number of dimensions. +""" +import numpy as np +import matplotlib.pyplot as plt +import gc +from datetime import datetime + +# to store the results +scikit_classifier_results = [] +scikit_regressor_results = [] + +mu_second = 0.0 + 10 ** 6 # number of microseconds in a second + + +def bench_scikit_tree_classifier(X, Y): + """Benchmark with scikit-learn decision tree classifier""" + + from sklearn.tree import DecisionTreeClassifier + + gc.collect() + + # start time + tstart = datetime.now() + clf = DecisionTreeClassifier() + clf.fit(X, Y).predict(X) + delta = (datetime.now() - tstart) + # stop time + + scikit_classifier_results.append( + delta.seconds + delta.microseconds / mu_second) + + +def bench_scikit_tree_regressor(X, Y): + """Benchmark with scikit-learn decision tree regressor""" + + from sklearn.tree import DecisionTreeRegressor + + gc.collect() + + # start time + tstart = datetime.now() + clf = DecisionTreeRegressor() + clf.fit(X, Y).predict(X) + delta = (datetime.now() - tstart) + # stop time + + scikit_regressor_results.append( + delta.seconds + delta.microseconds / mu_second) + + +if __name__ == '__main__': + + print('============================================') + print('Warning: this is going to take a looong time') + print('============================================') + + n = 10 + step = 10000 + n_samples = 10000 + dim = 10 + n_classes = 10 + for i in range(n): + print('============================================') + print('Entering iteration %s of %s' % (i, n)) + print('============================================') + n_samples += step + X = np.random.randn(n_samples, dim) + Y = np.random.randint(0, n_classes, (n_samples,)) + bench_scikit_tree_classifier(X, Y) + Y = np.random.randn(n_samples) + bench_scikit_tree_regressor(X, Y) + + xx = range(0, n * step, step) + plt.figure('scikit-learn tree benchmark results') + plt.subplot(211) + plt.title('Learning with varying number of samples') + plt.plot(xx, scikit_classifier_results, 'g-', label='classification') + plt.plot(xx, scikit_regressor_results, 'r-', label='regression') + plt.legend(loc='upper left') + plt.xlabel('number of samples') + plt.ylabel('Time (s)') + + scikit_classifier_results = [] + scikit_regressor_results = [] + n = 10 + step = 500 + start_dim = 500 + n_classes = 10 + + dim = start_dim + for i in range(0, n): + print('============================================') + print('Entering iteration %s of %s' % (i, n)) + print('============================================') + dim += step + X = np.random.randn(100, dim) + Y = np.random.randint(0, n_classes, (100,)) + bench_scikit_tree_classifier(X, Y) + Y = np.random.randn(100) + bench_scikit_tree_regressor(X, Y) + + xx = np.arange(start_dim, start_dim + n * step, step) + plt.subplot(212) + plt.title('Learning in high dimensional spaces') + plt.plot(xx, scikit_classifier_results, 'g-', label='classification') + plt.plot(xx, scikit_regressor_results, 'r-', label='regression') + plt.legend(loc='upper left') + plt.xlabel('number of dimensions') + plt.ylabel('Time (s)') + plt.axis('tight') + plt.show() diff --git a/build_tools/appveyor/install.ps1 b/build_tools/appveyor/install.ps1 new file mode 100644 index 0000000..160ba55 --- /dev/null +++ b/build_tools/appveyor/install.ps1 @@ -0,0 +1,229 @@ +# Sample script to install Python and pip under Windows +# Authors: Olivier Grisel, Jonathan Helmus, Kyle Kastner, and Alex Willmer +# License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ + +$MINICONDA_URL = "http://repo.continuum.io/miniconda/" +$BASE_URL = "https://www.python.org/ftp/python/" +$GET_PIP_URL = "https://bootstrap.pypa.io/get-pip.py" +$GET_PIP_PATH = "C:\get-pip.py" + +$PYTHON_PRERELEASE_REGEX = @" +(?x) +(?\d+) +\. +(?\d+) +\. +(?\d+) +(?[a-z]{1,2}\d+) +"@ + + +function Download ($filename, $url) { + $webclient = New-Object System.Net.WebClient + + $basedir = $pwd.Path + "\" + $filepath = $basedir + $filename + if (Test-Path $filename) { + Write-Host "Reusing" $filepath + return $filepath + } + + # Download and retry up to 3 times in case of network transient errors. + Write-Host "Downloading" $filename "from" $url + $retry_attempts = 2 + for ($i = 0; $i -lt $retry_attempts; $i++) { + try { + $webclient.DownloadFile($url, $filepath) + break + } + Catch [Exception]{ + Start-Sleep 1 + } + } + if (Test-Path $filepath) { + Write-Host "File saved at" $filepath + } else { + # Retry once to get the error message if any at the last try + $webclient.DownloadFile($url, $filepath) + } + return $filepath +} + + +function ParsePythonVersion ($python_version) { + if ($python_version -match $PYTHON_PRERELEASE_REGEX) { + return ([int]$matches.major, [int]$matches.minor, [int]$matches.micro, + $matches.prerelease) + } + $version_obj = [version]$python_version + return ($version_obj.major, $version_obj.minor, $version_obj.build, "") +} + + +function DownloadPython ($python_version, $platform_suffix) { + $major, $minor, $micro, $prerelease = ParsePythonVersion $python_version + + if (($major -le 2 -and $micro -eq 0) ` + -or ($major -eq 3 -and $minor -le 2 -and $micro -eq 0) ` + ) { + $dir = "$major.$minor" + $python_version = "$major.$minor$prerelease" + } else { + $dir = "$major.$minor.$micro" + } + + if ($prerelease) { + if (($major -le 2) ` + -or ($major -eq 3 -and $minor -eq 1) ` + -or ($major -eq 3 -and $minor -eq 2) ` + -or ($major -eq 3 -and $minor -eq 3) ` + ) { + $dir = "$dir/prev" + } + } + + if (($major -le 2) -or ($major -le 3 -and $minor -le 4)) { + $ext = "msi" + if ($platform_suffix) { + $platform_suffix = ".$platform_suffix" + } + } else { + $ext = "exe" + if ($platform_suffix) { + $platform_suffix = "-$platform_suffix" + } + } + + $filename = "python-$python_version$platform_suffix.$ext" + $url = "$BASE_URL$dir/$filename" + $filepath = Download $filename $url + return $filepath +} + + +function InstallPython ($python_version, $architecture, $python_home) { + Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home + if (Test-Path $python_home) { + Write-Host $python_home "already exists, skipping." + return $false + } + if ($architecture -eq "32") { + $platform_suffix = "" + } else { + $platform_suffix = "amd64" + } + $installer_path = DownloadPython $python_version $platform_suffix + $installer_ext = [System.IO.Path]::GetExtension($installer_path) + Write-Host "Installing $installer_path to $python_home" + $install_log = $python_home + ".log" + if ($installer_ext -eq '.msi') { + InstallPythonMSI $installer_path $python_home $install_log + } else { + InstallPythonEXE $installer_path $python_home $install_log + } + if (Test-Path $python_home) { + Write-Host "Python $python_version ($architecture) installation complete" + } else { + Write-Host "Failed to install Python in $python_home" + Get-Content -Path $install_log + Exit 1 + } +} + + +function InstallPythonEXE ($exepath, $python_home, $install_log) { + $install_args = "/quiet InstallAllUsers=1 TargetDir=$python_home" + RunCommand $exepath $install_args +} + + +function InstallPythonMSI ($msipath, $python_home, $install_log) { + $install_args = "/qn /log $install_log /i $msipath TARGETDIR=$python_home" + $uninstall_args = "/qn /x $msipath" + RunCommand "msiexec.exe" $install_args + if (-not(Test-Path $python_home)) { + Write-Host "Python seems to be installed else-where, reinstalling." + RunCommand "msiexec.exe" $uninstall_args + RunCommand "msiexec.exe" $install_args + } +} + +function RunCommand ($command, $command_args) { + Write-Host $command $command_args + Start-Process -FilePath $command -ArgumentList $command_args -Wait -Passthru +} + + +function InstallPip ($python_home) { + $pip_path = $python_home + "\Scripts\pip.exe" + $python_path = $python_home + "\python.exe" + if (-not(Test-Path $pip_path)) { + Write-Host "Installing pip..." + $webclient = New-Object System.Net.WebClient + $webclient.DownloadFile($GET_PIP_URL, $GET_PIP_PATH) + Write-Host "Executing:" $python_path $GET_PIP_PATH + & $python_path $GET_PIP_PATH + } else { + Write-Host "pip already installed." + } +} + + +function DownloadMiniconda ($python_version, $platform_suffix) { + if ($python_version -eq "3.4") { + $filename = "Miniconda3-3.5.5-Windows-" + $platform_suffix + ".exe" + } else { + $filename = "Miniconda-3.5.5-Windows-" + $platform_suffix + ".exe" + } + $url = $MINICONDA_URL + $filename + $filepath = Download $filename $url + return $filepath +} + + +function InstallMiniconda ($python_version, $architecture, $python_home) { + Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home + if (Test-Path $python_home) { + Write-Host $python_home "already exists, skipping." + return $false + } + if ($architecture -eq "32") { + $platform_suffix = "x86" + } else { + $platform_suffix = "x86_64" + } + $filepath = DownloadMiniconda $python_version $platform_suffix + Write-Host "Installing" $filepath "to" $python_home + $install_log = $python_home + ".log" + $args = "/S /D=$python_home" + Write-Host $filepath $args + Start-Process -FilePath $filepath -ArgumentList $args -Wait -Passthru + if (Test-Path $python_home) { + Write-Host "Python $python_version ($architecture) installation complete" + } else { + Write-Host "Failed to install Python in $python_home" + Get-Content -Path $install_log + Exit 1 + } +} + + +function InstallMinicondaPip ($python_home) { + $pip_path = $python_home + "\Scripts\pip.exe" + $conda_path = $python_home + "\Scripts\conda.exe" + if (-not(Test-Path $pip_path)) { + Write-Host "Installing pip..." + $args = "install --yes pip" + Write-Host $conda_path $args + Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru + } else { + Write-Host "pip already installed." + } +} + +function main () { + InstallPython $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON + InstallPip $env:PYTHON +} + +main diff --git a/build_tools/appveyor/requirements.txt b/build_tools/appveyor/requirements.txt new file mode 100644 index 0000000..0b9c63f --- /dev/null +++ b/build_tools/appveyor/requirements.txt @@ -0,0 +1,16 @@ +# Fetch numpy and scipy wheels from the sklearn rackspace wheelhouse. +# Those wheels were collected from http://www.lfd.uci.edu/~gohlke/pythonlibs/ +# This is a temporary solution. As soon as numpy and scipy provide official +# wheel for windows we ca delete this --find-links line. +--find-links http://28daf2247a33ed269873-7b1aad3fab3cc330e1fd9d109892382a.r6.cf2.rackcdn.com/ + +# fix the versions of numpy to force the use of numpy and scipy to use the whl +# of the rackspace folder instead of trying to install from more recent +# source tarball published on PyPI +numpy==1.9.3 +scipy==0.16.0 +cython +nose +nose-timer +wheel +wheelhouse_uploader diff --git a/build_tools/appveyor/run_with_env.cmd b/build_tools/appveyor/run_with_env.cmd new file mode 100644 index 0000000..5da547c --- /dev/null +++ b/build_tools/appveyor/run_with_env.cmd @@ -0,0 +1,88 @@ +:: To build extensions for 64 bit Python 3, we need to configure environment +:: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of: +:: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1) +:: +:: To build extensions for 64 bit Python 2, we need to configure environment +:: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of: +:: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0) +:: +:: 32 bit builds, and 64-bit builds for 3.5 and beyond, do not require specific +:: environment configurations. +:: +:: Note: this script needs to be run with the /E:ON and /V:ON flags for the +:: cmd interpreter, at least for (SDK v7.0) +:: +:: More details at: +:: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows +:: http://stackoverflow.com/a/13751649/163740 +:: +:: Author: Olivier Grisel +:: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ +:: +:: Notes about batch files for Python people: +:: +:: Quotes in values are literally part of the values: +:: SET FOO="bar" +:: FOO is now five characters long: " b a r " +:: If you don't want quotes, don't include them on the right-hand side. +:: +:: The CALL lines at the end of this file look redundant, but if you move them +:: outside of the IF clauses, they do not run properly in the SET_SDK_64==Y +:: case, I don't know why. +@ECHO OFF + +SET COMMAND_TO_RUN=%* +SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows +SET WIN_WDK=c:\Program Files (x86)\Windows Kits\10\Include\wdf + +:: Extract the major and minor versions, and allow for the minor version to be +:: more than 9. This requires the version number to have two dots in it. +SET MAJOR_PYTHON_VERSION=%PYTHON_VERSION:~0,1% +IF "%PYTHON_VERSION:~3,1%" == "." ( + SET MINOR_PYTHON_VERSION=%PYTHON_VERSION:~2,1% +) ELSE ( + SET MINOR_PYTHON_VERSION=%PYTHON_VERSION:~2,2% +) + +:: Based on the Python version, determine what SDK version to use, and whether +:: to set the SDK for 64-bit. +IF %MAJOR_PYTHON_VERSION% == 2 ( + SET WINDOWS_SDK_VERSION="v7.0" + SET SET_SDK_64=Y +) ELSE ( + IF %MAJOR_PYTHON_VERSION% == 3 ( + SET WINDOWS_SDK_VERSION="v7.1" + IF %MINOR_PYTHON_VERSION% LEQ 4 ( + SET SET_SDK_64=Y + ) ELSE ( + SET SET_SDK_64=N + IF EXIST "%WIN_WDK%" ( + :: See: https://connect.microsoft.com/VisualStudio/feedback/details/1610302/ + REN "%WIN_WDK%" 0wdf + ) + ) + ) ELSE ( + ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" + EXIT 1 + ) +) + +IF %PYTHON_ARCH% == 64 ( + IF %SET_SDK_64% == Y ( + ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture + SET DISTUTILS_USE_SDK=1 + SET MSSdk=1 + "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% + "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release + ECHO Executing: %COMMAND_TO_RUN% + call %COMMAND_TO_RUN% || EXIT 1 + ) ELSE ( + ECHO Using default MSVC build environment for 64 bit architecture + ECHO Executing: %COMMAND_TO_RUN% + call %COMMAND_TO_RUN% || EXIT 1 + ) +) ELSE ( + ECHO Using default MSVC build environment for 32 bit architecture + ECHO Executing: %COMMAND_TO_RUN% + call %COMMAND_TO_RUN% || EXIT 1 +) diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh new file mode 100755 index 0000000..7f41a6b --- /dev/null +++ b/build_tools/circle/build_doc.sh @@ -0,0 +1,55 @@ +set -x +set -e + +# Introspect the commit to know whether or not we should skip building the +# documentation: a pull request that does not change any file in doc/ or +# examples/ folder should be skipped unless the "[doc: build]" is found the +# commit message. +BUILD_DOC=`python build_tools/circle/check_build_doc.py` +echo -e $BUILD_DOC +if [[ $BUILD_DOC == "SKIP:"* ]]; then + touch ~/log.txt # the "test" segment needs that file + exit 0 +fi + +# Installing required system packages to support the rendering of match +# notation in the HTML documentation +sudo -E apt-get -yq update +sudo -E apt-get -yq remove texlive-binaries --purge +sudo -E apt-get -yq --no-install-suggests --no-install-recommends --force-yes \ + install dvipng texlive-latex-base texlive-latex-extra + +# deactivate circleci virtualenv and setup a miniconda env instead +if [[ `type -t deactivate` ]]; then + deactivate +fi + +# Install dependencies with miniconda +pushd . +cd +mkdir -p download +cd download +echo "Cached in $HOME/download :" +ls -l +if [[ ! -f miniconda.sh ]] +then + wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh \ + -O miniconda.sh +fi +chmod +x miniconda.sh && ./miniconda.sh -b -p $HOME/miniconda +cd .. +export PATH="$HOME/miniconda/bin:$PATH" +conda update --yes --quiet conda +popd + +# Configure the conda environment and put it in the path using the +# provided versions +conda create -n testenv --yes --quiet python numpy scipy \ + cython nose coverage matplotlib sphinx pillow +source activate testenv + +# Build and install scikit-learn in dev mode +python setup.py develop + +# The pipefail is requested to propagate exit code +set -o pipefail && cd doc && make html 2>&1 | tee ~/log.txt diff --git a/build_tools/circle/check_build_doc.py b/build_tools/circle/check_build_doc.py new file mode 100644 index 0000000..f8e0048 --- /dev/null +++ b/build_tools/circle/check_build_doc.py @@ -0,0 +1,66 @@ +"""Check whether we or not we should build the documentation + +If the last commit message has a "[doc skip]" marker, do not build +the doc. On the contrary if a "[doc build]" marker is found, build the doc +instead of relying on the subsequent rules. + +We always build the documentation for jobs that are not related to a specific +PR (e.g. a merge to master or a maintenance branch). + +If this is a PR, check that if there are some files in this PR that are under +the "doc/" or "examples/" folders, otherwise skip. + +If the introspection of the current commit fails for any reason, the default +behavior is to build the documentation. + +""" +import sys +import os +from subprocess import check_output, CalledProcessError + + +def exit(msg="", skip=False): + print("%s: %s" % ("SKIP" if skip else "BUILD", msg)) + sys.exit(0) + +# Introspect the message for the commit that triggered the build +commit = os.environ.get('CIRCLE_SHA1') +if not commit: + exit("undefined CIRCLE_SHA1 variable") +try: + commit_msg = check_output("git log --format=%B -n 1".split() + [commit]) + commit_msg = commit_msg.decode('utf-8') +except CalledProcessError: + exit("failed to introspect commit message for %s" % commit) + +if "[doc skip]" in commit_msg: + exit("[doc skip] marker found", skip=True) +elif "[doc build]" in commit_msg: + exit("[doc build] marker found") + +# Check whether this commit is part of a pull request or not +pr_url = os.environ.get('CI_PULL_REQUEST') +if not pr_url: + # The documentation should be always built when executed from one of the + # main branches + exit("not a pull request") + +# Introspect the list of files changed by all the commits in this PR. +# Hardcode the assumption that this is a PR to origin/master of this repo +# as apparently there is way to reliably get the target of a PR with circle +# ci +git_range = "origin/master...%s" % commit +try: + check_output("git fetch origin master".split()) + filenames = check_output("git diff --name-only".split() + [git_range]) +except CalledProcessError: + exit("git introspection failed.") +filenames = filenames.decode('utf-8').split() +for filename in filenames: + if filename.startswith(u'doc/') or filename.startswith(u'examples/'): + exit("detected doc impacting file modified by PR in range %s: %s" + % (git_range, filename)) + +# This PR does not seem to have any documentation related file changed. +msg = "no doc impacting files detected:\n" + u"\n".join(filenames) +exit(msg, skip=True) diff --git a/build_tools/circle/push_doc.sh b/build_tools/circle/push_doc.sh new file mode 100755 index 0000000..2423929 --- /dev/null +++ b/build_tools/circle/push_doc.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# This script is meant to be called in the "deploy" step defined in +# circle.yml. See https://circleci.com/docs/ for more details. +# The behavior of the script is controlled by environment variable defined +# in the circle.yml in the top level folder of the project. + + +if [ -z $CIRCLE_PROJECT_USERNAME ]; +then USERNAME="sklearn-ci"; +else USERNAME=$CIRCLE_PROJECT_USERNAME; +fi + +DOC_REPO="scikit-learn.github.io" + +MSG="Pushing the docs for revision for branch: $CIRCLE_BRANCH, commit $CIRCLE_SHA1" + +cd $HOME +if [ ! -d $DOC_REPO ]; +then git clone "git@github.com:scikit-learn/"$DOC_REPO".git"; +fi +cd $DOC_REPO +git checkout master +git reset --hard origin/master +git rm -rf dev/ && rm -rf dev/ +cp -R $HOME/scikit-learn/doc/_build/html/stable dev +git config --global user.email "olivier.grisel+sklearn-ci@gmail.com" +git config --global user.name $USERNAME +git config --global push.default matching +git add -f dev/ +git commit -m "$MSG" dev +git push + +echo $MSG diff --git a/build_tools/cythonize.py b/build_tools/cythonize.py new file mode 100755 index 0000000..b01da58 --- /dev/null +++ b/build_tools/cythonize.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python +""" cythonize + +Cythonize pyx files into C files as needed. + +Usage: cythonize [root_dir] + +Default [root_dir] is 'sklearn'. + +Checks pyx files to see if they have been changed relative to their +corresponding C files. If they have, then runs cython on these files to +recreate the C files. + +The script detects changes in the pyx/pxd files using checksums +[or hashes] stored in a database file + +Simple script to invoke Cython on all .pyx +files; while waiting for a proper build system. Uses file hashes to +figure out if rebuild is needed. + +It is called by ./setup.py sdist so that sdist package can be installed without +cython + +Originally written by Dag Sverre Seljebotn, and adapted from statsmodel 0.6.1 +(Modified BSD 3-clause) + +We copied it for scikit-learn. + +Note: this script does not check any of the dependent C libraries; it only +operates on the Cython .pyx files or their corresponding Cython header (.pxd) +files. +""" +# Author: Arthur Mensch +# Author: Raghav R V +# +# License: BSD 3 clause + +from __future__ import division, print_function, absolute_import + +import os +import re +import sys +import hashlib +import subprocess + +HASH_FILE = 'cythonize.dat' +DEFAULT_ROOT = 'sklearn' + +# WindowsError is not defined on unix systems +try: + WindowsError +except NameError: + WindowsError = None + + +def cythonize(cython_file, gen_file): + try: + from Cython.Compiler.Version import version as cython_version + from distutils.version import LooseVersion + if LooseVersion(cython_version) < LooseVersion('0.21'): + raise Exception('Building scikit-learn requires Cython >= 0.21') + + except ImportError: + pass + + flags = ['--fast-fail'] + if gen_file.endswith('.cpp'): + flags += ['--cplus'] + + try: + try: + rc = subprocess.call(['cython'] + + flags + ["-o", gen_file, cython_file]) + if rc != 0: + raise Exception('Cythonizing %s failed' % cython_file) + except OSError: + # There are ways of installing Cython that don't result in a cython + # executable on the path, see scipy issue gh-2397. + rc = subprocess.call([sys.executable, '-c', + 'import sys; from Cython.Compiler.Main ' + 'import setuptools_main as main;' + ' sys.exit(main())'] + flags + + ["-o", gen_file, cython_file]) + if rc != 0: + raise Exception('Cythonizing %s failed' % cython_file) + except OSError: + raise OSError('Cython needs to be installed') + + +def load_hashes(filename): + """Load the hashes dict from the hashfile""" + # { filename : (sha1 of header if available or 'NA', + # sha1 of input, + # sha1 of output) } + + hashes = {} + try: + with open(filename, 'r') as cython_hash_file: + for hash_record in cython_hash_file: + (filename, header_hash, + cython_hash, gen_file_hash) = hash_record.split() + hashes[filename] = (header_hash, cython_hash, gen_file_hash) + except (KeyError, ValueError, AttributeError, IOError): + hashes = {} + return hashes + + +def save_hashes(hashes, filename): + """Save the hashes dict to the hashfile""" + with open(filename, 'w') as cython_hash_file: + for key, value in hashes.items(): + cython_hash_file.write("%s %s %s %s\n" + % (key, value[0], value[1], value[2])) + + +def sha1_of_file(filename): + h = hashlib.sha1() + with open(filename, "rb") as f: + h.update(f.read()) + return h.hexdigest() + + +def clean_path(path): + """Clean the path""" + path = path.replace(os.sep, '/') + if path.startswith('./'): + path = path[2:] + return path + + +def get_hash_tuple(header_path, cython_path, gen_file_path): + """Get the hashes from the given files""" + + header_hash = (sha1_of_file(header_path) + if os.path.exists(header_path) else 'NA') + from_hash = sha1_of_file(cython_path) + to_hash = (sha1_of_file(gen_file_path) + if os.path.exists(gen_file_path) else 'NA') + + return header_hash, from_hash, to_hash + + +def cythonize_if_unchanged(path, cython_file, gen_file, hashes): + full_cython_path = os.path.join(path, cython_file) + full_header_path = full_cython_path.replace('.pyx', '.pxd') + full_gen_file_path = os.path.join(path, gen_file) + + current_hash = get_hash_tuple(full_header_path, full_cython_path, + full_gen_file_path) + + if current_hash == hashes.get(clean_path(full_cython_path)): + print('%s has not changed' % full_cython_path) + return + + print('Processing %s' % full_cython_path) + cythonize(full_cython_path, full_gen_file_path) + + # changed target file, recompute hash + current_hash = get_hash_tuple(full_header_path, full_cython_path, + full_gen_file_path) + + # Update the hashes dict with the new hash + hashes[clean_path(full_cython_path)] = current_hash + + +def check_and_cythonize(root_dir): + print(root_dir) + hashes = load_hashes(HASH_FILE) + + for cur_dir, dirs, files in os.walk(root_dir): + for filename in files: + if filename.endswith('.pyx'): + gen_file_ext = '.c' + # Cython files with libcpp imports should be compiled to cpp + with open(os.path.join(cur_dir, filename), 'rb') as f: + data = f.read() + m = re.search(b"libcpp", data, re.I | re.M) + if m: + gen_file_ext = ".cpp" + cython_file = filename + gen_file = filename.replace('.pyx', gen_file_ext) + cythonize_if_unchanged(cur_dir, cython_file, gen_file, hashes) + + # Save hashes once per module. This prevents cythonizing prev. + # files again when debugging broken code in a single file + save_hashes(hashes, HASH_FILE) + + +def main(root_dir=DEFAULT_ROOT): + check_and_cythonize(root_dir) + + +if __name__ == '__main__': + try: + root_dir_arg = sys.argv[1] + except IndexError: + root_dir_arg = DEFAULT_ROOT + main(root_dir_arg) diff --git a/build_tools/travis/after_success.sh b/build_tools/travis/after_success.sh new file mode 100755 index 0000000..a4613cc --- /dev/null +++ b/build_tools/travis/after_success.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# This script is meant to be called by the "after_success" step defined in +# .travis.yml. See http://docs.travis-ci.com/ for more details. + +# License: 3-clause BSD + +set -e + +if [[ "$COVERAGE" == "true" ]]; then + # Need to run coveralls from a git checkout, so we copy .coverage + # from TEST_DIR where nosetests has been run + cp $TEST_DIR/.coverage $TRAVIS_BUILD_DIR + cd $TRAVIS_BUILD_DIR + # Ignore coveralls failures as the coveralls server is not + # very reliable but we don't want travis to report a failure + # in the github UI just because the coverage report failed to + # be published. + coveralls || echo "Coveralls upload failed" +fi diff --git a/build_tools/travis/flake8_diff.sh b/build_tools/travis/flake8_diff.sh new file mode 100755 index 0000000..59065c5 --- /dev/null +++ b/build_tools/travis/flake8_diff.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# This script is used in Travis to check that PRs do not add obvious +# flake8 violations. It relies on two things: +# - find common ancestor between branch and +# scikit-learn/scikit-learn remote +# - run flake8 --diff on the diff between the branch and the common +# ancestor +# +# Additional features: +# - the line numbers in Travis match the local branch on the PR +# author machine. +# - ./build_tools/travis/flake8_diff.sh can be run locally for quick +# turn-around + +set -e +# pipefail is necessary to propagate exit codes +set -o pipefail + +PROJECT=scikit-learn/scikit-learn +PROJECT_URL=https://github.com/$PROJECT.git + +echo "Remotes:" +git remote --verbose + +# Find the remote with the project name (upstream in most cases) +REMOTE=$(git remote -v | grep $PROJECT | cut -f1 | head -1 || echo '') + +# Add a temporary remote if needed. For example this is necessary when +# Travis is configured to run in a fork. In this case 'origin' is the +# fork and not the reference repo we want to diff against. +if [[ -z "$REMOTE" ]]; then + TMP_REMOTE=tmp_reference_upstream + REMOTE=$TMP_REMOTE + git remote add $REMOTE $PROJECT_URL +fi + +if [[ "$TRAVIS" == "true" ]]; then + if [[ "$TRAVIS_PULL_REQUEST" == "false" ]] + then + # In main repo, using TRAVIS_COMMIT_RANGE to test the commits + # that were pushed into a branch + if [[ "$PROJECT" == "$TRAVIS_REPO_SLUG" ]]; then + if [[ -z "$TRAVIS_COMMIT_RANGE" ]]; then + echo "New branch, no commit range from Travis so passing this test by convention" + exit 0 + fi + COMMIT_RANGE=$TRAVIS_COMMIT_RANGE + else + # Travis does the git clone with a limited depth (50 at the time of + # writing). This may not be enough to find the common ancestor with + # $REMOTE/master so we unshallow the git checkout + git fetch --unshallow || echo "Unshallowing the git checkout failed" + fi + else + # We want to fetch the code as it is in the PR branch and not + # the result of the merge into master. This way line numbers + # reported by Travis will match with the local code. + BRANCH_NAME=travis_pr_$TRAVIS_PULL_REQUEST + git fetch $REMOTE pull/$TRAVIS_PULL_REQUEST/head:$BRANCH_NAME + git checkout $BRANCH_NAME + fi +fi + + +echo -e '\nLast 2 commits:' +echo '--------------------------------------------------------------------------------' +git log -2 --pretty=short + +# If not using the commit range from Travis we need to find the common +# ancestor between HEAD and $REMOTE/master +if [[ -z "$COMMIT_RANGE" ]]; then + REMOTE_MASTER_REF="$REMOTE/master" + # Make sure that $REMOTE_MASTER_REF is a valid reference + git fetch $REMOTE master:refs/$REMOTE_MASTER_REF + + COMMIT=$(git merge-base @ $REMOTE_MASTER_REF) || \ + echo "No common ancestor found for $(git show @ -q) and $(git show $REMOTE_MASTER_REF -q)" + + if [[ -n "$TMP_REMOTE" ]]; then + git remote remove $TMP_REMOTE + fi + + if [ -z "$COMMIT" ]; then + exit 1 + fi + + echo -e "\nCommon ancestor between HEAD and $REMOTE_MASTER_REF is:" + echo '--------------------------------------------------------------------------------' + git show --no-patch $COMMIT + + COMMIT_RANGE="$(git rev-parse --short $COMMIT)..$(git rev-parse --short @)" + +else + echo "Got the commit range from Travis: $COMMIT_RANGE" +fi + +echo -e '\nRunning flake8 on the diff in the range' "$COMMIT_RANGE" \ + "($(git rev-list $COMMIT_RANGE | wc -l) commit(s)):" +echo '--------------------------------------------------------------------------------' + +# We ignore files from sklearn/externals. Unfortunately there is no +# way to do it with flake8 directly (the --exclude does not seem to +# work with --diff). We could use the exclude magic in the git pathspec +# ':!sklearn/externals' but it is only available on git 1.9 and Travis +# uses git 1.8. +# We need the following command to exit with 0 hence the echo in case +# there is no match +MODIFIED_FILES=$(git diff --name-only $COMMIT_RANGE | grep -v 'sklearn/externals' || echo "no_match") + +if [[ "$MODIFIED_FILES" == "no_match" ]]; then + echo "No file outside sklearn/externals has been modified" +else + # Conservative approach: diff without context so that code that + # was not changed does not create failures + git diff --unified=0 $COMMIT -- $MODIFIED_FILES | flake8 --diff --show-source +fi +echo -e "No problem detected by flake8\n" diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh new file mode 100755 index 0000000..b5825d0 --- /dev/null +++ b/build_tools/travis/install.sh @@ -0,0 +1,120 @@ +#!/bin/bash +# This script is meant to be called by the "install" step defined in +# .travis.yml. See http://docs.travis-ci.com/ for more details. +# The behavior of the script is controlled by environment variabled defined +# in the .travis.yml in the top level folder of the project. + +# License: 3-clause BSD + +# Travis clone scikit-learn/scikit-learn repository in to a local repository. +# We use a cached directory with three scikit-learn repositories (one for each +# matrix entry) from which we pull from local Travis repository. This allows +# us to keep build artefact for gcc + cython, and gain time + +set -e + +# Fix the compilers to workaround avoid having the Python 3.4 build +# lookup for g++44 unexpectedly. +export CC=gcc +export CXX=g++ + +echo 'List files from cached directories' +echo 'pip:' +ls $HOME/.cache/pip + + +if [[ "$DISTRIB" == "conda" ]]; then + # Deactivate the travis-provided virtual environment and setup a + # conda-based environment instead + deactivate + + # Use the miniconda installer for faster download / install of conda + # itself + pushd . + cd + mkdir -p download + cd download + echo "Cached in $HOME/download :" + ls -l + echo + if [[ ! -f miniconda.sh ]] + then + wget http://repo.continuum.io/miniconda/Miniconda-3.6.0-Linux-x86_64.sh \ + -O miniconda.sh + fi + chmod +x miniconda.sh && ./miniconda.sh -b + cd .. + export PATH=/home/travis/miniconda/bin:$PATH + conda update --yes conda + popd + + # Configure the conda environment and put it in the path using the + # provided versions + if [[ "$INSTALL_MKL" == "true" ]]; then + conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ + numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION numpy scipy \ + cython=$CYTHON_VERSION libgfortran mkl flake8 + else + conda create -n testenv --yes python=$PYTHON_VERSION pip nose \ + numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION cython=$CYTHON_VERSION \ + libgfortran nomkl + fi + source activate testenv + + # Install nose-timer via pip + pip install nose-timer + +elif [[ "$DISTRIB" == "ubuntu" ]]; then + # At the time of writing numpy 1.9.1 is included in the travis + # virtualenv but we want to use the numpy installed through apt-get + # install. + deactivate + # Create a new virtualenv using system site packages for python, numpy + # and scipy + virtualenv --system-site-packages testvenv + source testvenv/bin/activate + pip install nose nose-timer cython + +elif [[ "$DISTRIB" == "scipy-dev-wheels" ]]; then + # Set up our own virtualenv environment to avoid travis' numpy. + # This venv points to the python interpreter of the travis build + # matrix. + virtualenv --python=python ~/testvenv + source ~/testvenv/bin/activate + pip install --upgrade pip setuptools + + # We use the default Python virtualenv provided by travis + echo "Installing numpy master wheel" + pip install --pre --upgrade --no-index --timeout=60 \ + --trusted-host travis-dev-wheels.scipy.org \ + -f https://travis-dev-wheels.scipy.org/ numpy scipy + pip install nose nose-timer cython +fi + +if [[ "$COVERAGE" == "true" ]]; then + pip install coverage coveralls +fi + +if [[ "$SKIP_TESTS" == "true" ]]; then + echo "No need to build scikit-learn when not running the tests" +else + if [ ! -d "$CACHED_BUILD_DIR" ]; then + mkdir -p $CACHED_BUILD_DIR + fi + + rsync -av --exclude '.git/' --exclude='testvenv/' \ + $TRAVIS_BUILD_DIR $CACHED_BUILD_DIR + + cd $CACHED_BUILD_DIR/scikit-learn + + # Build scikit-learn in the install.sh script to collapse the verbose + # build output in the travis output when it succeeds. + python --version + python -c "import numpy; print('numpy %s' % numpy.__version__)" + python -c "import scipy; print('scipy %s' % scipy.__version__)" + python setup.py develop +fi + +if [[ "$RUN_FLAKE8" == "true" ]]; then + conda install flake8 +fi diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh new file mode 100755 index 0000000..f1b8043 --- /dev/null +++ b/build_tools/travis/test_script.sh @@ -0,0 +1,49 @@ +#!/bin/bash +# This script is meant to be called by the "script" step defined in +# .travis.yml. See http://docs.travis-ci.com/ for more details. +# The behavior of the script is controlled by environment variabled defined +# in the .travis.yml in the top level folder of the project. + +# License: 3-clause BSD + +set -e + +python --version +python -c "import numpy; print('numpy %s' % numpy.__version__)" +python -c "import scipy; print('scipy %s' % scipy.__version__)" +python -c "import multiprocessing as mp; print('%d CPUs' % mp.cpu_count())" + +run_tests() { + # Get into a temp directory to run test from the installed scikit learn and + # check if we do not leave artifacts + mkdir -p $TEST_DIR + # We need the setup.cfg for the nose settings + cp setup.cfg $TEST_DIR + cd $TEST_DIR + + # Skip tests that require large downloads over the network to save bandwidth + # usage as travis workers are stateless and therefore traditional local + # disk caching does not work. + export SKLEARN_SKIP_NETWORK_TESTS=1 + + if [[ "$COVERAGE" == "true" ]]; then + nosetests -s --with-coverage --with-timer --timer-top-n 20 sklearn + else + nosetests -s --with-timer --timer-top-n 20 sklearn + fi + + # Is directory still empty ? + ls -ltra + + # Test doc + cd $CACHED_BUILD_DIR/scikit-learn + make test-doc test-sphinxext +} + +if [[ "$RUN_FLAKE8" == "true" ]]; then + source build_tools/travis/flake8_diff.sh +fi + +if [[ "$SKIP_TESTS" != "true" ]]; then + run_tests +fi diff --git a/build_tools/windows/windows_testing_downloader.ps1 b/build_tools/windows/windows_testing_downloader.ps1 new file mode 100644 index 0000000..d72b678 --- /dev/null +++ b/build_tools/windows/windows_testing_downloader.ps1 @@ -0,0 +1,270 @@ +# Author: Kyle Kastner +# License: BSD 3 clause + +# This script is a helper to download the base python, numpy, and scipy +# packages from their respective websites. +# To quickly execute the script, run the following Powershell command: +# powershell.exe -ExecutionPolicy unrestricted "iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/scikit-learn/scikit-learn/master/continuous_integration/windows/windows_testing_downloader.ps1'))" + +# This is a stopgap solution to make Windows testing easier +# until Windows CI issues are resolved. + +# Rackspace's default Windows VMs have several security features enabled by default. +# The DisableInternetExplorerESC function disables a feature which +# prevents any webpage from opening without explicit permission. +# This is a default setting of Windows VMs on Rackspace, and makes it annoying to +# download other packages to test! + +# Powershell scripts are also disabled by default. One must run the command: +# set-executionpolicy unrestricted +# from a Powershell terminal with administrator rights to enable scripts. +# To start an administrator Powershell terminal, right click second icon from the left on Windows Server 2012's bottom taskbar. + +param ( + [string]$python = "None", + [string]$nogit = "False" +) + +function DisableInternetExplorerESC { + # Disables InternetExplorerESC to enable easier manual downloads of testing packages. + # http://stackoverflow.com/questions/9368305/disable-ie-security-on-windows-server-via-powershell + $AdminKey = "HKLM:\SOFTWARE\Microsoft\Active Setup\Installed Components\{A509B1A7-37EF-4b3f-8CFC-4F3A74704073}" + $UserKey = "HKLM:\SOFTWARE\Microsoft\Active Setup\Installed Components\{A509B1A8-37EF-4b3f-8CFC-4F3A74704073}" + Set-ItemProperty -Path $AdminKey -Name "IsInstalled" -Value 0 + Set-ItemProperty -Path $UserKey -Name "IsInstalled" -Value 0 + Stop-Process -Name Explorer + Write-Host "IE Enhanced Security Configuration (ESC) has been disabled." -ForegroundColor Green +} + +function DownloadPackages ($package_dict, $append_string) { + $webclient = New-Object System.Net.WebClient + + ForEach ($key in $package_dict.Keys) { + $url = $package_dict[$key] + $file = $key + $append_string + if ($url -match "(\.*exe$)") { + $file = $file + ".exe" + } elseif ($url -match "(\.*msi$)") { + $file = $file + ".msi" + } else { + $file = $file + ".py" + } + $basedir = $pwd.Path + "\" + $filepath = $basedir + $file + Write-Host "Downloading" $file "from" $url + + # Retry up to 5 times in case of network transient errors. + $retry_attempts = 5 + for($i=0; $i -lt $retry_attempts; $i++){ + try{ + $webclient.DownloadFile($url, $filepath) + break + } + Catch [Exception]{ + Start-Sleep 1 + } + } + Write-Host "File saved at" $filepath + } +} + +function InstallPython($match_string) { + $pkg_regex = "python" + $match_string + "*" + $pkg = Get-ChildItem -Filter $pkg_regex -Name + Invoke-Expression -Command "msiexec /qn /i $pkg" + + Write-Host "Installing Python" + Start-Sleep 25 + Write-Host "Python installation complete" +} + +function InstallPip($match_string, $python_version) { + $pkg_regex = "get-pip" + $match_string + "*" + $py = $python_version -replace "\." + $pkg = Get-ChildItem -Filter $pkg_regex -Name + $python_path = "C:\Python" + $py + "\python.exe" + Invoke-Expression -Command "$python_path $pkg" +} + +function EnsurePip($python_version) { + $py = $python_version -replace "\." + $python_path = "C:\Python" + $py + "\python.exe" + Invoke-Expression -Command "$python_path -m ensurepip" +} + +function GetPythonHome($python_version) { + $py = $python_version -replace "\." + $pypath = "C:\Python" + $py + "\" + return $pypath +} + +function GetPipPath($python_version) { + $py = $python_version -replace "\." + $pypath = GetPythonHome $python_version + if ($py.StartsWith("3")) { + $pip = $pypath + "Scripts\pip3.exe" + } else { + $pip = $pypath + "Scripts\pip.exe" + } + return $pip +} + +function PipInstall($pkg_name, $python_version, $extra_args) { + $pip = GetPipPath $python_version + Invoke-Expression -Command "$pip install $pkg_name" +} + +function InstallNose($python_version) { + PipInstall "nose" $python_version +} + +function WheelInstall($name, $url, $python_version) { + $pip = GetPipPath $python_version + $args = "install --use-wheel --no-index" + Invoke-Expression -Command "$pip $args $url $name" +} + +function InstallWheel($python_version) { + PipInstall "virtualenv" $python_version + PipInstall "wheel" $python_version +} + +function InstallNumpy($package_dict, $python_version) { + #Don't pass name so we can use URL directly. + WheelInstall "" $package_dict["numpy"] $python_version +} + +function InstallScipy($package_dict, $python_version) { + #Don't pass name so we can use URL directly. + WheelInstall "" $package_dict["scipy"] $python_version +} + +function InstallGit { + $pkg_regex = "git*" + $pkg = Get-ChildItem -Filter $pkg_regex -Name + $pkg_cmd = $pwd.ToString() + "\" + $pkg + " /verysilent" + Invoke-Expression -Command $pkg_cmd + + Write-Host "Installing Git" + Start-Sleep 20 + # Remove the installer - seems to cause weird issues with Git Bash + Invoke-Expression -Command "rm git.exe" + Write-Host "Git installation complete" +} + +function ReadAndUpdateFromRegistry { + # http://stackoverflow.com/questions/14381650/how-to-update-windows-powershell-session-environment-variables-from-registry + foreach($level in "Machine","User") { + [Environment]::GetEnvironmentVariables($level).GetEnumerator() | % { + # For Path variables, append the new values, if they're not already in there + if($_.Name -match 'Path$') { + $_.Value = ($((Get-Content "Env:$($_.Name)") + ";$($_.Value)") -split ';' | Select -unique) -join ';' + } + $_ + } | Set-Content -Path { "Env:$($_.Name)" } + } +} + +function UpdatePaths($python_version) { + #This function makes local path updates required in order to install Python and supplementary packages in a single shell. + $pypath = GetPythonHome $python_version + $env:PATH = $env:PATH + ";" + $pypath + $env:PYTHONPATH = $pypath + "DLLs;" + $pypath + "Lib;" + $pypath + "Lib\site-packages" + $env:PYTHONHOME = $pypath + Write-Host "PYTHONHOME temporarily set to" $env:PYTHONHOME + Write-Host "PYTHONPATH temporarily set to" $env:PYTHONPATH + Write-Host "PATH temporarily set to" $env:PATH +} + +function Python27URLs { + # Function returns a dictionary of packages to download for Python 2.7. + $urls = @{ + "python" = "https://www.python.org/ftp/python/2.7.7/python-2.7.7.msi" + "numpy" = "http://28daf2247a33ed269873-7b1aad3fab3cc330e1fd9d109892382a.r6.cf2.rackcdn.com/numpy-1.8.1-cp27-none-win32.whl" + "scipy" = "http://28daf2247a33ed269873-7b1aad3fab3cc330e1fd9d109892382a.r6.cf2.rackcdn.com/scipy-0.14.0-cp27-none-win32.whl" + "get-pip" = "https://bootstrap.pypa.io/get-pip.py" + } + return $urls +} + +function Python34URLs { + # Function returns a dictionary of packages to download for Python 3.4. + $urls = @{ + "python" = "https://www.python.org/ftp/python/3.4.1/python-3.4.1.msi" + "numpy" = "http://28daf2247a33ed269873-7b1aad3fab3cc330e1fd9d109892382a.r6.cf2.rackcdn.com/numpy-1.8.1-cp34-none-win32.whl" + "scipy" = "http://28daf2247a33ed269873-7b1aad3fab3cc330e1fd9d109892382a.r6.cf2.rackcdn.com/scipy-0.14.0-cp34-none-win32.whl" + } + return $urls +} + +function GitURLs { + # Function returns a dictionary of packages to download for Git + $urls = @{ + "git" = "https://github.com/msysgit/msysgit/releases/download/Git-1.9.4-preview20140611/Git-1.9.4-preview20140611.exe" + } + return $urls +} + +function main { + $versions = @{ + "2.7" = Python27URLs + "3.4" = Python34URLs + } + + if ($nogit -eq "False") { + Write-Host "Downloading and installing Gitbash" + $urls = GitURLs + DownloadPackages $urls "" + InstallGit ".exe" + } + + if (($python -eq "None")) { + Write-Host "Installing all supported python versions" + Write-Host "Current versions supported are:" + ForEach ($key in $versions.Keys) { + Write-Host $key + $all_python += @($key) + } + } elseif(!($versions.ContainsKey($python))) { + Write-Host "Python version not recognized!" + Write-Host "Pass python version with -python" + Write-Host "Current versions supported are:" + ForEach ($key in $versions.Keys) { + Write-Host $key + } + return + } else { + $all_python += @($python) + } + ForEach ($py in $all_python) { + Write-Host "Installing Python" $py + DisableInternetExplorerESC + $pystring = $py -replace "\." + $pystring = "_py" + $pystring + $package_dict = $versions[$py] + + # This will download the whl packages as well which is + # clunky but makes configuration simpler. + DownloadPackages $package_dict $pystring + UpdatePaths $py + InstallPython $pystring + ReadAndUpdateFromRegistry + if ($package_dict.ContainsKey("get-pip")) { + InstallPip $pystring $py + } else { + EnsurePip $py + } + InstallNose $py + InstallWheel $py + + # The installers below here use wheel packages. + # Wheels were created from CGohlke's installers with + # wheel convert + # These are hosted in Rackspace Cloud Files. + InstallNumpy $package_dict $py + InstallScipy $package_dict $py + } + return +} + +main diff --git a/circle.yml b/circle.yml new file mode 100644 index 0000000..dd26453 --- /dev/null +++ b/circle.yml @@ -0,0 +1,23 @@ +dependencies: + cache_directories: + - "~/scikit_learn_data" + - "~/scikit-learn.github.io" + - "~/download" + # Check whether the doc build is required, install build dependencies and + # run sphinx to build the doc. + override: + - ./build_tools/circle/build_doc.sh +test: + # Grep error on the documentation + override: + - cat ~/log.txt && if grep -q "Traceback (most recent call last):" ~/log.txt; then false; else true; fi +deployment: + push: + branch: master + commands: + - bash build_tools/circle/push_doc.sh +general: + # Open the doc to the API + artifacts: + - "doc/_build/html" + - "~/log.txt" diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..ca5e60a --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,105 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD ?= sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml pickle json latex latexpdf changes linkcheck doctest optipng + +all: html-noplot + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + -rm -rf auto_examples/ + -rm -rf generated/* + -rm -rf modules/generated/* + +html: + # These two lines make the build a bit more lengthy, and the + # the embedding of images more robust + rm -rf $(BUILDDIR)/html/_images + #rm -rf _build/doctrees/ + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable" + +html-noplot: + $(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + make -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +download-data: + python -c "from sklearn.datasets.lfw import check_fetch_lfw; check_fetch_lfw()" + +# Optimize PNG files. Needs OptiPNG. Change the -P argument to the number of +# cores you have available, so -P 64 if you have a real computer ;) +optipng: + find _build auto_examples */generated -name '*.png' -print0 \ + | xargs -0 -n 1 -P 4 optipng -o10 + +dist: html latexpdf + cp _build/latex/user_guide.pdf _build/html/stable/_downloads/scikit-learn-docs.pdf diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 0000000..ea22f55 --- /dev/null +++ b/doc/README.md @@ -0,0 +1,37 @@ +#Documentation for scikit-learn + +This section contains the full manual and web page as displayed in +http://scikit-learn.org. To generate the full web page, including +the example gallery (this might take a while): + + make html + +Or, if you'd rather not build the example gallery: + + make html-noplot + +That should create all the doc in directory _build/html + +To build the PDF manual, run + + make latexpdf + + +The website is hosted at github and can be updated manually (for releases) +by pushing to the https://github.com/scikit-learn/scikit-learn.github.io repository. + +It's recommended to run OptiPNG, before uploading the website. +The PNG files generated by Matplotlib tend to be ~20% too big, and they're +costing us bandwidth. You can run OptiPNG with:: + + make optipng + +#Development documentation automated build + +A Rackspace cloud server named 'docbuilder' is continuously building the master branch +to update the http://scikit-learn.org/dev tree of the website. + +The configuration of this server is managed at: + + https://github.com/scikit-learn/sklearn-docbuilder + diff --git a/doc/about.rst b/doc/about.rst new file mode 100644 index 0000000..6d6fc1a --- /dev/null +++ b/doc/about.rst @@ -0,0 +1,229 @@ + + +About us +======== + +.. include:: ../AUTHORS.rst + +.. seealso:: + + :ref:`How you can contribute to the project ` + +.. _citing-scikit-learn: + +Citing scikit-learn +------------------- + +If you use scikit-learn in a scientific publication, we would appreciate +citations to the following paper: + + `Scikit-learn: Machine Learning in Python + `_, Pedregosa + *et al.*, JMLR 12, pp. 2825-2830, 2011. + + Bibtex entry:: + + @article{scikit-learn, + title={Scikit-learn: Machine Learning in {P}ython}, + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011} + } + +If you want to cite scikit-learn for its API or design, you may also want to consider the +following paper: + + `API design for machine learning software: experiences from the scikit-learn + project `_, Buitinck *et al.*, 2013. + + Bibtex entry:: + + @inproceedings{sklearn_api, + author = {Lars Buitinck and Gilles Louppe and Mathieu Blondel and + Fabian Pedregosa and Andreas Mueller and Olivier Grisel and + Vlad Niculae and Peter Prettenhofer and Alexandre Gramfort + and Jaques Grobler and Robert Layton and Jake VanderPlas and + Arnaud Joly and Brian Holt and Ga{\"{e}}l Varoquaux}, + title = {{API} design for machine learning software: experiences from the scikit-learn + project}, + booktitle = {ECML PKDD Workshop: Languages for Data Mining and Machine Learning}, + year = {2013}, + pages = {108--122}, + } + +Artwork +------- + +High quality PNG and SVG logos are available in the `doc/logos/ `_ source directory. + +.. image:: images/scikit-learn-logo-notext.png + :align: center + +Funding +------- + +`INRIA `_ actively supports this project. It has +provided funding for Fabian Pedregosa (2010-2012), Jaques Grobler +(2012-2013) and Olivier Grisel (2013-2015) to work on this project +full-time. It also hosts coding sprints and other events. + +.. image:: images/inria-logo.jpg + :width: 200pt + :align: center + +`Paris-Saclay Center for Data Science `_ +funded one year for a developer to work on the project full-time +(2014-2015). + +.. image:: images/cds-logo.png + :width: 200pt + :align: center + +`NYU Moore-Sloan Data Science Environment `_ +funds Andreas Mueller (2014-2015) to work on this project. The Moore-Sloan Data Science +Environment also funds several students to work on the project part-time. + +.. image:: images/nyu_short_color.png + :width: 200pt + :align: center + +The following students were sponsored by `Google `_ +to work on scikit-learn through the +`Google Summer of Code `_ +program. + +- 2007 - David Cournapeau +- 2011 - `Vlad Niculae`_ +- 2012 - `Vlad Niculae`_, Immanuel Bayer. +- 2013 - Kemal Eren, Nicolas Trésegnie +- 2014 - Hamzeh Alsalhi, Issam Laradji, Maheshakya Wijewardena, Manoj Kumar. +- 2015 - `Raghav R V `_, Wei Xue +- 2016 - `Nelson Liu `_, `YenChen Lin `_ + +It also provided funding for sprints and events around scikit-learn. If +you would like to participate in the next Google Summer of code +program, please see `this page +`_. + +The `NeuroDebian `_ project providing `Debian +`_ packaging and contributions is supported by +`Dr. James V. Haxby `_ (`Dartmouth +College `_). + +The `PSF `_ helped find and manage funding for our +2011 Granada sprint. More information can be found `here +`__ + +`tinyclues `_ funded the 2011 international Granada +sprint. + + +Donating to the project +~~~~~~~~~~~~~~~~~~~~~~~ + +If you are interested in donating to the project or to one of our code-sprints, you can use +the *Paypal* button below or the `NumFOCUS Donations Page `_ (if you use the latter, please indicate that you are donating for the scikit-learn project). + +All donations will be handled by `NumFOCUS +`_, a non-profit-organization which is +managed by a board of `Scipy community members +`_. NumFOCUS's mission is to foster +scientific computing software, in particular in Python. As a fiscal home +of scikit-learn, it ensures that money is available when needed to keep +the project funded and available while in compliance with tax regulations. + +The received donations for the scikit-learn project mostly will go towards covering travel-expenses +for code sprints, as well as towards the organization budget of the project [#f1]_. + +.. raw :: html + +

+
+ + + + +
+
+ +.. rubric:: Notes + +.. [#f1] Regarding the organization budget in particular, we might use some of the donated funds to pay for other project expenses such as DNS, hosting or continuous integration services. + + +The 2013 Paris international sprint +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +|center-div| |telecom| |tinyclues| |afpy| |FNRS| + + |end-div| + + + +.. |center-div| raw:: html + +
+ + +.. |telecom| image:: http://f.hypotheses.org/wp-content/blogs.dir/331/files/2011/03/Logo-TPT.jpg + :width: 120pt + :target: http://www.telecom-paristech.fr/ + + +.. |tinyclues| image:: https://www.tinyclues.com/web/wp-content/uploads/2016/06/Tinyclues-PNG-logo.png + :width: 120pt + :target: https://www.tinyclues.com/ + + +.. |afpy| image:: https://www.afpy.org/logo.png + :width: 120pt + :target: https://www.afpy.org + + +.. |SGR| image:: http://www.svi.cnrs-bellevue.fr/wikimedia/images/Logo_svi_inp.png + :width: 120pt + :target: http://www.svi.cnrs-bellevue.fr + +.. |FNRS| image:: http://www.fnrs.be/en/images/FRS-FNRS_rose_transp.png + :width: 120pt + :target: http://www.frs-fnrs.be/ + +.. figure:: http://sites.uclouvain.be/dysco/pmwiki/uploads/Main/dysco.gif + :width: 120pt + :target: http://sites.uclouvain.be/dysco/ + + IAP VII/19 - DYSCO + +.. |end-div| raw:: html + +
+ +*For more information on this sprint, see* `here +`__ + + +Infrastructure support +---------------------- + +- We would like to thank `Rackspace `_ for providing + us with a free `Rackspace Cloud `_ account to + automatically build the documentation and the example gallery from for the + development version of scikit-learn using `this tool + `_. + +- We would also like to thank `Shining Panda + `_ for free CPU time on their Continuous + Integration server. + diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000..9c75922 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,282 @@ +# -*- coding: utf-8 -*- +# +# scikit-learn documentation build configuration file, created by +# sphinx-quickstart on Fri Jan 8 09:13:42 2010. +# +# This file is execfile()d with the current directory set to its containing +# dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +from __future__ import print_function +import sys +import os +from sklearn.externals.six import u + +# If extensions (or modules to document with autodoc) are in another +# directory, add these directories to sys.path here. If the directory +# is relative to the documentation root, use os.path.abspath to make it +# absolute, like shown here. +sys.path.insert(0, os.path.abspath('sphinxext')) + +from github_link import make_linkcode_resolve +import sphinx_gallery + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [ + 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', + 'numpy_ext.numpydoc', + 'sphinx.ext.linkcode', 'sphinx.ext.doctest', + 'sphinx_gallery.gen_gallery', +] + +# pngmath / imgmath compatibility layer for different sphinx versions +import sphinx +from distutils.version import LooseVersion +if LooseVersion(sphinx.__version__) < LooseVersion('1.4'): + extensions.append('sphinx.ext.pngmath') +else: + extensions.append('sphinx.ext.imgmath') + + +autodoc_default_flags = ['members', 'inherited-members'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['templates'] + +# generate autosummary even if no references +autosummary_generate = True + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8' + +# Generate the plots for the gallery +plot_gallery = True + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u('scikit-learn') +copyright = u('2010 - 2016, scikit-learn developers (BSD License)') + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +import sklearn +version = sklearn.__version__ +# The full version, including alpha/beta/rc tags. +release = sklearn.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of documents that shouldn't be included in the build. +#unused_docs = [] + +# List of directories, relative to source directory, that shouldn't be +# searched for source files. +exclude_trees = ['_build', 'templates', 'includes'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +add_function_parentheses = False + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. Major themes that come with +# Sphinx are currently 'default' and 'sphinxdoc'. +html_theme = 'scikit-learn' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +html_theme_options = {'oldversion': False, 'collapsiblesidebar': True, + 'google_analytics': True, 'surveybanner': False, + 'sprintbanner': True} + +# Add any paths that contain custom themes here, relative to this directory. +html_theme_path = ['themes'] + + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +html_short_title = 'scikit-learn' + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +html_logo = 'logos/scikit-learn-logo-small.png' + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +html_favicon = 'logos/favicon.ico' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['images'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +html_domain_indices = False + +# If false, no index is generated. +html_use_index = False + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = '' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'scikit-learndoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass +# [howto/manual]). +latex_documents = [('index', 'user_guide.tex', u('scikit-learn user guide'), + u('scikit-learn developers'), 'manual'), ] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +latex_logo = "logos/scikit-learn-logo.png" + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# Additional stuff for the LaTeX preamble. +latex_preamble = r""" +\usepackage{amsmath}\usepackage{amsfonts}\usepackage{bm}\usepackage{morefloats} +\usepackage{enumitem} \setlistdepth{10} +""" + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +latex_domain_indices = False + +trim_doctests_flags = True + + +sphinx_gallery_conf = { + 'doc_module': 'sklearn', + 'reference_url': { + 'sklearn': None, + 'matplotlib': 'http://matplotlib.org', + 'numpy': 'http://docs.scipy.org/doc/numpy-1.6.0', + 'scipy': 'http://docs.scipy.org/doc/scipy-0.11.0/reference', + 'nibabel': 'http://nipy.org/nibabel'} +} + + +# The following dictionary contains the information used to create the +# thumbnails for the front page of the scikit-learn home page. +# key: first image in set +# values: (number of plot in set, height of thumbnail) +carousel_thumbs = {'sphx_glr_plot_classifier_comparison_001.png': 600, + 'sphx_glr_plot_outlier_detection_003.png': 372, + 'sphx_glr_plot_gpr_co2_001.png': 350, + 'sphx_glr_plot_adaboost_twoclass_001.png': 372, + 'sphx_glr_plot_compare_methods_001.png': 349} + + +def make_carousel_thumbs(app, exception): + """produces the final resized carousel images""" + if exception is not None: + return + print('Preparing carousel images') + + image_dir = os.path.join(app.builder.outdir, '_images') + for glr_plot, max_width in carousel_thumbs.items(): + image = os.path.join(image_dir, glr_plot) + if os.path.exists(image): + c_thumb = os.path.join(image_dir, glr_plot[:-4] + '_carousel.png') + sphinx_gallery.gen_rst.scale_image(image, c_thumb, max_width, 190) + + +def setup(app): + # to hide/show the prompt in code examples: + app.add_javascript('js/copybutton.js') + app.connect('build-finished', make_carousel_thumbs) + + +# The following is used by sphinx.ext.linkcode to provide links to github +linkcode_resolve = make_linkcode_resolve('sklearn', + u'https://github.com/scikit-learn/' + 'scikit-learn/blob/{revision}/' + '{package}/{path}#L{lineno}') diff --git a/doc/data_transforms.rst b/doc/data_transforms.rst new file mode 100644 index 0000000..e861762 --- /dev/null +++ b/doc/data_transforms.rst @@ -0,0 +1,35 @@ +.. include:: includes/big_toc_css.rst + +.. _data-transforms: + +Dataset transformations +----------------------- + +scikit-learn provides a library of transformers, which may clean (see +:ref:`preprocessing`), reduce (see :ref:`data_reduction`), expand (see +:ref:`kernel_approximation`) or generate (see :ref:`feature_extraction`) +feature representations. + +Like other estimators, these are represented by classes with a ``fit`` method, +which learns model parameters (e.g. mean and standard deviation for +normalization) from a training set, and a ``transform`` method which applies +this transformation model to unseen data. ``fit_transform`` may be more +convenient and efficient for modelling and transforming the training data +simultaneously. + +Combining such transformers, either in parallel or series is covered in +:ref:`combining_estimators`. :ref:`metrics` covers transforming feature +spaces into affinity matrices, while :ref:`preprocessing_targets` considers +transformations of the target space (e.g. categorical labels) for use in +scikit-learn. + +.. toctree:: + + modules/pipeline + modules/feature_extraction + modules/preprocessing + modules/unsupervised_reduction + modules/random_projection + modules/kernel_approximation + modules/metrics + modules/preprocessing_targets diff --git a/doc/datasets/covtype.rst b/doc/datasets/covtype.rst new file mode 100644 index 0000000..c0ed4ea --- /dev/null +++ b/doc/datasets/covtype.rst @@ -0,0 +1,20 @@ + +.. _covtype: + +Forest covertypes +================= + +The samples in this dataset correspond to 30×30m patches of forest in the US, +collected for the task of predicting each patch's cover type, +i.e. the dominant species of tree. +There are seven covertypes, making this a multiclass classification problem. +Each sample has 54 features, described on the +`dataset's homepage `_. +Some of the features are boolean indicators, +while others are discrete or continuous measurements. + +:func:`sklearn.datasets.fetch_covtype` will load the covertype dataset; +it returns a dictionary-like object +with the feature matrix in the ``data`` member +and the target values in ``target``. +The dataset will be downloaded from the web if necessary. diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst new file mode 100644 index 0000000..b3f329e --- /dev/null +++ b/doc/datasets/index.rst @@ -0,0 +1,306 @@ +.. _datasets: + +========================= +Dataset loading utilities +========================= + +.. currentmodule:: sklearn.datasets + +The ``sklearn.datasets`` package embeds some small toy datasets +as introduced in the :ref:`Getting Started ` section. + +To evaluate the impact of the scale of the dataset (``n_samples`` and +``n_features``) while controlling the statistical properties of the data +(typically the correlation and informativeness of the features), it is +also possible to generate synthetic data. + +This package also features helpers to fetch larger datasets commonly +used by the machine learning community to benchmark algorithm on data +that comes from the 'real world'. + +General dataset API +=================== + +There are three distinct kinds of dataset interfaces for different types +of datasets. +The simplest one is the interface for sample images, which is described +below in the :ref:`sample_images` section. + +The dataset generation functions and the svmlight loader share a simplistic +interface, returning a tuple ``(X, y)`` consisting of a ``n_samples`` * +``n_features`` numpy array ``X`` and an array of length ``n_samples`` +containing the targets ``y``. + +The toy datasets as well as the 'real world' datasets and the datasets +fetched from mldata.org have more sophisticated structure. +These functions return a dictionary-like object holding at least two items: +an array of shape ``n_samples`` * ``n_features`` with key ``data`` +(except for 20newsgroups) +and a numpy array of length ``n_samples``, containing the target values, +with key ``target``. + +The datasets also contain a description in ``DESCR`` and some contain +``feature_names`` and ``target_names``. +See the dataset descriptions below for details. + + +Toy datasets +============ + +scikit-learn comes with a few small standard datasets that do not +require to download any file from some external website. + +.. autosummary:: + + :toctree: ../modules/generated/ + :template: function.rst + + load_boston + load_iris + load_diabetes + load_digits + load_linnerud + +These datasets are useful to quickly illustrate the behavior of the +various algorithms implemented in the scikit. They are however often too +small to be representative of real world machine learning tasks. + +.. _sample_images: + +Sample images +============= + +The scikit also embed a couple of sample JPEG images published under Creative +Commons license by their authors. Those image can be useful to test algorithms +and pipeline on 2D data. + +.. autosummary:: + + :toctree: ../modules/generated/ + :template: function.rst + + load_sample_images + load_sample_image + +.. image:: ../auto_examples/cluster/images/sphx_glr_plot_color_quantization_001.png + :target: ../auto_examples/cluster/plot_color_quantization.html + :scale: 30 + :align: right + + +.. warning:: + + The default coding of images is based on the ``uint8`` dtype to + spare memory. Often machine learning algorithms work best if the + input is converted to a floating point representation first. Also, + if you plan to use ``matplotlib.pyplpt.imshow`` don't forget to scale to the range + 0 - 1 as done in the following example. + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_cluster_plot_color_quantization.py` + + +.. _sample_generators: + +Sample generators +================= + +In addition, scikit-learn includes various random sample generators that +can be used to build artificial datasets of controlled size and complexity. + +Generators for classification and clustering +-------------------------------------------- + +These generators produce a matrix of features and corresponding discrete +targets. + +Single label +~~~~~~~~~~~~ + +Both :func:`make_blobs` and :func:`make_classification` create multiclass +datasets by allocating each class one or more normally-distributed clusters of +points. :func:`make_blobs` provides greater control regarding the centers and +standard deviations of each cluster, and is used to demonstrate clustering. +:func:`make_classification` specialises in introducing noise by way of: +correlated, redundant and uninformative features; multiple Gaussian clusters +per class; and linear transformations of the feature space. + +:func:`make_gaussian_quantiles` divides a single Gaussian cluster into +near-equal-size classes separated by concentric hyperspheres. +:func:`make_hastie_10_2` generates a similar binary, 10-dimensional problem. + +.. image:: ../auto_examples/datasets/images/sphx_glr_plot_random_dataset_001.png + :target: ../auto_examples/datasets/plot_random_dataset.html + :scale: 50 + :align: center + +:func:`make_circles` and :func:`make_moons` generate 2d binary classification +datasets that are challenging to certain algorithms (e.g. centroid-based +clustering or linear classification), including optional Gaussian noise. +They are useful for visualisation. produces Gaussian +data with a spherical decision boundary for binary classification. + +Multilabel +~~~~~~~~~~ + +:func:`make_multilabel_classification` generates random samples with multiple +labels, reflecting a bag of words drawn from a mixture of topics. The number of +topics for each document is drawn from a Poisson distribution, and the topics +themselves are drawn from a fixed random distribution. Similarly, the number of +words is drawn from Poisson, with words drawn from a multinomial, where each +topic defines a probability distribution over words. Simplifications with +respect to true bag-of-words mixtures include: + +* Per-topic word distributions are independently drawn, where in reality all + would be affected by a sparse base distribution, and would be correlated. +* For a document generated from multiple topics, all topics are weighted + equally in generating its bag of words. +* Documents without labels words at random, rather than from a base + distribution. + +.. image:: ../auto_examples/datasets/images/sphx_glr_plot_random_multilabel_dataset_001.png + :target: ../auto_examples/datasets/plot_random_multilabel_dataset.html + :scale: 50 + :align: center + +Biclustering +~~~~~~~~~~~~ + +.. autosummary:: + + :toctree: ../modules/generated/ + :template: function.rst + + make_biclusters + make_checkerboard + + +Generators for regression +------------------------- + +:func:`make_regression` produces regression targets as an optionally-sparse +random linear combination of random features, with noise. Its informative +features may be uncorrelated, or low rank (few features account for most of the +variance). + +Other regression generators generate functions deterministically from +randomized features. :func:`make_sparse_uncorrelated` produces a target as a +linear combination of four features with fixed coefficients. +Others encode explicitly non-linear relations: +:func:`make_friedman1` is related by polynomial and sine transforms; +:func:`make_friedman2` includes feature multiplication and reciprocation; and +:func:`make_friedman3` is similar with an arctan transformation on the target. + +Generators for manifold learning +-------------------------------- + +.. autosummary:: + + :toctree: ../modules/generated/ + :template: function.rst + + make_s_curve + make_swiss_roll + +Generators for decomposition +---------------------------- + +.. autosummary:: + + :toctree: ../modules/generated/ + :template: function.rst + + make_low_rank_matrix + make_sparse_coded_signal + make_spd_matrix + make_sparse_spd_matrix + + +.. _libsvm_loader: + +Datasets in svmlight / libsvm format +==================================== + +scikit-learn includes utility functions for loading +datasets in the svmlight / libsvm format. In this format, each line +takes the form ``