From eb44410ed905640696b3a2e5ccb51d54dfe1d197 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Thu, 8 Aug 2013 18:06:48 +0000 Subject: [PATCH] Import scikit-learn_0.14.1.orig.tar.gz [dgit import orig scikit-learn_0.14.1.orig.tar.gz] --- .gitattributes | 25 + .gitignore | 52 + .mailmap | 93 + .travis.yml | 10 + AUTHORS.rst | 91 + CONTRIBUTING.md | 145 + COPYING | 32 + MANIFEST.in | 5 + Makefile | 64 + README.rst | 93 + benchmarks/bench_covertype.py | 269 + benchmarks/bench_glm.py | 58 + benchmarks/bench_glmnet.py | 128 + benchmarks/bench_lasso.py | 95 + benchmarks/bench_plot_fastkmeans.py | 138 + benchmarks/bench_plot_lasso_path.py | 117 + benchmarks/bench_plot_neighbors.py | 186 + benchmarks/bench_plot_nmf.py | 166 + benchmarks/bench_plot_omp_lars.py | 123 + benchmarks/bench_plot_parallel_pairwise.py | 44 + benchmarks/bench_plot_svd.py | 82 + benchmarks/bench_plot_ward.py | 43 + benchmarks/bench_random_projections.py | 254 + .../bench_sample_without_replacement.py | 207 + benchmarks/bench_sgd_regression.py | 131 + benchmarks/bench_tree.py | 124 + doc/Makefile | 109 + doc/README | 54 + doc/about.rst | 154 + doc/conf.py | 226 + doc/data_transforms.rst | 14 + doc/datasets/covtype.rst | 20 + doc/datasets/index.rst | 185 + doc/datasets/labeled_faces.rst | 117 + doc/datasets/labeled_faces_fixture.py | 15 + doc/datasets/mldata.rst | 70 + doc/datasets/mldata_fixture.py | 45 + doc/datasets/olivetti_faces.rst | 36 + doc/datasets/twenty_newsgroups.rst | 217 + doc/datasets/twenty_newsgroups_fixture.py | 15 + doc/developers/debugging.rst | 51 + doc/developers/index.rst | 903 + doc/developers/maintainer.rst | 48 + doc/developers/performance.rst | 433 + doc/developers/utilities.rst | 322 + doc/documentation.rst | 100 + doc/images/google-logo.png | Bin 0 -> 7087 bytes doc/images/inria-logo.jpg | Bin 0 -> 21107 bytes doc/images/inria-small.jpg | Bin 0 -> 11762 bytes doc/images/iris.pdf | Bin 0 -> 15633 bytes doc/images/iris.svg | 220 + doc/images/last_digit.png | Bin 0 -> 4789 bytes doc/images/minBox.png | Bin 0 -> 245 bytes doc/images/minBoxHighlight.png | Bin 0 -> 259 bytes doc/images/ml_map.png | Bin 0 -> 789895 bytes doc/images/no_image.png | Bin 0 -> 4731 bytes doc/images/noneBox.png | Bin 0 -> 200 bytes doc/images/plot_digits_classification.png | Bin 0 -> 54025 bytes doc/images/plot_face_recognition_1.png | Bin 0 -> 201807 bytes doc/images/plot_face_recognition_2.png | Bin 0 -> 143690 bytes doc/images/plusBox.png | Bin 0 -> 242 bytes doc/images/plusBoxHighlight.png | Bin 0 -> 263 bytes doc/images/rbm_graph.png | Bin 0 -> 22927 bytes doc/includes/big_toc_css.rst | 44 + doc/includes/bigger_toc_css.rst | 60 + doc/index.rst | 337 + doc/install.rst | 309 + doc/logos/favicon.ico | Bin 0 -> 2238 bytes doc/logos/identity.pdf | Bin 0 -> 120865 bytes doc/logos/scikit-learn-logo-notext.png | Bin 0 -> 11276 bytes doc/logos/scikit-learn-logo-small.png | Bin 0 -> 6854 bytes doc/logos/scikit-learn-logo-thumb.png | Bin 0 -> 8923 bytes doc/logos/scikit-learn-logo.png | Bin 0 -> 16295 bytes doc/logos/scikit-learn-logo.svg | 110 + doc/make.bat | 113 + doc/model_selection.rst | 13 + doc/modules/biclustering.rst | 306 + doc/modules/classes.rst | 1170 + doc/modules/clustering.rst | 1133 + doc/modules/covariance.rst | 328 + doc/modules/cross_decomposition.rst | 42 + doc/modules/cross_validation.rst | 446 + doc/modules/decomposition.rst | 657 + doc/modules/density.rst | 180 + doc/modules/dp-derivation.rst | 502 + doc/modules/ensemble.rst | 782 + doc/modules/feature_extraction.rst | 890 + doc/modules/feature_selection.rst | 236 + doc/modules/gaussian_process.rst | 361 + .../lasso_enet_coordinate_descent.png | Bin 0 -> 40423 bytes doc/modules/grid_search.rst | 234 + doc/modules/hmm.rst | 119 + doc/modules/isotonic.rst | 23 + doc/modules/kernel_approximation.rst | 200 + doc/modules/label_propagation.rst | 100 + doc/modules/lda_qda.rst | 65 + doc/modules/linear_model.rst | 727 + doc/modules/manifold.rst | 507 + doc/modules/metrics.rst | 114 + doc/modules/mixture.rst | 326 + doc/modules/model_evaluation.rst | 1123 + doc/modules/multiclass.rst | 271 + doc/modules/naive_bayes.rst | 197 + doc/modules/neighbors.rst | 510 + doc/modules/neural_networks.rst | 161 + doc/modules/outlier_detection.rst | 189 + doc/modules/pipeline.rst | 144 + doc/modules/preprocessing.rst | 473 + doc/modules/random_projection.rst | 162 + doc/modules/sgd.rst | 403 + doc/modules/svm.rst | 641 + doc/modules/tree.rst | 465 + doc/presentations.rst | 72 + doc/sphinxext/LICENSE.txt | 97 + doc/sphinxext/MANIFEST.in | 2 + doc/sphinxext/README.txt | 52 + doc/sphinxext/gen_rst.py | 1039 + doc/sphinxext/numpy_ext/__init__.py | 0 doc/sphinxext/numpy_ext/docscrape.py | 507 + doc/sphinxext/numpy_ext/docscrape_sphinx.py | 240 + doc/sphinxext/numpy_ext/numpydoc.py | 177 + doc/supervised_learning.rst | 23 + doc/support.rst | 108 + doc/templates/class.rst | 12 + doc/templates/class_with_call.rst | 13 + doc/templates/function.rst | 8 + doc/testimonials/README.txt | 8 + doc/testimonials/images/Makefile | 0 doc/testimonials/images/aweber.png | Bin 0 -> 42336 bytes doc/testimonials/images/evernote.png | Bin 0 -> 4708 bytes doc/testimonials/images/inria.jpg | Bin 0 -> 21107 bytes doc/testimonials/images/telecomparistech.jpg | Bin 0 -> 11473 bytes doc/testimonials/testimonials.rst | 157 + doc/themes/scikit-learn/layout.html | 359 + .../scikit-learn/static/ML_MAPS_README.rst | 93 + .../static/css/bootstrap-responsive.css | 1109 + .../static/css/bootstrap-responsive.min.css | 9 + .../scikit-learn/static/css/bootstrap.css | 6315 +++ .../scikit-learn/static/css/bootstrap.min.css | 857 + doc/themes/scikit-learn/static/img/forkme.png | Bin 0 -> 9267 bytes .../static/img/glyphicons-halflings-white.png | Bin 0 -> 8777 bytes .../static/img/glyphicons-halflings.png | Bin 0 -> 12799 bytes doc/themes/scikit-learn/static/img/google.png | Bin 0 -> 7087 bytes doc/themes/scikit-learn/static/img/inria.jpg | Bin 0 -> 21107 bytes .../img/plot_classifier_comparison_1.png | Bin 0 -> 512692 bytes .../static/img/plot_manifold_sphere_1.png | Bin 0 -> 732646 bytes .../static/img/scikit-learn-logo-notext.png | Bin 0 -> 11276 bytes .../static/img/scikit-learn-logo-small.png | Bin 0 -> 4205 bytes .../static/img/scikit-learn-logo.png | Bin 0 -> 16295 bytes .../static/img/scikit-learn-logo.svg | 1050 + .../scikit-learn/static/img/telecom.jpg | Bin 0 -> 31027 bytes doc/themes/scikit-learn/static/jquery.js | 9404 ++++ .../scikit-learn/static/jquery.maphilight.js | 362 + .../static/jquery.maphilight.min.js | 1 + .../scikit-learn/static/js/bootstrap.js | 2280 + .../scikit-learn/static/js/bootstrap.min.js | 6 + doc/themes/scikit-learn/static/nature.css_t | 1186 + doc/themes/scikit-learn/static/sidebar.js | 156 + doc/themes/scikit-learn/theme.conf | 11 + doc/tutorial/basic/tutorial.rst | 228 + doc/tutorial/common_includes/info.txt | 3 + doc/tutorial/index.rst | 26 + .../machine_learning_map/ML_MAPS_README.txt | 93 + doc/tutorial/machine_learning_map/index.rst | 109 + .../machine_learning_map/parse_path.py | 164 + .../machine_learning_map/pyparsing.py | 3381 ++ .../machine_learning_map/svg2imagemap.py | 90 + .../statistical_inference/finding_help.rst | 42 + doc/tutorial/statistical_inference/index.rst | 50 + .../statistical_inference/model_selection.rst | 219 + .../putting_together.rst | 76 + .../statistical_inference/settings.rst | 92 + .../supervised_learning.rst | 573 + .../unsupervised_learning.rst | 323 + doc/tutorial/text_analytics/placeholder.txt | 1 + doc/unsupervised_learning.rst | 19 + doc/user_guide.rst | 64 + doc/whats_new.rst | 2222 + examples/README.txt | 6 + examples/applications/README.txt | 7 + examples/applications/face_recognition.py | 159 + .../applications/plot_hmm_stock_analysis.py | 100 + .../plot_out_of_core_classification.py | 307 + .../plot_outlier_detection_housing.py | 126 + .../plot_species_distribution_modeling.py | 211 + examples/applications/plot_stock_market.py | 257 + .../plot_tomography_l1_reconstruction.py | 150 + examples/applications/svm_gui.py | 330 + .../topics_extraction_with_nmf.py | 78 + .../wikipedia_principal_eigenvector.py | 229 + examples/bicluster/README.txt | 6 + examples/bicluster/bicluster_newsgroups.py | 172 + .../bicluster/plot_spectral_biclustering.py | 62 + .../bicluster/plot_spectral_coclustering.py | 54 + examples/cluster/README.txt | 6 + .../plot_adjusted_for_chance_measures.py | 123 + examples/cluster/plot_affinity_propagation.py | 62 + examples/cluster/plot_cluster_comparison.py | 113 + examples/cluster/plot_cluster_iris.py | 92 + examples/cluster/plot_color_quantization.py | 104 + examples/cluster/plot_dbscan.py | 73 + examples/cluster/plot_dict_face_patches.py | 84 + examples/cluster/plot_digits_agglomeration.py | 61 + ...e_agglomeration_vs_univariate_selection.py | 108 + examples/cluster/plot_kmeans_digits.py | 127 + .../plot_kmeans_stability_low_dim_dense.py | 119 + examples/cluster/plot_lena_compress.py | 80 + examples/cluster/plot_lena_segmentation.py | 74 + .../cluster/plot_lena_ward_segmentation.py | 55 + examples/cluster/plot_mean_shift.py | 56 + examples/cluster/plot_mini_batch_kmeans.py | 120 + examples/cluster/plot_segmentation_toy.py | 98 + .../plot_ward_structured_vs_unstructured.py | 88 + examples/covariance/README.txt | 6 + .../covariance/plot_covariance_estimation.py | 131 + examples/covariance/plot_lw_vs_oas.py | 83 + .../covariance/plot_mahalanobis_distances.py | 143 + examples/covariance/plot_outlier_detection.py | 97 + .../plot_robust_vs_empirical_covariance.py | 148 + examples/covariance/plot_sparse_cov.py | 135 + examples/cross_decomposition/README.txt | 7 + .../plot_compare_cross_decomposition.py | 148 + examples/datasets/README.txt | 6 + examples/datasets/plot_digits_last_image.py | 35 + examples/datasets/plot_iris_dataset.py | 67 + examples/datasets/plot_random_dataset.py | 49 + examples/decomposition/README.txt | 7 + .../decomposition/plot_faces_decomposition.py | 133 + .../plot_ica_blind_source_separation.py | 53 + examples/decomposition/plot_ica_vs_pca.py | 103 + .../decomposition/plot_image_denoising.py | 164 + examples/decomposition/plot_kernel_pca.py | 73 + examples/decomposition/plot_pca_3d.py | 99 + examples/decomposition/plot_pca_iris.py | 70 + examples/decomposition/plot_pca_vs_lda.py | 55 + examples/decomposition/plot_sparse_coding.py | 97 + .../document_classification_20newsgroups.py | 315 + examples/document_clustering.py | 196 + examples/ensemble/README.txt | 6 + .../ensemble/plot_adaboost_hastie_10_2.py | 111 + examples/ensemble/plot_adaboost_multiclass.py | 109 + examples/ensemble/plot_adaboost_regression.py | 51 + examples/ensemble/plot_adaboost_twoclass.py | 93 + examples/ensemble/plot_forest_importances.py | 54 + .../ensemble/plot_forest_importances_faces.py | 49 + examples/ensemble/plot_forest_iris.py | 152 + .../ensemble/plot_gradient_boosting_oob.py | 136 + .../plot_gradient_boosting_quantile.py | 79 + .../plot_gradient_boosting_regression.py | 75 + .../plot_gradient_boosting_regularization.py | 79 + examples/ensemble/plot_partial_dependence.py | 111 + .../ensemble/plot_random_forest_embedding.py | 105 + examples/exercises/README.txt | 4 + examples/exercises/plot_cv_diabetes.py | 73 + examples/exercises/plot_cv_digits.py | 44 + .../plot_digits_classification_exercise.py | 33 + examples/exercises/plot_iris_exercise.py | 65 + examples/feature_selection_pipeline.py | 29 + examples/feature_stacker.py | 60 + examples/gaussian_process/README.txt | 7 + .../gaussian_process/gp_diabetes_dataset.py | 51 + ...ilistic_classification_after_regression.py | 111 + .../gaussian_process/plot_gp_regression.py | 127 + examples/grid_search_digits.py | 78 + .../grid_search_text_feature_extraction.py | 129 + examples/hashing_vs_dict_vectorizer.py | 111 + examples/imputation.py | 66 + examples/linear_model/README.txt | 6 + .../lasso_dense_vs_sparse_data.py | 66 + examples/linear_model/plot_ard.py | 82 + examples/linear_model/plot_bayesian_ridge.py | 78 + examples/linear_model/plot_iris_logistic.py | 59 + .../linear_model/plot_lasso_and_elasticnet.py | 67 + .../plot_lasso_coordinate_descent_path.py | 94 + examples/linear_model/plot_lasso_lars.py | 42 + .../plot_lasso_model_selection.py | 155 + examples/linear_model/plot_logistic.py | 65 + .../plot_logistic_l1_l2_sparsity.py | 79 + examples/linear_model/plot_logistic_path.py | 55 + .../plot_multi_task_lasso_support.py | 66 + examples/linear_model/plot_ols.py | 68 + examples/linear_model/plot_ols_3d.py | 74 + .../linear_model/plot_ols_ridge_variance.py | 70 + examples/linear_model/plot_omp.py | 82 + .../plot_polynomial_interpolation.py | 60 + examples/linear_model/plot_ridge_path.py | 59 + examples/linear_model/plot_sgd_comparison.py | 51 + examples/linear_model/plot_sgd_iris.py | 80 + .../linear_model/plot_sgd_loss_functions.py | 53 + examples/linear_model/plot_sgd_penalties.py | 67 + .../plot_sgd_separating_hyperplane.py | 42 + .../linear_model/plot_sgd_weighted_classes.py | 56 + .../linear_model/plot_sgd_weighted_samples.py | 48 + examples/linear_model/plot_sparse_recovery.py | 172 + examples/manifold/README.txt | 7 + examples/manifold/plot_compare_methods.py | 111 + examples/manifold/plot_lle_digits.py | 214 + examples/manifold/plot_manifold_sphere.py | 141 + examples/manifold/plot_mds.py | 86 + examples/manifold/plot_swissroll.py | 50 + examples/mixture/README.txt | 6 + examples/mixture/plot_gmm.py | 80 + examples/mixture/plot_gmm_classifier.py | 120 + examples/mixture/plot_gmm_pdf.py | 38 + examples/mixture/plot_gmm_selection.py | 97 + examples/mixture/plot_gmm_sin.py | 82 + .../mlcomp_sparse_document_classification.py | 145 + examples/neighbors/README.txt | 6 + examples/neighbors/plot_classification.py | 55 + .../neighbors/plot_digits_kde_sampling.py | 62 + examples/neighbors/plot_kde_1d.py | 144 + examples/neighbors/plot_nearest_centroid.py | 56 + examples/neighbors/plot_regression.py | 49 + examples/neighbors/plot_species_kde.py | 115 + examples/plot_classification_probability.py | 76 + examples/plot_classifier_comparison.py | 132 + examples/plot_confusion_matrix.py | 46 + examples/plot_digits_classification.py | 64 + examples/plot_digits_pipe.py | 68 + examples/plot_feature_selection.py | 84 + examples/plot_hmm_sampling.py | 63 + examples/plot_isotonic_regression.py | 58 + examples/plot_johnson_lindenstrauss_bound.py | 198 + examples/plot_kernel_approximation.py | 210 + examples/plot_lda_qda.py | 142 + examples/plot_multilabel.py | 116 + examples/plot_multioutput_face_completion.py | 98 + ...lot_permutation_test_for_classification.py | 67 + examples/plot_precision_recall.py | 51 + examples/plot_rbm_logistic_classification.py | 141 + examples/plot_rfe_digits.py | 36 + examples/plot_rfe_with_cross_validation.py | 36 + examples/plot_roc.py | 61 + examples/plot_roc_crossval.py | 67 + examples/plot_train_error_vs_test_error.py | 75 + examples/randomized_search.py | 87 + examples/semi_supervised/README.txt | 6 + .../plot_label_propagation_digits.py | 90 + ...abel_propagation_digits_active_learning.py | 99 + .../plot_label_propagation_structure.py | 62 + .../plot_label_propagation_versus_svm_iris.py | 79 + examples/svm/README.txt | 6 + examples/svm/plot_custom_kernel.py | 57 + examples/svm/plot_iris.py | 61 + examples/svm/plot_oneclass.py | 63 + examples/svm/plot_rbf_parameters.py | 125 + examples/svm/plot_separating_hyperplane.py | 48 + .../plot_separating_hyperplane_unbalanced.py | 52 + examples/svm/plot_svm_anova.py | 57 + examples/svm/plot_svm_iris.py | 60 + examples/svm/plot_svm_kernels.py | 84 + examples/svm/plot_svm_margin.py | 87 + examples/svm/plot_svm_nonlinear.py | 41 + examples/svm/plot_svm_regression.py | 46 + examples/svm/plot_svm_scale_c.py | 151 + examples/svm/plot_weighted_samples.py | 63 + examples/tree/README.txt | 6 + examples/tree/plot_iris.py | 78 + examples/tree/plot_tree_regression.py | 50 + .../tree/plot_tree_regression_multioutput.py | 57 + setup.cfg | 26 + setup.py | 149 + site.cfg | 6 + sklearn/__check_build/__init__.py | 46 + sklearn/__check_build/_check_build.c | 1208 + sklearn/__check_build/_check_build.pyx | 2 + sklearn/__check_build/setup.py | 18 + sklearn/__init__.py | 82 + sklearn/_build_utils.py | 32 + sklearn/_hmmc.c | 20659 ++++++++ sklearn/_hmmc.pyx | 128 + sklearn/_isotonic.c | 6381 +++ sklearn/_isotonic.pyx | 54 + sklearn/base.py | 437 + sklearn/cluster/__init__.py | 44 + sklearn/cluster/_feature_agglomeration.py | 71 + sklearn/cluster/_hierarchical.c | 7525 +++ sklearn/cluster/_hierarchical.pyx | 144 + sklearn/cluster/_k_means.c | 10794 +++++ sklearn/cluster/_k_means.pyx | 394 + sklearn/cluster/affinity_propagation_.py | 281 + sklearn/cluster/bicluster/__init__.py | 4 + sklearn/cluster/bicluster/spectral.py | 494 + sklearn/cluster/bicluster/tests/__init__.py | 0 .../cluster/bicluster/tests/test_spectral.py | 212 + sklearn/cluster/bicluster/tests/test_utils.py | 43 + sklearn/cluster/bicluster/utils.py | 40 + sklearn/cluster/dbscan_.py | 252 + sklearn/cluster/hierarchical.py | 471 + sklearn/cluster/k_means_.py | 1272 + sklearn/cluster/mean_shift_.py | 281 + sklearn/cluster/setup.py | 41 + sklearn/cluster/spectral.py | 488 + sklearn/cluster/tests/__init__.py | 0 sklearn/cluster/tests/common.py | 28 + .../tests/test_affinity_propagation.py | 58 + sklearn/cluster/tests/test_dbscan.py | 155 + sklearn/cluster/tests/test_hierarchical.py | 205 + sklearn/cluster/tests/test_k_means.py | 624 + sklearn/cluster/tests/test_mean_shift.py | 76 + sklearn/cluster/tests/test_spectral.py | 225 + sklearn/covariance/__init__.py | 35 + sklearn/covariance/empirical_covariance_.py | 270 + sklearn/covariance/graph_lasso_.py | 570 + sklearn/covariance/outlier_detection.py | 191 + sklearn/covariance/robust_covariance.py | 682 + sklearn/covariance/shrunk_covariance_.py | 552 + sklearn/covariance/tests/__init__.py | 0 sklearn/covariance/tests/test_covariance.py | 264 + sklearn/covariance/tests/test_graph_lasso.py | 64 + .../tests/test_robust_covariance.py | 98 + sklearn/cross_decomposition/__init__.py | 2 + sklearn/cross_decomposition/cca_.py | 110 + sklearn/cross_decomposition/pls_.py | 792 + sklearn/cross_decomposition/tests/test_pls.py | 249 + sklearn/cross_validation.py | 1407 + sklearn/datasets/DATASET_PROPOSAL.txt | 137 + sklearn/datasets/__init__.py | 98 + sklearn/datasets/_svmlight_format.c | 6580 +++ sklearn/datasets/_svmlight_format.pyx | 103 + sklearn/datasets/base.py | 525 + sklearn/datasets/california_housing.py | 104 + sklearn/datasets/covtype.py | 104 + sklearn/datasets/data/boston_house_prices.csv | 508 + sklearn/datasets/data/diabetes_data.csv.gz | Bin 0 -> 23803 bytes sklearn/datasets/data/diabetes_target.csv.gz | Bin 0 -> 1050 bytes sklearn/datasets/data/digits.csv.gz | Bin 0 -> 57523 bytes sklearn/datasets/data/iris.csv | 151 + sklearn/datasets/data/linnerud_exercise.csv | 21 + .../datasets/data/linnerud_physiological.csv | 21 + .../datasets/descr/boston_house_prices.rst | 52 + sklearn/datasets/descr/diabetes.rst | 8 + sklearn/datasets/descr/digits.rst | 44 + sklearn/datasets/descr/iris.rst | 59 + sklearn/datasets/descr/linnerud.rst | 21 + sklearn/datasets/images/README.txt | 21 + sklearn/datasets/images/china.jpg | Bin 0 -> 196653 bytes sklearn/datasets/images/flower.jpg | Bin 0 -> 142987 bytes sklearn/datasets/lfw.py | 439 + sklearn/datasets/mlcomp.py | 103 + sklearn/datasets/mldata.py | 239 + sklearn/datasets/olivetti_faces.py | 119 + sklearn/datasets/samples_generator.py | 1506 + sklearn/datasets/setup.py | 23 + sklearn/datasets/species_distributions.py | 257 + sklearn/datasets/svmlight_format.py | 348 + sklearn/datasets/tests/__init__.py | 0 .../tests/data/svmlight_classification.txt | 9 + .../datasets/tests/data/svmlight_invalid.txt | 3 + .../tests/data/svmlight_invalid_order.txt | 1 + .../tests/data/svmlight_multilabel.txt | 4 + sklearn/datasets/tests/test_20news.py | 62 + sklearn/datasets/tests/test_base.py | 185 + sklearn/datasets/tests/test_covtype.py | 32 + sklearn/datasets/tests/test_lfw.py | 181 + sklearn/datasets/tests/test_mldata.py | 169 + .../datasets/tests/test_samples_generator.py | 244 + .../datasets/tests/test_svmlight_format.py | 332 + sklearn/datasets/twenty_newsgroups.py | 356 + sklearn/decomposition/__init__.py | 35 + sklearn/decomposition/dict_learning.py | 1133 + sklearn/decomposition/factor_analysis.py | 235 + sklearn/decomposition/fastica_.py | 549 + sklearn/decomposition/kernel_pca.py | 262 + sklearn/decomposition/nmf.py | 587 + sklearn/decomposition/pca.py | 606 + sklearn/decomposition/sparse_pca.py | 264 + sklearn/decomposition/tests/__init__.py | 0 .../decomposition/tests/test_dict_learning.py | 189 + .../tests/test_factor_analysis.py | 52 + sklearn/decomposition/tests/test_fastica.py | 244 + .../decomposition/tests/test_kernel_pca.py | 214 + sklearn/decomposition/tests/test_nmf.py | 186 + sklearn/decomposition/tests/test_pca.py | 369 + .../decomposition/tests/test_sparse_pca.py | 163 + .../decomposition/tests/test_truncated_svd.py | 76 + sklearn/decomposition/truncated_svd.py | 180 + sklearn/dummy.py | 303 + sklearn/ensemble/__init__.py | 27 + sklearn/ensemble/_gradient_boosting.c | 21143 ++++++++ sklearn/ensemble/_gradient_boosting.pyx | 341 + sklearn/ensemble/base.py | 76 + sklearn/ensemble/forest.py | 1339 + sklearn/ensemble/gradient_boosting.py | 1160 + sklearn/ensemble/partial_dependence.py | 388 + sklearn/ensemble/setup.py | 17 + sklearn/ensemble/tests/__init__.py | 0 sklearn/ensemble/tests/test_base.py | 42 + sklearn/ensemble/tests/test_forest.py | 458 + .../ensemble/tests/test_gradient_boosting.py | 581 + .../test_gradient_boosting_loss_functions.py | 61 + .../ensemble/tests/test_partial_dependence.py | 206 + .../ensemble/tests/test_weight_boosting.py | 246 + sklearn/ensemble/weight_boosting.py | 1062 + sklearn/externals/README | 7 + sklearn/externals/__init__.py | 5 + sklearn/externals/copy_joblib.sh | 18 + sklearn/externals/setup.py | 10 + sklearn/externals/six.py | 335 + sklearn/externals/test_externals_setup.py | 10 + sklearn/feature_extraction/__init__.py | 13 + sklearn/feature_extraction/_hashing.c | 6530 +++ sklearn/feature_extraction/_hashing.pyx | 74 + sklearn/feature_extraction/dict_vectorizer.py | 296 + sklearn/feature_extraction/hashing.py | 140 + sklearn/feature_extraction/image.py | 472 + sklearn/feature_extraction/setup.py | 18 + sklearn/feature_extraction/stop_words.py | 45 + sklearn/feature_extraction/tests/__init__.py | 0 .../tests/test_dict_vectorizer.py | 100 + .../tests/test_feature_hasher.py | 82 + .../feature_extraction/tests/test_image.py | 288 + sklearn/feature_extraction/tests/test_text.py | 829 + sklearn/feature_extraction/text.py | 1238 + sklearn/feature_selection/__init__.py | 32 + sklearn/feature_selection/base.py | 115 + sklearn/feature_selection/from_model.py | 110 + sklearn/feature_selection/rfe.py | 377 + sklearn/feature_selection/selector_mixin.py | 11 + sklearn/feature_selection/tests/__init__.py | 0 sklearn/feature_selection/tests/test_base.py | 115 + sklearn/feature_selection/tests/test_chi2.py | 82 + .../tests/test_feature_select.py | 501 + .../tests/test_from_model.py | 42 + sklearn/feature_selection/tests/test_rfe.py | 107 + .../feature_selection/univariate_selection.py | 591 + sklearn/gaussian_process/__init__.py | 17 + .../gaussian_process/correlation_models.py | 285 + sklearn/gaussian_process/gaussian_process.py | 880 + sklearn/gaussian_process/regression_models.py | 90 + sklearn/gaussian_process/tests/__init__.py | 0 .../tests/test_gaussian_process.py | 101 + sklearn/grid_search.py | 860 + sklearn/hmm.py | 1218 + sklearn/isotonic.py | 258 + sklearn/kernel_approximation.py | 502 + sklearn/lda.py | 295 + sklearn/linear_model/__init__.py | 72 + sklearn/linear_model/base.py | 420 + sklearn/linear_model/bayes.py | 420 + sklearn/linear_model/cd_fast.c | 12605 +++++ sklearn/linear_model/cd_fast.pyx | 655 + sklearn/linear_model/coordinate_descent.py | 1509 + sklearn/linear_model/least_angle.py | 1236 + sklearn/linear_model/logistic.py | 140 + sklearn/linear_model/omp.py | 865 + sklearn/linear_model/passive_aggressive.py | 278 + sklearn/linear_model/perceptron.py | 104 + sklearn/linear_model/randomized_l1.py | 619 + sklearn/linear_model/ridge.py | 1038 + sklearn/linear_model/setup.py | 42 + sklearn/linear_model/sgd_fast.c | 14975 ++++++ sklearn/linear_model/sgd_fast.pyx | 554 + sklearn/linear_model/stochastic_gradient.py | 1083 + sklearn/linear_model/tests/__init__.py | 0 sklearn/linear_model/tests/test_base.py | 112 + sklearn/linear_model/tests/test_bayes.py | 58 + .../tests/test_coordinate_descent.py | 339 + .../linear_model/tests/test_least_angle.py | 458 + sklearn/linear_model/tests/test_logistic.py | 157 + sklearn/linear_model/tests/test_omp.py | 235 + .../tests/test_passive_aggressive.py | 180 + sklearn/linear_model/tests/test_perceptron.py | 70 + .../linear_model/tests/test_randomized_l1.py | 128 + sklearn/linear_model/tests/test_ridge.py | 473 + sklearn/linear_model/tests/test_sgd.py | 803 + .../tests/test_sparse_coordinate_descent.py | 248 + sklearn/manifold/__init__.py | 11 + sklearn/manifold/isomap.py | 203 + sklearn/manifold/locally_linear.py | 678 + sklearn/manifold/mds.py | 397 + sklearn/manifold/spectral_embedding_.py | 488 + sklearn/manifold/tests/test_isomap.py | 118 + sklearn/manifold/tests/test_locally_linear.py | 126 + sklearn/manifold/tests/test_mds.py | 61 + .../manifold/tests/test_spectral_embedding.py | 198 + sklearn/metrics/__init__.py | 95 + sklearn/metrics/cluster/__init__.py | 28 + sklearn/metrics/cluster/bicluster/__init__.py | 3 + .../cluster/bicluster/bicluster_metrics.py | 80 + .../cluster/bicluster/tests/__init__.py | 0 .../bicluster/tests/test_bicluster_metrics.py | 36 + .../cluster/expected_mutual_info_fast.c | 7156 +++ .../cluster/expected_mutual_info_fast.pyx | 72 + sklearn/metrics/cluster/setup.py | 23 + sklearn/metrics/cluster/supervised.py | 746 + sklearn/metrics/cluster/tests/__init__.py | 0 .../metrics/cluster/tests/test_supervised.py | 198 + .../cluster/tests/test_unsupervised.py | 51 + sklearn/metrics/cluster/unsupervised.py | 203 + sklearn/metrics/metrics.py | 2259 + sklearn/metrics/pairwise.py | 837 + sklearn/metrics/pairwise_fast.c | 22700 +++++++++ sklearn/metrics/pairwise_fast.pyx | 31 + sklearn/metrics/scorer.py | 276 + sklearn/metrics/setup.py | 21 + sklearn/metrics/tests/__init__.py | 0 sklearn/metrics/tests/test_metrics.py | 1911 + sklearn/metrics/tests/test_pairwise.py | 405 + sklearn/metrics/tests/test_score_objects.py | 114 + sklearn/mixture/__init__.py | 16 + sklearn/mixture/dpgmm.py | 754 + sklearn/mixture/gmm.py | 722 + sklearn/mixture/tests/__init__.py | 0 sklearn/mixture/tests/test_dpgmm.py | 102 + sklearn/mixture/tests/test_gmm.py | 331 + sklearn/multiclass.py | 605 + sklearn/naive_bayes.py | 573 + sklearn/neighbors/__init__.py | 27 + sklearn/neighbors/ball_tree.c | 39142 +++++++++++++++ sklearn/neighbors/ball_tree.pyx | 177 + sklearn/neighbors/base.py | 642 + sklearn/neighbors/binary_tree.pxi | 2529 + sklearn/neighbors/classification.py | 386 + sklearn/neighbors/dist_metrics.c | 29411 ++++++++++++ sklearn/neighbors/dist_metrics.pxd | 75 + sklearn/neighbors/dist_metrics.pyx | 1097 + sklearn/neighbors/graph.py | 93 + sklearn/neighbors/kd_tree.c | 39778 ++++++++++++++++ sklearn/neighbors/kd_tree.pyx | 255 + sklearn/neighbors/kde.py | 213 + sklearn/neighbors/nearest_centroid.py | 159 + sklearn/neighbors/regression.py | 297 + sklearn/neighbors/setup.py | 35 + sklearn/neighbors/tests/__init__.py | 0 sklearn/neighbors/tests/test_ball_tree.py | 298 + sklearn/neighbors/tests/test_dist_metrics.py | 141 + sklearn/neighbors/tests/test_kd_tree.py | 243 + sklearn/neighbors/tests/test_kde.py | 124 + .../neighbors/tests/test_nearest_centroid.py | 103 + sklearn/neighbors/tests/test_neighbors.py | 857 + sklearn/neighbors/typedefs.c | 5086 ++ sklearn/neighbors/typedefs.pxd | 18 + sklearn/neighbors/typedefs.pyx | 23 + sklearn/neighbors/unsupervised.py | 83 + sklearn/neural_network/__init__.py | 6 + sklearn/neural_network/rbm.py | 317 + sklearn/neural_network/tests/test_rbm.py | 123 + sklearn/pipeline.py | 348 + sklearn/pls.py | 7 + sklearn/preprocessing/__init__.py | 41 + sklearn/preprocessing/_weights.py | 34 + sklearn/preprocessing/data.py | 988 + sklearn/preprocessing/imputation.py | 406 + sklearn/preprocessing/label.py | 430 + sklearn/preprocessing/tests/__init__.py | 0 sklearn/preprocessing/tests/test_data.py | 709 + .../preprocessing/tests/test_imputation.py | 261 + sklearn/preprocessing/tests/test_label.py | 229 + sklearn/preprocessing/tests/test_weights.py | 13 + sklearn/qda.py | 240 + sklearn/random_projection.py | 616 + sklearn/semi_supervised/__init__.py | 10 + sklearn/semi_supervised/label_propagation.py | 396 + sklearn/semi_supervised/tests/__init__.py | 0 .../tests/test_label_propagation.py | 54 + sklearn/setup.py | 91 + sklearn/src/cblas/ATL_drefcopy.c | 148 + sklearn/src/cblas/ATL_drefgemv.c | 178 + sklearn/src/cblas/ATL_drefgemvN.c | 96 + sklearn/src/cblas/ATL_drefgemvT.c | 96 + sklearn/src/cblas/ATL_drefger.c | 147 + sklearn/src/cblas/ATL_drefrot.c | 170 + sklearn/src/cblas/ATL_drefrotg.c | 146 + sklearn/src/cblas/ATL_dreftrsv.c | 213 + sklearn/src/cblas/ATL_dreftrsvLNN.c | 94 + sklearn/src/cblas/ATL_dreftrsvLNU.c | 94 + sklearn/src/cblas/ATL_dreftrsvLTN.c | 96 + sklearn/src/cblas/ATL_dreftrsvLTU.c | 96 + sklearn/src/cblas/ATL_dreftrsvUNN.c | 95 + sklearn/src/cblas/ATL_dreftrsvUNU.c | 95 + sklearn/src/cblas/ATL_dreftrsvUTN.c | 95 + sklearn/src/cblas/ATL_dreftrsvUTU.c | 95 + sklearn/src/cblas/ATL_srefcopy.c | 148 + sklearn/src/cblas/ATL_srefrot.c | 170 + sklearn/src/cblas/ATL_srefrotg.c | 146 + sklearn/src/cblas/ATL_sreftrsv.c | 213 + sklearn/src/cblas/ATL_sreftrsvLNN.c | 94 + sklearn/src/cblas/ATL_sreftrsvLNU.c | 94 + sklearn/src/cblas/ATL_sreftrsvLTN.c | 96 + sklearn/src/cblas/ATL_sreftrsvLTU.c | 96 + sklearn/src/cblas/ATL_sreftrsvUNN.c | 95 + sklearn/src/cblas/ATL_sreftrsvUNU.c | 95 + sklearn/src/cblas/ATL_sreftrsvUTN.c | 95 + sklearn/src/cblas/ATL_sreftrsvUTU.c | 95 + sklearn/src/cblas/README.txt | 13 + sklearn/src/cblas/atlas_aux.h | 942 + sklearn/src/cblas/atlas_dsysinfo.h | 0 sklearn/src/cblas/atlas_enum.h | 55 + sklearn/src/cblas/atlas_level1.h | 129 + sklearn/src/cblas/atlas_level2.h | 338 + sklearn/src/cblas/atlas_misc.h | 477 + sklearn/src/cblas/atlas_refalias1.h | 59 + sklearn/src/cblas/atlas_refalias2.h | 85 + sklearn/src/cblas/atlas_reflevel1.h | 423 + sklearn/src/cblas/atlas_reflevel2.h | 792 + sklearn/src/cblas/atlas_reflvl2.h | 3187 ++ sklearn/src/cblas/atlas_refmisc.h | 370 + sklearn/src/cblas/atlas_ssysinfo.h | 0 sklearn/src/cblas/atlas_type.h | 0 sklearn/src/cblas/cblas.h | 596 + sklearn/src/cblas/cblas_daxpy.c | 159 + sklearn/src/cblas/cblas_dcopy.c | 52 + sklearn/src/cblas/cblas_ddot.c | 135 + sklearn/src/cblas/cblas_dgemv.c | 102 + sklearn/src/cblas/cblas_dger.c | 85 + sklearn/src/cblas/cblas_dnrm2.c | 206 + sklearn/src/cblas/cblas_drot.c | 60 + sklearn/src/cblas/cblas_drotg.c | 42 + sklearn/src/cblas/cblas_dscal.c | 183 + sklearn/src/cblas/cblas_dtrsv.c | 87 + sklearn/src/cblas/cblas_errprn.c | 50 + sklearn/src/cblas/cblas_scopy.c | 52 + sklearn/src/cblas/cblas_srot.c | 60 + sklearn/src/cblas/cblas_srotg.c | 42 + sklearn/src/cblas/cblas_strsv.c | 87 + sklearn/src/cblas/cblas_xerbla.c | 53 + sklearn/svm/__init__.py | 26 + sklearn/svm/base.py | 726 + sklearn/svm/bounds.py | 85 + sklearn/svm/classes.py | 750 + sklearn/svm/liblinear.c | 6918 +++ sklearn/svm/liblinear.pxd | 32 + sklearn/svm/liblinear.pyx | 81 + sklearn/svm/libsvm.c | 12041 +++++ sklearn/svm/libsvm.pxd | 70 + sklearn/svm/libsvm.pyx | 586 + sklearn/svm/libsvm_sparse.c | 9396 ++++ sklearn/svm/libsvm_sparse.pyx | 357 + sklearn/svm/setup.py | 87 + sklearn/svm/src/blas/Makefile | 22 + sklearn/svm/src/blas/blas.h | 25 + sklearn/svm/src/blas/blasp.h | 430 + sklearn/svm/src/blas/daxpy.c | 49 + sklearn/svm/src/blas/ddot.c | 50 + sklearn/svm/src/blas/dnrm2.c | 62 + sklearn/svm/src/blas/dscal.c | 44 + sklearn/svm/src/liblinear/COPYRIGHT | 31 + sklearn/svm/src/liblinear/liblinear_helper.c | 238 + sklearn/svm/src/liblinear/linear.cpp | 2862 ++ sklearn/svm/src/liblinear/linear.h | 74 + sklearn/svm/src/liblinear/tron.cpp | 235 + sklearn/svm/src/liblinear/tron.h | 34 + sklearn/svm/src/libsvm/LIBSVM_CHANGES | 8 + sklearn/svm/src/libsvm/libsvm_helper.c | 422 + sklearn/svm/src/libsvm/libsvm_sparse_helper.c | 430 + sklearn/svm/src/libsvm/libsvm_template.cpp | 8 + sklearn/svm/src/libsvm/svm.cpp | 3112 ++ sklearn/svm/src/libsvm/svm.h | 180 + sklearn/svm/tests/__init__.py | 0 sklearn/svm/tests/test_bounds.py | 70 + sklearn/svm/tests/test_sparse.py | 295 + sklearn/svm/tests/test_svm.py | 669 + sklearn/test_setup.py | 12 + sklearn/tests/__init__.py | 0 sklearn/tests/test_base.py | 204 + sklearn/tests/test_check_build.py | 14 + sklearn/tests/test_common.py | 1049 + sklearn/tests/test_cross_validation.py | 590 + sklearn/tests/test_dummy.py | 210 + sklearn/tests/test_grid_search.py | 643 + sklearn/tests/test_hmm.py | 698 + sklearn/tests/test_init.py | 22 + sklearn/tests/test_isotonic.py | 50 + sklearn/tests/test_kernel_approximation.py | 175 + sklearn/tests/test_lda.py | 51 + sklearn/tests/test_multiclass.py | 340 + sklearn/tests/test_naive_bayes.py | 358 + sklearn/tests/test_pipeline.py | 303 + sklearn/tests/test_qda.py | 86 + sklearn/tests/test_random_projection.py | 345 + sklearn/tree/__init__.py | 13 + sklearn/tree/_tree.c | 21944 +++++++++ sklearn/tree/_tree.pxd | 144 + sklearn/tree/_tree.pyx | 1760 + sklearn/tree/export.py | 134 + sklearn/tree/setup.py | 24 + sklearn/tree/tests/__init__.py | 0 sklearn/tree/tests/test_export.py | 88 + sklearn/tree/tests/test_tree.py | 607 + sklearn/tree/tree.py | 734 + sklearn/utils/__init__.py | 364 + sklearn/utils/_logistic_sigmoid.c | 6090 +++ sklearn/utils/_logistic_sigmoid.pyx | 27 + sklearn/utils/arpack.py | 1618 + sklearn/utils/arraybuilder.c | 5495 +++ sklearn/utils/arraybuilder.pyx | 49 + sklearn/utils/arrayfuncs.c | 5709 +++ sklearn/utils/arrayfuncs.pyx | 115 + sklearn/utils/bench.py | 17 + sklearn/utils/class_weight.py | 57 + sklearn/utils/extmath.py | 547 + sklearn/utils/fixes.py | 239 + sklearn/utils/graph.py | 182 + sklearn/utils/graph_shortest_path.c | 9120 ++++ sklearn/utils/graph_shortest_path.pyx | 604 + sklearn/utils/lgamma.c | 1676 + sklearn/utils/lgamma.pxd | 1 + sklearn/utils/lgamma.pyx | 8 + sklearn/utils/linear_assignment_.py | 287 + sklearn/utils/multiclass.py | 345 + sklearn/utils/murmurhash.c | 7438 +++ sklearn/utils/murmurhash.pxd | 21 + sklearn/utils/murmurhash.pyx | 131 + sklearn/utils/random.c | 8041 ++++ sklearn/utils/random.pxd | 14 + sklearn/utils/random.pyx | 302 + sklearn/utils/seq_dataset.c | 7540 +++ sklearn/utils/seq_dataset.pxd | 49 + sklearn/utils/seq_dataset.pyx | 178 + sklearn/utils/setup.py | 76 + sklearn/utils/sparsefuncs.c | 9478 ++++ sklearn/utils/sparsefuncs.pxd | 8 + sklearn/utils/sparsefuncs.pyx | 335 + sklearn/utils/sparsetools/README | 6 + sklearn/utils/sparsetools/__init__.py | 6 + sklearn/utils/sparsetools/_graph_tools.c | 9879 ++++ sklearn/utils/sparsetools/_graph_tools.pyx | 460 + .../utils/sparsetools/_graph_validation.py | 58 + .../utils/sparsetools/_min_spanning_tree.c | 7003 +++ .../utils/sparsetools/_min_spanning_tree.pyx | 185 + sklearn/utils/sparsetools/_traversal.c | 12205 +++++ sklearn/utils/sparsetools/_traversal.pyx | 748 + sklearn/utils/sparsetools/setup.py | 29 + sklearn/utils/sparsetools/tests/__init__.py | 0 .../sparsetools/tests/test_spanning_tree.py | 65 + .../utils/sparsetools/tests/test_traversal.py | 52 + sklearn/utils/src/MurmurHash3.cpp | 346 + sklearn/utils/src/MurmurHash3.h | 45 + sklearn/utils/src/cholesky_delete.h | 76 + sklearn/utils/src/gamma.c | 155 + sklearn/utils/src/gamma.h | 8 + sklearn/utils/testing.py | 348 + sklearn/utils/tests/__init__.py | 0 sklearn/utils/tests/test_bench.py | 11 + sklearn/utils/tests/test_class_weight.py | 26 + sklearn/utils/tests/test_extmath.py | 284 + sklearn/utils/tests/test_fixes.py | 28 + sklearn/utils/tests/test_graph.py | 24 + sklearn/utils/tests/test_linear_assignment.py | 60 + sklearn/utils/tests/test_multiclass.py | 248 + sklearn/utils/tests/test_murmurhash.py | 80 + sklearn/utils/tests/test_random.py | 101 + sklearn/utils/tests/test_shortest_path.py | 100 + sklearn/utils/tests/test_sparsefuncs.py | 41 + sklearn/utils/tests/test_testing.py | 108 + sklearn/utils/tests/test_utils.py | 143 + sklearn/utils/tests/test_validation.py | 175 + sklearn/utils/validation.py | 298 + sklearn/utils/weight_vector.c | 6270 +++ sklearn/utils/weight_vector.pxd | 27 + sklearn/utils/weight_vector.pyx | 126 + 852 files changed, 590682 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 .mailmap create mode 100644 .travis.yml create mode 100644 AUTHORS.rst create mode 100644 CONTRIBUTING.md create mode 100644 COPYING create mode 100644 MANIFEST.in create mode 100644 Makefile create mode 100644 README.rst create mode 100644 benchmarks/bench_covertype.py create mode 100644 benchmarks/bench_glm.py create mode 100644 benchmarks/bench_glmnet.py create mode 100644 benchmarks/bench_lasso.py create mode 100644 benchmarks/bench_plot_fastkmeans.py create mode 100644 benchmarks/bench_plot_lasso_path.py create mode 100644 benchmarks/bench_plot_neighbors.py create mode 100644 benchmarks/bench_plot_nmf.py create mode 100644 benchmarks/bench_plot_omp_lars.py create mode 100644 benchmarks/bench_plot_parallel_pairwise.py create mode 100644 benchmarks/bench_plot_svd.py create mode 100644 benchmarks/bench_plot_ward.py create mode 100644 benchmarks/bench_random_projections.py create mode 100644 benchmarks/bench_sample_without_replacement.py create mode 100644 benchmarks/bench_sgd_regression.py create mode 100644 benchmarks/bench_tree.py create mode 100644 doc/Makefile create mode 100644 doc/README create mode 100644 doc/about.rst create mode 100644 doc/conf.py create mode 100644 doc/data_transforms.rst create mode 100644 doc/datasets/covtype.rst create mode 100644 doc/datasets/index.rst create mode 100644 doc/datasets/labeled_faces.rst create mode 100644 doc/datasets/labeled_faces_fixture.py create mode 100644 doc/datasets/mldata.rst create mode 100644 doc/datasets/mldata_fixture.py create mode 100644 doc/datasets/olivetti_faces.rst create mode 100644 doc/datasets/twenty_newsgroups.rst create mode 100644 doc/datasets/twenty_newsgroups_fixture.py create mode 100644 doc/developers/debugging.rst create mode 100644 doc/developers/index.rst create mode 100644 doc/developers/maintainer.rst create mode 100644 doc/developers/performance.rst create mode 100644 doc/developers/utilities.rst create mode 100644 doc/documentation.rst create mode 100644 doc/images/google-logo.png create mode 100644 doc/images/inria-logo.jpg create mode 100644 doc/images/inria-small.jpg create mode 100644 doc/images/iris.pdf create mode 100644 doc/images/iris.svg create mode 100644 doc/images/last_digit.png create mode 100644 doc/images/minBox.png create mode 100644 doc/images/minBoxHighlight.png create mode 100644 doc/images/ml_map.png create mode 100644 doc/images/no_image.png create mode 100644 doc/images/noneBox.png create mode 100644 doc/images/plot_digits_classification.png create mode 100644 doc/images/plot_face_recognition_1.png create mode 100644 doc/images/plot_face_recognition_2.png create mode 100644 doc/images/plusBox.png create mode 100644 doc/images/plusBoxHighlight.png create mode 100644 doc/images/rbm_graph.png create mode 100644 doc/includes/big_toc_css.rst create mode 100644 doc/includes/bigger_toc_css.rst create mode 100644 doc/index.rst create mode 100644 doc/install.rst create mode 100755 doc/logos/favicon.ico create mode 100755 doc/logos/identity.pdf create mode 100644 doc/logos/scikit-learn-logo-notext.png create mode 100644 doc/logos/scikit-learn-logo-small.png create mode 100644 doc/logos/scikit-learn-logo-thumb.png create mode 100644 doc/logos/scikit-learn-logo.png create mode 100644 doc/logos/scikit-learn-logo.svg create mode 100644 doc/make.bat create mode 100644 doc/model_selection.rst create mode 100644 doc/modules/biclustering.rst create mode 100644 doc/modules/classes.rst create mode 100644 doc/modules/clustering.rst create mode 100644 doc/modules/covariance.rst create mode 100644 doc/modules/cross_decomposition.rst create mode 100644 doc/modules/cross_validation.rst create mode 100644 doc/modules/decomposition.rst create mode 100644 doc/modules/density.rst create mode 100644 doc/modules/dp-derivation.rst create mode 100644 doc/modules/ensemble.rst create mode 100644 doc/modules/feature_extraction.rst create mode 100644 doc/modules/feature_selection.rst create mode 100644 doc/modules/gaussian_process.rst create mode 100644 doc/modules/glm_data/lasso_enet_coordinate_descent.png create mode 100644 doc/modules/grid_search.rst create mode 100644 doc/modules/hmm.rst create mode 100644 doc/modules/isotonic.rst create mode 100644 doc/modules/kernel_approximation.rst create mode 100644 doc/modules/label_propagation.rst create mode 100644 doc/modules/lda_qda.rst create mode 100644 doc/modules/linear_model.rst create mode 100644 doc/modules/manifold.rst create mode 100644 doc/modules/metrics.rst create mode 100644 doc/modules/mixture.rst create mode 100644 doc/modules/model_evaluation.rst create mode 100644 doc/modules/multiclass.rst create mode 100644 doc/modules/naive_bayes.rst create mode 100644 doc/modules/neighbors.rst create mode 100644 doc/modules/neural_networks.rst create mode 100644 doc/modules/outlier_detection.rst create mode 100644 doc/modules/pipeline.rst create mode 100644 doc/modules/preprocessing.rst create mode 100644 doc/modules/random_projection.rst create mode 100644 doc/modules/sgd.rst create mode 100644 doc/modules/svm.rst create mode 100644 doc/modules/tree.rst create mode 100644 doc/presentations.rst create mode 100644 doc/sphinxext/LICENSE.txt create mode 100644 doc/sphinxext/MANIFEST.in create mode 100644 doc/sphinxext/README.txt create mode 100644 doc/sphinxext/gen_rst.py create mode 100644 doc/sphinxext/numpy_ext/__init__.py create mode 100644 doc/sphinxext/numpy_ext/docscrape.py create mode 100644 doc/sphinxext/numpy_ext/docscrape_sphinx.py create mode 100644 doc/sphinxext/numpy_ext/numpydoc.py create mode 100644 doc/supervised_learning.rst create mode 100644 doc/support.rst create mode 100644 doc/templates/class.rst create mode 100644 doc/templates/class_with_call.rst create mode 100644 doc/templates/function.rst create mode 100644 doc/testimonials/README.txt create mode 100644 doc/testimonials/images/Makefile create mode 100644 doc/testimonials/images/aweber.png create mode 100644 doc/testimonials/images/evernote.png create mode 100644 doc/testimonials/images/inria.jpg create mode 100644 doc/testimonials/images/telecomparistech.jpg create mode 100644 doc/testimonials/testimonials.rst create mode 100644 doc/themes/scikit-learn/layout.html create mode 100644 doc/themes/scikit-learn/static/ML_MAPS_README.rst create mode 100644 doc/themes/scikit-learn/static/css/bootstrap-responsive.css create mode 100644 doc/themes/scikit-learn/static/css/bootstrap-responsive.min.css create mode 100644 doc/themes/scikit-learn/static/css/bootstrap.css create mode 100644 doc/themes/scikit-learn/static/css/bootstrap.min.css create mode 100644 doc/themes/scikit-learn/static/img/forkme.png create mode 100644 doc/themes/scikit-learn/static/img/glyphicons-halflings-white.png create mode 100644 doc/themes/scikit-learn/static/img/glyphicons-halflings.png create mode 100644 doc/themes/scikit-learn/static/img/google.png create mode 100644 doc/themes/scikit-learn/static/img/inria.jpg create mode 100644 doc/themes/scikit-learn/static/img/plot_classifier_comparison_1.png create mode 100644 doc/themes/scikit-learn/static/img/plot_manifold_sphere_1.png create mode 100644 doc/themes/scikit-learn/static/img/scikit-learn-logo-notext.png create mode 100644 doc/themes/scikit-learn/static/img/scikit-learn-logo-small.png create mode 100644 doc/themes/scikit-learn/static/img/scikit-learn-logo.png create mode 100644 doc/themes/scikit-learn/static/img/scikit-learn-logo.svg create mode 100644 doc/themes/scikit-learn/static/img/telecom.jpg create mode 100644 doc/themes/scikit-learn/static/jquery.js create mode 100644 doc/themes/scikit-learn/static/jquery.maphilight.js create mode 100644 doc/themes/scikit-learn/static/jquery.maphilight.min.js create mode 100644 doc/themes/scikit-learn/static/js/bootstrap.js create mode 100644 doc/themes/scikit-learn/static/js/bootstrap.min.js create mode 100644 doc/themes/scikit-learn/static/nature.css_t create mode 100644 doc/themes/scikit-learn/static/sidebar.js create mode 100644 doc/themes/scikit-learn/theme.conf create mode 100644 doc/tutorial/basic/tutorial.rst create mode 100644 doc/tutorial/common_includes/info.txt create mode 100644 doc/tutorial/index.rst create mode 100644 doc/tutorial/machine_learning_map/ML_MAPS_README.txt create mode 100644 doc/tutorial/machine_learning_map/index.rst create mode 100644 doc/tutorial/machine_learning_map/parse_path.py create mode 100644 doc/tutorial/machine_learning_map/pyparsing.py create mode 100644 doc/tutorial/machine_learning_map/svg2imagemap.py create mode 100644 doc/tutorial/statistical_inference/finding_help.rst create mode 100644 doc/tutorial/statistical_inference/index.rst create mode 100644 doc/tutorial/statistical_inference/model_selection.rst create mode 100644 doc/tutorial/statistical_inference/putting_together.rst create mode 100644 doc/tutorial/statistical_inference/settings.rst create mode 100644 doc/tutorial/statistical_inference/supervised_learning.rst create mode 100644 doc/tutorial/statistical_inference/unsupervised_learning.rst create mode 100644 doc/tutorial/text_analytics/placeholder.txt create mode 100644 doc/unsupervised_learning.rst create mode 100644 doc/user_guide.rst create mode 100644 doc/whats_new.rst create mode 100644 examples/README.txt create mode 100644 examples/applications/README.txt create mode 100644 examples/applications/face_recognition.py create mode 100644 examples/applications/plot_hmm_stock_analysis.py create mode 100644 examples/applications/plot_out_of_core_classification.py create mode 100644 examples/applications/plot_outlier_detection_housing.py create mode 100644 examples/applications/plot_species_distribution_modeling.py create mode 100644 examples/applications/plot_stock_market.py create mode 100644 examples/applications/plot_tomography_l1_reconstruction.py create mode 100644 examples/applications/svm_gui.py create mode 100644 examples/applications/topics_extraction_with_nmf.py create mode 100644 examples/applications/wikipedia_principal_eigenvector.py create mode 100644 examples/bicluster/README.txt create mode 100644 examples/bicluster/bicluster_newsgroups.py create mode 100644 examples/bicluster/plot_spectral_biclustering.py create mode 100644 examples/bicluster/plot_spectral_coclustering.py create mode 100644 examples/cluster/README.txt create mode 100644 examples/cluster/plot_adjusted_for_chance_measures.py create mode 100644 examples/cluster/plot_affinity_propagation.py create mode 100644 examples/cluster/plot_cluster_comparison.py create mode 100644 examples/cluster/plot_cluster_iris.py create mode 100644 examples/cluster/plot_color_quantization.py create mode 100644 examples/cluster/plot_dbscan.py create mode 100644 examples/cluster/plot_dict_face_patches.py create mode 100644 examples/cluster/plot_digits_agglomeration.py create mode 100644 examples/cluster/plot_feature_agglomeration_vs_univariate_selection.py create mode 100644 examples/cluster/plot_kmeans_digits.py create mode 100644 examples/cluster/plot_kmeans_stability_low_dim_dense.py create mode 100644 examples/cluster/plot_lena_compress.py create mode 100644 examples/cluster/plot_lena_segmentation.py create mode 100644 examples/cluster/plot_lena_ward_segmentation.py create mode 100644 examples/cluster/plot_mean_shift.py create mode 100644 examples/cluster/plot_mini_batch_kmeans.py create mode 100644 examples/cluster/plot_segmentation_toy.py create mode 100644 examples/cluster/plot_ward_structured_vs_unstructured.py create mode 100644 examples/covariance/README.txt create mode 100644 examples/covariance/plot_covariance_estimation.py create mode 100644 examples/covariance/plot_lw_vs_oas.py create mode 100644 examples/covariance/plot_mahalanobis_distances.py create mode 100644 examples/covariance/plot_outlier_detection.py create mode 100644 examples/covariance/plot_robust_vs_empirical_covariance.py create mode 100644 examples/covariance/plot_sparse_cov.py create mode 100644 examples/cross_decomposition/README.txt create mode 100644 examples/cross_decomposition/plot_compare_cross_decomposition.py create mode 100644 examples/datasets/README.txt create mode 100644 examples/datasets/plot_digits_last_image.py create mode 100644 examples/datasets/plot_iris_dataset.py create mode 100644 examples/datasets/plot_random_dataset.py create mode 100644 examples/decomposition/README.txt create mode 100644 examples/decomposition/plot_faces_decomposition.py create mode 100644 examples/decomposition/plot_ica_blind_source_separation.py create mode 100644 examples/decomposition/plot_ica_vs_pca.py create mode 100644 examples/decomposition/plot_image_denoising.py create mode 100644 examples/decomposition/plot_kernel_pca.py create mode 100644 examples/decomposition/plot_pca_3d.py create mode 100644 examples/decomposition/plot_pca_iris.py create mode 100644 examples/decomposition/plot_pca_vs_lda.py create mode 100644 examples/decomposition/plot_sparse_coding.py create mode 100644 examples/document_classification_20newsgroups.py create mode 100644 examples/document_clustering.py create mode 100644 examples/ensemble/README.txt create mode 100644 examples/ensemble/plot_adaboost_hastie_10_2.py create mode 100644 examples/ensemble/plot_adaboost_multiclass.py create mode 100644 examples/ensemble/plot_adaboost_regression.py create mode 100644 examples/ensemble/plot_adaboost_twoclass.py create mode 100644 examples/ensemble/plot_forest_importances.py create mode 100644 examples/ensemble/plot_forest_importances_faces.py create mode 100644 examples/ensemble/plot_forest_iris.py create mode 100644 examples/ensemble/plot_gradient_boosting_oob.py create mode 100644 examples/ensemble/plot_gradient_boosting_quantile.py create mode 100644 examples/ensemble/plot_gradient_boosting_regression.py create mode 100644 examples/ensemble/plot_gradient_boosting_regularization.py create mode 100644 examples/ensemble/plot_partial_dependence.py create mode 100644 examples/ensemble/plot_random_forest_embedding.py create mode 100644 examples/exercises/README.txt create mode 100644 examples/exercises/plot_cv_diabetes.py create mode 100644 examples/exercises/plot_cv_digits.py create mode 100644 examples/exercises/plot_digits_classification_exercise.py create mode 100644 examples/exercises/plot_iris_exercise.py create mode 100644 examples/feature_selection_pipeline.py create mode 100644 examples/feature_stacker.py create mode 100644 examples/gaussian_process/README.txt create mode 100644 examples/gaussian_process/gp_diabetes_dataset.py create mode 100644 examples/gaussian_process/plot_gp_probabilistic_classification_after_regression.py create mode 100644 examples/gaussian_process/plot_gp_regression.py create mode 100644 examples/grid_search_digits.py create mode 100644 examples/grid_search_text_feature_extraction.py create mode 100644 examples/hashing_vs_dict_vectorizer.py create mode 100644 examples/imputation.py create mode 100644 examples/linear_model/README.txt create mode 100644 examples/linear_model/lasso_dense_vs_sparse_data.py create mode 100644 examples/linear_model/plot_ard.py create mode 100644 examples/linear_model/plot_bayesian_ridge.py create mode 100644 examples/linear_model/plot_iris_logistic.py create mode 100644 examples/linear_model/plot_lasso_and_elasticnet.py create mode 100644 examples/linear_model/plot_lasso_coordinate_descent_path.py create mode 100644 examples/linear_model/plot_lasso_lars.py create mode 100644 examples/linear_model/plot_lasso_model_selection.py create mode 100644 examples/linear_model/plot_logistic.py create mode 100644 examples/linear_model/plot_logistic_l1_l2_sparsity.py create mode 100644 examples/linear_model/plot_logistic_path.py create mode 100644 examples/linear_model/plot_multi_task_lasso_support.py create mode 100644 examples/linear_model/plot_ols.py create mode 100644 examples/linear_model/plot_ols_3d.py create mode 100644 examples/linear_model/plot_ols_ridge_variance.py create mode 100644 examples/linear_model/plot_omp.py create mode 100644 examples/linear_model/plot_polynomial_interpolation.py create mode 100644 examples/linear_model/plot_ridge_path.py create mode 100644 examples/linear_model/plot_sgd_comparison.py create mode 100644 examples/linear_model/plot_sgd_iris.py create mode 100644 examples/linear_model/plot_sgd_loss_functions.py create mode 100644 examples/linear_model/plot_sgd_penalties.py create mode 100644 examples/linear_model/plot_sgd_separating_hyperplane.py create mode 100644 examples/linear_model/plot_sgd_weighted_classes.py create mode 100644 examples/linear_model/plot_sgd_weighted_samples.py create mode 100644 examples/linear_model/plot_sparse_recovery.py create mode 100644 examples/manifold/README.txt create mode 100644 examples/manifold/plot_compare_methods.py create mode 100644 examples/manifold/plot_lle_digits.py create mode 100644 examples/manifold/plot_manifold_sphere.py create mode 100644 examples/manifold/plot_mds.py create mode 100644 examples/manifold/plot_swissroll.py create mode 100644 examples/mixture/README.txt create mode 100644 examples/mixture/plot_gmm.py create mode 100644 examples/mixture/plot_gmm_classifier.py create mode 100644 examples/mixture/plot_gmm_pdf.py create mode 100644 examples/mixture/plot_gmm_selection.py create mode 100644 examples/mixture/plot_gmm_sin.py create mode 100644 examples/mlcomp_sparse_document_classification.py create mode 100644 examples/neighbors/README.txt create mode 100644 examples/neighbors/plot_classification.py create mode 100644 examples/neighbors/plot_digits_kde_sampling.py create mode 100644 examples/neighbors/plot_kde_1d.py create mode 100644 examples/neighbors/plot_nearest_centroid.py create mode 100644 examples/neighbors/plot_regression.py create mode 100644 examples/neighbors/plot_species_kde.py create mode 100644 examples/plot_classification_probability.py create mode 100644 examples/plot_classifier_comparison.py create mode 100644 examples/plot_confusion_matrix.py create mode 100644 examples/plot_digits_classification.py create mode 100644 examples/plot_digits_pipe.py create mode 100644 examples/plot_feature_selection.py create mode 100644 examples/plot_hmm_sampling.py create mode 100644 examples/plot_isotonic_regression.py create mode 100644 examples/plot_johnson_lindenstrauss_bound.py create mode 100644 examples/plot_kernel_approximation.py create mode 100644 examples/plot_lda_qda.py create mode 100644 examples/plot_multilabel.py create mode 100644 examples/plot_multioutput_face_completion.py create mode 100644 examples/plot_permutation_test_for_classification.py create mode 100644 examples/plot_precision_recall.py create mode 100755 examples/plot_rbm_logistic_classification.py create mode 100644 examples/plot_rfe_digits.py create mode 100644 examples/plot_rfe_with_cross_validation.py create mode 100644 examples/plot_roc.py create mode 100644 examples/plot_roc_crossval.py create mode 100644 examples/plot_train_error_vs_test_error.py create mode 100644 examples/randomized_search.py create mode 100644 examples/semi_supervised/README.txt create mode 100644 examples/semi_supervised/plot_label_propagation_digits.py create mode 100644 examples/semi_supervised/plot_label_propagation_digits_active_learning.py create mode 100644 examples/semi_supervised/plot_label_propagation_structure.py create mode 100644 examples/semi_supervised/plot_label_propagation_versus_svm_iris.py create mode 100644 examples/svm/README.txt create mode 100644 examples/svm/plot_custom_kernel.py create mode 100644 examples/svm/plot_iris.py create mode 100644 examples/svm/plot_oneclass.py create mode 100644 examples/svm/plot_rbf_parameters.py create mode 100644 examples/svm/plot_separating_hyperplane.py create mode 100644 examples/svm/plot_separating_hyperplane_unbalanced.py create mode 100644 examples/svm/plot_svm_anova.py create mode 100644 examples/svm/plot_svm_iris.py create mode 100644 examples/svm/plot_svm_kernels.py create mode 100644 examples/svm/plot_svm_margin.py create mode 100644 examples/svm/plot_svm_nonlinear.py create mode 100644 examples/svm/plot_svm_regression.py create mode 100644 examples/svm/plot_svm_scale_c.py create mode 100644 examples/svm/plot_weighted_samples.py create mode 100644 examples/tree/README.txt create mode 100644 examples/tree/plot_iris.py create mode 100644 examples/tree/plot_tree_regression.py create mode 100644 examples/tree/plot_tree_regression_multioutput.py create mode 100644 setup.cfg create mode 100755 setup.py create mode 100644 site.cfg create mode 100644 sklearn/__check_build/__init__.py create mode 100644 sklearn/__check_build/_check_build.c create mode 100644 sklearn/__check_build/_check_build.pyx create mode 100644 sklearn/__check_build/setup.py create mode 100644 sklearn/__init__.py create mode 100644 sklearn/_build_utils.py create mode 100644 sklearn/_hmmc.c create mode 100644 sklearn/_hmmc.pyx create mode 100644 sklearn/_isotonic.c create mode 100644 sklearn/_isotonic.pyx create mode 100644 sklearn/base.py create mode 100644 sklearn/cluster/__init__.py create mode 100644 sklearn/cluster/_feature_agglomeration.py create mode 100644 sklearn/cluster/_hierarchical.c create mode 100644 sklearn/cluster/_hierarchical.pyx create mode 100644 sklearn/cluster/_k_means.c create mode 100644 sklearn/cluster/_k_means.pyx create mode 100644 sklearn/cluster/affinity_propagation_.py create mode 100644 sklearn/cluster/bicluster/__init__.py create mode 100644 sklearn/cluster/bicluster/spectral.py create mode 100644 sklearn/cluster/bicluster/tests/__init__.py create mode 100644 sklearn/cluster/bicluster/tests/test_spectral.py create mode 100644 sklearn/cluster/bicluster/tests/test_utils.py create mode 100644 sklearn/cluster/bicluster/utils.py create mode 100644 sklearn/cluster/dbscan_.py create mode 100644 sklearn/cluster/hierarchical.py create mode 100644 sklearn/cluster/k_means_.py create mode 100644 sklearn/cluster/mean_shift_.py create mode 100644 sklearn/cluster/setup.py create mode 100644 sklearn/cluster/spectral.py create mode 100644 sklearn/cluster/tests/__init__.py create mode 100644 sklearn/cluster/tests/common.py create mode 100644 sklearn/cluster/tests/test_affinity_propagation.py create mode 100644 sklearn/cluster/tests/test_dbscan.py create mode 100644 sklearn/cluster/tests/test_hierarchical.py create mode 100644 sklearn/cluster/tests/test_k_means.py create mode 100644 sklearn/cluster/tests/test_mean_shift.py create mode 100644 sklearn/cluster/tests/test_spectral.py create mode 100644 sklearn/covariance/__init__.py create mode 100644 sklearn/covariance/empirical_covariance_.py create mode 100644 sklearn/covariance/graph_lasso_.py create mode 100644 sklearn/covariance/outlier_detection.py create mode 100644 sklearn/covariance/robust_covariance.py create mode 100644 sklearn/covariance/shrunk_covariance_.py create mode 100644 sklearn/covariance/tests/__init__.py create mode 100644 sklearn/covariance/tests/test_covariance.py create mode 100644 sklearn/covariance/tests/test_graph_lasso.py create mode 100644 sklearn/covariance/tests/test_robust_covariance.py create mode 100644 sklearn/cross_decomposition/__init__.py create mode 100644 sklearn/cross_decomposition/cca_.py create mode 100644 sklearn/cross_decomposition/pls_.py create mode 100644 sklearn/cross_decomposition/tests/test_pls.py create mode 100644 sklearn/cross_validation.py create mode 100644 sklearn/datasets/DATASET_PROPOSAL.txt create mode 100644 sklearn/datasets/__init__.py create mode 100644 sklearn/datasets/_svmlight_format.c create mode 100644 sklearn/datasets/_svmlight_format.pyx create mode 100644 sklearn/datasets/base.py create mode 100644 sklearn/datasets/california_housing.py create mode 100644 sklearn/datasets/covtype.py create mode 100644 sklearn/datasets/data/boston_house_prices.csv create mode 100644 sklearn/datasets/data/diabetes_data.csv.gz create mode 100644 sklearn/datasets/data/diabetes_target.csv.gz create mode 100644 sklearn/datasets/data/digits.csv.gz create mode 100644 sklearn/datasets/data/iris.csv create mode 100644 sklearn/datasets/data/linnerud_exercise.csv create mode 100644 sklearn/datasets/data/linnerud_physiological.csv create mode 100644 sklearn/datasets/descr/boston_house_prices.rst create mode 100644 sklearn/datasets/descr/diabetes.rst create mode 100644 sklearn/datasets/descr/digits.rst create mode 100644 sklearn/datasets/descr/iris.rst create mode 100644 sklearn/datasets/descr/linnerud.rst create mode 100644 sklearn/datasets/images/README.txt create mode 100644 sklearn/datasets/images/china.jpg create mode 100644 sklearn/datasets/images/flower.jpg create mode 100644 sklearn/datasets/lfw.py create mode 100644 sklearn/datasets/mlcomp.py create mode 100644 sklearn/datasets/mldata.py create mode 100644 sklearn/datasets/olivetti_faces.py create mode 100644 sklearn/datasets/samples_generator.py create mode 100644 sklearn/datasets/setup.py create mode 100644 sklearn/datasets/species_distributions.py create mode 100644 sklearn/datasets/svmlight_format.py create mode 100644 sklearn/datasets/tests/__init__.py create mode 100644 sklearn/datasets/tests/data/svmlight_classification.txt create mode 100644 sklearn/datasets/tests/data/svmlight_invalid.txt create mode 100644 sklearn/datasets/tests/data/svmlight_invalid_order.txt create mode 100644 sklearn/datasets/tests/data/svmlight_multilabel.txt create mode 100644 sklearn/datasets/tests/test_20news.py create mode 100644 sklearn/datasets/tests/test_base.py create mode 100644 sklearn/datasets/tests/test_covtype.py create mode 100644 sklearn/datasets/tests/test_lfw.py create mode 100644 sklearn/datasets/tests/test_mldata.py create mode 100644 sklearn/datasets/tests/test_samples_generator.py create mode 100644 sklearn/datasets/tests/test_svmlight_format.py create mode 100644 sklearn/datasets/twenty_newsgroups.py create mode 100644 sklearn/decomposition/__init__.py create mode 100644 sklearn/decomposition/dict_learning.py create mode 100644 sklearn/decomposition/factor_analysis.py create mode 100644 sklearn/decomposition/fastica_.py create mode 100644 sklearn/decomposition/kernel_pca.py create mode 100644 sklearn/decomposition/nmf.py create mode 100644 sklearn/decomposition/pca.py create mode 100644 sklearn/decomposition/sparse_pca.py create mode 100644 sklearn/decomposition/tests/__init__.py create mode 100644 sklearn/decomposition/tests/test_dict_learning.py create mode 100644 sklearn/decomposition/tests/test_factor_analysis.py create mode 100644 sklearn/decomposition/tests/test_fastica.py create mode 100644 sklearn/decomposition/tests/test_kernel_pca.py create mode 100644 sklearn/decomposition/tests/test_nmf.py create mode 100644 sklearn/decomposition/tests/test_pca.py create mode 100644 sklearn/decomposition/tests/test_sparse_pca.py create mode 100644 sklearn/decomposition/tests/test_truncated_svd.py create mode 100644 sklearn/decomposition/truncated_svd.py create mode 100644 sklearn/dummy.py create mode 100644 sklearn/ensemble/__init__.py create mode 100644 sklearn/ensemble/_gradient_boosting.c create mode 100644 sklearn/ensemble/_gradient_boosting.pyx create mode 100644 sklearn/ensemble/base.py create mode 100644 sklearn/ensemble/forest.py create mode 100644 sklearn/ensemble/gradient_boosting.py create mode 100644 sklearn/ensemble/partial_dependence.py create mode 100644 sklearn/ensemble/setup.py create mode 100644 sklearn/ensemble/tests/__init__.py create mode 100644 sklearn/ensemble/tests/test_base.py create mode 100644 sklearn/ensemble/tests/test_forest.py create mode 100644 sklearn/ensemble/tests/test_gradient_boosting.py create mode 100644 sklearn/ensemble/tests/test_gradient_boosting_loss_functions.py create mode 100644 sklearn/ensemble/tests/test_partial_dependence.py create mode 100644 sklearn/ensemble/tests/test_weight_boosting.py create mode 100644 sklearn/ensemble/weight_boosting.py create mode 100644 sklearn/externals/README create mode 100644 sklearn/externals/__init__.py create mode 100755 sklearn/externals/copy_joblib.sh create mode 100644 sklearn/externals/setup.py create mode 100644 sklearn/externals/six.py create mode 100644 sklearn/externals/test_externals_setup.py create mode 100644 sklearn/feature_extraction/__init__.py create mode 100644 sklearn/feature_extraction/_hashing.c create mode 100644 sklearn/feature_extraction/_hashing.pyx create mode 100644 sklearn/feature_extraction/dict_vectorizer.py create mode 100644 sklearn/feature_extraction/hashing.py create mode 100644 sklearn/feature_extraction/image.py create mode 100644 sklearn/feature_extraction/setup.py create mode 100644 sklearn/feature_extraction/stop_words.py create mode 100644 sklearn/feature_extraction/tests/__init__.py create mode 100644 sklearn/feature_extraction/tests/test_dict_vectorizer.py create mode 100644 sklearn/feature_extraction/tests/test_feature_hasher.py create mode 100644 sklearn/feature_extraction/tests/test_image.py create mode 100644 sklearn/feature_extraction/tests/test_text.py create mode 100644 sklearn/feature_extraction/text.py create mode 100644 sklearn/feature_selection/__init__.py create mode 100644 sklearn/feature_selection/base.py create mode 100644 sklearn/feature_selection/from_model.py create mode 100644 sklearn/feature_selection/rfe.py create mode 100644 sklearn/feature_selection/selector_mixin.py create mode 100644 sklearn/feature_selection/tests/__init__.py create mode 100644 sklearn/feature_selection/tests/test_base.py create mode 100644 sklearn/feature_selection/tests/test_chi2.py create mode 100644 sklearn/feature_selection/tests/test_feature_select.py create mode 100644 sklearn/feature_selection/tests/test_from_model.py create mode 100644 sklearn/feature_selection/tests/test_rfe.py create mode 100644 sklearn/feature_selection/univariate_selection.py create mode 100644 sklearn/gaussian_process/__init__.py create mode 100644 sklearn/gaussian_process/correlation_models.py create mode 100644 sklearn/gaussian_process/gaussian_process.py create mode 100644 sklearn/gaussian_process/regression_models.py create mode 100644 sklearn/gaussian_process/tests/__init__.py create mode 100644 sklearn/gaussian_process/tests/test_gaussian_process.py create mode 100644 sklearn/grid_search.py create mode 100644 sklearn/hmm.py create mode 100644 sklearn/isotonic.py create mode 100644 sklearn/kernel_approximation.py create mode 100644 sklearn/lda.py create mode 100644 sklearn/linear_model/__init__.py create mode 100644 sklearn/linear_model/base.py create mode 100644 sklearn/linear_model/bayes.py create mode 100644 sklearn/linear_model/cd_fast.c create mode 100644 sklearn/linear_model/cd_fast.pyx create mode 100644 sklearn/linear_model/coordinate_descent.py create mode 100644 sklearn/linear_model/least_angle.py create mode 100644 sklearn/linear_model/logistic.py create mode 100644 sklearn/linear_model/omp.py create mode 100644 sklearn/linear_model/passive_aggressive.py create mode 100644 sklearn/linear_model/perceptron.py create mode 100644 sklearn/linear_model/randomized_l1.py create mode 100644 sklearn/linear_model/ridge.py create mode 100644 sklearn/linear_model/setup.py create mode 100644 sklearn/linear_model/sgd_fast.c create mode 100644 sklearn/linear_model/sgd_fast.pyx create mode 100644 sklearn/linear_model/stochastic_gradient.py create mode 100644 sklearn/linear_model/tests/__init__.py create mode 100644 sklearn/linear_model/tests/test_base.py create mode 100644 sklearn/linear_model/tests/test_bayes.py create mode 100644 sklearn/linear_model/tests/test_coordinate_descent.py create mode 100644 sklearn/linear_model/tests/test_least_angle.py create mode 100644 sklearn/linear_model/tests/test_logistic.py create mode 100644 sklearn/linear_model/tests/test_omp.py create mode 100644 sklearn/linear_model/tests/test_passive_aggressive.py create mode 100644 sklearn/linear_model/tests/test_perceptron.py create mode 100644 sklearn/linear_model/tests/test_randomized_l1.py create mode 100644 sklearn/linear_model/tests/test_ridge.py create mode 100644 sklearn/linear_model/tests/test_sgd.py create mode 100644 sklearn/linear_model/tests/test_sparse_coordinate_descent.py create mode 100644 sklearn/manifold/__init__.py create mode 100644 sklearn/manifold/isomap.py create mode 100644 sklearn/manifold/locally_linear.py create mode 100644 sklearn/manifold/mds.py create mode 100644 sklearn/manifold/spectral_embedding_.py create mode 100644 sklearn/manifold/tests/test_isomap.py create mode 100644 sklearn/manifold/tests/test_locally_linear.py create mode 100644 sklearn/manifold/tests/test_mds.py create mode 100644 sklearn/manifold/tests/test_spectral_embedding.py create mode 100644 sklearn/metrics/__init__.py create mode 100644 sklearn/metrics/cluster/__init__.py create mode 100644 sklearn/metrics/cluster/bicluster/__init__.py create mode 100644 sklearn/metrics/cluster/bicluster/bicluster_metrics.py create mode 100644 sklearn/metrics/cluster/bicluster/tests/__init__.py create mode 100644 sklearn/metrics/cluster/bicluster/tests/test_bicluster_metrics.py create mode 100644 sklearn/metrics/cluster/expected_mutual_info_fast.c create mode 100644 sklearn/metrics/cluster/expected_mutual_info_fast.pyx create mode 100644 sklearn/metrics/cluster/setup.py create mode 100644 sklearn/metrics/cluster/supervised.py create mode 100644 sklearn/metrics/cluster/tests/__init__.py create mode 100644 sklearn/metrics/cluster/tests/test_supervised.py create mode 100644 sklearn/metrics/cluster/tests/test_unsupervised.py create mode 100644 sklearn/metrics/cluster/unsupervised.py create mode 100644 sklearn/metrics/metrics.py create mode 100644 sklearn/metrics/pairwise.py create mode 100644 sklearn/metrics/pairwise_fast.c create mode 100644 sklearn/metrics/pairwise_fast.pyx create mode 100644 sklearn/metrics/scorer.py create mode 100644 sklearn/metrics/setup.py create mode 100644 sklearn/metrics/tests/__init__.py create mode 100644 sklearn/metrics/tests/test_metrics.py create mode 100644 sklearn/metrics/tests/test_pairwise.py create mode 100644 sklearn/metrics/tests/test_score_objects.py create mode 100644 sklearn/mixture/__init__.py create mode 100644 sklearn/mixture/dpgmm.py create mode 100644 sklearn/mixture/gmm.py create mode 100644 sklearn/mixture/tests/__init__.py create mode 100644 sklearn/mixture/tests/test_dpgmm.py create mode 100644 sklearn/mixture/tests/test_gmm.py create mode 100644 sklearn/multiclass.py create mode 100644 sklearn/naive_bayes.py create mode 100644 sklearn/neighbors/__init__.py create mode 100644 sklearn/neighbors/ball_tree.c create mode 100644 sklearn/neighbors/ball_tree.pyx create mode 100644 sklearn/neighbors/base.py create mode 100644 sklearn/neighbors/binary_tree.pxi create mode 100644 sklearn/neighbors/classification.py create mode 100644 sklearn/neighbors/dist_metrics.c create mode 100644 sklearn/neighbors/dist_metrics.pxd create mode 100644 sklearn/neighbors/dist_metrics.pyx create mode 100644 sklearn/neighbors/graph.py create mode 100644 sklearn/neighbors/kd_tree.c create mode 100644 sklearn/neighbors/kd_tree.pyx create mode 100644 sklearn/neighbors/kde.py create mode 100644 sklearn/neighbors/nearest_centroid.py create mode 100644 sklearn/neighbors/regression.py create mode 100644 sklearn/neighbors/setup.py create mode 100644 sklearn/neighbors/tests/__init__.py create mode 100644 sklearn/neighbors/tests/test_ball_tree.py create mode 100644 sklearn/neighbors/tests/test_dist_metrics.py create mode 100644 sklearn/neighbors/tests/test_kd_tree.py create mode 100644 sklearn/neighbors/tests/test_kde.py create mode 100644 sklearn/neighbors/tests/test_nearest_centroid.py create mode 100644 sklearn/neighbors/tests/test_neighbors.py create mode 100644 sklearn/neighbors/typedefs.c create mode 100644 sklearn/neighbors/typedefs.pxd create mode 100644 sklearn/neighbors/typedefs.pyx create mode 100644 sklearn/neighbors/unsupervised.py create mode 100644 sklearn/neural_network/__init__.py create mode 100644 sklearn/neural_network/rbm.py create mode 100644 sklearn/neural_network/tests/test_rbm.py create mode 100644 sklearn/pipeline.py create mode 100644 sklearn/pls.py create mode 100644 sklearn/preprocessing/__init__.py create mode 100644 sklearn/preprocessing/_weights.py create mode 100644 sklearn/preprocessing/data.py create mode 100644 sklearn/preprocessing/imputation.py create mode 100644 sklearn/preprocessing/label.py create mode 100644 sklearn/preprocessing/tests/__init__.py create mode 100644 sklearn/preprocessing/tests/test_data.py create mode 100644 sklearn/preprocessing/tests/test_imputation.py create mode 100644 sklearn/preprocessing/tests/test_label.py create mode 100644 sklearn/preprocessing/tests/test_weights.py create mode 100644 sklearn/qda.py create mode 100644 sklearn/random_projection.py create mode 100644 sklearn/semi_supervised/__init__.py create mode 100644 sklearn/semi_supervised/label_propagation.py create mode 100644 sklearn/semi_supervised/tests/__init__.py create mode 100644 sklearn/semi_supervised/tests/test_label_propagation.py create mode 100644 sklearn/setup.py create mode 100644 sklearn/src/cblas/ATL_drefcopy.c create mode 100644 sklearn/src/cblas/ATL_drefgemv.c create mode 100644 sklearn/src/cblas/ATL_drefgemvN.c create mode 100644 sklearn/src/cblas/ATL_drefgemvT.c create mode 100644 sklearn/src/cblas/ATL_drefger.c create mode 100644 sklearn/src/cblas/ATL_drefrot.c create mode 100644 sklearn/src/cblas/ATL_drefrotg.c create mode 100644 sklearn/src/cblas/ATL_dreftrsv.c create mode 100644 sklearn/src/cblas/ATL_dreftrsvLNN.c create mode 100644 sklearn/src/cblas/ATL_dreftrsvLNU.c create mode 100644 sklearn/src/cblas/ATL_dreftrsvLTN.c create mode 100644 sklearn/src/cblas/ATL_dreftrsvLTU.c create mode 100644 sklearn/src/cblas/ATL_dreftrsvUNN.c create mode 100644 sklearn/src/cblas/ATL_dreftrsvUNU.c create mode 100644 sklearn/src/cblas/ATL_dreftrsvUTN.c create mode 100644 sklearn/src/cblas/ATL_dreftrsvUTU.c create mode 100644 sklearn/src/cblas/ATL_srefcopy.c create mode 100644 sklearn/src/cblas/ATL_srefrot.c create mode 100644 sklearn/src/cblas/ATL_srefrotg.c create mode 100644 sklearn/src/cblas/ATL_sreftrsv.c create mode 100644 sklearn/src/cblas/ATL_sreftrsvLNN.c create mode 100644 sklearn/src/cblas/ATL_sreftrsvLNU.c create mode 100644 sklearn/src/cblas/ATL_sreftrsvLTN.c create mode 100644 sklearn/src/cblas/ATL_sreftrsvLTU.c create mode 100644 sklearn/src/cblas/ATL_sreftrsvUNN.c create mode 100644 sklearn/src/cblas/ATL_sreftrsvUNU.c create mode 100644 sklearn/src/cblas/ATL_sreftrsvUTN.c create mode 100644 sklearn/src/cblas/ATL_sreftrsvUTU.c create mode 100644 sklearn/src/cblas/README.txt create mode 100644 sklearn/src/cblas/atlas_aux.h create mode 100644 sklearn/src/cblas/atlas_dsysinfo.h create mode 100644 sklearn/src/cblas/atlas_enum.h create mode 100644 sklearn/src/cblas/atlas_level1.h create mode 100644 sklearn/src/cblas/atlas_level2.h create mode 100644 sklearn/src/cblas/atlas_misc.h create mode 100644 sklearn/src/cblas/atlas_refalias1.h create mode 100644 sklearn/src/cblas/atlas_refalias2.h create mode 100644 sklearn/src/cblas/atlas_reflevel1.h create mode 100644 sklearn/src/cblas/atlas_reflevel2.h create mode 100644 sklearn/src/cblas/atlas_reflvl2.h create mode 100644 sklearn/src/cblas/atlas_refmisc.h create mode 100644 sklearn/src/cblas/atlas_ssysinfo.h create mode 100644 sklearn/src/cblas/atlas_type.h create mode 100644 sklearn/src/cblas/cblas.h create mode 100644 sklearn/src/cblas/cblas_daxpy.c create mode 100644 sklearn/src/cblas/cblas_dcopy.c create mode 100644 sklearn/src/cblas/cblas_ddot.c create mode 100644 sklearn/src/cblas/cblas_dgemv.c create mode 100644 sklearn/src/cblas/cblas_dger.c create mode 100644 sklearn/src/cblas/cblas_dnrm2.c create mode 100644 sklearn/src/cblas/cblas_drot.c create mode 100644 sklearn/src/cblas/cblas_drotg.c create mode 100644 sklearn/src/cblas/cblas_dscal.c create mode 100644 sklearn/src/cblas/cblas_dtrsv.c create mode 100644 sklearn/src/cblas/cblas_errprn.c create mode 100644 sklearn/src/cblas/cblas_scopy.c create mode 100644 sklearn/src/cblas/cblas_srot.c create mode 100644 sklearn/src/cblas/cblas_srotg.c create mode 100644 sklearn/src/cblas/cblas_strsv.c create mode 100644 sklearn/src/cblas/cblas_xerbla.c create mode 100644 sklearn/svm/__init__.py create mode 100644 sklearn/svm/base.py create mode 100644 sklearn/svm/bounds.py create mode 100644 sklearn/svm/classes.py create mode 100644 sklearn/svm/liblinear.c create mode 100644 sklearn/svm/liblinear.pxd create mode 100644 sklearn/svm/liblinear.pyx create mode 100644 sklearn/svm/libsvm.c create mode 100644 sklearn/svm/libsvm.pxd create mode 100644 sklearn/svm/libsvm.pyx create mode 100644 sklearn/svm/libsvm_sparse.c create mode 100644 sklearn/svm/libsvm_sparse.pyx create mode 100644 sklearn/svm/setup.py create mode 100644 sklearn/svm/src/blas/Makefile create mode 100644 sklearn/svm/src/blas/blas.h create mode 100644 sklearn/svm/src/blas/blasp.h create mode 100644 sklearn/svm/src/blas/daxpy.c create mode 100644 sklearn/svm/src/blas/ddot.c create mode 100644 sklearn/svm/src/blas/dnrm2.c create mode 100644 sklearn/svm/src/blas/dscal.c create mode 100644 sklearn/svm/src/liblinear/COPYRIGHT create mode 100644 sklearn/svm/src/liblinear/liblinear_helper.c create mode 100644 sklearn/svm/src/liblinear/linear.cpp create mode 100644 sklearn/svm/src/liblinear/linear.h create mode 100644 sklearn/svm/src/liblinear/tron.cpp create mode 100644 sklearn/svm/src/liblinear/tron.h create mode 100644 sklearn/svm/src/libsvm/LIBSVM_CHANGES create mode 100644 sklearn/svm/src/libsvm/libsvm_helper.c create mode 100644 sklearn/svm/src/libsvm/libsvm_sparse_helper.c create mode 100644 sklearn/svm/src/libsvm/libsvm_template.cpp create mode 100644 sklearn/svm/src/libsvm/svm.cpp create mode 100644 sklearn/svm/src/libsvm/svm.h create mode 100644 sklearn/svm/tests/__init__.py create mode 100644 sklearn/svm/tests/test_bounds.py create mode 100644 sklearn/svm/tests/test_sparse.py create mode 100644 sklearn/svm/tests/test_svm.py create mode 100644 sklearn/test_setup.py create mode 100644 sklearn/tests/__init__.py create mode 100644 sklearn/tests/test_base.py create mode 100644 sklearn/tests/test_check_build.py create mode 100644 sklearn/tests/test_common.py create mode 100644 sklearn/tests/test_cross_validation.py create mode 100644 sklearn/tests/test_dummy.py create mode 100644 sklearn/tests/test_grid_search.py create mode 100644 sklearn/tests/test_hmm.py create mode 100644 sklearn/tests/test_init.py create mode 100644 sklearn/tests/test_isotonic.py create mode 100644 sklearn/tests/test_kernel_approximation.py create mode 100644 sklearn/tests/test_lda.py create mode 100644 sklearn/tests/test_multiclass.py create mode 100644 sklearn/tests/test_naive_bayes.py create mode 100644 sklearn/tests/test_pipeline.py create mode 100644 sklearn/tests/test_qda.py create mode 100644 sklearn/tests/test_random_projection.py create mode 100644 sklearn/tree/__init__.py create mode 100644 sklearn/tree/_tree.c create mode 100644 sklearn/tree/_tree.pxd create mode 100644 sklearn/tree/_tree.pyx create mode 100644 sklearn/tree/export.py create mode 100644 sklearn/tree/setup.py create mode 100644 sklearn/tree/tests/__init__.py create mode 100644 sklearn/tree/tests/test_export.py create mode 100644 sklearn/tree/tests/test_tree.py create mode 100644 sklearn/tree/tree.py create mode 100644 sklearn/utils/__init__.py create mode 100644 sklearn/utils/_logistic_sigmoid.c create mode 100644 sklearn/utils/_logistic_sigmoid.pyx create mode 100644 sklearn/utils/arpack.py create mode 100644 sklearn/utils/arraybuilder.c create mode 100644 sklearn/utils/arraybuilder.pyx create mode 100644 sklearn/utils/arrayfuncs.c create mode 100644 sklearn/utils/arrayfuncs.pyx create mode 100644 sklearn/utils/bench.py create mode 100644 sklearn/utils/class_weight.py create mode 100644 sklearn/utils/extmath.py create mode 100644 sklearn/utils/fixes.py create mode 100644 sklearn/utils/graph.py create mode 100644 sklearn/utils/graph_shortest_path.c create mode 100644 sklearn/utils/graph_shortest_path.pyx create mode 100644 sklearn/utils/lgamma.c create mode 100644 sklearn/utils/lgamma.pxd create mode 100644 sklearn/utils/lgamma.pyx create mode 100644 sklearn/utils/linear_assignment_.py create mode 100644 sklearn/utils/multiclass.py create mode 100644 sklearn/utils/murmurhash.c create mode 100644 sklearn/utils/murmurhash.pxd create mode 100644 sklearn/utils/murmurhash.pyx create mode 100644 sklearn/utils/random.c create mode 100644 sklearn/utils/random.pxd create mode 100644 sklearn/utils/random.pyx create mode 100644 sklearn/utils/seq_dataset.c create mode 100644 sklearn/utils/seq_dataset.pxd create mode 100644 sklearn/utils/seq_dataset.pyx create mode 100644 sklearn/utils/setup.py create mode 100644 sklearn/utils/sparsefuncs.c create mode 100644 sklearn/utils/sparsefuncs.pxd create mode 100644 sklearn/utils/sparsefuncs.pyx create mode 100644 sklearn/utils/sparsetools/README create mode 100644 sklearn/utils/sparsetools/__init__.py create mode 100644 sklearn/utils/sparsetools/_graph_tools.c create mode 100644 sklearn/utils/sparsetools/_graph_tools.pyx create mode 100644 sklearn/utils/sparsetools/_graph_validation.py create mode 100644 sklearn/utils/sparsetools/_min_spanning_tree.c create mode 100644 sklearn/utils/sparsetools/_min_spanning_tree.pyx create mode 100644 sklearn/utils/sparsetools/_traversal.c create mode 100644 sklearn/utils/sparsetools/_traversal.pyx create mode 100644 sklearn/utils/sparsetools/setup.py create mode 100644 sklearn/utils/sparsetools/tests/__init__.py create mode 100644 sklearn/utils/sparsetools/tests/test_spanning_tree.py create mode 100644 sklearn/utils/sparsetools/tests/test_traversal.py create mode 100644 sklearn/utils/src/MurmurHash3.cpp create mode 100644 sklearn/utils/src/MurmurHash3.h create mode 100644 sklearn/utils/src/cholesky_delete.h create mode 100644 sklearn/utils/src/gamma.c create mode 100644 sklearn/utils/src/gamma.h create mode 100644 sklearn/utils/testing.py create mode 100644 sklearn/utils/tests/__init__.py create mode 100644 sklearn/utils/tests/test_bench.py create mode 100644 sklearn/utils/tests/test_class_weight.py create mode 100644 sklearn/utils/tests/test_extmath.py create mode 100644 sklearn/utils/tests/test_fixes.py create mode 100644 sklearn/utils/tests/test_graph.py create mode 100644 sklearn/utils/tests/test_linear_assignment.py create mode 100644 sklearn/utils/tests/test_multiclass.py create mode 100644 sklearn/utils/tests/test_murmurhash.py create mode 100644 sklearn/utils/tests/test_random.py create mode 100644 sklearn/utils/tests/test_shortest_path.py create mode 100644 sklearn/utils/tests/test_sparsefuncs.py create mode 100644 sklearn/utils/tests/test_testing.py create mode 100644 sklearn/utils/tests/test_utils.py create mode 100644 sklearn/utils/tests/test_validation.py create mode 100644 sklearn/utils/validation.py create mode 100644 sklearn/utils/weight_vector.c create mode 100644 sklearn/utils/weight_vector.pxd create mode 100644 sklearn/utils/weight_vector.pyx diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..7985aa5 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,25 @@ +/sklearn/__check_build/_check_build.c -diff +/sklearn/_hmmc.c -diff +/sklearn/_isotonic.c -diff +/sklearn/cluster/_hierarchical.c -diff +/sklearn/cluster/_k_means.c -diff +/sklearn/datasets/_svmlight_format.c -diff +/sklearn/ensemble/_gradient_boosting.c -diff +/sklearn/feature_extraction/_hashing.c -diff +/sklearn/linear_model/cd_fast.c -diff +/sklearn/linear_model/sgd_fast.c -diff +/sklearn/metrics/pairwise_fast.c -diff +/sklearn/neighbors/ball_tree.c -diff +/sklearn/svm/liblinear.c -diff +/sklearn/svm/libsvm.c -diff +/sklearn/svm/libsvm_sparse.c -diff +/sklearn/tree/_tree.c -diff +/sklearn/utils/arraybuilder.c -diff +/sklearn/utils/arrayfuncs.c -diff +/sklearn/utils/graph_shortest_path.c -diff +/sklearn/utils/lgamma.c -diff +/sklearn/utils/_logistic_sigmoid.c -diff +/sklearn/utils/murmurhash.c -diff +/sklearn/utils/seq_dataset.c -diff +/sklearn/utils/sparsefuncs.c -diff +/sklearn/utils/weight_vector.c -diff diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e562b99 --- /dev/null +++ b/.gitignore @@ -0,0 +1,52 @@ +*.pyc +*.so +*.pyd +*~ +.#* +*.lprof +*.swp +*.swo +.DS_Store +build +sklearn/datasets/__config__.py +sklearn/**/*.html + +dist/ +MANIFEST +doc/_build/ +doc/auto_examples/ +doc/modules/generated/ +doc/datasets/generated/ +*.pdf +pip-log.txt +scikit_learn.egg-info/ +.coverage +coverage +tags +covtype.data.gz +20news-18828/ +20news-18828.tar.gz +coverages.zip +samples.zip +doc/coverages.zip +doc/samples.zip +coverages +samples +doc/coverages +doc/samples +*.prof + +lfw_preprocessed/ +nips2010_pdf/ + +*.nt.bz2 +*.tar.gz +*.tgz + +examples/cluster/joblib +reuters/ +benchmarks/bench_covertype_data/ + +*.prefs +.pydevproject +.idea diff --git a/.mailmap b/.mailmap new file mode 100644 index 0000000..74d7126 --- /dev/null +++ b/.mailmap @@ -0,0 +1,93 @@ +Alexandre Gramfort +Alexandre Gramfort +Alexandre Gramfort +Andreas Mueller +Andreas Mueller +Andreas Mueller +Andreas Mueller +Andreas Mueller +Arnaud Joly +Arnaud Joly +Arnaud Joly +Anne-Laure Fouque +Ariel Rokem arokem +Bertrand Thirion +Brian Cheung +Brian Cheung +Brian Cheung +Brian Holt +Christian Osendorfer +Clay Woolam +Denis Engemann +Denis Engemann +Denis Engemann +Denis Engemann +Diego Molla +DraXus draxus +Edouard DUCHESNAY +Edouard DUCHESNAY +Edouard DUCHESNAY +Emmanuelle Gouillart +Emmanuelle Gouillart +Fabian Pedregosa +Fabian Pedregosa +Fabian Pedregosa +Federico Vaggi +Federico Vaggi +Gael Varoquaux +Gael Varoquaux +Gael Varoquaux +Gilles Louppe +Harikrishnan S +Hrishikesh Huilgolkar +Immanuel Bayer +Jake VanderPlas +Jake VanderPlas +Jake VanderPlas +James Bergstra +Jaques Grobler +Jan Schlüter +Jean Kossaifi +Jean Kossaifi +Jean Kossaifi +Joel Nothman +Lars Buitinck +Lars Buitinck +Lars Buitinck +Matthieu Perrot +Michael Eickenberg +Michael Eickenberg +Nelle Varoquaux +Nelle Varoquaux +Nicolas Pinto +Noel Dawe +Noel Dawe +Olivier Grisel +Olivier Grisel +Olivier Hervieu +Peter Prettenhofer +Robert Layton +Roman Sinayev +Roman Sinayev +Satrajit Ghosh +Shiqiao Du +Shiqiao Du +Tim Sheerman-Chase +Vincent Dubourg +Vincent Dubourg +Vincent Michel +Vincent Michel +Vincent Michel +Vincent Michel +Vincent Michel +Vincent Schut +Virgile Fritsch +Virgile Fritsch +Vlad Niculae +Wei Li +Wei Li +X006 +Xinfan Meng +Yannick Schwartz +Yannick Schwartz + diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..97a181a --- /dev/null +++ b/.travis.yml @@ -0,0 +1,10 @@ +language: python +python: + - "2.7" +virtualenv: + system_site_packages: true +before_install: + - sudo apt-get update -qq + - sudo apt-get install -qq python-scipy python-nose +install: python setup.py build_ext --inplace +script: make test diff --git a/AUTHORS.rst b/AUTHORS.rst new file mode 100644 index 0000000..5231773 --- /dev/null +++ b/AUTHORS.rst @@ -0,0 +1,91 @@ +.. -*- mode: rst -*- + + +This is a community effort, and as such many people have contributed +to it over the years. + +History +------- + +This project was started in 2007 as a Google Summer of Code project by +David Cournapeau. Later that year, Matthieu Brucher started work on +this project as part of his thesis. + +In 2010 Fabian Pedregosa, Gael Varoquaux, Alexandre Gramfort and Vincent +Michel of INRIA took leadership of the project and made the first public +release, February the 1st 2010. Since then, several releases have appeared +following a ~3 month cycle, and a striving international community has +been leading the development. + +People +------ + +.. hlist:: + + * David Cournapeau + + * Jarrod Millman + + * `Matthieu Brucher `_ + + * `Fabian Pedregosa `_ + + * `Gael Varoquaux `_ + + * `Jake VanderPlas `_ + + * `Alexandre Gramfort `_ + + * `Olivier Grisel `_ + + * Bertrand Thirion + + * Vincent Michel + + * Chris Filo Gorgolewski + + * `Angel Soler Gollonet `_ + + * `Yaroslav Halchenko `_ + + * Ron Weiss + + * `Virgile Fritsch + `_ + + * `Mathieu Blondel `_ + + * `Peter Prettenhofer + `_ + + * Vincent Dubourg + + * `Alexandre Passos `_ + + * `Vlad Niculae `_ + + * Edouard Duchesnay + + * Thouis (Ray) Jones + + * Lars Buitinck + + * Paolo Losi + + * Nelle Varoquaux + + * `Brian Holt `_ + + * Robert Layton + + * `Gilles Louppe `_ + + * `Andreas Müller `_ (release manager) + + * `Satra Ghosh `_ + + * `Wei Li `_ + + * `Arnaud Joly `_ + + * `Kemal Eren `_ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..f5b2487 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,145 @@ + +Contributing code +================= + +**Note: This document is just to get started, visit [**Contributing +page**](http://scikit-learn.org/stable/developers/index.html#coding-guidelines) +for the full contributor's guide. Please be sure to read it carefully to make +the code review process go as smoothly as possible and maximize the +likelihood of your contribution being merged.** + +How to contribute +----------------- + +The preferred way to contribute to scikit-learn is to fork the +[main repository](http://github.com/scikit-learn/scikit-learn/) on +GitHub: + +1. Fork the [project repository](http://github.com/scikit-learn/scikit-learn): + click on the 'Fork' button near the top of the page. This creates + a copy of the code under your account on the GitHub server. + +2. Clone this copy to your local disk: + + $ git clone git@github.com:YourLogin/scikit-learn.git + +3. Create a branch to hold your changes: + + $ git checkout -b my-feature + + and start making changes. Never work in the ``master`` branch! + +4. Work on this copy on your computer using Git to do the version + control. When you're done editing, do: + + $ git add modified_files + $ git commit + + to record your changes in Git, then push them to GitHub with: + + $ git push -u origin my-feature + +Finally, go to the web page of the your fork of the scikit-learn repo, +and click 'Pull request' to send your changes to the maintainers for +review. request. This will send an email to the committers. + +(If any of the above seems like magic to you, then look up the +[Git documentation](http://git-scm.com/documentation) on the web.) + +It is recommended to check that your contribution complies with the +following rules before submitting a pull request: + +- All public methods should have informative docstrings with sample + usage presented as doctests when appropriate. + +- All other tests pass when everything is rebuilt from scratch. On + Unix-like systems, check with (from the toplevel source folder): + + $ make + +- When adding additional functionality, provide at least one + example script in the ``examples/`` folder. Have a look at other + examples for reference. Examples should demonstrate why the new + functionality is useful in practice and, if possible, compare it + to other methods available in scikit-learn. + +- At least one paragraph of narrative documentation with links to +```` references in the literature (with PDF links when possible) and + the example. + +The documentation should also include expected time and space +complexity of the algorithm and scalability, e.g. "this algorithm +can scale to a large number of samples > 100000, but does not +scale in dimensionality: n_features is expected to be lower than +100". + +You can also check for common programming errors with the following +tools: + +- Code with good unittest coverage (at least 80%), check with: + + $ pip install nose coverage + $ nosetests --with-coverage path/to/tests_for_package + +- No pyflakes warnings, check with: + + $ pip install pyflakes + $ pyflakes path/to/module.py + +- No PEP8 warnings, check with: + + $ pip install pep8 + $ pep8 path/to/module.py + +- AutoPEP8 can help you fix some of the easy redundant errors: + + $ pip install autopep8 + $ autopep8 path/to/pep8.py + +Bonus points for contributions that include a performance analysis with +a benchmark script and profiling output (please report on the mailing +list or on the GitHub issue). + +Easy Issues +----------- + +A great way to start contributing to scikit-learn is to pick an item +from the list of [Easy issues](https://github.com/scikit-learn/scikit-learn/issues?labels=Easy) +in the issue tracker. Resolving these issues allow you to start +contributing to the project without much prior knowledge. Your +assistance in this area will be greatly appreciated by the more +experienced developers as it helps free up their time to concentrate on +other issues. + +Documentation +------------- + +We are glad to accept any sort of documentation: function docstrings, +reStructuredText documents (like this one), tutorials, etc. +reStructuredText documents live in the source code repository under the +doc/ directory. + +You can edit the documentation using any text editor and then generate +the HTML output by typing ``make html`` from the doc/ directory. +Alternatively, ``make`` can be used to quickly generate the +documentation without the example gallery. The resulting HTML files will +be placed in _build/html/ and are viewable in a web browser. See the +README file in the doc/ directory for more information. + +For building the documentation, you will need +[sphinx](http://sphinx.pocoo.org/) and +[matplotlib](http://matplotlib.sourceforge.net/). + +When you are writing documentation, it is important to keep a good +compromise between mathematical and algorithmic details, and give +intuition to the reader on what the algorithm does. It is best to always +start with a small paragraph with a hand-waving explanation of what the +method does to the data and a figure (coming from an example) +illustrating it. + +Further Information +------------------- + +Visit the [Contributing Code](http://scikit-learn.org/stable/developers/index.html#coding-guidelines) +section of the website for more information including conforming to the +API spec and profiling contributed code. diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..fb4c305 --- /dev/null +++ b/COPYING @@ -0,0 +1,32 @@ +New BSD License + +Copyright (c) 2007–2013 The scikit-learn developers. +All rights reserved. + + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + a. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + b. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + c. Neither the name of the Scikit-learn Developers nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH +DAMAGE. + diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..b2e138f --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include *.rst +recursive-include doc * +recursive-include examples * +recursive-include sklearn *.c *.h *.pyx *.pxd +recursive-include sklearn/datasets *.csv *.csv.gz *.rst *.jpg *.txt diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..0832cb7 --- /dev/null +++ b/Makefile @@ -0,0 +1,64 @@ +# simple makefile to simplify repetetive build env management tasks under posix + +# caution: testing won't work on windows, see README + +PYTHON ?= python +CYTHON ?= cython +NOSETESTS ?= nosetests +CTAGS ?= ctags + +all: clean inplace test + +clean-pyc: + find sklearn -name "*.pyc" | xargs rm -f + +clean-so: + find sklearn -name "*.so" | xargs rm -f + find sklearn -name "*.pyd" | xargs rm -f + find sklearn -name "__pycache__" | xargs rm -rf + +clean-build: + rm -rf build + rm -rf dist + +clean-ctags: + rm -f tags + +clean: clean-build clean-pyc clean-so clean-ctags + +in: inplace # just a shortcut +inplace: + $(PYTHON) setup.py build_ext -i + +test-code: in + $(NOSETESTS) -s -v sklearn +test-doc: + $(NOSETESTS) -s -v doc/ doc/modules/ doc/datasets/ \ + doc/developers doc/tutorial/basic doc/tutorial/statistical_inference + +test-coverage: + rm -rf coverage .coverage + $(NOSETESTS) -s -v --with-coverage sklearn + +test: test-code test-doc + +trailing-spaces: + find sklearn -name "*.py" | xargs perl -pi -e 's/[ \t]*$$//' + +cython: + find sklearn -name "*.pyx" | xargs $(CYTHON) + +ctags: + # make tags for symbol based navigation in emacs and vim + # Install with: sudo apt-get install exuberant-ctags + $(CTAGS) -R * + +doc: inplace + make -C doc html + +doc-noplot: inplace + make -C doc html-noplot + +code-analysis: + flake8 sklearn | grep -v __init__ | grep -v external + pylint -E -i y sklearn/ -d E1103,E0611,E1101 diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..6888292 --- /dev/null +++ b/README.rst @@ -0,0 +1,93 @@ +.. -*- mode: rst -*- + +|Travis|_ + +.. |Travis| image:: https://api.travis-ci.org/scikit-learn/scikit-learn.png?branch=master +.. _Travis: https://travis-ci.org/scikit-learn/scikit-learn + +scikit-learn +============ + +scikit-learn is a Python module for machine learning built on top of +SciPy and distributed under the 3-Clause BSD license. + +The project was started in 2007 by David Cournapeau as a Google Summer +of Code project, and since then many volunteers have contributed. See +the AUTHORS.rst file for a complete list of contributors. + +It is currently maintained by a team of volunteers. + +**Note** `scikit-learn` was previously referred to as `scikits.learn`. + + +Important links +=============== + +- Official source code repo: https://github.com/scikit-learn/scikit-learn +- HTML documentation (stable release): http://scikit-learn.org +- HTML documentation (development version): http://scikit-learn.org/dev/ +- Download releases: http://sourceforge.net/projects/scikit-learn/files/ +- Issue tracker: https://github.com/scikit-learn/scikit-learn/issues +- Mailing list: https://lists.sourceforge.net/lists/listinfo/scikit-learn-general +- IRC channel: ``#scikit-learn`` at ``irc.freenode.net`` + +Dependencies +============ + +scikit-learn is tested to work under Python 2.6+ and Python 3.3+ +(using the same codebase thanks to an embedded copy of `six `_). + +The required dependencies to build the software Numpy >= 1.3, SciPy >= 0.7 +and a working C/C++ compiler. + +For running the examples Matplotlib >= 0.99.1 is required and for running the +tests you need nose >= 0.10. + +This configuration matches the Ubuntu 10.04 LTS release from April 2010. + + +Install +======= + +This package uses distutils, which is the default way of installing +python modules. To install in your home directory, use:: + + python setup.py install --user + +To install for all users on Unix/Linux:: + + python setup.py build + sudo python setup.py install + + +Development +=========== + +Code +---- + +GIT +~~~ + +You can check the latest sources with the command:: + + git clone git://github.com/scikit-learn/scikit-learn.git + +or if you have write privileges:: + + git clone git@github.com:scikit-learn/scikit-learn.git + + +Testing +------- + +After installation, you can launch the test suite from outside the +source directory (you will need to have nosetests installed):: + + $ nosetests --exe sklearn + +See the web page http://scikit-learn.org/stable/install.html#testing +for more information. + + Random number generation can be controlled during testing by setting + the ``SKLEARN_SEED`` environment variable. diff --git a/benchmarks/bench_covertype.py b/benchmarks/bench_covertype.py new file mode 100644 index 0000000..4e46b30 --- /dev/null +++ b/benchmarks/bench_covertype.py @@ -0,0 +1,269 @@ +""" +=========================== +Covertype dataset benchmark +=========================== + +Benchmark stochastic gradient descent (SGD), Liblinear, and Naive Bayes, CART +(decision tree), RandomForest and Extra-Trees on the forest covertype dataset +of Blackard, Jock, and Dean [1]. The dataset comprises 581,012 samples. It is +low dimensional with 54 features and a sparsity of approx. 23%. Here, we +consider the task of predicting class 1 (spruce/fir). The classification +performance of SGD is competitive with Liblinear while being two orders of +magnitude faster to train:: + + [..] + Classification performance: + =========================== + Classifier train-time test-time error-rate + -------------------------------------------- + liblinear 15.9744s 0.0705s 0.2305 + GaussianNB 3.0666s 0.3884s 0.4841 + SGD 1.0558s 0.1152s 0.2300 + CART 79.4296s 0.0523s 0.0469 + RandomForest 1190.1620s 0.5881s 0.0243 + ExtraTrees 640.3194s 0.6495s 0.0198 + +The same task has been used in a number of papers including: + + * `"SVM Optimization: Inverse Dependence on Training Set Size" + `_ + S. Shalev-Shwartz, N. Srebro - In Proceedings of ICML '08. + + * `"Pegasos: Primal estimated sub-gradient solver for svm" + `_ + S. Shalev-Shwartz, Y. Singer, N. Srebro - In Proceedings of ICML '07. + + * `"Training Linear SVMs in Linear Time" + `_ + T. Joachims - In SIGKDD '06 + +[1] http://archive.ics.uci.edu/ml/datasets/Covertype + +""" +from __future__ import division, print_function + +print(__doc__) + +# Author: Peter Prettenhofer +# License: BSD 3 clause + +import logging +import os +import sys +from time import time +from optparse import OptionParser + +import numpy as np + +from sklearn.datasets import fetch_covtype +from sklearn.svm import LinearSVC +from sklearn.linear_model import SGDClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier +from sklearn.ensemble import GradientBoostingClassifier +from sklearn import metrics +from sklearn.externals.joblib import Memory + +logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s %(message)s') +logger = logging.getLogger(__name__) + +op = OptionParser() +op.add_option("--classifiers", + dest="classifiers", default='liblinear,GaussianNB,SGD,CART', + help="comma-separated list of classifiers to benchmark. " + "default: %default. available: " + "liblinear,GaussianNB,SGD,CART,ExtraTrees,RandomForest") + +op.add_option("--n-jobs", + dest="n_jobs", default=1, type=int, + help="Number of concurrently running workers for models that" + " support parallelism.") + +# Each number generator use the same seed to avoid coupling issue between +# estimators. +op.add_option("--random-seed", + dest="random_seed", default=13, type=int, + help="Common seed used by random number generator.") + +op.print_help() + +(opts, args) = op.parse_args() +if len(args) > 0: + op.error("this script takes no arguments.") + sys.exit(1) + +# Memoize the data extraction and memory map the resulting +# train / test splits in readonly mode +bench_folder = os.path.dirname(__file__) +original_archive = os.path.join(bench_folder, 'covtype.data.gz') +joblib_cache_folder = os.path.join(bench_folder, 'bench_covertype_data') +m = Memory(joblib_cache_folder, mmap_mode='r') + + +# Load the data, then cache and memmap the train/test split +@m.cache +def load_data(dtype=np.float32, order='C'): + ###################################################################### + ## Load dataset + print("Loading dataset...") + data = fetch_covtype(download_if_missing=True, shuffle=True, + random_state=opts.random_seed) + X, y = data['data'], data['target'] + X = np.asarray(X, dtype=dtype) + + if order.lower() == 'f': + X = np.asfortranarray(X) + + # class 1 vs. all others. + y[np.where(y != 1)] = -1 + + ###################################################################### + ## Create train-test split (as [Joachims, 2006]) + logger.info("Creating train-test split...") + n_train = 522911 + + X_train = X[:n_train] + y_train = y[:n_train] + X_test = X[n_train:] + y_test = y[n_train:] + + ###################################################################### + ## Standardize first 10 features (the numerical ones) + mean = X_train.mean(axis=0) + std = X_train.std(axis=0) + mean[10:] = 0.0 + std[10:] = 1.0 + X_train = (X_train - mean) / std + X_test = (X_test - mean) / std + return X_train, X_test, y_train, y_test + + +X_train, X_test, y_train, y_test = load_data() + +###################################################################### +## Print dataset statistics +print("") +print("Dataset statistics:") +print("===================") +print("%s %d" % ("number of features:".ljust(25), + X_train.shape[1])) +print("%s %d" % ("number of classes:".ljust(25), + np.unique(y_train).shape[0])) +print("%s %s" % ("data type:".ljust(25), X_train.dtype)) +print("%s %d (pos=%d, neg=%d, size=%dMB)" + % ("number of train samples:".ljust(25), + X_train.shape[0], np.sum(y_train == 1), + np.sum(y_train == -1), int(X_train.nbytes / 1e6))) +print("%s %d (pos=%d, neg=%d, size=%dMB)" + % ("number of test samples:".ljust(25), + X_test.shape[0], np.sum(y_test == 1), + np.sum(y_test == -1), int(X_test.nbytes / 1e6))) + + +classifiers = dict() + + +###################################################################### +## Benchmark classifiers +def benchmark(clf): + t0 = time() + clf.fit(X_train, y_train) + train_time = time() - t0 + t0 = time() + pred = clf.predict(X_test) + test_time = time() - t0 + err = metrics.zero_one_loss(y_test, pred, normalize=True) + return err, train_time, test_time + +###################################################################### +## Train Liblinear model +liblinear_parameters = { + 'loss': 'l2', + 'penalty': 'l2', + 'C': 1000, + 'dual': False, + 'tol': 1e-3, + "random_state": opts.random_seed, +} +classifiers['liblinear'] = LinearSVC(**liblinear_parameters) + +###################################################################### +## Train GaussianNB model +classifiers['GaussianNB'] = GaussianNB() + +###################################################################### +## Train SGD model +sgd_parameters = { + 'alpha': 0.001, + 'n_iter': 2, + 'n_jobs': opts.n_jobs, + "random_state": opts.random_seed, +} +classifiers['SGD'] = SGDClassifier(**sgd_parameters) + +###################################################################### +## Train CART model +classifiers['CART'] = DecisionTreeClassifier(min_samples_split=5, + max_depth=None, + random_state=opts.random_seed) + +###################################################################### +## Train RandomForest model +rf_parameters = { + "n_estimators": 20, + "n_jobs": opts.n_jobs, + "random_state": opts.random_seed, +} +classifiers['RandomForest'] = RandomForestClassifier(**rf_parameters) + +###################################################################### +## Train Extra-Trees model +classifiers['ExtraTrees'] = ExtraTreesClassifier(n_estimators=20, + n_jobs=opts.n_jobs, + random_state=opts.random_seed) + +###################################################################### +## Train GBRT model +classifiers['GBRT'] = GradientBoostingClassifier(n_estimators=250, + random_state=opts.random_seed) + + +selected_classifiers = opts.classifiers.split(',') +for name in selected_classifiers: + if name not in classifiers: + op.error('classifier %r unknown' % name) + sys.exit(1) + +print() +print("Training Classifiers") +print("====================") +print() +err, train_time, test_time = {}, {}, {} +for name in sorted(selected_classifiers): + print("Training %s ..." % name) + err[name], train_time[name], test_time[name] = benchmark(classifiers[name]) + +###################################################################### +## Print classification performance +print() +print("Classification performance:") +print("===========================") +print() + + +def print_row(clf_type, train_time, test_time, err): + print("%s %s %s %s" % (clf_type.ljust(12), + ("%.4fs" % train_time).center(10), + ("%.4fs" % test_time).center(10), + ("%.4f" % err).center(10))) + +print("%s %s %s %s" % ("Classifier ", "train-time", "test-time", + "error-rate")) +print("-" * 44) + +for name in sorted(selected_classifiers, key=lambda name: err[name]): + print_row(name, train_time[name], test_time[name], err[name]) +print() +print() diff --git a/benchmarks/bench_glm.py b/benchmarks/bench_glm.py new file mode 100644 index 0000000..598ba5f --- /dev/null +++ b/benchmarks/bench_glm.py @@ -0,0 +1,58 @@ +""" +A comparison of different methods in GLM + +Data comes from a random square matrix. + +""" +from datetime import datetime +import numpy as np +from sklearn import linear_model +from sklearn.utils.bench import total_seconds + + +if __name__ == '__main__': + + import pylab as pl + + n_iter = 40 + + time_ridge = np.empty(n_iter) + time_ols = np.empty(n_iter) + time_lasso = np.empty(n_iter) + + dimensions = 500 * np.arange(1, n_iter + 1) + + for i in range(n_iter): + + print('Iteration %s of %s' % (i, n_iter)) + + n_samples, n_features = 10 * i + 3, 10 * i + 3 + + X = np.random.randn(n_samples, n_features) + Y = np.random.randn(n_samples) + + start = datetime.now() + ridge = linear_model.Ridge(alpha=1.) + ridge.fit(X, Y) + time_ridge[i] = total_seconds(datetime.now() - start) + + start = datetime.now() + ols = linear_model.LinearRegression() + ols.fit(X, Y) + time_ols[i] = total_seconds(datetime.now() - start) + + start = datetime.now() + lasso = linear_model.LassoLars() + lasso.fit(X, Y) + time_lasso[i] = total_seconds(datetime.now() - start) + + pl.figure('scikit-learn GLM benchmark results') + pl.xlabel('Dimensions') + pl.ylabel('Time (s)') + pl.plot(dimensions, time_ridge, color='r') + pl.plot(dimensions, time_ols, color='g') + pl.plot(dimensions, time_lasso, color='b') + + pl.legend(['Ridge', 'OLS', 'LassoLars'], loc='upper left') + pl.axis('tight') + pl.show() diff --git a/benchmarks/bench_glmnet.py b/benchmarks/bench_glmnet.py new file mode 100644 index 0000000..4b9c350 --- /dev/null +++ b/benchmarks/bench_glmnet.py @@ -0,0 +1,128 @@ +""" +To run this, you'll need to have installed. + + * glmnet-python + * scikit-learn (of course) + +Does two benchmarks + +First, we fix a training set and increase the number of +samples. Then we plot the computation time as function of +the number of samples. + +In the second benchmark, we increase the number of dimensions of the +training set. Then we plot the computation time as function of +the number of dimensions. + +In both cases, only 10% of the features are informative. +""" +import numpy as np +import gc +from time import time +from sklearn.datasets.samples_generator import make_regression + +alpha = 0.1 +# alpha = 0.01 + + +def rmse(a, b): + return np.sqrt(np.mean((a - b) ** 2)) + + +def bench(factory, X, Y, X_test, Y_test, ref_coef): + gc.collect() + + # start time + tstart = time() + clf = factory(alpha=alpha).fit(X, Y) + delta = (time() - tstart) + # stop time + + print("duration: %0.3fs" % delta) + print("rmse: %f" % rmse(Y_test, clf.predict(X_test))) + print("mean coef abs diff: %f" % abs(ref_coef - clf.coef_.ravel()).mean()) + return delta + + +if __name__ == '__main__': + from glmnet.elastic_net import Lasso as GlmnetLasso + from sklearn.linear_model import Lasso as ScikitLasso + # Delayed import of pylab + import pylab as pl + + scikit_results = [] + glmnet_results = [] + n = 20 + step = 500 + n_features = 1000 + n_informative = n_features / 10 + n_test_samples = 1000 + for i in range(1, n + 1): + print('==================') + print('Iteration %s of %s' % (i, n)) + print('==================') + + X, Y, coef_ = make_regression( + n_samples=(i * step) + n_test_samples, n_features=n_features, + noise=0.1, n_informative=n_informative, coef=True) + + X_test = X[-n_test_samples:] + Y_test = Y[-n_test_samples:] + X = X[:(i * step)] + Y = Y[:(i * step)] + + print("benchmarking scikit-learn: ") + scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_)) + print("benchmarking glmnet: ") + glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_)) + + pl.clf() + xx = range(0, n * step, step) + pl.title('Lasso regression on sample dataset (%d features)' % n_features) + pl.plot(xx, scikit_results, 'b-', label='scikit-learn') + pl.plot(xx, glmnet_results, 'r-', label='glmnet') + pl.legend() + pl.xlabel('number of samples to classify') + pl.ylabel('Time (s)') + pl.show() + + # now do a benchmark where the number of points is fixed + # and the variable is the number of features + + scikit_results = [] + glmnet_results = [] + n = 20 + step = 100 + n_samples = 500 + + for i in range(1, n + 1): + print('==================') + print('Iteration %02d of %02d' % (i, n)) + print('==================') + n_features = i * step + n_informative = n_features / 10 + + X, Y, coef_ = make_regression( + n_samples=(i * step) + n_test_samples, n_features=n_features, + noise=0.1, n_informative=n_informative, coef=True) + + X_test = X[-n_test_samples:] + Y_test = Y[-n_test_samples:] + X = X[:n_samples] + Y = Y[:n_samples] + + print("benchmarking scikit-learn: ") + scikit_results.append(bench(ScikitLasso, X, Y, X_test, Y_test, coef_)) + print("benchmarking glmnet: ") + glmnet_results.append(bench(GlmnetLasso, X, Y, X_test, Y_test, coef_)) + + xx = np.arange(100, 100 + n * step, step) + pl.figure('scikit-learn vs. glmnet benchmark results') + pl.title('Regression in high dimensional spaces (%d samples)' % n_samples) + pl.plot(xx, scikit_results, 'b-', label='scikit-learn') + pl.plot(xx, glmnet_results, 'r-', label='glmnet') + pl.legend() + pl.xlabel('number of features') + pl.ylabel('Time (s)') + pl.axis('tight') + pl.show() diff --git a/benchmarks/bench_lasso.py b/benchmarks/bench_lasso.py new file mode 100644 index 0000000..7934fe1 --- /dev/null +++ b/benchmarks/bench_lasso.py @@ -0,0 +1,95 @@ +""" +Benchmarks of Lasso vs LassoLars + +First, we fix a training set and increase the number of +samples. Then we plot the computation time as function of +the number of samples. + +In the second benchmark, we increase the number of dimensions of the +training set. Then we plot the computation time as function of +the number of dimensions. + +In both cases, only 10% of the features are informative. +""" +import gc +from time import time +import numpy as np + +from sklearn.datasets.samples_generator import make_regression + + +def compute_bench(alpha, n_samples, n_features, precompute): + lasso_results = [] + lars_lasso_results = [] + + it = 0 + + for ns in n_samples: + for nf in n_features: + it += 1 + print('==================') + print('Iteration %s of %s' % (it, max(len(n_samples), + len(n_features)))) + print('==================') + n_informative = nf // 10 + X, Y, coef_ = make_regression(n_samples=ns, n_features=nf, + n_informative=n_informative, + noise=0.1, coef=True) + + X /= np.sqrt(np.sum(X ** 2, axis=0)) # Normalize data + + gc.collect() + print("- benchmarking Lasso") + clf = Lasso(alpha=alpha, fit_intercept=False, + precompute=precompute) + tstart = time() + clf.fit(X, Y) + lasso_results.append(time() - tstart) + + gc.collect() + print("- benchmarking LassoLars") + clf = LassoLars(alpha=alpha, fit_intercept=False, + normalize=False, precompute=precompute) + tstart = time() + clf.fit(X, Y) + lars_lasso_results.append(time() - tstart) + + return lasso_results, lars_lasso_results + + +if __name__ == '__main__': + from sklearn.linear_model import Lasso, LassoLars + import pylab as pl + + alpha = 0.01 # regularization parameter + + n_features = 10 + list_n_samples = np.linspace(100, 1000000, 5).astype(np.int) + lasso_results, lars_lasso_results = compute_bench(alpha, list_n_samples, + [n_features], precompute=True) + + pl.figure('scikit-learn LASSO benchmark results') + pl.subplot(211) + pl.plot(list_n_samples, lasso_results, 'b-', + label='Lasso') + pl.plot(list_n_samples, lars_lasso_results, 'r-', + label='LassoLars') + pl.title('precomputed Gram matrix, %d features, alpha=%s' % (n_features, alpha)) + pl.legend(loc='upper left') + pl.xlabel('number of samples') + pl.ylabel('Time (s)') + pl.axis('tight') + + n_samples = 2000 + list_n_features = np.linspace(500, 3000, 5).astype(np.int) + lasso_results, lars_lasso_results = compute_bench(alpha, [n_samples], + list_n_features, precompute=False) + pl.subplot(212) + pl.plot(list_n_features, lasso_results, 'b-', label='Lasso') + pl.plot(list_n_features, lars_lasso_results, 'r-', label='LassoLars') + pl.title('%d samples, alpha=%s' % (n_samples, alpha)) + pl.legend(loc='upper left') + pl.xlabel('number of features') + pl.ylabel('Time (s)') + pl.axis('tight') + pl.show() diff --git a/benchmarks/bench_plot_fastkmeans.py b/benchmarks/bench_plot_fastkmeans.py new file mode 100644 index 0000000..4677fd2 --- /dev/null +++ b/benchmarks/bench_plot_fastkmeans.py @@ -0,0 +1,138 @@ +from __future__ import print_function + +from collections import defaultdict +from time import time + +import numpy as np +from numpy import random as nr + +from sklearn.cluster.k_means_ import KMeans, MiniBatchKMeans + + +def compute_bench(samples_range, features_range): + + it = 0 + results = defaultdict(lambda: []) + chunk = 100 + + max_it = len(samples_range) * len(features_range) + for n_samples in samples_range: + for n_features in features_range: + it += 1 + print('==============================') + print('Iteration %03d of %03d' % (it, max_it)) + print('==============================') + print() + data = nr.random_integers(-50, 50, (n_samples, n_features)) + + print('K-Means') + tstart = time() + kmeans = KMeans(init='k-means++', n_clusters=10).fit(data) + + delta = time() - tstart + print("Speed: %0.3fs" % delta) + print("Inertia: %0.5f" % kmeans.inertia_) + print() + + results['kmeans_speed'].append(delta) + results['kmeans_quality'].append(kmeans.inertia_) + + print('Fast K-Means') + # let's prepare the data in small chunks + mbkmeans = MiniBatchKMeans(init='k-means++', + n_clusters=10, + batch_size=chunk) + tstart = time() + mbkmeans.fit(data) + delta = time() - tstart + print("Speed: %0.3fs" % delta) + print("Inertia: %f" % mbkmeans.inertia_) + print() + print() + + results['MiniBatchKMeans Speed'].append(delta) + results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_) + + return results + + +def compute_bench_2(chunks): + results = defaultdict(lambda: []) + n_features = 50000 + means = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1], + [0.5, 0.5], [0.75, -0.5], [-1, 0.75], [1, 0]]) + X = np.empty((0, 2)) + for i in range(8): + X = np.r_[X, means[i] + 0.8 * np.random.randn(n_features, 2)] + max_it = len(chunks) + it = 0 + for chunk in chunks: + it += 1 + print('==============================') + print('Iteration %03d of %03d' % (it, max_it)) + print('==============================') + print() + + print('Fast K-Means') + tstart = time() + mbkmeans = MiniBatchKMeans(init='k-means++', + n_clusters=8, + batch_size=chunk) + + mbkmeans.fit(X) + delta = time() - tstart + print("Speed: %0.3fs" % delta) + print("Inertia: %0.3fs" % mbkmeans.inertia_) + print() + + results['MiniBatchKMeans Speed'].append(delta) + results['MiniBatchKMeans Quality'].append(mbkmeans.inertia_) + + return results + + +if __name__ == '__main__': + from mpl_toolkits.mplot3d import axes3d # register the 3d projection + import matplotlib.pyplot as plt + + samples_range = np.linspace(50, 150, 5).astype(np.int) + features_range = np.linspace(150, 50000, 5).astype(np.int) + chunks = np.linspace(500, 10000, 15).astype(np.int) + + results = compute_bench(samples_range, features_range) + results_2 = compute_bench_2(chunks) + + max_time = max([max(i) for i in [t for (label, t) in results.iteritems() + if "speed" in label]]) + max_inertia = max([max(i) for i in [ + t for (label, t) in results.iteritems() + if "speed" not in label]]) + + fig = plt.figure('scikit-learn K-Means benchmark results') + for c, (label, timings) in zip('brcy', + sorted(results.iteritems())): + if 'speed' in label: + ax = fig.add_subplot(2, 2, 1, projection='3d') + ax.set_zlim3d(0.0, max_time * 1.1) + else: + ax = fig.add_subplot(2, 2, 2, projection='3d') + ax.set_zlim3d(0.0, max_inertia * 1.1) + + X, Y = np.meshgrid(samples_range, features_range) + Z = np.asarray(timings).reshape(samples_range.shape[0], + features_range.shape[0]) + ax.plot_surface(X, Y, Z.T, cstride=1, rstride=1, color=c, alpha=0.5) + ax.set_xlabel('n_samples') + ax.set_ylabel('n_features') + + i = 0 + for c, (label, timings) in zip('br', + sorted(results_2.iteritems())): + i += 1 + ax = fig.add_subplot(2, 2, i + 2) + y = np.asarray(timings) + ax.plot(chunks, y, color=c, alpha=0.8) + ax.set_xlabel('Chunks') + ax.set_ylabel(label) + + plt.show() diff --git a/benchmarks/bench_plot_lasso_path.py b/benchmarks/bench_plot_lasso_path.py new file mode 100644 index 0000000..4456ab1 --- /dev/null +++ b/benchmarks/bench_plot_lasso_path.py @@ -0,0 +1,117 @@ +"""Benchmarks of Lasso regularization path computation using Lars and CD + +The input data is mostly low rank but is a fat infinite tail. +""" +from __future__ import print_function + +from collections import defaultdict +import gc +import sys +from time import time + +import numpy as np + +from sklearn.linear_model import lars_path +from sklearn.linear_model import lasso_path +from sklearn.datasets.samples_generator import make_regression + + +def compute_bench(samples_range, features_range): + + it = 0 + + results = defaultdict(lambda: []) + + max_it = len(samples_range) * len(features_range) + for n_samples in samples_range: + for n_features in features_range: + it += 1 + print('====================') + print('Iteration %03d of %03d' % (it, max_it)) + print('====================') + dataset_kwargs = { + 'n_samples': n_samples, + 'n_features': n_features, + 'n_informative': n_features / 10, + 'effective_rank': min(n_samples, n_features) / 10, + #'effective_rank': None, + 'bias': 0.0, + } + print("n_samples: %d" % n_samples) + print("n_features: %d" % n_features) + X, y = make_regression(**dataset_kwargs) + + gc.collect() + print("benchmarking lars_path (with Gram):", end='') + sys.stdout.flush() + tstart = time() + G = np.dot(X.T, X) # precomputed Gram matrix + Xy = np.dot(X.T, y) + lars_path(X, y, Xy=Xy, Gram=G, method='lasso') + delta = time() - tstart + print("%0.3fs" % delta) + results['lars_path (with Gram)'].append(delta) + + gc.collect() + print("benchmarking lars_path (without Gram):", end='') + sys.stdout.flush() + tstart = time() + lars_path(X, y, method='lasso') + delta = time() - tstart + print("%0.3fs" % delta) + results['lars_path (without Gram)'].append(delta) + + gc.collect() + print("benchmarking lasso_path (with Gram):", end='') + sys.stdout.flush() + tstart = time() + lasso_path(X, y, precompute=True) + delta = time() - tstart + print("%0.3fs" % delta) + results['lasso_path (with Gram)'].append(delta) + + gc.collect() + print("benchmarking lasso_path (without Gram):", end='') + sys.stdout.flush() + tstart = time() + lasso_path(X, y, precompute=False) + delta = time() - tstart + print("%0.3fs" % delta) + results['lasso_path (without Gram)'].append(delta) + + return results + + +if __name__ == '__main__': + from mpl_toolkits.mplot3d import axes3d # register the 3d projection + import matplotlib.pyplot as plt + + samples_range = np.linspace(10, 2000, 5).astype(np.int) + features_range = np.linspace(10, 2000, 5).astype(np.int) + results = compute_bench(samples_range, features_range) + + max_time = max(max(t) for t in results.values()) + + fig = plt.figure('scikit-learn Lasso path benchmark results') + i = 1 + for c, (label, timings) in zip('bcry', sorted(results.items())): + ax = fig.add_subplot(2, 2, i, projection='3d') + X, Y = np.meshgrid(samples_range, features_range) + Z = np.asarray(timings).reshape(samples_range.shape[0], + features_range.shape[0]) + + # plot the actual surface + ax.plot_surface(X, Y, Z.T, cstride=1, rstride=1, color=c, alpha=0.8) + + # dummy point plot to stick the legend to since surface plot do not + # support legends (yet?) + #ax.plot([1], [1], [1], color=c, label=label) + + ax.set_xlabel('n_samples') + ax.set_ylabel('n_features') + ax.set_zlabel('Time (s)') + ax.set_zlim3d(0.0, max_time * 1.1) + ax.set_title(label) + #ax.legend() + i += 1 + plt.show() diff --git a/benchmarks/bench_plot_neighbors.py b/benchmarks/bench_plot_neighbors.py new file mode 100644 index 0000000..2288772 --- /dev/null +++ b/benchmarks/bench_plot_neighbors.py @@ -0,0 +1,186 @@ +""" +Plot the scaling of the nearest neighbors algorithms with k, D, and N +""" +from time import time + +import numpy as np +import pylab as pl +from matplotlib import ticker + +from sklearn import neighbors, datasets + + +def get_data(N, D, dataset='dense'): + if dataset == 'dense': + np.random.seed(0) + return np.random.random((N, D)) + elif dataset == 'digits': + X = datasets.load_digits().data + i = np.argsort(X[0])[::-1] + X = X[:, i] + return X[:N, :D] + else: + raise ValueError("invalid dataset: %s" % dataset) + + +def barplot_neighbors(Nrange=2 ** np.arange(1, 11), + Drange=2 ** np.arange(7), + krange=2 ** np.arange(10), + N=1000, + D=64, + k=5, + leaf_size=30, + dataset='digits'): + algorithms = ('kd_tree', 'brute', 'ball_tree') + fiducial_values = {'N': N, + 'D': D, + 'k': k} + + #------------------------------------------------------------ + # varying N + N_results_build = dict([(alg, np.zeros(len(Nrange))) + for alg in algorithms]) + N_results_query = dict([(alg, np.zeros(len(Nrange))) + for alg in algorithms]) + + for i, NN in enumerate(Nrange): + print("N = %i (%i out of %i)" % (NN, i + 1, len(Nrange))) + X = get_data(NN, D, dataset) + for algorithm in algorithms: + nbrs = neighbors.NearestNeighbors(n_neighbors=min(NN, k), + algorithm=algorithm, + leaf_size=leaf_size) + t0 = time() + nbrs.fit(X) + t1 = time() + nbrs.kneighbors(X) + t2 = time() + + N_results_build[algorithm][i] = (t1 - t0) + N_results_query[algorithm][i] = (t2 - t1) + + #------------------------------------------------------------ + # varying D + D_results_build = dict([(alg, np.zeros(len(Drange))) + for alg in algorithms]) + D_results_query = dict([(alg, np.zeros(len(Drange))) + for alg in algorithms]) + + for i, DD in enumerate(Drange): + print("D = %i (%i out of %i)" % (DD, i + 1, len(Drange))) + X = get_data(N, DD, dataset) + for algorithm in algorithms: + nbrs = neighbors.NearestNeighbors(n_neighbors=k, + algorithm=algorithm, + leaf_size=leaf_size) + t0 = time() + nbrs.fit(X) + t1 = time() + nbrs.kneighbors(X) + t2 = time() + + D_results_build[algorithm][i] = (t1 - t0) + D_results_query[algorithm][i] = (t2 - t1) + + #------------------------------------------------------------ + # varying k + k_results_build = dict([(alg, np.zeros(len(krange))) + for alg in algorithms]) + k_results_query = dict([(alg, np.zeros(len(krange))) + for alg in algorithms]) + + X = get_data(N, DD, dataset) + + for i, kk in enumerate(krange): + print("k = %i (%i out of %i)" % (kk, i + 1, len(krange))) + for algorithm in algorithms: + nbrs = neighbors.NearestNeighbors(n_neighbors=kk, + algorithm=algorithm, + leaf_size=leaf_size) + t0 = time() + nbrs.fit(X) + t1 = time() + nbrs.kneighbors(X) + t2 = time() + + k_results_build[algorithm][i] = (t1 - t0) + k_results_query[algorithm][i] = (t2 - t1) + + pl.figure('scikit-learn nearest neighbors benchmark results', + figsize=(8, 11)) + + for (sbplt, vals, quantity, + build_time, query_time) in [(311, Nrange, 'N', + N_results_build, + N_results_query), + (312, Drange, 'D', + D_results_build, + D_results_query), + (313, krange, 'k', + k_results_build, + k_results_query)]: + ax = pl.subplot(sbplt, yscale='log') + pl.grid(True) + + tick_vals = [] + tick_labels = [] + + bottom = 10 ** np.min([min(np.floor(np.log10(build_time[alg]))) + for alg in algorithms]) + + for i, alg in enumerate(algorithms): + xvals = 0.1 + i * (1 + len(vals)) + np.arange(len(vals)) + width = 0.8 + + c_bar = pl.bar(xvals, build_time[alg] - bottom, + width, bottom, color='r') + q_bar = pl.bar(xvals, query_time[alg], + width, build_time[alg], color='b') + + tick_vals += list(xvals + 0.5 * width) + tick_labels += ['%i' % val for val in vals] + + pl.text((i + 0.02) / len(algorithms), 0.98, alg, + transform=ax.transAxes, + ha='left', + va='top', + bbox=dict(facecolor='w', edgecolor='w', alpha=0.5)) + + pl.ylabel('Time (s)') + + ax.xaxis.set_major_locator(ticker.FixedLocator(tick_vals)) + ax.xaxis.set_major_formatter(ticker.FixedFormatter(tick_labels)) + + for label in ax.get_xticklabels(): + label.set_rotation(-90) + label.set_fontsize(10) + + title_string = 'Varying %s' % quantity + + descr_string = '' + + for s in 'NDk': + if s == quantity: + pass + else: + descr_string += '%s = %i, ' % (s, fiducial_values[s]) + + descr_string = descr_string[:-2] + + pl.text(1.01, 0.5, title_string, + transform=ax.transAxes, rotation=-90, + ha='left', va='center', fontsize=20) + + pl.text(0.99, 0.5, descr_string, + transform=ax.transAxes, rotation=-90, + ha='right', va='center') + + pl.gcf().suptitle("%s data set" % dataset.capitalize(), fontsize=16) + + pl.figlegend((c_bar, q_bar), ('construction', 'N-point query'), + 'upper right') + +if __name__ == '__main__': + barplot_neighbors(dataset='digits') + barplot_neighbors(dataset='dense') + pl.show() diff --git a/benchmarks/bench_plot_nmf.py b/benchmarks/bench_plot_nmf.py new file mode 100644 index 0000000..0e753b7 --- /dev/null +++ b/benchmarks/bench_plot_nmf.py @@ -0,0 +1,166 @@ +""" +Benchmarks of Non-Negative Matrix Factorization +""" + +from __future__ import print_function + +import gc +from time import time +import numpy as np +from collections import defaultdict + +from sklearn.decomposition.nmf import NMF, _initialize_nmf +from sklearn.datasets.samples_generator import make_low_rank_matrix +from sklearn.externals.six.moves import xrange + + +def alt_nnmf(V, r, max_iter=1000, tol=1e-3, R=None): + ''' + A, S = nnmf(X, r, tol=1e-3, R=None) + + Implement Lee & Seung's algorithm + + Parameters + ---------- + V : 2-ndarray, [n_samples, n_features] + input matrix + r : integer + number of latent features + max_iter : integer, optional + maximum number of iterations (default: 10000) + tol : double + tolerance threshold for early exit (when the update factor is within + tol of 1., the function exits) + R : integer, optional + random seed + + Returns + ------- + A : 2-ndarray, [n_samples, r] + Component part of the factorization + + S : 2-ndarray, [r, n_features] + Data part of the factorization + Reference + --------- + "Algorithms for Non-negative Matrix Factorization" + by Daniel D Lee, Sebastian H Seung + (available at http://citeseer.ist.psu.edu/lee01algorithms.html) + ''' + # Nomenclature in the function follows Lee & Seung + eps = 1e-5 + n, m = V.shape + if R == "svd": + W, H = _initialize_nmf(V, r) + elif R is None: + R = np.random.mtrand._rand + W = np.abs(R.standard_normal((n, r))) + H = np.abs(R.standard_normal((r, m))) + + for i in xrange(max_iter): + updateH = np.dot(W.T, V) / (np.dot(np.dot(W.T, W), H) + eps) + H *= updateH + updateW = np.dot(V, H.T) / (np.dot(W, np.dot(H, H.T)) + eps) + W *= updateW + if True or (i % 10) == 0: + max_update = max(updateW.max(), updateH.max()) + if abs(1. - max_update) < tol: + break + return W, H + + +def compute_bench(samples_range, features_range, rank=50, tolerance=1e-7): + it = 0 + timeset = defaultdict(lambda: []) + err = defaultdict(lambda: []) + + max_it = len(samples_range) * len(features_range) + for n_samples in samples_range: + for n_features in features_range: + it += 1 + print('====================') + print('Iteration %03d of %03d' % (it, max_it)) + print('====================') + X = np.abs(make_low_rank_matrix(n_samples, n_features, + effective_rank=rank, tail_strength=0.2)) + + gc.collect() + print("benchmarking nndsvd-nmf: ") + tstart = time() + m = NMF(n_components=30, tol=tolerance, init='nndsvd').fit(X) + tend = time() - tstart + timeset['nndsvd-nmf'].append(tend) + err['nndsvd-nmf'].append(m.reconstruction_err_) + print(m.reconstruction_err_, tend) + + gc.collect() + print("benchmarking nndsvda-nmf: ") + tstart = time() + m = NMF(n_components=30, init='nndsvda', + tol=tolerance).fit(X) + tend = time() - tstart + timeset['nndsvda-nmf'].append(tend) + err['nndsvda-nmf'].append(m.reconstruction_err_) + print(m.reconstruction_err_, tend) + + gc.collect() + print("benchmarking nndsvdar-nmf: ") + tstart = time() + m = NMF(n_components=30, init='nndsvdar', + tol=tolerance).fit(X) + tend = time() - tstart + timeset['nndsvdar-nmf'].append(tend) + err['nndsvdar-nmf'].append(m.reconstruction_err_) + print(m.reconstruction_err_, tend) + + gc.collect() + print("benchmarking random-nmf") + tstart = time() + m = NMF(n_components=30, init=None, max_iter=1000, + tol=tolerance).fit(X) + tend = time() - tstart + timeset['random-nmf'].append(tend) + err['random-nmf'].append(m.reconstruction_err_) + print(m.reconstruction_err_, tend) + + gc.collect() + print("benchmarking alt-random-nmf") + tstart = time() + W, H = alt_nnmf(X, r=30, R=None, tol=tolerance) + tend = time() - tstart + timeset['alt-random-nmf'].append(tend) + err['alt-random-nmf'].append(np.linalg.norm(X - np.dot(W, H))) + print(np.linalg.norm(X - np.dot(W, H)), tend) + + return timeset, err + + +if __name__ == '__main__': + from mpl_toolkits.mplot3d import axes3d # register the 3d projection + axes3d + import matplotlib.pyplot as plt + + samples_range = np.linspace(50, 500, 3).astype(np.int) + features_range = np.linspace(50, 500, 3).astype(np.int) + timeset, err = compute_bench(samples_range, features_range) + + for i, results in enumerate((timeset, err)): + fig = plt.figure('scikit-learn Non-Negative Matrix Factorization benchmkar results') + ax = fig.gca(projection='3d') + for c, (label, timings) in zip('rbgcm', sorted(results.iteritems())): + X, Y = np.meshgrid(samples_range, features_range) + Z = np.asarray(timings).reshape(samples_range.shape[0], + features_range.shape[0]) + # plot the actual surface + ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3, + color=c) + # dummy point plot to stick the legend to since surface plot do not + # support legends (yet?) + ax.plot([1], [1], [1], color=c, label=label) + + ax.set_xlabel('n_samples') + ax.set_ylabel('n_features') + zlabel = 'Time (s)' if i == 0 else 'reconstruction error' + ax.set_zlabel(zlabel) + ax.legend() + plt.show() diff --git a/benchmarks/bench_plot_omp_lars.py b/benchmarks/bench_plot_omp_lars.py new file mode 100644 index 0000000..a3d9550 --- /dev/null +++ b/benchmarks/bench_plot_omp_lars.py @@ -0,0 +1,123 @@ +"""Benchmarks of orthogonal matching pursuit (:ref:`OMP`) versus least angle +regression (:ref:`least_angle_regression`) + +The input data is mostly low rank but is a fat infinite tail. +""" +from __future__ import print_function + +import gc +import sys +from time import time + +import numpy as np + +from sklearn.linear_model import lars_path, orthogonal_mp +from sklearn.datasets.samples_generator import make_sparse_coded_signal + + +def compute_bench(samples_range, features_range): + + it = 0 + + results = dict() + lars = np.empty((len(features_range), len(samples_range))) + lars_gram = lars.copy() + omp = lars.copy() + omp_gram = lars.copy() + + max_it = len(samples_range) * len(features_range) + for i_s, n_samples in enumerate(samples_range): + for i_f, n_features in enumerate(features_range): + it += 1 + n_informative = n_features / 10 + print('====================') + print('Iteration %03d of %03d' % (it, max_it)) + print('====================') + # dataset_kwargs = { + # 'n_train_samples': n_samples, + # 'n_test_samples': 2, + # 'n_features': n_features, + # 'n_informative': n_informative, + # 'effective_rank': min(n_samples, n_features) / 10, + # #'effective_rank': None, + # 'bias': 0.0, + # } + dataset_kwargs = { + 'n_samples': 1, + 'n_components': n_features, + 'n_features': n_samples, + 'n_nonzero_coefs': n_informative, + 'random_state': 0 + } + print("n_samples: %d" % n_samples) + print("n_features: %d" % n_features) + y, X, _ = make_sparse_coded_signal(**dataset_kwargs) + X = np.asfortranarray(X) + + gc.collect() + print("benchmarking lars_path (with Gram):", end='') + sys.stdout.flush() + tstart = time() + G = np.dot(X.T, X) # precomputed Gram matrix + Xy = np.dot(X.T, y) + lars_path(X, y, Xy=Xy, Gram=G, max_iter=n_informative) + delta = time() - tstart + print("%0.3fs" % delta) + lars_gram[i_f, i_s] = delta + + gc.collect() + print("benchmarking lars_path (without Gram):", end='') + sys.stdout.flush() + tstart = time() + lars_path(X, y, Gram=None, max_iter=n_informative) + delta = time() - tstart + print("%0.3fs" % delta) + lars[i_f, i_s] = delta + + gc.collect() + print("benchmarking orthogonal_mp (with Gram):", end='') + sys.stdout.flush() + tstart = time() + orthogonal_mp(X, y, precompute_gram=True, + n_nonzero_coefs=n_informative) + delta = time() - tstart + print("%0.3fs" % delta) + omp_gram[i_f, i_s] = delta + + gc.collect() + print("benchmarking orthogonal_mp (without Gram):", end='') + sys.stdout.flush() + tstart = time() + orthogonal_mp(X, y, precompute_gram=False, + n_nonzero_coefs=n_informative) + delta = time() - tstart + print("%0.3fs" % delta) + omp[i_f, i_s] = delta + + results['time(LARS) / time(OMP)\n (w/ Gram)'] = (lars_gram / omp_gram) + results['time(LARS) / time(OMP)\n (w/o Gram)'] = (lars / omp) + return results + + +if __name__ == '__main__': + samples_range = np.linspace(1000, 5000, 5).astype(np.int) + features_range = np.linspace(1000, 5000, 5).astype(np.int) + results = compute_bench(samples_range, features_range) + max_time = max(np.max(t) for t in results.values()) + + import pylab as pl + fig = pl.figure('scikit-learn OMP vs. LARS benchmark results') + for i, (label, timings) in enumerate(sorted(results.iteritems())): + ax = fig.add_subplot(1, 2, i) + vmax = max(1 - timings.min(), -1 + timings.max()) + pl.matshow(timings, fignum=False, vmin=1 - vmax, vmax=1 + vmax) + ax.set_xticklabels([''] + map(str, samples_range)) + ax.set_yticklabels([''] + map(str, features_range)) + pl.xlabel('n_samples') + pl.ylabel('n_features') + pl.title(label) + + pl.subplots_adjust(0.1, 0.08, 0.96, 0.98, 0.4, 0.63) + ax = pl.axes([0.1, 0.08, 0.8, 0.06]) + pl.colorbar(cax=ax, orientation='horizontal') + pl.show() diff --git a/benchmarks/bench_plot_parallel_pairwise.py b/benchmarks/bench_plot_parallel_pairwise.py new file mode 100644 index 0000000..c17f42d --- /dev/null +++ b/benchmarks/bench_plot_parallel_pairwise.py @@ -0,0 +1,44 @@ +# Author: Mathieu Blondel +# License: BSD 3 clause +import time + +import pylab as pl + +from sklearn.utils import check_random_state +from sklearn.metrics.pairwise import pairwise_distances +from sklearn.metrics.pairwise import pairwise_kernels + +def plot(func): + random_state = check_random_state(0) + one_core = [] + multi_core = [] + sample_sizes = range(1000, 6000, 1000) + + for n_samples in sample_sizes: + X = random_state.rand(n_samples, 300) + + start = time.time() + func(X, n_jobs=1) + one_core.append(time.time() - start) + + start = time.time() + func(X, n_jobs=-1) + multi_core.append(time.time() - start) + + pl.figure('scikit-learn parallel %s benchmark results' % func.__name__) + pl.plot(sample_sizes, one_core, label="one core") + pl.plot(sample_sizes, multi_core, label="multi core") + pl.xlabel('n_samples') + pl.ylabel('Time (s)') + pl.title('Parallel %s' % func.__name__) + pl.legend() + +def euclidean_distances(X, n_jobs): + return pairwise_distances(X, metric="euclidean", n_jobs=n_jobs) + +def rbf_kernels(X, n_jobs): + return pairwise_kernels(X, metric="rbf", n_jobs=n_jobs, gamma=0.1) + +plot(euclidean_distances) +plot(rbf_kernels) +pl.show() diff --git a/benchmarks/bench_plot_svd.py b/benchmarks/bench_plot_svd.py new file mode 100644 index 0000000..ce5cba9 --- /dev/null +++ b/benchmarks/bench_plot_svd.py @@ -0,0 +1,82 @@ +"""Benchmarks of Singular Value Decomposition (Exact and Approximate) + +The data is mostly low rank but is a fat infinite tail. +""" +import gc +from time import time +import numpy as np +from collections import defaultdict + +from scipy.linalg import svd +from sklearn.utils.extmath import randomized_svd +from sklearn.datasets.samples_generator import make_low_rank_matrix + + +def compute_bench(samples_range, features_range, n_iter=3, rank=50): + + it = 0 + + results = defaultdict(lambda: []) + + max_it = len(samples_range) * len(features_range) + for n_samples in samples_range: + for n_features in features_range: + it += 1 + print('====================') + print('Iteration %03d of %03d' % (it, max_it)) + print('====================') + X = make_low_rank_matrix(n_samples, n_features, + effective_rank=rank, + tail_strength=0.2) + + gc.collect() + print("benchmarking scipy svd: ") + tstart = time() + svd(X, full_matrices=False) + results['scipy svd'].append(time() - tstart) + + gc.collect() + print("benchmarking scikit-learn randomized_svd: n_iter=0") + tstart = time() + randomized_svd(X, rank, n_iter=0) + results['scikit-learn randomized_svd (n_iter=0)'].append( + time() - tstart) + + gc.collect() + print("benchmarking scikit-learn randomized_svd: n_iter=%d " + % n_iter) + tstart = time() + randomized_svd(X, rank, n_iter=n_iter) + results['scikit-learn randomized_svd (n_iter=%d)' + % n_iter].append(time() - tstart) + + return results + + +if __name__ == '__main__': + from mpl_toolkits.mplot3d import axes3d # register the 3d projection + import matplotlib.pyplot as plt + + samples_range = np.linspace(2, 1000, 4).astype(np.int) + features_range = np.linspace(2, 1000, 4).astype(np.int) + results = compute_bench(samples_range, features_range) + + label = 'scikit-learn singular value decomposition benchmark results' + fig = plt.figure(label) + ax = fig.gca(projection='3d') + for c, (label, timings) in zip('rbg', sorted(results.iteritems())): + X, Y = np.meshgrid(samples_range, features_range) + Z = np.asarray(timings).reshape(samples_range.shape[0], + features_range.shape[0]) + # plot the actual surface + ax.plot_surface(X, Y, Z, rstride=8, cstride=8, alpha=0.3, + color=c) + # dummy point plot to stick the legend to since surface plot do not + # support legends (yet?) + ax.plot([1], [1], [1], color=c, label=label) + + ax.set_xlabel('n_samples') + ax.set_ylabel('n_features') + ax.set_zlabel('Time (s)') + ax.legend() + plt.show() diff --git a/benchmarks/bench_plot_ward.py b/benchmarks/bench_plot_ward.py new file mode 100644 index 0000000..e9ad254 --- /dev/null +++ b/benchmarks/bench_plot_ward.py @@ -0,0 +1,43 @@ +""" +Benchmark scikit-learn's Ward implement compared to SciPy's +""" + +import time + +import numpy as np +from scipy.cluster import hierarchy +import pylab as pl + +from sklearn.cluster import Ward + +ward = Ward(n_clusters=3) + +n_samples = np.logspace(.5, 3, 9) +n_features = np.logspace(1, 3.5, 7) +N_samples, N_features = np.meshgrid(n_samples, + n_features) +scikits_time = np.zeros(N_samples.shape) +scipy_time = np.zeros(N_samples.shape) + +for i, n in enumerate(n_samples): + for j, p in enumerate(n_features): + X = np.random.normal(size=(n, p)) + t0 = time.time() + ward.fit(X) + scikits_time[j, i] = time.time() - t0 + t0 = time.time() + hierarchy.ward(X) + scipy_time[j, i] = time.time() - t0 + +ratio = scikits_time / scipy_time + +pl.figure("scikit-learn Ward's method benchmark results") +pl.imshow(np.log(ratio), aspect='auto', origin="lower") +pl.colorbar() +pl.contour(ratio, levels=[1, ], colors='k') +pl.yticks(range(len(n_features)), n_features.astype(np.int)) +pl.ylabel('N features') +pl.xticks(range(len(n_samples)), n_samples.astype(np.int)) +pl.xlabel('N samples') +pl.title("Scikit's time, in units of scipy time (log)") +pl.show() diff --git a/benchmarks/bench_random_projections.py b/benchmarks/bench_random_projections.py new file mode 100644 index 0000000..e752a7e --- /dev/null +++ b/benchmarks/bench_random_projections.py @@ -0,0 +1,254 @@ +""" +=========================== +Random projection benchmark +=========================== + +Benchmarks for random projections. + +""" +from __future__ import division +from __future__ import print_function + +import gc +import sys +import optparse +from datetime import datetime +import collections + +import numpy as np +import scipy.sparse as sp + +from sklearn import clone +from sklearn.externals.six.moves import xrange +from sklearn.random_projection import (SparseRandomProjection, + GaussianRandomProjection, + johnson_lindenstrauss_min_dim) + + +def type_auto_or_float(val): + if val == "auto": + return "auto" + else: + return float(val) + + +def type_auto_or_int(val): + if val == "auto": + return "auto" + else: + return int(val) + + +def compute_time(t_start, delta): + mu_second = 0.0 + 10 ** 6 # number of microseconds in a second + + return delta.seconds + delta.microseconds / mu_second + + +def bench_scikit_transformer(X, transfomer): + gc.collect() + + clf = clone(transfomer) + + # start time + t_start = datetime.now() + clf.fit(X) + delta = (datetime.now() - t_start) + # stop time + time_to_fit = compute_time(t_start, delta) + + # start time + t_start = datetime.now() + clf.transform(X) + delta = (datetime.now() - t_start) + # stop time + time_to_transform = compute_time(t_start, delta) + + return time_to_fit, time_to_transform + + +# Make some random data with uniformly located non zero entries with +# Gaussian distributed values +def make_sparse_random_data(n_samples, n_features, n_nonzeros, + random_state=None): + rng = np.random.RandomState(random_state) + data_coo = sp.coo_matrix( + (rng.randn(n_nonzeros), + (rng.randint(n_samples, size=n_nonzeros), + rng.randint(n_features, size=n_nonzeros))), + shape=(n_samples, n_features)) + return data_coo.toarray(), data_coo.tocsr() + + +def print_row(clf_type, time_fit, time_transform): + print("%s | %s | %s" % (clf_type.ljust(30), + ("%.4fs" % time_fit).center(12), + ("%.4fs" % time_transform).center(12))) + + +if __name__ == "__main__": + ########################################################################### + # Option parser + ########################################################################### + op = optparse.OptionParser() + op.add_option("--n-times", + dest="n_times", default=5, type=int, + help="Benchmark results are average over n_times experiments") + + op.add_option("--n-features", + dest="n_features", default=10 ** 4, type=int, + help="Number of features in the benchmarks") + + op.add_option("--n-components", + dest="n_components", default="auto", + help="Size of the random subspace." + "('auto' or int > 0)") + + op.add_option("--ratio-nonzeros", + dest="ratio_nonzeros", default=10 ** -3, type=float, + help="Number of features in the benchmarks") + + op.add_option("--n-samples", + dest="n_samples", default=500, type=int, + help="Number of samples in the benchmarks") + + op.add_option("--random-seed", + dest="random_seed", default=13, type=int, + help="Seed used by the random number generators.") + + op.add_option("--density", + dest="density", default=1 / 3, + help="Density used by the sparse random projection." + "('auto' or float (0.0, 1.0]") + + op.add_option("--eps", + dest="eps", default=0.5, type=float, + help="See the documentation of the underlying transformers.") + + op.add_option("--transformers", + dest="selected_transformers", + default='GaussianRandomProjection,SparseRandomProjection', + type=str, + help="Comma-separated list of transformer to benchmark. " + "Default: %default. Available: " + "GaussianRandomProjection,SparseRandomProjection") + + op.add_option("--dense", + dest="dense", + default=False, + action="store_true", + help="Set input space as a dense matrix.") + + (opts, args) = op.parse_args() + if len(args) > 0: + op.error("this script takes no arguments.") + sys.exit(1) + opts.n_components = type_auto_or_int(opts.n_components) + opts.density = type_auto_or_float(opts.density) + selected_transformers = opts.selected_transformers.split(',') + + ########################################################################### + # Generate dataset + ########################################################################### + n_nonzeros = int(opts.ratio_nonzeros * opts.n_features) + + print('Dataset statics') + print("===========================") + print('n_samples \t= %s' % opts.n_samples) + print('n_features \t= %s' % opts.n_features) + if opts.n_components == "auto": + print('n_components \t= %s (auto)' % + johnson_lindenstrauss_min_dim(n_samples=opts.n_samples, + eps=opts.eps)) + else: + print('n_components \t= %s' % opts.n_components) + print('n_elements \t= %s' % (opts.n_features * opts.n_samples)) + print('n_nonzeros \t= %s per feature' % n_nonzeros) + print('ratio_nonzeros \t= %s' % opts.ratio_nonzeros) + print('') + + ########################################################################### + # Set transformer input + ########################################################################### + transformers = {} + + ########################################################################### + # Set GaussianRandomProjection input + gaussian_matrix_params = { + "n_components": opts.n_components, + "random_state": opts.random_seed + } + transformers["GaussianRandomProjection"] = \ + GaussianRandomProjection(**gaussian_matrix_params) + + ########################################################################### + # Set SparseRandomProjection input + sparse_matrix_params = { + "n_components": opts.n_components, + "random_state": opts.random_seed, + "density": opts.density, + "eps": opts.eps, + } + + transformers["SparseRandomProjection"] = \ + SparseRandomProjection(**sparse_matrix_params) + + ########################################################################### + # Perform benchmark + ########################################################################### + time_fit = collections.defaultdict(list) + time_transform = collections.defaultdict(list) + + print('Benchmarks') + print("===========================") + print("Generate dataset benchmarks... ", end="") + X_dense, X_sparse = make_sparse_random_data(opts.n_samples, + opts.n_features, + n_nonzeros, + random_state=opts.random_seed) + X = X_dense if opts.dense else X_sparse + print("done") + + for name in selected_transformers: + print("Perform benchmarks for %s..." % name) + + for iteration in xrange(opts.n_times): + print("\titer %s..." % iteration, end="") + time_to_fit, time_to_transform = bench_scikit_transformer(X_dense, + transformers[name]) + time_fit[name].append(time_to_fit) + time_transform[name].append(time_to_transform) + print("done") + + print("") + + ########################################################################### + # Print results + ########################################################################### + print("Script arguments") + print("===========================") + arguments = vars(opts) + print("%s \t | %s " % ("Arguments".ljust(16), + "Value".center(12),)) + print(25 * "-" + ("|" + "-" * 14) * 1) + for key, value in arguments.items(): + print("%s \t | %s " % (str(key).ljust(16), + str(value).strip().center(12))) + print("") + + print("Transformer performance:") + print("===========================") + print("Results are averaged over %s repetition(s)." % opts.n_times) + print("") + print("%s | %s | %s" % ("Transformer".ljust(30), + "fit".center(12), + "transform".center(12))) + print(31 * "-" + ("|" + "-" * 14) * 2) + + for name in sorted(selected_transformers): + print_row(name, + np.mean(time_fit[name]), + np.mean(time_transform[name])) + + print("") + print("") diff --git a/benchmarks/bench_sample_without_replacement.py b/benchmarks/bench_sample_without_replacement.py new file mode 100644 index 0000000..90c382e --- /dev/null +++ b/benchmarks/bench_sample_without_replacement.py @@ -0,0 +1,207 @@ +""" +Benchmarks for sampling without replacement of integer. + +""" +from __future__ import division +from __future__ import print_function + +import gc +import sys +import optparse +from datetime import datetime +import operator + +import matplotlib.pyplot as plt +import numpy as np +import random + +from sklearn.externals.six.moves import xrange +from sklearn.utils.random import sample_without_replacement + + +def compute_time(t_start, delta): + mu_second = 0.0 + 10 ** 6 # number of microseconds in a second + + return delta.seconds + delta.microseconds / mu_second + + +def bench_sample(sampling, n_population, n_samples): + gc.collect() + # start time + t_start = datetime.now() + sampling(n_population, n_samples) + delta = (datetime.now() - t_start) + # stop time + time = compute_time(t_start, delta) + return time + +if __name__ == "__main__": + ########################################################################### + # Option parser + ########################################################################### + op = optparse.OptionParser() + op.add_option("--n-times", + dest="n_times", default=5, type=int, + help="Benchmark results are average over n_times experiments") + + op.add_option("--n-population", + dest="n_population", default=100000, type=int, + help="Size of the population to sample from.") + + op.add_option("--n-step", + dest="n_steps", default=5, type=int, + help="Number of step interval between 0 and n_population.") + + default_algorithms = "custom-tracking-selection,custom-auto," \ + "custom-reservoir-sampling,custom-pool,"\ + "python-core-sample,numpy-permutation" + + op.add_option("--algorithm", + dest="selected_algorithm", + default=default_algorithms, + type=str, + help="Comma-separated list of transformer to benchmark. " + "Default: %default. \nAvailable: %default") + + # op.add_option("--random-seed", + # dest="random_seed", default=13, type=int, + # help="Seed used by the random number generators.") + + (opts, args) = op.parse_args() + if len(args) > 0: + op.error("this script takes no arguments.") + sys.exit(1) + + selected_algorithm = opts.selected_algorithm.split(',') + for key in selected_algorithm: + if key not in default_algorithms.split(','): + raise ValueError("Unknown sampling algorithm \"%s\" not in (%s)." + % (key, default_algorithms)) + + ########################################################################### + # List sampling algorithm + ########################################################################### + # We assume that sampling algorithm has the following signature: + # sample(n_population, n_sample) + # + sampling_algorithm = {} + + ########################################################################### + # Set Python core input + sampling_algorithm["python-core-sample"] = \ + lambda n_population, n_sample: \ + random.sample(xrange(n_population), n_sample) + + ########################################################################### + # Set custom automatic method selection + sampling_algorithm["custom-auto"] = \ + lambda n_population, n_samples, random_state=None: \ + sample_without_replacement(n_population, + n_samples, + method="auto", + random_state=random_state) + + ########################################################################### + # Set custom tracking based method + sampling_algorithm["custom-tracking-selection"] = \ + lambda n_population, n_samples, random_state=None: \ + sample_without_replacement(n_population, + n_samples, + method="tracking_selection", + random_state=random_state) + + ########################################################################### + # Set custom reservoir based method + sampling_algorithm["custom-reservoir-sampling"] = \ + lambda n_population, n_samples, random_state=None: \ + sample_without_replacement(n_population, + n_samples, + method="reservoir_sampling", + random_state=random_state) + + ########################################################################### + # Set custom reservoir based method + sampling_algorithm["custom-pool"] = \ + lambda n_population, n_samples, random_state=None: \ + sample_without_replacement(n_population, + n_samples, + method="pool", + random_state=random_state) + + ########################################################################### + # Numpy permutation based + sampling_algorithm["numpy-permutation"] = \ + lambda n_population, n_sample: \ + np.random.permutation(n_population)[:n_sample] + + ########################################################################### + # Remove unspecified algorithm + sampling_algorithm = dict((key, value) + for key, value in sampling_algorithm.items() + if key in selected_algorithm) + + ########################################################################### + # Perform benchmark + ########################################################################### + time = {} + n_samples = np.linspace(start=0, stop=opts.n_population, + num=opts.n_steps).astype(np.int) + + ratio = n_samples / opts.n_population + + print('Benchmarks') + print("===========================") + + for name in sorted(sampling_algorithm): + print("Perform benchmarks for %s..." % name, end="") + time[name] = np.zeros(shape=(opts.n_steps, opts.n_times)) + + for step in xrange(opts.n_steps): + for it in xrange(opts.n_times): + time[name][step, it] = bench_sample(sampling_algorithm[name], + opts.n_population, + n_samples[step]) + + print("done") + + print("Averaging results...", end="") + for name in sampling_algorithm: + time[name] = np.mean(time[name], axis=1) + print("done\n") + + # Print results + ########################################################################### + print("Script arguments") + print("===========================") + arguments = vars(opts) + print("%s \t | %s " % ("Arguments".ljust(16), + "Value".center(12),)) + print(25 * "-" + ("|" + "-" * 14) * 1) + for key, value in arguments.items(): + print("%s \t | %s " % (str(key).ljust(16), + str(value).strip().center(12))) + print("") + + print("Sampling algorithm performance:") + print("===============================") + print("Results are averaged over %s repetition(s)." % opts.n_times) + print("") + + fig = plt.figure('scikit-learn sample w/o replacement benchmark results') + plt.title("n_population = %s, n_times = %s" % + (opts.n_population, opts.n_times)) + ax = fig.add_subplot(111) + for name in sampling_algorithm: + ax.plot(ratio, time[name], label=name) + + ax.set_xlabel('ratio of n_sample / n_population') + ax.set_ylabel('Time (s)') + ax.legend() + + # Sort legend labels + handles, labels = ax.get_legend_handles_labels() + hl = sorted(zip(handles, labels), key=operator.itemgetter(1)) + handles2, labels2 = zip(*hl) + ax.legend(handles2, labels2, loc=0) + + plt.show() diff --git a/benchmarks/bench_sgd_regression.py b/benchmarks/bench_sgd_regression.py new file mode 100644 index 0000000..2dd9c56 --- /dev/null +++ b/benchmarks/bench_sgd_regression.py @@ -0,0 +1,131 @@ +""" +Benchmark for SGD regression + +Compares SGD regression against coordinate descent and Ridge +on synthetic data. +""" + +print(__doc__) + +# Author: Peter Prettenhofer +# License: BSD 3 clause + +import numpy as np +import pylab as pl + +import gc + +from time import time + +from sklearn.linear_model import Ridge, SGDRegressor, ElasticNet +from sklearn.metrics import mean_squared_error +from sklearn.datasets.samples_generator import make_regression + +if __name__ == "__main__": + list_n_samples = np.linspace(100, 10000, 5).astype(np.int) + list_n_features = [10, 100, 1000] + n_test = 1000 + noise = 0.1 + alpha = 0.01 + sgd_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) + elnet_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) + ridge_results = np.zeros((len(list_n_samples), len(list_n_features), 2)) + for i, n_train in enumerate(list_n_samples): + for j, n_features in enumerate(list_n_features): + X, y, coef = make_regression( + n_samples=n_train + n_test, n_features=n_features, + noise=noise, coef=True) + + X_train = X[:n_train] + y_train = y[:n_train] + X_test = X[n_train:] + y_test = y[n_train:] + + print("=======================") + print("Round %d %d" % (i, j)) + print("n_features:", n_features) + print("n_samples:", n_train) + + # Shuffle data + idx = np.arange(n_train) + np.random.seed(13) + np.random.shuffle(idx) + X_train = X_train[idx] + y_train = y_train[idx] + + std = X_train.std(axis=0) + mean = X_train.mean(axis=0) + X_train = (X_train - mean) / std + X_test = (X_test - mean) / std + + std = y_train.std(axis=0) + mean = y_train.mean(axis=0) + y_train = (y_train - mean) / std + y_test = (y_test - mean) / std + + gc.collect() + print("- benchmarking ElasticNet") + clf = ElasticNet(alpha=alpha, rho=0.5, fit_intercept=False) + tstart = time() + clf.fit(X_train, y_train) + elnet_results[i, j, 0] = mean_squared_error(clf.predict(X_test), + y_test) + elnet_results[i, j, 1] = time() - tstart + + gc.collect() + print("- benchmarking SGD") + n_iter = np.ceil(10 ** 4.0 / n_train) + clf = SGDRegressor(alpha=alpha, fit_intercept=False, + n_iter=n_iter, learning_rate="invscaling", + eta0=.01, power_t=0.25) + + tstart = time() + clf.fit(X_train, y_train) + sgd_results[i, j, 0] = mean_squared_error(clf.predict(X_test), + y_test) + sgd_results[i, j, 1] = time() - tstart + + gc.collect() + print("- benchmarking RidgeRegression") + clf = Ridge(alpha=alpha, fit_intercept=False) + tstart = time() + clf.fit(X_train, y_train) + ridge_results[i, j, 0] = mean_squared_error(clf.predict(X_test), + y_test) + ridge_results[i, j, 1] = time() - tstart + + # Plot results + i = 0 + m = len(list_n_features) + pl.figure('scikit-learn SGD regression benchmark results', + figsize=(5 * 2, 4 * m)) + for j in range(m): + pl.subplot(m, 2, i + 1) + pl.plot(list_n_samples, np.sqrt(elnet_results[:, j, 0]), + label="ElasticNet") + pl.plot(list_n_samples, np.sqrt(sgd_results[:, j, 0]), + label="SGDRegressor") + pl.plot(list_n_samples, np.sqrt(ridge_results[:, j, 0]), + label="Ridge") + pl.legend(prop={"size": 10}) + pl.xlabel("n_train") + pl.ylabel("RMSE") + pl.title("Test error - %d features" % list_n_features[j]) + i += 1 + + pl.subplot(m, 2, i + 1) + pl.plot(list_n_samples, np.sqrt(elnet_results[:, j, 1]), + label="ElasticNet") + pl.plot(list_n_samples, np.sqrt(sgd_results[:, j, 1]), + label="SGDRegressor") + pl.plot(list_n_samples, np.sqrt(ridge_results[:, j, 1]), + label="Ridge") + pl.legend(prop={"size": 10}) + pl.xlabel("n_train") + pl.ylabel("Time [sec]") + pl.title("Training time - %d features" % list_n_features[j]) + i += 1 + + pl.subplots_adjust(hspace=.30) + + pl.show() diff --git a/benchmarks/bench_tree.py b/benchmarks/bench_tree.py new file mode 100644 index 0000000..ca97cf9 --- /dev/null +++ b/benchmarks/bench_tree.py @@ -0,0 +1,124 @@ +""" +To run this, you'll need to have installed. + + * scikit-learn + +Does two benchmarks + +First, we fix a training set, increase the number of +samples to classify and plot number of classified samples as a +function of time. + +In the second benchmark, we increase the number of dimensions of the +training set, classify a sample and plot the time taken as a function +of the number of dimensions. +""" +import numpy as np +import pylab as pl +import gc +from datetime import datetime + +# to store the results +scikit_classifier_results = [] +scikit_regressor_results = [] + +mu_second = 0.0 + 10 ** 6 # number of microseconds in a second + + +def bench_scikit_tree_classifier(X, Y): + """Benchmark with scikit-learn decision tree classifier""" + + from sklearn.tree import DecisionTreeClassifier + + gc.collect() + + # start time + tstart = datetime.now() + clf = DecisionTreeClassifier() + clf.fit(X, Y).predict(X) + delta = (datetime.now() - tstart) + # stop time + + scikit_classifier_results.append( + delta.seconds + delta.microseconds / mu_second) + + +def bench_scikit_tree_regressor(X, Y): + """Benchmark with scikit-learn decision tree regressor""" + + from sklearn.tree import DecisionTreeRegressor + + gc.collect() + + # start time + tstart = datetime.now() + clf = DecisionTreeRegressor() + clf.fit(X, Y).predict(X) + delta = (datetime.now() - tstart) + # stop time + + scikit_regressor_results.append( + delta.seconds + delta.microseconds / mu_second) + + +if __name__ == '__main__': + + print('============================================') + print('Warning: this is going to take a looong time') + print('============================================') + + n = 10 + step = 10000 + n_samples = 10000 + dim = 10 + n_classes = 10 + for i in range(n): + print('============================================') + print('Entering iteration %s of %s' % (i, n)) + print('============================================') + n_samples += step + X = np.random.randn(n_samples, dim) + Y = np.random.randint(0, n_classes, (n_samples,)) + bench_scikit_tree_classifier(X, Y) + Y = np.random.randn(n_samples) + bench_scikit_tree_regressor(X, Y) + + xx = range(0, n * step, step) + pl.figure('scikit-learn tree benchmark results') + pl.subplot(211) + pl.title('Learning with varying number of samples') + pl.plot(xx, scikit_classifier_results, 'g-', label='classification') + pl.plot(xx, scikit_regressor_results, 'r-', label='regression') + pl.legend(loc='upper left') + pl.xlabel('number of samples') + pl.ylabel('Time (s)') + + scikit_classifier_results = [] + scikit_regressor_results = [] + n = 10 + step = 500 + start_dim = 500 + n_classes = 10 + + dim = start_dim + for i in range(0, n): + print('============================================') + print('Entering iteration %s of %s' % (i, n)) + print('============================================') + dim += step + X = np.random.randn(100, dim) + Y = np.random.randint(0, n_classes, (100,)) + bench_scikit_tree_classifier(X, Y) + Y = np.random.randn(100) + bench_scikit_tree_regressor(X, Y) + + xx = np.arange(start_dim, start_dim + n * step, step) + pl.subplot(212) + pl.title('Learning in high dimensional spaces') + pl.plot(xx, scikit_classifier_results, 'g-', label='classification') + pl.plot(xx, scikit_regressor_results, 'r-', label='regression') + pl.legend(loc='upper left') + pl.xlabel('number of dimensions') + pl.ylabel('Time (s)') + pl.axis('tight') + pl.show() diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..01cf701 --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,109 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex latexpdf changes linkcheck doctest + +all: html-noplot + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + -rm -rf $(BUILDDIR)/* + -rm -rf auto_examples/ + -rm -rf generated/* + -rm -rf modules/generated/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable" + +html-noplot: + $(SPHINXBUILD) -D plot_gallery=False -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html/stable + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html/stable." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/scikit-learn.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/scikit-learn.qhc" + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + make -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +download-data: + python -c "from sklearn.datasets.lfw import check_fetch_lfw; check_fetch_lfw()" diff --git a/doc/README b/doc/README new file mode 100644 index 0000000..01f612d --- /dev/null +++ b/doc/README @@ -0,0 +1,54 @@ +Documentation for scikit-learn +------------------------------- + +This section contains the full manual and web page as displayed in +http://scikit-learn.sf.net. To generate the full web page, including +the example gallery (this might take a while): + + make html + +Or, if you'd rather not build the example gallery: + + make html-noplot + +That should create all the doc in directory _build/html + +To build the PDF manual, run + + make latexpdf + + +Upload the generated doc to sourceforge +--------------------------------------- + +First of, generate the html documentation:: + + make html + +This should create a directory _build/html/stable with the documentation in +html format. + + +Now can upload the generated HTML documentation using scp or some other SFTP +clients. + + * Project web Hostname: web.sourceforge.net + * Path: htdocs/ + * Username: Combine your SourceForge.net Username with your SourceForge.net + project UNIX name using a comma ( "," ); see below + * Password: Your SourceForge.net Password + +An example session might look like the following for Username "jsmith" +uploading a file for this project, using rsync with the right switch for the +permissions: + +[jsmith@linux ~]$ rsync -rltvz --delete _build/html/stable/ \ +> jsmith,scikit-learn@web.sourceforge.net:htdocs/dev -essh +Connecting to web.sourceforge.net... +The authenticity of host 'web.sourceforge.net (216.34.181.57)' can't be established. +RSA key fingerprint is 68:b3:26:02:a0:07:e4:78:d4:ec:7f:2f:6a:4d:32:c5. +Are you sure you want to continue connecting (yes/no)? yes +Warning: Permanently added 'web.sourceforge.net,216.34.181.57' (RSA) to the list of known hosts. +jsmith,fooproject@web.sourceforge.net's password: +sending incremental file list +... diff --git a/doc/about.rst b/doc/about.rst new file mode 100644 index 0000000..1fdb2ec --- /dev/null +++ b/doc/about.rst @@ -0,0 +1,154 @@ + + +About us +======== + +.. include:: ../AUTHORS.rst + +.. _citing-scikit-learn: + +Citing scikit-learn +------------------- + +If you use scikit-learn in scientific publication, we would appreciate +citations to the following paper: + + `Scikit-learn: Machine Learning in Python + `_, Pedregosa + *et al.*, JMLR 12, pp. 2825-2830, 2011. + + Bibtex entry:: + + @article{scikit-learn, + title={Scikit-learn: Machine Learning in {P}ython}, + author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. + and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. + and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and + Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, + journal={Journal of Machine Learning Research}, + volume={12}, + pages={2825--2830}, + year={2011} + } + +Funding +------- + +`INRIA `_ actively supports this project. It has +provided funding for Fabian Pedregosa to work on this project full +time in the period 2010-2012. It also hosts coding sprints and other +events. + +.. image:: images/inria-logo.jpg + +`Google `_ sponsored David +Cournapeau with a Summer of Code Scholarship in the summer of 2007, +`Vlad Niculae`_ in 2011, and `Vlad Niculae`_ and Immanuel Bayer in 2012 . It also provided funding for sprints and events +around scikit-learn. If +you would like to participate in the next Google Summer of code +program, please see `this page +`_ + +The `NeuroDebian `_ project providing `Debian +`_ packaging and contributions is supported by +`Dr. James V. Haxby `_ (`Dartmouth +College `_). + +The `PSF `_ helped find and manage funding for our +2011 Granada sprint. More information can be found `here +`_ + +`tinyclues `_ funded the 2011 international Granada +sprint. + +Donating to the project +~~~~~~~~~~~~~~~~~~~~~~~ + +If you are interested in donating to the project or to one of our code-sprints, you can use +the *Paypal* button below or the `NumFOCUS Donations Page `_ (if you use the latter, please indicate that you are donating for the scikit-learn project). + +All donations will be handled by `NumFOCUS +`_, a non-profit-organization which is +managed by a board of `Scipy community members +`_. NumFOCUS's mission is to foster +scientific computing software, in particular in Python. As a fiscal home +of scikit-learn, it ensures that money is available when needed to keep +the project funded and available while in compliance with tax regulations. + +The received donations for the scikit-learn project mostly will go towards covering travel-expenses +for code sprints, as well as towards the organization budget of the project [#f1]_. + +.. raw :: html + +

+
+ + + + +
+
+ +.. rubric:: Notes + +.. [#f1] Regarding the organization budget in particular, we might use some of the donated funds to pay for other project expenses such as DNS, hosting or continuous integration services. + + + + + +The 2013' Paris international sprint +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +|center-div| |telecom| |tinyclues| |afpy| |FNRS| + + |end-div| + + + +.. |center-div| raw:: html + +
+ + +.. |telecom| image:: http://f.hypotheses.org/wp-content/blogs.dir/331/files/2011/03/Logo-TPT.jpg + :width: 150px + :target: http://www.telecom-paristech.fr/ + + +.. |tinyclues| image:: http://www.tinyclues.com/item/50b77d01e4b0bff132989dfd?format=original + :width: 150px + :target: http://www.tinyclues.fr + + +.. |afpy| image:: http://www.afpy.org/logo.png + :width: 150px + :target: http://www.afpy.org + + +.. |SGR| image:: http://www.svi.cnrs-bellevue.fr/wikimedia/images/Logo_svi_inp.png + :width: 150px + :target: http://www.svi.cnrs-bellevue.fr + +.. |FNRS| image:: http://www.fnrs.be/uploaddocs/images/COMMUNIQUER/FRS-FNRS_rose_transp.png + :width: 150px + :target: http://www.frs-fnrs.be/ + +.. figure:: http://sites.uclouvain.be/dysco/pmwiki/uploads/Main/dysco.gif + :width: 150px + :target: http://sites.uclouvain.be/dysco/ + + IAP VII/19 - DYSCO + +.. |end-div| raw:: html + +
+ +*For more information on this sprint, see* `here `_ diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000..2f6f41b --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,226 @@ +# -*- coding: utf-8 -*- +# +# scikit-learn documentation build configuration file, created by +# sphinx-quickstart on Fri Jan 8 09:13:42 2010. +# +# This file is execfile()d with the current directory set to its containing +# dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os +from sklearn.externals.six import u + +# If extensions (or modules to document with autodoc) are in another +# directory, add these directories to sys.path here. If the directory +# is relative to the documentation root, use os.path.abspath to make it +# absolute, like shown here. +sys.path.insert(0, os.path.abspath('sphinxext')) + +# -- General configuration --------------------------------------------------- + +# Try to override the matplotlib configuration as early as possible +try: + import gen_rst +except: + pass + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = ['gen_rst', + 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', + 'sphinx.ext.pngmath', 'numpy_ext.numpydoc' + ] + +autosummary_generate = True + +autodoc_default_flags = ['members', 'inherited-members'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['templates'] + +# generate autosummary even if no references +autosummary_generate = True + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8' + +# Generate the plots for the gallery +plot_gallery = True + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u('scikit-learn') +copyright = u('2010 - 2013, scikit-learn developers (BSD License)') + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.14' +# The full version, including alpha/beta/rc tags. +import sklearn +release = sklearn.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of documents that shouldn't be included in the build. +#unused_docs = [] + +# List of directories, relative to source directory, that shouldn't be +# searched for source files. +exclude_trees = ['_build', 'templates', 'includes'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +add_function_parentheses = False + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. Major themes that come with +# Sphinx are currently 'default' and 'sphinxdoc'. +html_theme = 'scikit-learn' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +html_theme_options = {'oldversion': False, 'collapsiblesidebar': True, + 'google_analytics': True, 'surveybanner': False, + 'sprintbanner' : True} + +# Add any paths that contain custom themes here, relative to this directory. +html_theme_path = ['themes'] + + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +html_short_title = 'scikit-learn' + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +html_logo = 'logos/scikit-learn-logo-small.png' + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +html_favicon = 'logos/favicon.ico' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['images'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +html_use_modindex = False + +# If false, no index is generated. +html_use_index = False + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = '' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'scikit-learndoc' + + +# -- Options for LaTeX output ------------------------------------------------ + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass +# [howto/manual]). +latex_documents = [('index', 'user_guide.tex', u('scikit-learn user guide'), + u('scikit-learn developers'), 'manual'), ] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +latex_logo = "logos/scikit-learn-logo.png" + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# Additional stuff for the LaTeX preamble. +latex_preamble = r""" +\usepackage{amsmath}\usepackage{amsfonts}\usepackage{bm}\usepackage{morefloats} +\usepackage{enumitem} \setlistdepth{10} +""" + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_use_modindex = True + +trim_doctests_flags = True diff --git a/doc/data_transforms.rst b/doc/data_transforms.rst new file mode 100644 index 0000000..2888780 --- /dev/null +++ b/doc/data_transforms.rst @@ -0,0 +1,14 @@ +.. include:: includes/big_toc_css.rst + +.. _data-transforms: + +Dataset transformations +----------------------- + +.. toctree:: + + modules/feature_extraction + modules/preprocessing + modules/kernel_approximation + modules/random_projection + modules/metrics diff --git a/doc/datasets/covtype.rst b/doc/datasets/covtype.rst new file mode 100644 index 0000000..c0ed4ea --- /dev/null +++ b/doc/datasets/covtype.rst @@ -0,0 +1,20 @@ + +.. _covtype: + +Forest covertypes +================= + +The samples in this dataset correspond to 30×30m patches of forest in the US, +collected for the task of predicting each patch's cover type, +i.e. the dominant species of tree. +There are seven covertypes, making this a multiclass classification problem. +Each sample has 54 features, described on the +`dataset's homepage `_. +Some of the features are boolean indicators, +while others are discrete or continuous measurements. + +:func:`sklearn.datasets.fetch_covtype` will load the covertype dataset; +it returns a dictionary-like object +with the feature matrix in the ``data`` member +and the target values in ``target``. +The dataset will be downloaded from the web if necessary. diff --git a/doc/datasets/index.rst b/doc/datasets/index.rst new file mode 100644 index 0000000..e86af99 --- /dev/null +++ b/doc/datasets/index.rst @@ -0,0 +1,185 @@ +.. _datasets: + +========================= +Dataset loading utilities +========================= + +.. currentmodule:: sklearn.datasets + +The ``sklearn.datasets`` package embeds some small toy datasets +as introduced in the :ref:`Getting Started ` section. + +To evaluate the impact of the scale of the dataset (``n_samples`` and +``n_features``) while controlling the statistical properties of the data +(typically the correlation and informativeness of the features), it is +also possible to generate synthetic data. + +This package also features helpers to fetch larger datasets commonly +used by the machine learning community to benchmark algorithm on data +that comes from the 'real world'. + +General dataset API +=================== + +There are three distinct kinds of dataset interfaces for different types +of datasets. +The simplest one is the interface for sample images, which is described +below in the :ref:`sample_images` section. + +The dataset generation functions and the svmlight loader share a simplistic +interface, returning a tuple ``(X, y)`` consisting of a n_samples x n_features +numpy array X and an array of length n_samples containing the targets y. + +The toy datasets as well as the 'real world' datasets and the datasets +fetched from mldata.org have more sophisticated structure. +These functions return a dictionary-like object holding at least two items: +an array of shape ``n_samples`` * `` n_features`` with key ``data`` +(except for 20newsgroups) +and a NumPy array of length ``n_features``, containing the target values, +with key ``target``. + +The datasets also contain a description in ``DESCR`` and some contain +``feature_names`` and ``target_names``. +See the dataset descriptions below for details. + + +Toy datasets +============ + +scikit-learn comes with a few small standard datasets that do not +require to download any file from some external website. + +.. autosummary:: + + :toctree: ../modules/generated/ + :template: function.rst + + load_boston + load_iris + load_diabetes + load_digits + load_linnerud + +These datasets are useful to quickly illustrate the behavior of the +various algorithms implemented in the scikit. They are however often too +small to be representative of real world machine learning tasks. + +.. _sample_images: + +Sample images +============= + +The scikit also embed a couple of sample JPEG images published under Creative +Commons license by their authors. Those image can be useful to test algorithms +and pipeline on 2D data. + +.. autosummary:: + + :toctree: ../modules/generated/ + :template: function.rst + + load_sample_images + load_sample_image + +.. image:: ../auto_examples/cluster/images/plot_color_quantization_1.png + :target: ../auto_examples/cluster/plot_color_quantization.html + :scale: 30 + :align: right + + +.. warning:: + + The default coding of images is based on the ``uint8`` dtype to + spare memory. Often machine learning algorithms work best if the + input is converted to a floating point representation first. Also, + if you plan to use ``pylab.imshow`` don't forget to scale to the range + 0 - 1 as done in the following example. + +.. topic:: Examples: + + * :ref:`example_cluster_plot_color_quantization.py` + + +.. _sample_generators: + +Sample generators +================= + +In addition, scikit-learn includes various random sample generators that +can be used to build artificial datasets of controlled size and complexity. + +.. image:: ../auto_examples/datasets/images/plot_random_dataset_1.png + :target: ../auto_examples/datasets/plot_random_dataset.html + :scale: 50 + :align: center + +.. autosummary:: + + :toctree: ../modules/generated/ + :template: function.rst + + make_classification + make_multilabel_classification + make_regression + make_blobs + make_friedman1 + make_friedman2 + make_friedman3 + make_hastie_10_2 + make_low_rank_matrix + make_sparse_coded_signal + make_sparse_uncorrelated + make_spd_matrix + make_swiss_roll + make_s_curve + make_sparse_spd_matrix + make_biclusters + make_checkerboard + +.. _libsvm_loader: + +Datasets in svmlight / libsvm format +==================================== + +scikit-learn includes utility functions for loading +datasets in the svmlight / libsvm format. In this format, each line +takes the form ``