diff --git a/.gitignore b/.gitignore index ce64169..f96e6d6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +Gemfile.lock doc.yaml *.swp *.rbc @@ -11,3 +12,4 @@ examples/images/* examples/*.html web/upload_task.rb .idea +*.gem diff --git a/.travis.yml b/.travis.yml index f4a0791..4741681 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,13 +2,22 @@ language: ruby rvm: - - '1.9.3' - - '2.0.0' - - '2.1.1' + - '2.0' + - '2.1' + - '2.2' + - '2.3.0' + - '2.4' + +matrix: + fast_finish: + true + +script: "bundle exec rake test" + +install: + - gem install bundler + - bundle install -script: - bundle exec rake test - before_install: - sudo apt-get update -qq - sudo apt-get install -y libgsl0-dev r-base r-base-dev diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..3365674 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,17 @@ +# Contributing guide + +## Installing statsample development dependencies + +Keep in mind that either nmatrix OR rb-gsl are NOT NECESSARY for using statsample. They are just required for an optional speed up. + +Statsample also works with [rb-gsl](https://github.com/sciruby/rb-gsl). + +Install dependencies: + + `bundle install` + +And run the test suite (should be all green): + + `bundle exec rake test` + +If you have problems installing nmatrix, please consult the [nmatrix installation wiki](https://github.com/SciRuby/nmatrix/wiki/Installation) or the [mailing list](https://groups.google.com/forum/#!forum/sciruby-dev). \ No newline at end of file diff --git a/Gemfile b/Gemfile index ea8fc56..38eb365 100644 --- a/Gemfile +++ b/Gemfile @@ -1,18 +1,2 @@ source "https://www.rubygems.org" -gem 'minitest' -gem 'rdoc' -gem 'mocha', '0.14.0' #:require=>'mocha/setup' -gem 'shoulda','3.5.0' -gem 'shoulda-matchers','2.2.0' -gem 'hoe' -#gem 'bio-statsample-timeseries' -gem 'reportbuilder' -gem 'dirty-memoize' -gem 'distribution' -gem 'extendmatrix' -gem 'minimization' -gem 'rserve-client' -gem 'rubyvis' -gem 'spreadsheet' -gem 'rb-gsl' -gem 'awesome_print' +gemspec diff --git a/Gemfile.lock b/Gemfile.lock deleted file mode 100644 index ef5d88d..0000000 --- a/Gemfile.lock +++ /dev/null @@ -1,81 +0,0 @@ -GEM - remote: https://www.rubygems.org/ - specs: - activesupport (4.1.6) - i18n (~> 0.6, >= 0.6.9) - json (~> 1.7, >= 1.7.7) - minitest (~> 5.1) - thread_safe (~> 0.1) - tzinfo (~> 1.1) - awesome_print (1.2.0) - clbustos-rtf (0.4.2) - dirty-memoize (0.0.4) - distribution (0.7.1) - extendmatrix (0.3.1) - hoe (3.13.0) - rake (>= 0.8, < 11.0) - i18n (0.6.11) - json (1.8.1) - metaclass (0.0.4) - minimization (0.2.1) - rb-gsl (~> 1.2) - text-table (~> 1.2) - minitest (5.4.2) - mocha (0.14.0) - metaclass (~> 0.0.1) - narray (0.6.0.9) - prawn (0.8.4) - prawn-core (>= 0.8.4, < 0.9) - prawn-layout (>= 0.8.4, < 0.9) - prawn-security (>= 0.8.4, < 0.9) - prawn-core (0.8.4) - prawn-layout (0.8.4) - prawn-security (0.8.4) - prawn-svg (0.9.1.11) - prawn (>= 0.8.4) - rake (10.3.2) - rb-gsl (1.16.0.2) - narray (>= 0.5.9) - rdoc (4.1.2) - json (~> 1.4) - reportbuilder (1.4.2) - clbustos-rtf (~> 0.4.0) - prawn (~> 0.8.4) - prawn-svg (~> 0.9.1) - text-table (~> 1.2) - rserve-client (0.3.1) - ruby-ole (1.2.11.7) - rubyvis (0.6.1) - shoulda (3.5.0) - shoulda-context (~> 1.0, >= 1.0.1) - shoulda-matchers (>= 1.4.1, < 3.0) - shoulda-context (1.2.1) - shoulda-matchers (2.2.0) - activesupport (>= 3.0.0) - spreadsheet (1.0.0) - ruby-ole (>= 1.0) - text-table (1.2.3) - thread_safe (0.3.4) - tzinfo (1.2.2) - thread_safe (~> 0.1) - -PLATFORMS - ruby - -DEPENDENCIES - awesome_print - dirty-memoize - distribution - extendmatrix - hoe - minimization - minitest - mocha (= 0.14.0) - rb-gsl - rdoc - reportbuilder - rserve-client - rubyvis - shoulda (= 3.5.0) - shoulda-matchers (= 2.2.0) - spreadsheet diff --git a/History.txt b/History.txt index a438896..40bc0db 100644 --- a/History.txt +++ b/History.txt @@ -1,9 +1,47 @@ +=== 2.1.0 / 2017-08-10 + * Update documentation to reflect methods that have been removed (@lokeshh) + * Update daru dependency to v0.1.6 (@lokeshh) + * Remove pre-daru legacy methods like n_valid, missing value functions (@lokeshh) + * Update test suite with rubocop and rake. New tests for methods like Regression (@lokeshh) + * Introduce fitting a regression using string formulas (@lokeshh) + +=== 2.0.2 / 2016-03-11 + * Update dependencies (spreadsheet, GSL) + +=== 2.0.1 / 2015-08-19 + * Cleaned legacy containers in favor of `Daru::DataFrame` and `Daru::Vector`. + +=== 2.0.0 / 2015-06-20 + * Added dependency on daru and replaced Statsample::Vector and Dataset with + Daru::Vector and Daru::DataFrame. + * NMatrix and gsl-nmatrix are used as development dependencies. + +=== 1.5.0 / 2015-06-11 + * Made sure all methods work properly with and without GSL. + * Statsample works with either rb-gsl or gsl-nmatrix. + * Changed the data types of Statsample::Vector from :ordinal, :scale and + :nominal to only :numeric and :object. :numeric replaces :ordinal/:scale + and :object replaces :nominal. Methods for creating the older data types still + exist, but throw a warning prodding the user to use the new methods. + +=== 1.4.3 / 2015-04-27 + * Removed rb-gsl dependency. + +=== 1.4.2 / 2015-04-07 + * Statsample::CSV.read accepts numbers in scientific notation. + * Test on Ruby 2.2 via Travis CI. + +=== 1.4.1 / 2015-03-26 + * Removed Hoe gem in order to use `statsample.gemspec`. + * Improved readability of some files by using rubocop. + * Removed a bad check in `cronbach_alpha` (#10). + === 1.4.0 / 2014-10-11 * Replaced README.txt for README.md * Replace File.exists? for File.exist? + New Dataset.join to join two dataset based on some fields * Deleted MLE based regression (Probit and logistic). Now all GML methods are on statsample-glm - + === 1.3.1 / 2014-06-26 * Example referred to a SimpleRegression class which doesn't exist. Updated to working example. @@ -23,7 +61,7 @@ * open svg on mac osx === 1.2.0 / 2011-12-15 - + * Added support for time series (TimeSeries object): MA, EMA, MACD, acf, lag and delta. [Rob Britton] * Changed summary attribute to properly display 'b' value for simple linear regression [hstove] * Merge pull request #6 from hstove/patch-1Changed summary attribute to properly display 'b' value for simple linear regression [Claudio Bustos] @@ -34,9 +72,9 @@ * New Statsample::Anova::Contrast * Jacknife and bootstrap for Vector. Thanks to John Firebaugh for the idea * Improved Statsample::Analysis API -* Updated CSV.read. Third argument is a Hash with options to CSV class +* Updated CSV.read. Third argument is a Hash with options to CSV class * Added restriction on Statsample::Excel.read -* Updated spanish po +* Updated spanish po * Better summary for Vector * Improving summary of t related test (confidence interval and estimate output) * Replaced c for vector on Statsample::Analysis examples @@ -51,7 +89,7 @@ === 1.0.0 / 2011-01-27 * Added Statsample::Analysis, a beautiful DSL to perform fast statistical analysis using statsample. See directory /examples -* Created benchmarks directory +* Created benchmarks directory * Removed Distribution module from statsample and moved to a gem. Changes on code to reflect new API * Optimized simple regression. Better library detection * New 'should_with_gsl' to test methods with gsl. Refactored Factor::MAP @@ -62,17 +100,17 @@ * Modified examples using Statsample::Analysis * Simplified eigen calculations * Updated some examples. Added correlation matrix speed suite -* Correlation matrix optimized. Better specs -* Optimized correlation matrix. Use gsl matrix algebra or pairwise correlations depending on empiric calculated equations. See benchmarks/correlation_matrix.rb to see implementation of calculation +* Correlation matrix optimized. Better specs +* Optimized correlation matrix. Use gsl matrix algebra or pairwise correlations depending on empiric calculated equations. See benchmarks/correlation_matrix.rb to see implementation of calculation * Moved tests fixtures from data to test/fixtures * Fixed some errors on tests -* Bug fix: constant_se on binomial regression have an error -* All test should work on ruby 1.9.3 +* Bug fix: constant_se on binomial regression have an error +* All test should work on ruby 1.9.3 * New Vector.[] and Vector.new_scale -* Detect linearly dependent predictors on OLS. +* Detect linearly dependent predictors on OLS. === 0.18.0 / 2011-01-07 -* New Statsample.load_excel +* New Statsample.load_excel * New Statsample.load_csv * Statsample::Dataset#[] accepts an array of fields and uses clone * New Dataset#correlation_matrix and Statsample::Dataset#covariance_matrix @@ -83,19 +121,19 @@ * Improved summary for PCA using covariance matrix * New attribute :label_angle for Statsample::Graph::Boxplot * Fixed Scatterplots scaling problems -* New attributes for Scatterplots: groups, minimum_x, minimum_y, maximum_x, +* New attributes for Scatterplots: groups, minimum_x, minimum_y, maximum_x, * New Statsample::Multiset#union allows to create a new dataset based on a m * New Statsample::Multiset#each to traverse through datasets * Bug fix: Vector#standarized and Vector#percentile crash on nil data * Bug fix: Vector#mean and Vector#sd crash on data without valid values * Modified methods names on Statsample::Factor::PCA : feature_vector to feature_matrix, data_transformation to principal_components * Added Statsample::Vector.vector_centered -* Factor::MAP.with_dataset() implemented -* Bug fix: Factor::MAP with correlation matrix with non-real eigenvalues crashes * Added documentation for Graph::Histogram +* Factor::MAP.with_dataset() implemented +* Bug fix: Factor::MAP with correlation matrix with non-real eigenvalues crashes * Added documentation for Graph::Histogram * Added MPA to Reliability::MultiScaleAnalysis -* Added custom names for returned vectors and datasets -* Updated spanish traslation -* Graph::Histogram updated. Custom x and y max and min, optional normal distribution drawing +* Added custom names for returned vectors and datasets +* Updated spanish traslation +* Graph::Histogram updated. Custom x and y max and min, optional normal distribution drawing * Updated Histogram class, with several new methods compatibles with GSL::Histogram === 0.17.0 / 2010-12-09 @@ -106,18 +144,18 @@ === 0.16.0 / 2010-11-13 * Works on ruby 1.9.2 and HEAD. Updated Rakefile and manifest -* Removed all graph based on Svg::Graph. +* Removed all graph based on Svg::Graph. * First operative version of Graph with Rubyvis -* Corrected bug on Distribution::Normal.cdf. +* Corrected bug on Distribution::Normal.cdf. * Added reference on references.txt * Ruby-based random gaussian distribution generator when gsl not available * Added population average deviation [Al Chou] === 0.15.1 / 2010-10-20 -* Statsample::Excel and Statsample::PlainText add name to vectors equal to field name +* Statsample::Excel and Statsample::PlainText add name to vectors equal to field name * Statsample::Dataset.delete_vector accept multiple fields. -* Statsample::Dataset.dup_only_valid allows duplication of specific fields -* ScaleAnalysis doesn't crash on one-item scales +* Statsample::Dataset.dup_only_valid allows duplication of specific fields +* ScaleAnalysis doesn't crash on one-item scales * Updated references === 0.15.0 / 2010-09-07 @@ -126,14 +164,14 @@ * Added Spearman-Brown prophecy on Reliability module * Distribution::F uses Gsl when available * Added mean r.p.b. and item sd on Scale Analysis -* Corrected bug on Vector.ary_method and example of Anova Two Way using vector. +* Corrected bug on Vector.ary_method and example of Anova Two Way using vector. === 0.14.1 / 2010-08-18 -* Added extra information on $DEBUG=true. -* Changed ParallelAnalysis: with_random_data parameters, bootstrap_method options are data and random, resolve bug related to number of factors to preserve, resolved bug related to original eigenvalues, can support failed bootstrap of data for Tetrachoric correlation. -* Optimized eigenpairs on Matrix when GSL is available. +* Added extra information on $DEBUG=true. +* Changed ParallelAnalysis: with_random_data parameters, bootstrap_method options are data and random, resolve bug related to number of factors to preserve, resolved bug related to original eigenvalues, can support failed bootstrap of data for Tetrachoric correlation. +* Optimized eigenpairs on Matrix when GSL is available. * Added test for parallel analysis using data bootstraping * Updated .pot and Manifest.txt * Added test for kmo(global and univariate), bartlett and anti-image. Kmo and Bartlett have test based on Dziuban and Shirkey with correct results @@ -142,16 +180,16 @@ * Added reference for Statsample::Factor::MAP === 0.14.0 / 2010-08-16 -* Added Statsample::Factor::MAP, to execute Velicer's (1976) MAP to determine the number of factors to retain on EFA +* Added Statsample::Factor::MAP, to execute Velicer's (1976) MAP to determine the number of factors to retain on EFA * Bug fix on test suite on Ruby 1.8.7 * Horn's Parallel Analysis operational and tested for pure random data -* Fixed bug on Excel writer on Ruby1.9 (frozen string on header raises an error). +* Fixed bug on Excel writer on Ruby1.9 (frozen string on header raises an error). * Extra information on Factorial Analysis on summaries -* Fixed bug on Factor::Rotation when used ::Matrix without field method. +* Fixed bug on Factor::Rotation when used ::Matrix without field method. * Added Vector#vector_percentil method -* Summaries for PCA, Rotation, MultiScale and ScaleAnalysis created or improved. +* Summaries for PCA, Rotation, MultiScale and ScaleAnalysis created or improved. * Factor::PCA could have rotation and parallel analysis on summary. -* Cronbach's alpha from covariance matrix raise an error on size<2 +* Cronbach's alpha from covariance matrix raise an error on size<2 * MultiScaleAnalysis could have Parallel Analysis on summary. * Added Chi Square test * Added new information on README.txt @@ -168,7 +206,7 @@ * Polychoric and Tetrachoric moved to gem statsample-bivariate-extension * All classes left with summary method include Summarizable now. Every method which return localizable string is now parsed with _() -* Correct implementation of Reliability::MultiScaleAnalysis. +* Correct implementation of Reliability::MultiScaleAnalysis. * Spanish translation for Mann-Whitney's U * Added example for Mann-Whitney's U test * Better summary for Mann-Whitney's U Test @@ -179,10 +217,10 @@ * Modified Rakefile to remove dependencies based on C extensions. These are moved to statsample-optimization * T test with unequal variance fixed on i686 -* API Change: Renamed Reliability::ItemAnalysis and moved to independent file +* API Change: Renamed Reliability::ItemAnalysis and moved to independent file * New Reliability::MultiScaleAnalysis for easy analysis of scales on a same survey, includind reliability, correlation matrix and Factor Analysis * Updated README to reflect changes on Reliability module -* SvgGraph works with reportbuilder. +* SvgGraph works with reportbuilder. * Added methods on Polychoric based on Olsson(1979): the idea is estimate using second derivatives. * Distribution test changed (reduced precision on 32 bits system @@ -196,7 +234,7 @@ New features: * Added Statsample::Anova::TwoWay and Statsample::Anova::TwoWayWithVectors * Added Statsample.clone_only valid and Statsample::Dataset.clone_only_valid, for cheap copy on already clean vectors -Optimizations and bug fix +Optimizations and bug fix * Removed library statistics2 from package. Used gem statistics2 instead, because have a extension version * Added example for Reliability class * Bug fix on Statsample::DominanceAnalysis @@ -204,7 +242,7 @@ === 0.10.0 / 2010-04-13 API modifications -* Refactoring of Statsample::Anova module. +* Refactoring of Statsample::Anova module. * Statsample::Anova::OneWay :implementation of generic ANOVA One-Way, used by Multiple Regression, for example. * Statsample::Anova::OneWayWithVectors: implementation of ANOVA One-Way to test differences of means. @@ -228,7 +266,7 @@ === 0.8.1 / 2010-03-29 * Fixed Regression summaries === 0.8.0 / 2010-03-29 -* New Statsample::Test::T module, with classes and methods to do Student's t tests for one and two samples. +* New Statsample::Test::T module, with classes and methods to do Student's t tests for one and two samples. * Statsample::PromiseAfter module to set a number of variables without explicitly call the compute or iterate method * All tests ported to MiniUnit * Directory 'demo' renamed to 'examples' @@ -266,7 +304,7 @@ === 0.6.4 / 2010-02-19 -* Dominance Analysis and Dominance Analysis Bootstrap allows multivariate dependent analysis. +* Dominance Analysis and Dominance Analysis Bootstrap allows multivariate dependent analysis. * Test suite for Dominance Analysis, using Azen and Budescu papers as references * X^2 for polychoric correlation @@ -285,12 +323,12 @@ * New Statsample::Factor module. Include classes for extracting factors (Statsample::Factor::PCA and Statsample::Factor::PrincipalAxis) and rotate component matrix ( Statsample::Factor::Rotation subclasses). For now, only orthogonal rotations * New Statsample::Dataset.crosstab_with_asignation, Statsample::Dataset.one_to_many * New class Statsample::Permutation to produce permutations of a given array -* New class Statsample::Histogram, with same interface as GSL one +* New class Statsample::Histogram, with same interface as GSL one * New class Statsample::Test::UMannWhitney, to perform Mann-Whitney's U test. Gives z based and exact calculation of probability * Improved support for ReportBuilder * Statsample::Codification module reworked * Fixed bugs on Dominance Analysis classes -* Fixed bugs on Statsample::Vector.kurtosis and Statsample::Vector.skew +* Fixed bugs on Statsample::Vector.kurtosis and Statsample::Vector.skew === 0.5.1 / 2009-10-06 @@ -354,15 +392,15 @@ * One Way Anova on Statsample::Anova::OneWay * Dominance Analysis!!!! The one and only reason to develop a Multiple Regression on pure ruby. -* Multiple Regression on Multiple Regression module. Pairwise (pure ruby) or MultipleRegressionPairwise and Listwise (optimized) on MultipleRegressionAlglib and +* Multiple Regression on Multiple Regression module. Pairwise (pure ruby) or MultipleRegressionPairwise and Listwise (optimized) on MultipleRegressionAlglib and * New Dataset#to_gsl_matrix, #from_to,#[..],#bootstrap,#vector_missing_values, #vector_count_characters, #each_with_index, #collect_with_index * New Vector#box_cox_transformation * Module Correlation renamed to Bivariate * Some fancy methods and classes to create Summaries * Some documentation about Algorithm used on doc_latex * Deleted 'distributions' extension. Ruby/GSL has all the pdf and cdf you ever need. -* Tests work without any dependency. Only nags about missing deps. -* Test for MultipleRegression, Anova, Excel, Bivariate.correlation_matrix and many others +* Tests work without any dependency. Only nags about missing deps. +* Test for MultipleRegression, Anova, Excel, Bivariate.correlation_matrix and many others === 0.1.9 / 2009-05-22 @@ -372,8 +410,8 @@ * Module SRS: New methods estimation_n0 and estimation_n * Module Reliability: new ItemCharacteristicCurve class * New HtmlReport class -* New experimental SPSS Class. -* Converters: Module CSV with new options. Added write() method for GGobi module +* New experimental SPSS Class. +* Converters: Module CSV with new options. Added write() method for GGobi module * New Mx exporter (http://www.vcu.edu/mx/) * Class SimpleRegression: new methods standard error @@ -404,7 +442,7 @@ === 0.1.4 / 2008-08-27 * New extension, with cdf functions for - chi-square, t, gamma and normal distributions. + chi-square, t, gamma and normal distributions. Based on dcdflib (http://www.netlib.org/random/) Also, has a function to calculate the tail for a noncentral T distribution diff --git a/LICENSE.txt b/LICENSE.txt index 9d0b178..6886323 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright (c) 2009-2014, Claudio Bustos +Copyright (c) 2009-2015, Claudio Bustos All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/Manifest.txt b/Manifest.txt deleted file mode 100644 index f465a24..0000000 --- a/Manifest.txt +++ /dev/null @@ -1,157 +0,0 @@ -.travis.yml -Gemfile -Gemfile.lock -History.txt -LICENSE.txt -Manifest.txt -README.md -Rakefile -benchmarks/correlation_matrix_15_variables.rb -benchmarks/correlation_matrix_5_variables.rb -benchmarks/correlation_matrix_methods/correlation_matrix.ds -benchmarks/correlation_matrix_methods/correlation_matrix.html -benchmarks/correlation_matrix_methods/correlation_matrix.rb -benchmarks/correlation_matrix_methods/correlation_matrix.xls -benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods -benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods -benchmarks/correlation_matrix_methods/results.ds -benchmarks/factor_map.rb -benchmarks/helpers_benchmark.rb -data/locale/es/LC_MESSAGES/statsample.mo -doc_latex/manual/equations.tex -examples/boxplot.rb -examples/correlation_matrix.rb -examples/dataset.rb -examples/dominance_analysis.rb -examples/dominance_analysis_bootstrap.rb -examples/histogram.rb -examples/icc.rb -examples/levene.rb -examples/multiple_regression.rb -examples/multivariate_correlation.rb -examples/parallel_analysis.rb -examples/polychoric.rb -examples/principal_axis.rb -examples/reliability.rb -examples/scatterplot.rb -examples/t_test.rb -examples/tetrachoric.rb -examples/u_test.rb -examples/vector.rb -examples/velicer_map_test.rb -grab_references.rb -lib/spss.rb -lib/statsample.rb -lib/statsample/analysis.rb -lib/statsample/analysis/suite.rb -lib/statsample/analysis/suitereportbuilder.rb -lib/statsample/anova.rb -lib/statsample/anova/contrast.rb -lib/statsample/anova/oneway.rb -lib/statsample/anova/twoway.rb -lib/statsample/bivariate.rb -lib/statsample/bivariate/pearson.rb -lib/statsample/codification.rb -lib/statsample/converter/csv.rb -lib/statsample/converter/spss.rb -lib/statsample/converters.rb -lib/statsample/crosstab.rb -lib/statsample/dataset.rb -lib/statsample/dominanceanalysis.rb -lib/statsample/dominanceanalysis/bootstrap.rb -lib/statsample/factor.rb -lib/statsample/factor/map.rb -lib/statsample/factor/parallelanalysis.rb -lib/statsample/factor/pca.rb -lib/statsample/factor/principalaxis.rb -lib/statsample/factor/rotation.rb -lib/statsample/graph.rb -lib/statsample/graph/boxplot.rb -lib/statsample/graph/histogram.rb -lib/statsample/graph/scatterplot.rb -lib/statsample/histogram.rb -lib/statsample/matrix.rb -lib/statsample/multiset.rb -lib/statsample/regression.rb -lib/statsample/regression/multiple.rb -lib/statsample/regression/multiple/alglibengine.rb -lib/statsample/regression/multiple/baseengine.rb -lib/statsample/regression/multiple/gslengine.rb -lib/statsample/regression/multiple/matrixengine.rb -lib/statsample/regression/multiple/rubyengine.rb -lib/statsample/regression/simple.rb -lib/statsample/reliability.rb -lib/statsample/reliability/icc.rb -lib/statsample/reliability/multiscaleanalysis.rb -lib/statsample/reliability/scaleanalysis.rb -lib/statsample/reliability/skillscaleanalysis.rb -lib/statsample/resample.rb -lib/statsample/rserve_extension.rb -lib/statsample/shorthand.rb -lib/statsample/srs.rb -lib/statsample/test.rb -lib/statsample/test/bartlettsphericity.rb -lib/statsample/test/chisquare.rb -lib/statsample/test/f.rb -lib/statsample/test/kolmogorovsmirnov.rb -lib/statsample/test/levene.rb -lib/statsample/test/t.rb -lib/statsample/test/umannwhitney.rb -lib/statsample/test/wilcoxonsignedrank.rb -lib/statsample/vector.rb -lib/statsample/vector/gsl.rb -lib/statsample/version.rb -po/es/statsample.mo -po/es/statsample.po -po/statsample.pot -references.txt -setup.rb -test/fixtures/bank2.dat -test/fixtures/correlation_matrix.rb -test/fixtures/hartman_23.matrix -test/fixtures/repeated_fields.csv -test/fixtures/stock_data.csv -test/fixtures/test_csv.csv -test/fixtures/test_xls.xls -test/fixtures/tetmat_matrix.txt -test/fixtures/tetmat_test.txt -test/helpers_tests.rb -test/test_analysis.rb -test/test_anova_contrast.rb -test/test_anovaoneway.rb -test/test_anovatwoway.rb -test/test_anovatwowaywithdataset.rb -test/test_anovawithvectors.rb -test/test_bartlettsphericity.rb -test/test_bivariate.rb -test/test_codification.rb -test/test_crosstab.rb -test/test_csv.rb -test/test_dataset.rb -test/test_dominance_analysis.rb -test/test_factor.rb -test/test_factor_map.rb -test/test_factor_pa.rb -test/test_ggobi.rb -test/test_gsl.rb -test/test_histogram.rb -test/test_matrix.rb -test/test_multiset.rb -test/test_regression.rb -test/test_reliability.rb -test/test_reliability_icc.rb -test/test_reliability_skillscale.rb -test/test_resample.rb -test/test_rserve_extension.rb -test/test_srs.rb -test/test_statistics.rb -test/test_stest.rb -test/test_stratified.rb -test/test_test_f.rb -test/test_test_kolmogorovsmirnov.rb -test/test_test_t.rb -test/test_umannwhitney.rb -test/test_vector.rb -test/test_wilcoxonsignedrank.rb -test/test_xls.rb -web/Rakefile diff --git a/README.md b/README.md index 8c8151d..cadaeae 100644 --- a/README.md +++ b/README.md @@ -1,192 +1,174 @@ # Statsample -Homepage :: https://github.com/clbustos/statsample - -[![Build Status](https://travis-ci.org/clbustos/statsample.svg?branch=master)](https://travis-ci.org/clbustos/statsample) +[![Build Status](https://travis-ci.org/SciRuby/statsample.svg?branch=master)](https://travis-ci.org/SciRuby/statsample) +[![Code Climate](https://codeclimate.com/github/SciRuby/statsample/badges/gpa.svg)](https://codeclimate.com/github/SciRuby/statsample) [![Gem Version](https://badge.fury.io/rb/statsample.svg)](http://badge.fury.io/rb/statsample) -## DESCRIPTION -A suite for basic and advanced statistics on Ruby. Tested on Ruby 2.1.1p76 (June 2014), 1.8.7, 1.9.1, 1.9.2 (April, 2010), ruby-head(June, 2011) and JRuby 1.4 (Ruby 1.8.7 compatible). +Homepage :: https://github.com/sciruby/statsample -Include: -* Descriptive statistics: frequencies, median, mean, standard error, skew, kurtosis (and many others). -* Imports and exports datasets from and to Excel, CSV and plain text files. -* Correlations: Pearson's r, Spearman's rank correlation (rho), point biserial, tau a, tau b and gamma. Tetrachoric and Polychoric correlation provides by +statsample-bivariate-extension+ gem. -* Intra-class correlation -* Anova: generic and vector-based One-way ANOVA and Two-way ANOVA, with contrasts for One-way ANOVA. -* Tests: F, T, Levene, U-Mannwhitney. -* Regression: Simple, Multiple (OLS), Probit and Logit -* Factorial Analysis: Extraction (PCA and Principal Axis), Rotation (Varimax, Equimax, Quartimax) and Parallel Analysis and Velicer's MAP test, for estimation of number of factors. -* Reliability analysis for simple scale and a DSL to easily analyze multiple scales using factor analysis and correlations, if you want it. -* Basic time series support -* Dominance Analysis, with multivariate dependent and bootstrap (Azen & Budescu) -* Sample calculation related formulas -* Structural Equation Modeling (SEM), using R libraries +sem+ and +OpenMx+ -* Creates reports on text, html and rtf, using ReportBuilder gem -* Graphics: Histogram, Boxplot and Scatterplot +# Installation -## Principles +You should have a recent version of GSL and R (with the `irr` and `Rserve` libraries) installed. In Ubuntu: -* Software Design: - * One module/class for each type of analysis - * Options can be set as hash on initialize() or as setters methods - * Clean API for interactive sessions - * summary() returns all necessary informacion for interactive sessions - * All statistical data available though methods on objects - * All (important) methods should be tested. Better with random data. -* Statistical Design - * Results are tested against text results, SPSS and R outputs. - * Go beyond Null Hiphotesis Testing, using confidence intervals and effect sizes when possible - * (When possible) All references for methods are documented, providing sensible information on documentation - -## Features - -* Classes for manipulation and storage of data: - * Statsample::Vector: An extension of an array, with statistical methods like sum, mean and standard deviation - * Statsample::Dataset: a group of Statsample::Vector, analog to a excel spreadsheet or a dataframe on R. The base of almost all operations on statsample. - * Statsample::Multiset: multiple datasets with same fields and type of vectors -* Anova module provides generic Statsample::Anova::OneWay and vector based Statsample::Anova::OneWayWithVectors. Also you can create contrast using Statsample::Anova::Contrast -* Module Statsample::Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric (see Bivariate::Tetrachoric) and polychoric (see Bivariate::Polychoric) correlations. Include methods to create correlation and covariance matrices -* Multiple types of regression. - * Simple Regression : Statsample::Regression::Simple - * Multiple Regression: Statsample::Regression::Multiple - * Logit Regression: Statsample::Regression::Binomial::Logit - * Probit Regression: Statsample::Regression::Binomial::Probit -* Factorial Analysis algorithms on Statsample::Factor module. - * Classes for Extraction of factors: - * Statsample::Factor::PCA - * Statsample::Factor::PrincipalAxis - * Classes for Rotation of factors: - * Statsample::Factor::Varimax - * Statsample::Factor::Equimax - * Statsample::Factor::Quartimax - * Classes for calculation of factors to retain - * Statsample::Factor::ParallelAnalysis performs Horn's 'parallel analysis' to a principal components analysis to adjust for sample bias in the retention of components. - * Statsample::Factor::MAP performs Velicer's Minimum Average Partial (MAP) test, which retain components as long as the variance in the correlation matrix represents systematic variance. -* Dominance Analysis. Based on Budescu and Azen papers, dominance analysis is a method to analyze the relative importance of one predictor relative to another on multiple regression - * Statsample::DominanceAnalysis class can report dominance analysis for a sample, using uni or multivariate dependent variables - * Statsample::DominanceAnalysis::Bootstrap can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/]. -* Module Statsample::Codification, to help to codify open questions -* Converters to import and export data: - * Statsample::Database : Can create sql to create tables, read and insert data - * Statsample::CSV : Read and write CSV files - * Statsample::Excel : Read and write Excel files - * Statsample::Mx : Write Mx Files - * Statsample::GGobi : Write Ggobi files -* Module Statsample::Crosstab provides function to create crosstab for categorical data -* Module Statsample::Reliability provides functions to analyze scales with psychometric methods. - * Class Statsample::Reliability::ScaleAnalysis provides statistics like mean, standard deviation for a scale, Cronbach's alpha and standarized Cronbach's alpha, and for each item: mean, correlation with total scale, mean if deleted, Cronbach's alpha is deleted. - * Class Statsample::Reliability::MultiScaleAnalysis provides a DSL to easily analyze reliability of multiple scales and retrieve correlation matrix and factor analysis of them. - * Class Statsample::Reliability::ICC provides intra-class correlation, using Shrout & Fleiss(1979) and McGraw & Wong (1996) formulations. -* Module Statsample::SRS (Simple Random Sampling) provides a lot of functions to estimate standard error for several type of samples -* Module Statsample::Test provides several methods and classes to perform inferencial statistics - * Statsample::Test::BartlettSphericity - * Statsample::Test::ChiSquare - * Statsample::Test::F - * Statsample::Test::KolmogorovSmirnov (only D value) - * Statsample::Test::Levene - * Statsample::Test::UMannWhitney - * Statsample::Test::T - * Statsample::Test::WilcoxonSignedRank -* Module Graph provides several classes to create beautiful graphs using rubyvis - * Statsample::Graph::Boxplot - * Statsample::Graph::Histogram - * Statsample::Graph::Scatterplot -* Gem bio-statsample-timeseries provides module Statsample::TimeSeries with support for time series, including ARIMA estimation using Kalman-Filter. -* Gem statsample-sem provides a DSL to R libraries +sem+ and +OpenMx+ -* Gem statsample-glm provides you with GML method, to work with Logistic, Poisson and Gaussian regression ,using ML or IRWLS. -* Close integration with gem reportbuilder, to easily create reports on text, html and rtf formats. - -# Examples of use: - -See the [examples folder](https://github.com/clbustos/statsample/tree/master/examples/) too. - -## Boxplot - -```ruby -require 'statsample' - -ss_analysis(Statsample::Graph::Boxplot) do - n=30 - a=rnorm(n-1,50,10) - b=rnorm(n, 30,5) - c=rnorm(n,5,1) - a.push(2) - boxplot(:vectors=>[a,b,c], :width=>300, :height=>300, :groups=>%w{first first second}, :minimum=>0) -end -Statsample::Analysis.run # Open svg file on *nix application defined +```bash +$ sudo apt-get install libgsl0-dev r-base r-base-dev +$ sudo Rscript -e "install.packages(c('Rserve', 'irr'))" ``` -## Correlation matrix - -```ruby -require 'statsample' -# Note R like generation of random gaussian variable -# and correlation matrix - -ss_analysis("Statsample::Bivariate.correlation_matrix") do - samples=1000 - ds=data_frame( - 'a'=>rnorm(samples), - 'b'=>rnorm(samples), - 'c'=>rnorm(samples), - 'd'=>rnorm(samples)) - cm=cor(ds) - summary(cm) -end - -Statsample::Analysis.run_batch # Echo output to console +With these libraries in place, just install from rubygems: + +```bash +$ [sudo] gem install statsample ``` -# Requirements +On *nix, you should install statsample-optimization to retrieve gems gsl, statistics2 and a C extension to speed some methods. -Optional: +```bash +$ [sudo] gem install statsample-optimization +``` -* Plotting: gnuplot and rbgnuplot, SVG::Graph -* Factorial analysis and polychorical correlation(joint estimate and polychoric series): gsl library and rb-gsl (https://rubygems.org/gems/rb-gsl/). You should install it using gem install rb-gsl. +If you need to work on Structural Equation Modeling, you could see +statsample-sem+. You need R with +sem+ or +OpenMx+ [http://openmx.psyc.virginia.edu/] libraries installed -*Note*: Use gsl 1.12.109 or later. +```bash +$ [sudo] gem install statsample-sem +``` +# Testing -# Resources +See CONTRIBUTING for information on testing and contributing to statsample. -* Source code on github :: http://github.com/clbustos/statsample -* Docs :: http://statsample.apsique.cl/ -* Bug report and feature request :: http://github.com/clbustos/statsample/issues -* E-mailing list :: http://groups.google.com/group/statsample +# Documentation -# Installation +You can see the latest documentation in [rubydoc.info](http://www.rubydoc.info/github/sciruby/statsample/master). -```bash -$ sudo gem install statsample -``` +# Usage -On *nix, you should install statsample-optimization to retrieve gems gsl, statistics2 and a C extension to speed some methods. +## Notebooks -There are available precompiled version for Ruby 1.9 on x86, x86_64 and mingw32 archs. +You can see some iruby notebooks here: -```bash -$ sudo gem install statsample-optimization -``` +### Statistics -If you use Ruby 1.8, you should compile statsample-optimization, usign parameter --platform ruby +* [Correlation Matrix with daru and statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Correlation%20Matrix%20with%20daru%20and%20statsample.ipynb) +* [Dominance Analysis with statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Dominance%20Analysis%20with%20statsample.ipynb) +* [Reliability ICC](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Reliability%20ICC%20with%20statsample.ipynb) +* [Levene Test](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Levene%20Test.ipynb) +* [Multiple Regression](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Multiple%20Regression.ipynb) +* [Parallel Analysis on PCA](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Parallel%20Analysis%20on%20PCA.ipynb) +* [Polychoric Analysis](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Polychoric%20Correlation.ipynb) +* [Reliability Scale and Multiscale Analysis](https://github.com/SciRuby/sciruby-notebooks/blob/master/Statistics/Reliability%20Scale%20Analysis.ipynb) +* [Velicer MAP Test](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Velicer%20MAP%20test.ipynb) -```bash -$ sudo gem install statsample-optimization --platform ruby -``` +### Visualizations -If you need to work on Structural Equation Modeling, you could see +statsample-sem+. You need R with +sem+ or +OpenMx+ [http://openmx.psyc.virginia.edu/] libraries installed +* [Creating Boxplots with daru and statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Boxplot%20with%20daru%20and%20statsample.ipynb) +* [Creating A Histogram](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Creating%20a%20Histogram.ipynb) +* [Creating a Scatterplot](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Scatterplot%20with%20statsample.ipynb) -```bash -$ sudo gem install statsample-sem -``` +### Working with DataFrame and Vector -Available setup.rb file +* [Creating Vectors and DataFrames with daru](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Creation%20of%20Vector%20and%20DataFrame.ipynb) +* [Detailed Usage of Daru::Vector](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Usage%20of%20Vector.ipynb) +* [Detailed Usage of Daru::DataFrame](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Usage%20of%20DataFrame.ipynb) +* [Visualizing Data with Daru::DataFrame](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Visualizing%20data%20with%20daru%20DataFrame.ipynb) -```bash -sudo gem ruby setup.rb -``` +## Examples + +See the /examples directory for some use cases. The notebooks listed above have mostly +the same examples, and they look better so you might want to see that first. + +# Description + +A suite for basic and advanced statistics on Ruby. Tested on CRuby 2.0.0, 2.1.1, 2.2 and 2.3.0 See `.travis.yml` for more information. + +Include: +- Descriptive statistics: frequencies, median, mean, standard error, skew, kurtosis (and many others). +- Correlations: Pearson's r, Spearman's rank correlation (rho), point biserial, tau a, tau b and gamma. Tetrachoric and Polychoric correlation provides by +statsample-bivariate-extension+ gem. +- Intra-class correlation +- Anova: generic and vector-based One-way ANOVA and Two-way ANOVA, with contrasts for One-way ANOVA. +- Tests: F, T, Levene, U-Mannwhitney. +- Regression: Simple, Multiple (OLS) +- Factorial Analysis: Extraction (PCA and Principal Axis), Rotation (Varimax, Equimax, Quartimax) and Parallel Analysis and Velicer's MAP test, for estimation of number of factors. +- Reliability analysis for simple scale and a DSL to easily analyze multiple scales using factor analysis and correlations, if you want it. +- Basic time series support +- Dominance Analysis, with multivariate dependent and bootstrap (Azen & Budescu) +- Sample calculation related formulas +- Structural Equation Modeling (SEM), using R libraries +sem+ and +OpenMx+ +- Creates reports on text, html and rtf, using ReportBuilder gem +- Graphics: Histogram, Boxplot and Scatterplot + +## Principles + +- Software Design: + - One module/class for each type of analysis + - Options can be set as hash on initialize() or as setters methods + - Clean API for interactive sessions + - summary() returns all necessary informacion for interactive sessions + - All statistical data available though methods on objects + - All (important) methods should be tested. Better with random data. +- Statistical Design + - Results are tested against text results, SPSS and R outputs. + - Go beyond Null Hiphotesis Testing, using confidence intervals and effect sizes when possible + - (When possible) All references for methods are documented, providing sensible information on documentation + +# Features + +- Classes for manipulation and storage of data: + - Uses [daru](https://github.com/v0dro/daru) for storing data and basic statistics. + - Statsample::Multiset: multiple datasets with same fields and type of vectors +- Anova module provides generic Statsample::Anova::OneWay and vector based Statsample::Anova::OneWayWithVectors. Also you can create contrast using Statsample::Anova::Contrast +- Module Statsample::Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric (see Bivariate::Tetrachoric) and polychoric (see Bivariate::Polychoric) correlations. Include methods to create correlation and covariance matrices +- Multiple types of regression. + - Simple Regression : Statsample::Regression::Simple + - Multiple Regression: Statsample::Regression::Multiple +- Factorial Analysis algorithms on Statsample::Factor module. + - Classes for Extraction of factors: + - Statsample::Factor::PCA + - Statsample::Factor::PrincipalAxis + - Classes for Rotation of factors: + - Statsample::Factor::Varimax + - Statsample::Factor::Equimax + - Statsample::Factor::Quartimax + - Classes for calculation of factors to retain + - Statsample::Factor::ParallelAnalysis performs Horn's 'parallel analysis' to a principal components analysis to adjust for sample bias in the retention of components. + - Statsample::Factor::MAP performs Velicer's Minimum Average Partial (MAP) test, which retain components as long as the variance in the correlation matrix represents systematic variance. +- Dominance Analysis. Based on Budescu and Azen papers, dominance analysis is a method to analyze the relative importance of one predictor relative to another on multiple regression + - Statsample::DominanceAnalysis class can report dominance analysis for a sample, using uni or multivariate dependent variables + - Statsample::DominanceAnalysis::Bootstrap can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/]. +- Module Statsample::Codification, to help to codify open questions +- Converters to export data: + - Statsample::Mx : Write Mx Files + - Statsample::GGobi : Write Ggobi files +- Module Statsample::Crosstab provides function to create crosstab for categorical data +- Module Statsample::Reliability provides functions to analyze scales with psychometric methods. + - Class Statsample::Reliability::ScaleAnalysis provides statistics like mean, standard deviation for a scale, Cronbach's alpha and standarized Cronbach's alpha, and for each item: mean, correlation with total scale, mean if deleted, Cronbach's alpha is deleted. + - Class Statsample::Reliability::MultiScaleAnalysis provides a DSL to easily analyze reliability of multiple scales and retrieve correlation matrix and factor analysis of them. + - Class Statsample::Reliability::ICC provides intra-class correlation, using Shrout & Fleiss(1979) and McGraw & Wong (1996) formulations. +- Module Statsample::SRS (Simple Random Sampling) provides a lot of functions to estimate standard error for several type of samples +- Module Statsample::Test provides several methods and classes to perform inferencial statistics + - Statsample::Test::BartlettSphericity + - Statsample::Test::ChiSquare + - Statsample::Test::F + - Statsample::Test::KolmogorovSmirnov (only D value) + - Statsample::Test::Levene + - Statsample::Test::UMannWhitney + - Statsample::Test::T + - Statsample::Test::WilcoxonSignedRank +- Module Graph provides several classes to create beautiful graphs using rubyvis + - Statsample::Graph::Boxplot + - Statsample::Graph::Histogram + - Statsample::Graph::Scatterplot +- Gem bio-statsample-timeseries provides module Statsample::TimeSeries with support for time series, including ARIMA estimation using Kalman-Filter. +- Gem statsample-sem provides a DSL to R libraries +sem+ and +OpenMx+ +- Gem statsample-glm provides you with GML method, to work with Logistic, Poisson and Gaussian regression ,using ML or IRWLS. +- Close integration with gem reportbuilder, to easily create reports on text, html and rtf formats. + +# Resources + +- Source code on github :: http://github.com/sciruby/statsample +- Bug report and feature request :: http://github.com/sciruby/statsample/issues +- E-mailing list :: https://groups.google.com/forum/#!forum/sciruby-dev -## License +# License BSD-3 (See LICENSE.txt) diff --git a/Rakefile b/Rakefile index d4e23b9..4d78a8f 100644 --- a/Rakefile +++ b/Rakefile @@ -1,32 +1,31 @@ -#!/usr/bin/ruby -# -*- ruby -*- -# -*- coding: utf-8 -*- -$:.unshift(File.dirname(__FILE__)+'/lib/') +$:.unshift File.expand_path("../lib/", __FILE__) +lib_folder = File.expand_path("../lib", __FILE__) -require 'rubygems' -require 'statsample' -require 'hoe' -require 'rdoc' +require 'statsample/version' +require 'rake' +require 'rake/testtask' +require 'rdoc/task' +require 'bundler/gem_tasks' -Hoe.plugin :git -Hoe.plugin :doofus -desc "Ruby Lint" -task :lint do - executable=Config::CONFIG['RUBY_INSTALL_NAME'] - Dir.glob("lib/**/*.rb") {|f| - if !system %{#{executable} -w -c "#{f}"} - puts "Error on: #{f}" - end - } +# Setup the necessary gems, specified in the gemspec. +require 'bundler' +begin + Bundler.setup(:default, :development) +rescue Bundler::BundlerError => e + $stderr.puts e.message + $stderr.puts "Run `bundle install` to install missing gems" + exit e.status_code end -task :release do -system %{git push origin master} +Rake::TestTask.new do |t| + t.pattern = "test/test_*.rb" end -task "clobber_docs" do - # Only to omit warnings +RDoc::Task.new do |rdoc| + rdoc.main = "README.md" + rdoc.rdoc_files.include("README.md", "lib", "History.txt", "LICENSE.txt", "references.txt") end + desc "Update pot/po files." task "gettext:updatepo" do require 'gettext/tools' @@ -37,83 +36,9 @@ desc "Create mo-files" task "gettext:makemo" do require 'gettext/tools' GetText.create_mofiles() - # GetText.create_mofiles(true, "po", "locale") # This is for "Ruby on Rails". -end - -h=Hoe.spec('statsample') do - self.version=Statsample::VERSION - self.urls=["https://github.com/clbustos/statsample"] - #self.testlib=:minitest - self.readme_file = 'README.md' - self.urls = ['https://github.com/clbustos/statsample'] - self.developer('Claudio Bustos', 'clbustos@gmail.com') - self.extra_deps << ["spreadsheet","~>0.6"] << ["reportbuilder", "~>1.4"] << ["minimization", "~>0.2.0"] << ["fastercsv", ">0"] << ["dirty-memoize", "~>0.0"] << ["extendmatrix","~>0.3.1"] << ["statsample-bivariate-extension", ">0"] << ["rserve-client"] << ["rubyvis"] << ["distribution"] - - self.extra_dev_deps << ["hoe","~>0"] << ["shoulda","~>3"] << ["minitest", "~>2"] << ["gettext", "~>0"] << ["mocha", "~>0"] << ["hoe-git", "~>0"] - - self.clean_globs << "test/images/*" << "demo/item_analysis/*" << "demo/Regression" - self.post_install_message = <<-EOF -*************************************************** -Thanks for installing statsample. - -On *nix, you could install statsample-optimization -to retrieve gems gsl, statistics2 and a C extension -to speed some methods. - - $ sudo gem install statsample-optimization - -On Ubuntu, install build-essential and libgsl0-dev -using apt-get. Compile ruby 1.8 or 1.9 from -source code first. - - $ sudo apt-get install build-essential libgsl0-dev - - -***************************************************** - EOF - self.need_rdoc=false -end - -if Rake.const_defined?(:RDocTask) -Rake::RDocTask.new(:docs) do |rd| - rd.main = h.readme_file - rd.options << '-d' if (`which dot` =~ /\/dot/) unless - ENV['NODOT'] || Hoe::WINDOZE - rd.rdoc_dir = 'doc' - - rd.rdoc_files.include("lib/**/*.rb") - rd.rdoc_files += h.spec.extra_rdoc_files - rd.rdoc_files.reject! {|f| f=="Manifest.txt"} - title = h.spec.rdoc_options.grep(/^(-t|--title)=?$/).first - if title then - rd.options << title - - unless title =~ /\=/ then # for ['-t', 'title here'] - title_index = spec.rdoc_options.index(title) - rd.options << spec.rdoc_options[title_index + 1] - end - else - title = "#{h.name}-#{h.version} Documentation" - title = "#{h.rubyforge_name}'s " + title if h.rubyforge_name != h.name - rd.options << '--title' << title - end end +desc 'Run pry' +task :pry do |task| + sh "pry -r #{lib_folder}/statsample.rb" end - -desc 'Publish rdocs with analytics support' -task :publicar_docs => [:clean] do -# ruby %{agregar_adsense_a_doc.rb} - path = File.expand_path("./doc.yaml") - config = YAML.load(File.read(path)) - host = "#{config["user"]}@#{config["host"]}" - - remote_dir = config["dir"] - local_dir = h.local_rdoc_dir - Dir.glob(local_dir+"/**/*") {|file| - sh %{chmod 755 #{file}} - } - sh %{rsync #{h.rsync_args} #{local_dir}/ #{host}:#{remote_dir}} -end - -# vim: syntax=Ruby diff --git a/benchmarks/correlation_matrix_15_variables.rb b/benchmarks/correlation_matrix_15_variables.rb index 82f56eb..5e6a725 100644 --- a/benchmarks/correlation_matrix_15_variables.rb +++ b/benchmarks/correlation_matrix_15_variables.rb @@ -4,7 +4,6 @@ cases=250 vars=20 - name "gsl matrix based vs. manual ruby correlation matrix (#{vars} vars, #{cases} cases)" author 'Clbustos' date '2011-01-18' @@ -17,10 +16,12 @@ reps 200 #number of repetitions -ds=vars.times.inject({}) {|ac,v| -ac["x#{v}"]=Statsample::Vector.new_scale(cases) {rand()} -ac -}.to_dataset +ds = Daru::DataFrame.new( + vars.times.inject({}) do |ac,v| + ac["x#{v}".to_sym]=Daru::Vector.new_with_size(cases) {rand()} + ac + end +) measure "Statsample::Bivariate.correlation_matrix_optimized" do Statsample::Bivariate.correlation_matrix_optimized(ds) diff --git a/benchmarks/correlation_matrix_5_variables.rb b/benchmarks/correlation_matrix_5_variables.rb index e84f25c..418ebe4 100644 --- a/benchmarks/correlation_matrix_5_variables.rb +++ b/benchmarks/correlation_matrix_5_variables.rb @@ -17,11 +17,12 @@ reps 200 #number of repetitions - -ds=vars.times.inject({}) {|ac,v| -ac["x#{v}"]=Statsample::Vector.new_scale(cases) {rand()} -ac -}.to_dataset +ds = Daru::DataFrame.new( + vars.times.inject({}) do |ac,v| + ac["x#{v}".to_sym]=Daru::Vector.new_with_size(cases) {rand()} + ac + end +) measure "Statsample::Bivariate.correlation_matrix_optimized" do Statsample::Bivariate.correlation_matrix_optimized(ds) diff --git a/benchmarks/correlation_matrix_methods/correlation_matrix.rb b/benchmarks/correlation_matrix_methods/correlation_matrix.rb index 4f5f842..dfb6add 100644 --- a/benchmarks/correlation_matrix_methods/correlation_matrix.rb +++ b/benchmarks/correlation_matrix_methods/correlation_matrix.rb @@ -5,11 +5,13 @@ require 'benchmark' def create_dataset(vars,cases) - ran=Distribution::Normal.rng - ds=vars.times.inject({}) {|ac,v| - ac["x#{v}"]=Statsample::Vector.new_scale(cases) {ran.call} - ac - }.to_dataset + ran = Distribution::Normal.rng + ds = Daru::DataFrame.new( + vars.times.inject({}) do |ac,v| + ac["x#{v}".to_sym] = Daru::Vector.new_with_size(cases) {ran.call} + ac + end + ) end def prediction_pairwise(vars,cases) @@ -19,19 +21,17 @@ def prediction_optimized(vars,cases) Statsample::Bivariate.prediction_optimized(vars,cases) / 10 end - - if !File.exists?("correlation_matrix.ds") or File.mtime(__FILE__) > File.mtime("correlation_matrix.ds") reps=100 #number of repetitions ds_sizes=[5,10,30,50,100,150,200,500,1000] ds_vars=[3,4,5,10,20,30,40] #ds_sizes=[5,10] #ds_vars=[3,5,20] -rs=Statsample::Dataset.new(%w{cases vars time_optimized time_pairwise}) +rs = Daru::DataFrame.new({}, order: [:cases, :vars, :time_optimized, :time_pairwise]) ds_sizes.each do |cases| ds_vars.each do |vars| - ds=create_dataset(vars,cases) + ds = create_dataset(vars,cases) time_optimized= Benchmark.realtime do reps.times { Statsample::Bivariate.correlation_matrix_optimized(ds) @@ -40,36 +40,32 @@ def prediction_optimized(vars,cases) end time_pairwise= Benchmark.realtime do - reps.times { - Statsample::Bivariate.correlation_matrix_pairwise(ds) - } + reps.times { Statsample::Bivariate.correlation_matrix_pairwise(ds) } end puts "Cases:#{cases}, vars:#{vars} -> opt:%0.3f (%0.3f) | pair: %0.3f (%0.3f)" % [time_optimized, prediction_optimized(vars,cases), time_pairwise, prediction_pairwise(vars,cases)] - rs.add_case({'cases'=>cases,'vars'=>vars,'time_optimized'=>Math.sqrt(time_optimized*1000),'time_pairwise'=>Math.sqrt(time_pairwise*1000)}) + rs.add_row(Daru::Vector.new({ + :cases => cases, + :vars => vars, + :time_optimized => Math.sqrt(time_optimized*1000), + :time_pairwise =>Math.sqrt(time_pairwise*1000) + }) + ) end - end - + end else rs=Statsample.load("correlation_matrix.ds") end +rs[:c_v] = rs.collect {|row| row[:cases]*row[:vars]} -rs.fields.each {|f| rs[f].type=:scale} - -rs['c_v']=rs.collect {|row| row['cases']*row['vars']} - -rs.update_valid_data rs.save("correlation_matrix.ds") Statsample::Excel.write(rs,"correlation_matrix.xls") +rb = ReportBuilder.new(:name=>"Correlation matrix analysis") - -rb=ReportBuilder.new(:name=>"Correlation matrix analysis") - -rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_optimized','c_v']],'time_optimized', :digits=>6)) -rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_pairwise','c_v']],'time_pairwise', :digits=>6)) - +rb.add(Statsample::Regression.multiple(rs[:cases,:vars,:time_optimized,:c_v],:time_optimized, :digits=>6)) +rb.add(Statsample::Regression.multiple(rs[:cases,:vars,:time_pairwise,:c_v],:time_pairwise, :digits=>6)) rb.save_html("correlation_matrix.html") diff --git a/examples/boxplot.rb b/examples/boxplot.rb index 49feeab..ab91a0a 100644 --- a/examples/boxplot.rb +++ b/examples/boxplot.rb @@ -1,14 +1,26 @@ #!/usr/bin/ruby +# == Description +# +# This example illustrates how daru, combined with Statsample::Graph::Boxplot +# can be used for generating box plots of a normally distributed set of data. +# +# The 'rnorm' function, defined in statsample/shorthands generates a Daru::Vector +# object which contains the specified number of random variables in a normal distribution. +# It uses the 'distribution' gem for this purpose. +# +# Create a boxplot of the data by specifying the vectors a, b and c and providing +# necessary options to Statsample::Graph::Boxplot. The 'boxplot' function is shorthand +# for calling Statsample::Graph::Boxplot. $:.unshift(File.dirname(__FILE__)+'/../lib/') require 'statsample' Statsample::Analysis.store(Statsample::Graph::Boxplot) do - n=30 - a=rnorm(n-1,50,10) - b=rnorm(n, 30,5) - c=rnorm(n,5,1) + n = 30 + a = rnorm(n-1,50,10) + b = rnorm(n, 30,5) + c = rnorm(n,5,1) a.push(2) + boxplot(:vectors=>[a,b,c],:width=>300, :height=>300, :groups=>%w{first first second}, :minimum=>0) - end if __FILE__==$0 diff --git a/examples/chisquare_test.rb b/examples/chisquare_test.rb new file mode 100644 index 0000000..650753c --- /dev/null +++ b/examples/chisquare_test.rb @@ -0,0 +1,23 @@ +#!/usr/bin/ruby +$:.unshift(File.dirname(__FILE__)+'/../lib') +require 'statsample' + +Statsample::Analysis.store(Statsample::Test::ChiSquare) do + # Collect the two vectors with the categorical data (raw number of occurences) into one matrix. Here + #-------------------------------------------- + #| category | observation 1 | observation 2 | + #|------------------------------------------| + #| A | 100 | 20 | + #| B | 50 | 70 | + #| C | 30 | 100 | + #|------------------------------------------| + # + m=Matrix[[100, 50, 30],[20, 70, 100]] + x_2=Statsample::Test.chi_square(m) + # after the test is done, look at the p-value. + puts x_2.probability +end + +if __FILE__==$0 + Statsample::Analysis.run_batch +end diff --git a/examples/correlation_matrix.rb b/examples/correlation_matrix.rb index 844e859..1a8a77e 100644 --- a/examples/correlation_matrix.rb +++ b/examples/correlation_matrix.rb @@ -1,15 +1,28 @@ #!/usr/bin/ruby + +# == Description +# +# Creating and summarizing a correlation matrix with daru and statsample $:.unshift(File.dirname(__FILE__)+'/../lib/') -require 'statsample' +require 'statsample' Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do + # Create a Daru::DataFrame containing 4 vectors a, b, c and d. + # + # Notice that the `clone` option has been set to *false*. This tells Daru + # to not clone the Daru::Vectors being supplied by `rnorm`, since it would + # be unnecessarily counter productive to clone the vectors once they have + # been assigned to the dataframe. samples=1000 - ds=data_frame( - 'a'=>rnorm(samples), - 'b'=>rnorm(samples), - 'c'=>rnorm(samples), - 'd'=>rnorm(samples)) - cm=cor(ds) + ds = Daru::DataFrame.new({ + :a => rnorm(samples), + :b => rnorm(samples), + :c => rnorm(samples), + :d => rnorm(samples) + }, clone: false) + + # Calculate correlation matrix by calling the `cor` shorthand. + cm = cor(ds) summary(cm) end diff --git a/examples/dataset.rb b/examples/dataset.rb index b993ddc..b7622de 100644 --- a/examples/dataset.rb +++ b/examples/dataset.rb @@ -1,13 +1,26 @@ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') +# == Description +# +# This example demonstrates creation of basic Vectors and DataFrames. require 'statsample' -Statsample::Analysis.store(Statsample::Dataset) do - samples=1000 - a=Statsample::Vector.new_scale(samples) {r=rand(5); r==4 ? nil: r} - b=Statsample::Vector.new_scale(samples) {r=rand(5); r==4 ? nil: r} +Statsample::Analysis.store(Daru::DataFrame) do + samples = 1000 - ds={'a'=>a,'b'=>b}.to_dataset + # The 'new_with_size' function lets you specify the size of the + # vector as the argument and the block specifies how each element + # of the vector will be created. + a = Daru::Vector.new_with_size(samples) {r=rand(5); r==4 ? nil: r} + b = Daru::Vector.new_with_size(samples) {r=rand(5); r==4 ? nil: r} + + # Pass the Daru::Vector objects in a Hash to the DataFrame constructor + # to make a DataFrame. + # + # The *order* option lets you specify the way the vectors in the Hash + # will be ordered. Not specifyin this will order vectors in alphabetical + # order by default. + ds = Daru::DataFrame.new({:a=>a,:b=>b}, order: [:b, :a]) summary(ds) end diff --git a/examples/dominance_analysis.rb b/examples/dominance_analysis.rb index a832a8e..1208343 100644 --- a/examples/dominance_analysis.rb +++ b/examples/dominance_analysis.rb @@ -1,9 +1,10 @@ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') +# == Description +# +# Dominance Analysis with statsample require 'statsample' - - Statsample::Analysis.store(Statsample::DominanceAnalysis) do sample=300 a=rnorm(sample) @@ -11,17 +12,17 @@ c=rnorm(sample) d=rnorm(sample) - ds={'a'=>a,'b'=>b,'cc'=>c,'d'=>d}.to_dataset + ds = Daru::DataFrame.new({:a => a,:b => b,:cc => c,:d => d}, clone: false) attach(ds) - ds['y']=a*5+b*3+cc*2+d+rnorm(300) + ds[:y]=a*5 + b*3 + cc*2 + d + rnorm(300) cm=cor(ds) summary(cm) - lr=lr(ds,'y') + lr=lr(ds,:y) summary(lr) - da=dominance_analysis(ds,'y') + da=dominance_analysis(ds,:y) summary(da) - da=dominance_analysis(ds,'y',:name=>"Dominance Analysis using group of predictors", :predictors=>['a', 'b', %w{cc d}]) + da = dominance_analysis(ds,:y,:name=>"Dominance Analysis using group of predictors", :predictors=>[:a, :b, [:cc, :d]]) summary(da) end diff --git a/examples/dominance_analysis_bootstrap.rb b/examples/dominance_analysis_bootstrap.rb index 6735e9f..c15efdc 100644 --- a/examples/dominance_analysis_bootstrap.rb +++ b/examples/dominance_analysis_bootstrap.rb @@ -3,27 +3,26 @@ require 'statsample' Statsample::Analysis.store(Statsample::DominanceAnalysis::Bootstrap) do - sample=300 a=rnorm(sample) b=rnorm(sample) c=rnorm(sample) d=rnorm(sample) - a.name="a" - b.name="b" - c.name="c" - d.name="d" + a.rename :a + b.rename :b + c.rename :c + d.rename :d - ds={'a'=>a,'b'=>b,'cc'=>c,'d'=>d}.to_dataset + ds = Daru::DataFrame.new({:a => a,:b => b,:cc => c,:d => d}) attach(ds) - ds['y1']=a*5+b*2+cc*2+d*2+rnorm(sample,0,10) - ds['y2']=a*10+rnorm(sample) + ds[:y1] = a*5 + b*2 + cc*2 + d*2 + rnorm(sample,0,10) + ds[:y2] = a*10 + rnorm(sample) - dab=dominance_analysis_bootstrap(ds, ['y1','y2'], :debug=>true) + dab=dominance_analysis_bootstrap(ds, [:y1,:y2], :debug=>true) dab.bootstrap(100,nil) summary(dab) - ds2=ds['a'..'y1'] - dab2=dominance_analysis_bootstrap(ds2, 'y1', :debug=>true) + ds2=ds[:a..:y1] + dab2=dominance_analysis_bootstrap(ds2, :y1, :debug=>true) dab2.bootstrap(100,nil) summary(dab2) end diff --git a/examples/histogram.rb b/examples/histogram.rb index ec36e1f..772c69f 100644 --- a/examples/histogram.rb +++ b/examples/histogram.rb @@ -1,12 +1,26 @@ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') + +# == Description +# +# This example demonstrates how a histogram can be created +# with statsample. +# +# The 'histogram' function creates a histogram by using the +# Statsample::Graph::Histogram class. This class accepts data +# in a Daru::Vector (as created by `rnorm`). +# +# A line showing normal distribution can be drawn by setting +# the `:line_normal_distribution` option to *true*. +# +# See this notebook for an illustration: +# http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/statistics/Creating%20a%20Histogram.ipynb require 'statsample' Statsample::Analysis.store(Statsample::Graph::Histogram) do - histogram(rnorm(3000,0,20)) + histogram(rnorm(3000,0,20), :line_normal_distribution => true) end - if __FILE__==$0 Statsample::Analysis.run end diff --git a/examples/icc.rb b/examples/icc.rb index b563ae4..1ef3b38 100644 --- a/examples/icc.rb +++ b/examples/icc.rb @@ -6,18 +6,17 @@ Statsample::Analysis.store(Statsample::Reliability::ICC) do size=1000 - a=Statsample::Vector.new_scale(size) {rand(10)} - b=a.recode{|i|i+rand(4)-2} - c=a.recode{|i|i+rand(4)-2} - d=a.recode{|i|i+rand(4)-2} - @ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset + a = Daru::Vector.new_with_size(size) {rand(10)} + b = a.recode{|i|i+rand(4)-2} + c = a.recode{|i|i+rand(4)-2} + d = a.recode{|i|i+rand(4)-2} + @ds = Daru::DataFrame.new({:a => a,:b => b,:c => c,:d => d}) @icc=Statsample::Reliability::ICC.new(@ds) summary(@icc) @icc.type=:icc_3_1 summary(@icc) @icc.type=:icc_a_k summary(@icc) - end if __FILE__==$0 diff --git a/examples/levene.rb b/examples/levene.rb index 8529ee2..fe75e78 100644 --- a/examples/levene.rb +++ b/examples/levene.rb @@ -1,15 +1,29 @@ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') +# == Description +# +# This example demonstrates how a levene test can be performed by +# using Daru::Vector and the Statsample::Test::Levene class. +# +# Levene's test is an inferential statistic used to assess the +# equality of variances for a variable calculated for two or more groups. +# +# == References +# +# http://en.wikipedia.org/wiki/Levene%27s_test require 'statsample' Statsample::Analysis.store(Statsample::Test::Levene) do - a=[1,2,3,4,5,6,7,8,100,10].to_scale - b=[30,40,50,60,70,80,90,100,110,120].to_scale + a = Daru::Vector.new([1,2,3,4,5,6,7,8,100,10]) + b = Daru::Vector.new([30,40,50,60,70,80,90,100,110,120]) + + # The 'levene' function is used as a shorthand + # for creating a Statsample::Test::Levene object. summary(levene([a,b])) end if __FILE__==$0 - Statsample::Analysis.run_batch + Statsample::Analysis.run_batch end diff --git a/examples/multiple_regression.rb b/examples/multiple_regression.rb index 371be4b..4ae1277 100644 --- a/examples/multiple_regression.rb +++ b/examples/multiple_regression.rb @@ -1,15 +1,18 @@ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') +# == Description +# +# This example shows how multiple regression can be performed using statsample and daru. require 'statsample' Statsample::Analysis.store(Statsample::Regression::Multiple) do samples=2000 - ds=dataset('a'=>rnorm(samples),'b'=>rnorm(samples),'cc'=>rnorm(samples),'d'=>rnorm(samples)) + ds=dataset(:a => rnorm(samples),:b => rnorm(samples),:cc => rnorm(samples),:d => rnorm(samples)) attach(ds) - ds['y']=a*5+b*3+cc*2+d+rnorm(samples) - summary lr(ds,'y') + ds[:y] = a*5+b*3+cc*2+d+rnorm(samples) + summary lr(ds,:y) end if __FILE__==$0 diff --git a/examples/parallel_analysis.rb b/examples/parallel_analysis.rb index 0684bda..1020ff0 100644 --- a/examples/parallel_analysis.rb +++ b/examples/parallel_analysis.rb @@ -1,6 +1,11 @@ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') +# == Description +# +# This example will explain how a parallel analysis can be performed on a PCA. +# Parallel Analysis helps in determining how many components are to be retained +# from the PCA. require 'statsample' samples=150 variables=30 @@ -8,18 +13,18 @@ Statsample::Analysis.store(Statsample::Factor::ParallelAnalysis) do rng = Distribution::Normal.rng() -f1=rnorm(samples) -f2=rnorm(samples) -f3=rnorm(samples) +f1 = rnorm(samples) +f2 = rnorm(samples) +f3 = rnorm(samples) vectors={} variables.times do |i| - vectors["v#{i}"]=samples.times.collect {|nv| f1[nv]*i+(f2[nv]*(15-i))+((f3[nv]*(30-i))*1.5)*rng.call}.to_scale - vectors["v#{i}"].name="Vector #{i}" + vectors["v#{i}".to_sym] = Daru::Vector.new(samples.times.collect {|nv| f1[nv]*i+(f2[nv]*(15-i))+((f3[nv]*(30-i))*1.5)*rng.call}) + vectors["v#{i}".to_sym].rename "Vector #{i}" end - ds=vectors.to_dataset + ds = Daru::DataFrame.new(vectors) pa=Statsample::Factor::ParallelAnalysis.new(ds, :iterations=>iterations, :debug=>true) pca=pca(cor(ds)) diff --git a/examples/polychoric.rb b/examples/polychoric.rb index ca99e7c..dec2c3f 100644 --- a/examples/polychoric.rb +++ b/examples/polychoric.rb @@ -1,26 +1,39 @@ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') -$:.unshift("/home/cdx/usr/lib/statsample-bivariate-extension/lib/") +# == Description +# Polychoric Correlation using two-step and joint method +# +# Polychoric correlation in statsample requires installation of +# the [statsample-bivariate-extension](https://rubygems.org/gems/statsample-bivariate-extension) +# gem. This gem extends the Statsample::Bivariate class with useful +# algorithms for polychoric and tetrachoric correlation. +# +# Statsample will automatically detect presence of polychoric/tetrachoric +# algorithms so there is no need to explicitly require the gem. +# +# In this example we'll see how polychoric correlation can be +# performed using statsample. require 'statsample' Statsample::Analysis.store(Statsample::Bivariate::Polychoric) do -ct=Matrix[[rand(10)+50, rand(10)+50, rand(10)+1], - [rand(20)+5, rand(50)+4, rand(10)+1], - [rand(8)+1, rand(12)+1, rand(10)+1]] + ct=Matrix[[rand(10)+50, rand(10)+50, rand(10)+1], + [rand(20)+5, rand(50)+4, rand(10)+1], + [rand(8)+1, rand(12)+1, rand(10)+1]] -# Estimation of polychoric correlation using two-step (default) -poly=polychoric(ct, :name=>"Polychoric with two-step", :debug=>false) -summary poly + # Estimation of polychoric correlation using two-step (default) + poly=polychoric(ct, :name=>"Polychoric with two-step", :debug=>false) + summary poly -# Estimation of polychoric correlation using joint method (slow) -poly=polychoric(ct, :method=>:joint, :name=>"Polychoric with joint") -summary poly + # Estimation of polychoric correlation using joint method (slow) + poly=polychoric(ct, :method=>:joint, :name=>"Polychoric with joint") + summary poly -# Uses polychoric series (not recomended) + # Uses polychoric series (not recomended) -poly=polychoric(ct, :method=>:polychoric_series, :name=>"Polychoric with polychoric series") -summary poly + poly=polychoric(ct, :method=>:polychoric_series, :name=>"Polychoric with polychoric series") + summary poly end + if __FILE__==$0 Statsample::Analysis.run_batch end diff --git a/examples/principal_axis.rb b/examples/principal_axis.rb index 75ae6a0..0e25b04 100644 --- a/examples/principal_axis.rb +++ b/examples/principal_axis.rb @@ -1,16 +1,20 @@ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') - +# Principal Axis Analysis¶ +# +# Here we use the Statsample::Factor::PrincipalAnalysis class +# for principal axis analysis for a correlation or covariance matrix. require 'statsample' Statsample::Analysis.store(Statsample::Factor::PrincipalAxis) do matrix=Matrix[ - [1.0, 0.709501601093587, 0.877596585880047, 0.272219316266807], [0.709501601093587, 1.0, 0.291633797330304, 0.871141831433844], [0.877596585880047, 0.291633797330304, 1.0, -0.213373722977167], [0.272219316266807, 0.871141831433844, -0.213373722977167, 1.0]] + [1.0, 0.709501601093587, 0.877596585880047, 0.272219316266807], + [0.709501601093587, 1.0, 0.291633797330304, 0.871141831433844], + [0.877596585880047, 0.291633797330304, 1.0, -0.213373722977167], + [0.272219316266807, 0.871141831433844, -0.213373722977167, 1.0]] matrix.extend Statsample::CovariateMatrix - - #matrix.fields=%w{a b c d} fa=principal_axis(matrix,:m=>1,:smc=>false) summary fa diff --git a/examples/reliability.rb b/examples/reliability.rb index 27d7e25..3667c16 100644 --- a/examples/reliability.rb +++ b/examples/reliability.rb @@ -1,27 +1,26 @@ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib') -require 'statsample' -Statsample::Analysis.store(Statsample::Reliability) do - +# == Description +# +# Reliability Scale Analysis with statsample +require 'statsample' +Statsample::Analysis.store(Statsample::Reliability) do samples=100 a=rnorm(samples) - ds=Statsample::Dataset.new + ds = Daru::DataFrame.new({}) 20.times do |i| - ds["v#{i}"]=a+rnorm(samples,0,0.2) + ds["v#{i}".to_sym]= a + rnorm(samples,0,0.2) end - ds.update_valid_data - rel=Statsample::Reliability::ScaleAnalysis.new(ds) summary rel - ms=Statsample::Reliability::MultiScaleAnalysis.new(:name=>"Multi Scale analyss") do |m| - m.scale "Scale 1", ds.clone(%w{v1 v2 v3 v4 v5 v6 v7 v8 v9 v10}) - m.scale "Scale 2", ds.clone(%w{v11 v12 v13 v14 v15 v16 v17 v18 v19}) + m.scale "Scale 1", ds.clone([:v1, :v2, :v3, :v4, :v5, :v6, :v7, :v8, :v9, :v10]) + m.scale "Scale 2", ds.clone([:v11, :v12, :v13, :v14, :v15, :v16, :v17, :v18, :v19]) end summary ms @@ -30,4 +29,3 @@ if __FILE__==$0 Statsample::Analysis.run_batch end - diff --git a/examples/scatterplot.rb b/examples/scatterplot.rb index f238c5f..72a0c5f 100644 --- a/examples/scatterplot.rb +++ b/examples/scatterplot.rb @@ -2,6 +2,14 @@ $:.unshift(File.dirname(__FILE__)+'/../lib/') $:.unshift('/home/cdx/dev/reportbuilder/lib/') +# == Description +# +# Creating a scatterplot with statsample's Statsample::Graph::Scatterplot class. +# +# In this example we'll demonstrate how a normally distributed Daru::Vector can +# be created using the daru and distribution gems, and how the values generated +# can be plotted very easily using the 'scatterplot' shorthand and supplying X +# and Y co-ordinates. require 'benchmark' require 'statsample' n=100 diff --git a/examples/t_test.rb b/examples/t_test.rb index ab1abf0..0a44cd9 100644 --- a/examples/t_test.rb +++ b/examples/t_test.rb @@ -1,5 +1,12 @@ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib') +# == Description +# +# This example illustrates how a T test can be done and summarized with statsample +# +# == References +# +# http://en.wikipedia.org/wiki/Student%27s_t-test require 'statsample' Statsample::Analysis.store(Statsample::Test::T) do diff --git a/examples/u_test.rb b/examples/u_test.rb index d5ae14f..00d345d 100644 --- a/examples/u_test.rb +++ b/examples/u_test.rb @@ -1,11 +1,19 @@ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib') + +# == Description +# +# Example illustrating Mann-Whitney U test with statsample. +# +# == References +# +# http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test require 'statsample' Statsample::Analysis.store(Statsample::Test::UMannWhitney) do - a=10.times.map {rand(100)}.to_scale - b=20.times.map {(rand(20))**2+50}.to_scale + a = Daru::Vector.new(10.times.map {rand(100)}) + b = Daru::Vector.new(20.times.map {(rand(20))**2+50}) u=Statsample::Test::UMannWhitney.new(a,b) summary u diff --git a/examples/vector.rb b/examples/vector.rb index f64e62b..964f870 100644 --- a/examples/vector.rb +++ b/examples/vector.rb @@ -1,15 +1,18 @@ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') - +# == Description +# +# This example provides a small sneak-peak into creating a Daru::Vector. +# For details on using Daru::Vector (with example on math, statistics and plotting) +# see the notebook at this link: +# http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Usage%20of%20Vector.ipynb require 'statsample' -Statsample::Analysis.store(Statsample::Vector) do - - a=Statsample::Vector.new_scale(1000) {r=rand(5); r==4 ? nil: r;} +Statsample::Analysis.store(Daru::Vector) do + a = Daru::Vector.new_with_size(1000) {r=rand(5); r==4 ? nil: r;} summary a - b=c(1,2,3,4,6..10) + b = Daru::Vector[1,2,3,4,6..10] summary b - end if __FILE__==$0 diff --git a/examples/velicer_map_test.rb b/examples/velicer_map_test.rb index 8ec3ed4..5a114d7 100644 --- a/examples/velicer_map_test.rb +++ b/examples/velicer_map_test.rb @@ -1,5 +1,8 @@ #!/usr/bin/ruby $:.unshift(File.dirname(__FILE__)+'/../lib/') +# == Description +# +# Velicer MAP test. require 'statsample' @@ -15,17 +18,18 @@ vectors={} variables.times do |i| - vectors["v#{i}"]=samples.times.collect {|nv| - if i<5 - f1[nv]*5 + f2[nv] *2 +rng.call - else - f1[nv]*2 + f2[nv] *3 +rng.call - end - }.to_scale + vectors["v#{i}".to_sym]= Daru::Vector.new( + samples.times.collect do |nv| + if i<5 + f1[nv]*5 + f2[nv] *2 +rng.call + else + f1[nv]*2 + f2[nv] *3 +rng.call + end + end) end - ds=vectors.to_dataset + ds = Daru::DataFrame.new(vectors) cor=cor(ds) pca=pca(cor) diff --git a/lib/spss.rb b/lib/spss.rb index 3c60dd3..50a2dca 100644 --- a/lib/spss.rb +++ b/lib/spss.rb @@ -1,4 +1,4 @@ -# = spss.rb - +# = spss.rb - # # Provides utilites for working with spss files # @@ -12,40 +12,43 @@ class Element def add(a) @elements.push(a) end - def parse_elements(func=:to_s) - @elements.collect{|e| " "+e.send(func)}.join("\n") + + def parse_elements(func = :to_s) + @elements.collect{ |e| " "+e.send(func) }.join("\n") end + def init_with config - config.each {|key,value| - self.send(key.to_s+"=",value) if methods.include? key.to_s - } + config.each do |key, value| + self.send(key.to_s + "=", value) if methods.include? key.to_s + end end - def initialize(config={}) - @config=config - @elements=[] + + def initialize(config = {}) + @config = config + @elements = [] end end class Dictionary < Element attr_accessor :locale, :date_time, :row_count - def initialize(config={}) + def initialize(config = {}) super init_with ({ - :locale=>"en_US", + :locale=>"en_US", :date_time=>Time.new().strftime("%Y-%m-%dT%H:%M:%S"), :row_count=>1 }) init_with config end - + def to_xml "\n"+parse_elements(:to_xml)+"\n" - + end def to_spss parse_elements(:to_spss) end end - + class MissingValue < Element attr_accessor :data, :type, :from, :to def initialize(data,type=nil) diff --git a/lib/statsample.rb b/lib/statsample.rb index 30b4608..1352a54 100644 --- a/lib/statsample.rb +++ b/lib/statsample.rb @@ -1,4 +1,4 @@ -# = statsample.rb - +# = statsample.rb - # Statsample - Statistic package for Ruby # Copyright (C) 2008-2014 Claudio Bustos # @@ -17,17 +17,18 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. # - -#$:.unshift(File.dirname(__FILE__)) require 'matrix' require 'extendmatrix' require 'distribution' require 'dirty-memoize' require 'reportbuilder' - +require 'daru' +require 'statsample/daru' class Numeric - def square ; self * self ; end + def square + self * self + end end class String @@ -41,10 +42,10 @@ def is_number? end class Module - def include_aliasing(m, suffix="ruby") + def include_aliasing(m, suffix = 'ruby') m.instance_methods.each do |f| if instance_methods.include? f - alias_method("#{f}_#{suffix}",f) + alias_method("#{f}_#{suffix}", f) remove_method f end end @@ -53,78 +54,83 @@ def include_aliasing(m, suffix="ruby") end class Array - # Recode repeated values on an array, adding the number of repetition - # at the end - # Example: - # a=%w{a b c c d d d e} - # a.recode_repeated - # => ["a","b","c_1","c_2","d_1","d_2","d_3","e"] - def recode_repeated - if self.size!=self.uniq.size - # Find repeated - repeated=self.inject({}) {|a,v| - (a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v| k} - ns=repeated.inject({}) {|a,v| a[v]=0;a} - self.collect do |f| - if repeated.include? f - ns[f]+=1 - sprintf("%s_%d",f,ns[f]) - else - f - end - end - else - self + unless method_defined?(:sum) + def sum + inject(:+) end end + + def mean + sum.fdiv(size) + end + + # Calcualte sum of squares + def sum_of_squares(m=nil) + m ||= mean + inject(0) {|a,x| a + (x-m).square } + end + + # Calculate sample variance + def variance_sample(m=nil) + m ||= mean + sum_of_squares(m).quo(size - 1) + end + + # Calculate sample standard deviation + def sd + m ||= mean + Math::sqrt(variance_sample(m)) + end end -def create_test(*args,&proc) - description=args.shift - fields=args +def create_test(*args, &_proc) + description = args.shift + fields = args [description, fields, Proc.new] end + #-- # Test extensions begin require 'gettext' rescue LoadError def bindtextdomain(d) #:nodoc: - d + d end - + # Bored module module GetText #:nodoc: - def _(t) - t + def _(t) + t end end end + # Library for statistical analysis on Ruby # # * Classes for manipulation and storage of data: # * Module Statsample::Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric (see Bivariate::Tetrachoric) and polychoric (see Bivariate::Polychoric) correlations. Include methods to create correlation and covariance matrices # * Multiple types of regression on Statsample::Regression # * Factorial Analysis algorithms on Statsample::Factor module. -# * Dominance Analysis. Based on Budescu and Azen papers.link[http://psycnet.apa.org/journals/met/8/2/129/]. +# * Dominance Analysis. Based on Budescu and Azen papers.link[http://psycnet.apa.org/journals/met/8/2/129/]. # * Module Statsample::Codification, to help to codify open questions # * Converters to import and export data from databases, csv and excel files. # * Module Statsample::Crosstab provides function to create crosstab for categorical data # * Reliability analysis provides functions to analyze scales. # * Module Statsample::SRS (Simple Random Sampling) provides a lot of functions to estimate standard error for several type of samples -# * Interfaces to gdchart, gnuplot and SVG::Graph +# * Interfaces to gdchart, gnuplot and SVG::Graph # module Statsample - def self.create_has_library(library) define_singleton_method("has_#{library}?") do - cv="@@#{library}" - if !class_variable_defined? cv - begin + cv = "@@#{library}" + unless class_variable_defined? cv + begin + gem library.to_s # activate gem require library.to_s - class_variable_set(cv,true) + class_variable_set(cv, true) rescue LoadError - class_variable_set(cv,false) + class_variable_set(cv, false) end end class_variable_get(cv) @@ -132,8 +138,8 @@ def self.create_has_library(library) end create_has_library :gsl - - SPLIT_TOKEN = "," + + SPLIT_TOKEN = ',' autoload(:Analysis, 'statsample/analysis') autoload(:Database, 'statsample/converters') autoload(:Anova, 'statsample/anova') @@ -154,133 +160,123 @@ def self.create_has_library(library) autoload(:Multivariate, 'statsample/multivariate') autoload(:Multiset, 'statsample/multiset') autoload(:StratifiedSample, 'statsample/multiset') - autoload(:MLE, 'statsample/mle') + autoload(:MLE, 'statsample/mle') autoload(:Regression, 'statsample/regression') + autoload(:FitModel, 'statsample/formula/fit_model') autoload(:Test, 'statsample/test') autoload(:Factor, 'statsample/factor') autoload(:Graph, 'statsample/graph') - - + class << self # Load a object saved on a file. def load(filename) if File.exist? filename - o=false - File.open(filename,"r") {|fp| o=Marshal.load(fp) } + o = false + File.open(filename, 'r') { |fp| o = Marshal.load(fp) } o else false end end - - - + # Create a matrix using vectors as columns. # Use: # # matrix=Statsample.vector_cols_matrix(v1,v2) def vector_cols_matrix(*vs) # test - size=vs[0].size - vs.each{|v| - raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector - raise ArgumentError,"Vectors size should be the same" if v.size!=size - } - Matrix.rows((0...size).to_a.collect() {|i| - vs.collect{|v| v[i]} - }) + size = vs[0].size + + vs.each do |v| + fail ArgumentError, 'Arguments should be Vector' unless v.instance_of? Daru::Vector + fail ArgumentError, 'Vectors size should be the same' if v.size != size + end + + Matrix.rows((0...size).to_a.collect { |i| vs.collect { |v| v[i] } }) end + # Returns a duplicate of the input vectors, without missing data # for any of the vectors. - # - # a=[1,2,3,6,7,nil,3,5].to_scale - # b=[nil,nil,5,6,4,5,10,2].to_scale - # c=[2,4,6,7,4,5,6,7].to_scale + # + # a = Daru::Vector.new([1,2,3,6,7,nil,3,5]) + # b = Daru::Vector.new([nil,nil,5,6,4,5,10,2]) + # c = Daru::Vector.new([2,4,6,7,4,5,6,7]) # a2,b2,c2=Statsample.only_valid(a,b,c) - # => [#, - # #, - # #] + # => [#, + # #, + # #] # def only_valid(*vs) - i=1 - h=vs.inject({}) {|a,v| a["v#{i}"]=v;i+=1;a} - ds=Statsample::Dataset.new(h).dup_only_valid - ds.vectors.values + i = 1 + h = vs.inject({}) { |acc, v| acc["v#{i}".to_sym] = v; i += 1; acc } + df = Daru::DataFrame.new(h).reject_values(*Daru::MISSING_VALUES) + df.map { |v| v } end - - # Cheap version of #only_valid. + + # Cheap version of #only_valid. # If any vectors have missing_values, return only valid. # If not, return the vectors itself def only_valid_clone(*vs) - if vs.any? {|v| v.flawed?} + if vs.any? { |v| v.include_values?(*Daru::MISSING_VALUES) } only_valid(*vs) else vs end end - end - - - - + end + module Util # Reference: http://www.itl.nist.gov/div898/handbook/eda/section3/normprpl.htm - def normal_order_statistic_medians(i,n) - if i==1 - u= 1.0 - normal_order_statistic_medians(n,n) - elsif i==n - u=0.5**(1 / n.to_f) + def normal_order_statistic_medians(i, n) + if i == 1 + u = 1.0 - normal_order_statistic_medians(n, n) + elsif i == n + u = 0.5**(1 / n.to_f) else - u= (i - 0.3175) / (n + 0.365) + u = (i - 0.3175) / (n + 0.365) end u end - - def self.nice(s,e) # :nodoc: - reverse = etrue).add(self).send(method) + bindtextdomain('statsample') + def summary(method = :to_text) + ReportBuilder.new(no_title: true).add(self).send(method) end end module STATSAMPLE__ #:nodoc: end end - - #-- -begin +begin require 'statsamplert' rescue LoadError module Statsample - OPTIMIZED=false + OPTIMIZED = false end end diff --git a/lib/statsample/analysis/suite.rb b/lib/statsample/analysis/suite.rb index f4d97c4..49b4677 100644 --- a/lib/statsample/analysis/suite.rb +++ b/lib/statsample/analysis/suite.rb @@ -80,7 +80,7 @@ def scatterplot(*args) def method_missing(name, *args,&block) @attached.reverse.each do |ds| - return ds[name.to_s] if ds.fields.include? (name.to_s) + return ds[name] if ds.vectors.to_a.include? (name) end raise "Method #{name} doesn't exists" end diff --git a/lib/statsample/anova/oneway.rb b/lib/statsample/anova/oneway.rb index e0c20a5..a2d5bca 100644 --- a/lib/statsample/anova/oneway.rb +++ b/lib/statsample/anova/oneway.rb @@ -67,9 +67,9 @@ def report_building_table(builder) #:nodoc: # One Way Anova with vectors # Example: - # v1=[2,3,4,5,6].to_scale - # v2=[3,3,4,5,6].to_scale - # v3=[5,3,1,5,6].to_scale + # v1 = Daru::Vector.new([2,3,4,5,6]) + # v2 = Daru::Vector.new([3,3,4,5,6]) + # v3 = Daru::Vector.new([5,3,1,5,6]) # anova=Statsample::Anova::OneWayWithVectors.new([v1,v2,v3]) # anova.f # => 0.0243902439024391 @@ -90,10 +90,10 @@ class OneWayWithVectors < OneWay def initialize(*args) if args[0].is_a? Array - @vectors=args.shift + @vectors = args.shift else - @vectors=args.find_all {|v| v.is_a? Statsample::Vector} - opts=args.find {|v| v.is_a? Hash} + @vectors = args.find_all {|v| v.is_a? Daru::Vector} + opts = args.find {|v| v.is_a? Hash} end opts||=Hash.new opts_default={:name=>_("Anova One-Way"), @@ -164,7 +164,7 @@ def report_building(builder) # :nodoc: if summary_descriptives s.table(:name=>_("Descriptives"),:header=>%w{Name N Mean SD Min Max}.map {|v| _(v)}) do |t| @vectors.each do |v| - t.row [v.name, v.n_valid, "%0.4f" % v.mean, "%0.4f" % v.sd, "%0.4f" % v.min, "%0.4f" % v.max] + t.row [v.name, v.reject_values(*Daru::MISSING_VALUES).size, "%0.4f" % v.mean, "%0.4f" % v.sd, "%0.4f" % v.min, "%0.4f" % v.max] end end end diff --git a/lib/statsample/anova/twoway.rb b/lib/statsample/anova/twoway.rb index f623e6c..49dae07 100644 --- a/lib/statsample/anova/twoway.rb +++ b/lib/statsample/anova/twoway.rb @@ -107,9 +107,9 @@ def report_building_table(builder) #:nodoc: # Two Way Anova with vectors # Example: - # v1=[1,1,2,2].to_scale - # v2=[1,2,1,2].to_scale - # v3=[5,3,1,5].to_scale + # v1 = Daru::Vector.new([1,1,2,2]) + # v2 = Daru::Vector.new([1,2,1,2]) + # v3 = Daru::Vector.new([5,3,1,5]) # anova=Statsample::Anova::TwoWayWithVectors.new(:a=>v1,:b=>v2, :dependent=>v3) # class TwoWayWithVectors < TwoWay @@ -121,25 +121,26 @@ class TwoWayWithVectors < TwoWay # For now, only equal sample cells allowed def initialize(opts=Hash.new) raise "You should insert at least :a, :b and :dependent" unless [:a, :b, :dependent].all? {|v| opts.has_key? v} - @a_var='a' - @b_var='b' - @dep_var='dependent' - @a_vector, @b_vector, @dep_vector=Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent] + @a_var = :a + @b_var = :b + @dep_var = :dependent + @a_vector, @b_vector, @dep_vector = + Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent] - ds={@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector}.to_dataset - @ds=ds.clone_only_valid - _p=@a_vector.factors.size - _q=@b_vector.factors.size - @x_general=@dep_vector.mean - @axb_means={} - @axb_sd={} - @vectors=[] + ds = Daru::DataFrame.new({@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector}) + @ds = ds.clone_only_valid + _p = @a_vector.factors.size + _q = @b_vector.factors.size + @x_general = @dep_vector.mean + @axb_means = {} + @axb_sd = {} + @vectors = [] n=nil @ds.to_multiset_by_split(a_var,b_var).each_vector(dep_var) {|k,v| - @axb_means[k]=v.mean - @axb_sd[k]=v.sd + @axb_means[k] = v.mean + @axb_sd[k] = v.sd @vectors << v - n||=v.size + n ||= v.size raise "All cell sizes should be equal" if n!=v.size } @@ -151,20 +152,21 @@ def initialize(opts=Hash.new) @ds.to_multiset_by_split(b_var).each_vector(dep_var) {|k,v| @b_means[k]=v.mean } - ss_a=n*_q*@ds[a_var].factors.inject(0) {|ac,v| - ac+(@a_means[v]-@x_general)**2 + ss_a = n*_q*@ds[a_var].factors.inject(0) {|ac,v| + ac + (@a_means[v]-@x_general)**2 } ss_b=n*_p*@ds[b_var].factors.inject(0) {|ac,v| ac+(@b_means[v]-@x_general)**2 } - ss_within=@ds.collect {|row| + ss_within = @ds.collect(:row) { |row| (row[dep_var]-@axb_means[[row[a_var],row[b_var]]])**2 }.sum - ss_axb=n*@axb_means.inject(0) {|ac,v| + ss_axb = n*@axb_means.inject(0) {|ac,v| j,k=v[0] xjk=v[1] ac+(xjk-@a_means[j]-@b_means[k]+@x_general)**2 } + df_a=_p-1 df_b=_q-1 df_within=(_p*_q)*(n-1) @@ -186,9 +188,9 @@ def levene def report_building(builder) #:nodoc:# builder.section(:name=>@name) do |s| if summary_descriptives - s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].labeling(a)}+[_("%s Mean") % @name_b]) do |t| + s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].index_of(a)}+[_("%s Mean") % @name_b]) do |t| @ds[b_var].factors.each do |b| - t.row([@ds[b_var].labeling(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]]) + t.row([@ds[b_var].index_of(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]]) end t.row([_("%s Mean") % @name_a]+@ds[a_var].factors.map {|a| "%0.3f" % @a_means[a]}+ ["%0.3f" % @x_general]) end diff --git a/lib/statsample/bivariate.rb b/lib/statsample/bivariate.rb index d24e5ff..3ba1150 100644 --- a/lib/statsample/bivariate.rb +++ b/lib/statsample/bivariate.rb @@ -12,9 +12,10 @@ class << self # Covariance between two vectors def covariance(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) + return nil if v1a.size==0 if Statsample.has_gsl? - GSL::Stats::covariance(v1a.gsl, v2a.gsl) + GSL::Stats::covariance(v1a.to_gsl, v2a.to_gsl) else covariance_slow(v1a,v2a) end @@ -34,7 +35,9 @@ def covariance_slow(v1,v2) # :nodoc: sum_of_squares(v1a,v2a) / (v1a.size-1) end def sum_of_squares(v1,v2) - v1a,v2a=Statsample.only_valid_clone(v1,v2) + v1a,v2a=Statsample.only_valid_clone(v1,v2) + v1a.reset_index! + v2a.reset_index! m1=v1a.mean m2=v2a.mean (v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)} @@ -44,13 +47,14 @@ def pearson(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) return nil if v1a.size ==0 if Statsample.has_gsl? - GSL::Stats::correlation(v1a.gsl, v2a.gsl) + GSL::Stats::correlation(v1a.to_gsl, v2a.to_gsl) else pearson_slow(v1a,v2a) end end def pearson_slow(v1,v2) # :nodoc: v1a,v2a=Statsample.only_valid_clone(v1,v2) + # Calculate sum of squares ss=sum_of_squares(v1a,v2a) ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares)) @@ -118,14 +122,16 @@ def residuals(from,del) r=Statsample::Bivariate.pearson(from,del) froms, dels = from.vector_standarized, del.vector_standarized nv=[] - froms.data_with_nils.each_index do |i| + froms.reset_index! + dels.reset_index! + froms.each_index do |i| if froms[i].nil? or dels[i].nil? nv.push(nil) else nv.push(froms[i]-r*dels[i]) end end - nv.to_vector(:scale) + Daru::Vector.new(nv) end # Correlation between v1 and v2, controling the effect of # control on both. @@ -135,7 +141,6 @@ def partial_correlation(v1,v2,control) rv1con=pearson(v1a,cona) rv2con=pearson(v2a,cona) (rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2)) - end def covariance_matrix_optimized(ds) @@ -153,50 +158,53 @@ def covariance_matrix_optimized(ds) # Order of rows and columns depends on Dataset#fields order def covariance_matrix(ds) - vars,cases=ds.fields.size,ds.cases - if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases) + vars,cases = ds.ncols, ds.nrows + if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases) cm=covariance_matrix_optimized(ds) else cm=covariance_matrix_pairwise(ds) - end cm.extend(Statsample::CovariateMatrix) - cm.fields=ds.fields + cm.fields = ds.vectors.to_a cm end def covariance_matrix_pairwise(ds) cache={} - matrix=ds.collect_matrix do |row,col| - if (ds[row].type!=:scale or ds[col].type!=:scale) - nil - elsif row==col - ds[row].variance - else - if cache[[col,row]].nil? - cov=covariance(ds[row],ds[col]) - cache[[row,col]]=cov - cov + vectors = ds.vectors.to_a + mat_rows = vectors.collect do |row| + vectors.collect do |col| + if (ds[row].type!=:numeric or ds[col].type!=:numeric) + nil + elsif row==col + ds[row].variance else - cache[[col,row]] + if cache[[col,row]].nil? + cov=covariance(ds[row],ds[col]) + cache[[row,col]]=cov + cov + else + cache[[col,row]] + end end end end - matrix + + Matrix.rows mat_rows end # Correlation matrix. # Order of rows and columns depends on Dataset#fields order def correlation_matrix(ds) - vars,cases=ds.fields.size,ds.cases - if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases) + vars, cases = ds.ncols, ds.nrows + if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases) cm=correlation_matrix_optimized(ds) else cm=correlation_matrix_pairwise(ds) end cm.extend(Statsample::CovariateMatrix) - cm.fields=ds.fields + cm.fields = ds.vectors.to_a cm end @@ -212,33 +220,43 @@ def correlation_matrix_optimized(ds) end def correlation_matrix_pairwise(ds) cache={} - cm=ds.collect_matrix do |row,col| - if row==col - 1.0 - elsif (ds[row].type!=:scale or ds[col].type!=:scale) - nil - else - if cache[[col,row]].nil? - r=pearson(ds[row],ds[col]) - cache[[row,col]]=r - r + vectors = ds.vectors.to_a + cm = vectors.collect do |row| + vectors.collect do |col| + if row==col + 1.0 + elsif (ds[row].type!=:numeric or ds[col].type!=:numeric) + nil else - cache[[col,row]] - end + if cache[[col,row]].nil? + r=pearson(ds[row],ds[col]) + cache[[row,col]]=r + r + else + cache[[col,row]] + end + end end end + + Matrix.rows cm end # Retrieves the n valid pairwise. def n_valid_matrix(ds) - ds.collect_matrix do |row,col| - if row==col - ds[row].valid_data.size - else - rowa,rowb=Statsample.only_valid_clone(ds[row],ds[col]) - rowa.size + vectors = ds.vectors.to_a + m = vectors.collect do |row| + vectors.collect do |col| + if row==col + ds[row].reject_values(*Daru::MISSING_VALUES).size + else + rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col]) + rowa.size + end end end + + Matrix.rows m end # Matrix of correlation probabilities. @@ -248,7 +266,7 @@ def correlation_probability_matrix(ds, tails=:both) rows=ds.fields.collect do |row| ds.fields.collect do |col| v1a,v2a=Statsample.only_valid_clone(ds[row],ds[col]) - (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails) + (row==col or ds[row].type!=:numeric or ds[col].type!=:numeric) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails) end end Matrix.rows(rows) @@ -256,27 +274,27 @@ def correlation_probability_matrix(ds, tails=:both) # Spearman ranked correlation coefficient (rho) between 2 vectors def spearman(v1,v2) - v1a,v2a=Statsample.only_valid_clone(v1,v2) - v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale) + v1a,v2a = Statsample.only_valid_clone(v1,v2) + v1r,v2r = v1a.ranked, v2a.ranked pearson(v1r,v2r) end # Calculate Point biserial correlation. Equal to Pearson correlation, with # one dichotomous value replaced by "0" and the other by "1" def point_biserial(dichotomous,continous) - ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid - raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2 - raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale - f0=ds['d'].factors.sort[0] - m0=ds.filter_field('c') {|c| c['d']==f0} - m1=ds.filter_field('c') {|c| c['d']!=f0} - ((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2) + ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).reject_values(*Daru::MISSING_VALUES) + raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2 + raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric + f0=ds[:d].factors.sort.to_a[0] + m0=ds.filter_vector(:c) {|c| c[:d] == f0} + m1=ds.filter_vector(:c) {|c| c[:d] != f0} + ((m1.mean-m0.mean).to_f / ds[:c].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.nrows**2) end # Kendall Rank Correlation Coefficient (Tau a) # Based on Hervé Adbi article def tau_a(v1,v2) v1a,v2a=Statsample.only_valid_clone(v1,v2) n=v1.size - v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale) + v1r,v2r=v1a.ranked,v2a.ranked o1=ordered_pairs(v1r) o2=ordered_pairs(v2r) delta= o1.size*2-(o2 & o1).size*2 @@ -348,14 +366,15 @@ def pairs(matrix) } {'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x} end + def ordered_pairs(vector) - d=vector.data - a=[] - (0...(d.size-1)).each{|i| - ((i+1)...(d.size)).each {|j| + d = vector.to_a + a = [] + (0...(d.size-1)).each do |i| + ((i+1)...(d.size)).each do |j| a.push([d[i],d[j]]) - } - } + end + end a end =begin @@ -371,8 +390,8 @@ def sum_of_codeviated(v1,v2) # Report the minimum number of cases valid of a covariate matrix # based on a dataset def min_n_valid(ds) - min=ds.cases - m=n_valid_matrix(ds) + min = ds.nrows + m = n_valid_matrix(ds) for x in 0...m.row_size for y in 0...m.column_size min=m[x,y] if m[x,y] < min @@ -380,8 +399,6 @@ def min_n_valid(ds) end min end - - end end end diff --git a/lib/statsample/bivariate/pearson.rb b/lib/statsample/bivariate/pearson.rb index 8dd6dea..4060ad4 100644 --- a/lib/statsample/bivariate/pearson.rb +++ b/lib/statsample/bivariate/pearson.rb @@ -7,8 +7,8 @@ module Bivariate # variables. # # == Usage - # a = [1,2,3,4,5,6].to_scale - # b = [2,3,4,5,6,7].to_scale + # a = Daru::Vector.new([1,2,3,4,5,6]) + # b = Daru::Vector.new([2,3,4,5,6,7]) # pearson = Statsample::Bivariate::Pearson.new(a,b) # puts pearson.r # puts pearson.t diff --git a/lib/statsample/codification.rb b/lib/statsample/codification.rb index bf76ef0..96d089f 100644 --- a/lib/statsample/codification.rb +++ b/lib/statsample/codification.rb @@ -34,24 +34,33 @@ class << self # will be hashes, with keys = values, for recodification def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN) raise ArgumentError,"Array should't be empty" if vectors.size==0 - pro_hash=vectors.inject({}){|h,v_name| - raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name - v=dataset[v_name] - split_data=v.splitted(sep).flatten.collect {|c| c.to_s}.find_all {|c| !c.nil?} + pro_hash = vectors.inject({}) do |h,v_name| + v_name = v_name.is_a?(Numeric) ? v_name : v_name.to_sym + raise Exception, "Vector #{v_name} doesn't exists on Dataset" if + !dataset.vectors.include?(v_name) + v = dataset[v_name] + split_data = v.splitted(sep) + .flatten + .collect { |c| c.to_s } + .find_all{ |c| !c.nil? } - factors=split_data.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac } - h[v_name]=factors + factors = split_data.uniq + .compact + .sort + .inject({}) { |ac,val| ac[val] = val; ac } + h[v_name] = factors h - } + end + pro_hash end # Create a yaml to create a dictionary, based on vectors # The keys will be vectors name on dataset and the values # will be hashes, with keys = values, for recodification # - # v1=%w{a,b b,c d}.to_vector - # ds={"v1"=>v1}.to_dataset - # Statsample::Codification.create_yaml(ds,['v1']) + # v1 = Daru::Vector.new(%w{a,b b,c d}) + # ds = Daru::DataFrame.new({:v1 => v1}) + # Statsample::Codification.create_yaml(ds,[:v1]) # => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n" def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN) pro_hash=create_hash(dataset, vectors, sep) @@ -69,16 +78,17 @@ def create_excel(dataset, vectors, filename, sep=Statsample::SPLIT_TOKEN) if File.exist?(filename) raise "Exists a file named #{filename}. Delete ir before overwrite." end - book = Spreadsheet::Workbook.new + book = Spreadsheet::Workbook.new sheet = book.create_worksheet - sheet.row(0).concat(%w{field original recoded}) - i=1 + sheet.row(0).concat(%w(field original recoded)) + i = 1 create_hash(dataset, vectors, sep).sort.each do |field, inner_hash| inner_hash.sort.each do |k,v| - sheet.row(i).concat([field.dup,k.dup,v.dup]) - i+=1 + sheet.row(i).concat([field.to_s,k.to_s,v.to_s]) + i += 1 end end + book.write(filename) end # From a excel generates a dictionary hash @@ -91,10 +101,11 @@ def excel_to_recoded_hash(filename) sheet= book.worksheet 0 row_i=0 sheet.each do |row| - row_i+=1 - next if row_i==1 or row[0].nil? or row[1].nil? or row[2].nil? - h[row[0]]={} if h[row[0]].nil? - h[row[0]][row[1]]=row[2] + row_i += 1 + next if row_i == 1 or row[0].nil? or row[1].nil? or row[2].nil? + key = row[0].to_sym + h[key] ||= {} + h[key][row[1]] = row[2] end h end @@ -110,12 +121,12 @@ def inverse_hash(h, sep=Statsample::SPLIT_TOKEN) end def dictionary(h, sep=Statsample::SPLIT_TOKEN) - h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a } + h.inject({}) { |a,v| a[v[0]]=v[1].split(sep); a } end def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN) - dict=dictionary(h,sep) - new_data=v.splitted(sep) + dict = dictionary(h,sep) + new_data = v.splitted(sep) new_data.collect do |c| if c.nil? nil @@ -134,20 +145,22 @@ def recode_dataset_split!(dataset, dictionary_hash, sep=Statsample::SPLIT_TOKEN) def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false) v_names||=h.keys v_names.each do |v_name| - raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name - recoded=recode_vector(dataset[v_name], h[v_name],sep).collect { |c| - if c.nil? - nil - else - c.join(sep) + raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.vectors.include? v_name + recoded = Daru::Vector.new( + recode_vector(dataset[v_name], h[v_name],sep).collect do |c| + if c.nil? + nil + else + c.join(sep) + end end - }.to_vector - if(split) + ) + if split recoded.split_by_separator(sep).each {|k,v| - dataset[v_name+"_"+k]=v + dataset[(v_name.to_s + "_" + k).to_sym] = v } else - dataset[v_name+"_recoded"]=recoded + dataset[(v_name.to_s + "_recoded").to_sym] = recoded end end end diff --git a/lib/statsample/converter/csv.rb b/lib/statsample/converter/csv.rb index e84442d..9834fac 100644 --- a/lib/statsample/converter/csv.rb +++ b/lib/statsample/converter/csv.rb @@ -1,78 +1,27 @@ +# This module will be removed in the next release. +# Please shift to using Daru::DataFrame.from_csv and #write_csv for CSV +# related operations. module Statsample - class CSV < SpreadsheetBase - if RUBY_VERSION<"1.9" - require 'fastercsv' - CSV_klass=::FasterCSV - else - require 'csv' - CSV_klass=::CSV - end + class CSV class << self - - def read19(filename,ignore_lines=0,csv_opts=Hash.new) - #default first line is header - csv_opts.merge!(:headers=>true, :header_converters => :symbol) - csv = CSV_klass::Table.new(CSV_klass::read(filename,'r',csv_opts)) - csv_headers = if csv_opts[:headers] - csv.headers - else - #as in R, if no header we name the headers as V1,V2,V3,V4,.. - 1.upto(csv.first.length).collect { |i| "V#{i}" } - end - #we invert row -> column. It means csv[0] is the first column and not row. Similar to R - csv.by_col! - thash = {} - csv_headers.each_with_index do |header,idx| - thash[header] = Statsample::Vector.new(csv[idx].drop(ignore_lines)) - end - Statsample::Dataset.new(thash) - end - # Returns a Dataset based on a csv file + # Return a DataFrom created from a csv file. # - # USE: - # ds=Statsample::CSV.read("test_csv.csv") - def read(filename, empty=[''],ignore_lines=0,csv_opts=Hash.new) - first_row=true - fields=[] - #fields_data={} - ds=nil - line_number=0 - csv=CSV_klass.open(filename,'rb', csv_opts) - csv.each do |row| - line_number+=1 - if(line_number<=ignore_lines) - #puts "Skip line" - next - end - row.collect!{|c| c.to_s } - if first_row - fields=extract_fields(row) - ds=Statsample::Dataset.new(fields) - first_row=false - else - rowa=process_row(row,empty) - ds.add_case(rowa,false) - end - end - convert_to_scale_and_date(ds,fields) - ds.update_valid_data - ds + # == NOTE + # + # This method has been DEPRECATED in favour of Daru::DataFrame.from_csv. + # Please switch to using that. + def read(filename, empty = [''], ignore_lines = 0, opts = {}) + raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_csv instead." end - # Save a Dataset on a csv file + + # Save a Dataset on a csv file. # - # USE: - # Statsample::CSV.write(ds,"test_csv.csv") - def write(dataset,filename, convert_comma=false,*opts) - - writer=CSV_klass.open(filename,'w',*opts) - writer << dataset.fields - dataset.each_array do|row| - if(convert_comma) - row.collect!{|v| v.to_s.gsub(".",",")} - end - writer << row - end - writer.close + # == NOTE + # + # This method has BEEN DEPRECATED in favor of Daru::DataFrame#write_csv. + # Please use that instead. + def write(dataset, filename, convert_comma = false, opts = {}) + raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_csv instead." end end end diff --git a/lib/statsample/converter/spss.rb b/lib/statsample/converter/spss.rb index 2599c82..8be7d1e 100644 --- a/lib/statsample/converter/spss.rb +++ b/lib/statsample/converter/spss.rb @@ -4,26 +4,27 @@ class << self # Export a SPSS Matrix with tetrachoric correlations . # # Use: - # ds=Statsample::Excel.read("my_data.xls") + # ds=Daru::DataFrame.from_excel("my_data.xls") # puts Statsample::SPSS.tetrachoric_correlation_matrix(ds) def tetrachoric_correlation_matrix(ds) - dsv=ds.dup_only_valid + dsv=ds.reject_values(*Daru::MISSING_VALUES) # Delete all vectors doesn't have variation - dsv.fields.each{|f| + dsv.vectors.each { |f| if dsv[f].factors.size==1 dsv.delete_vector(f) else dsv[f]=dsv[f].dichotomize end } + tcm=Statsample::Bivariate.tetrachoric_correlation_matrix(dsv) - n=dsv.fields.collect {|f| + n=dsv.vectors.to_a.collect {|f| sprintf("%d",dsv[f].size) } - meanlist=dsv.fields.collect{|f| + meanlist=dsv.vectors.to_a.collect{|f| sprintf("%0.3f", dsv[f].mean) } - stddevlist=dsv.fields.collect{|f| + stddevlist=dsv.vectors.to_a.collect{|f| sprintf("%0.3f", dsv[f].sd) } out=<<-HEREDOC diff --git a/lib/statsample/converters.rb b/lib/statsample/converters.rb index f5201ee..fbb1342 100644 --- a/lib/statsample/converters.rb +++ b/lib/statsample/converters.rb @@ -1,63 +1,36 @@ require 'statsample/converter/spss' module Statsample - # Create and dumps Datasets on a database + # Create and dumps Datasets on a database + # + # == NOTE + # + # Deprecated. Use Daru::DataFrame.from_sql and Daru::DataFrame#write_sql module Database class << self # Read a database query and returns a Dataset # - # USE: - # - # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password") - # Statsample.read(dbh, "SELECT * FROM test") - # + # == NOTE + # + # Deprecated. Use Daru::DataFrame.from_sql instead. def read(dbh,query) - require 'dbi' - sth=dbh.execute(query) - vectors={} - fields=[] - sth.column_info.each {|c| - vectors[c['name']]=Statsample::Vector.new([]) - vectors[c['name']].name=c['name'] - vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :scale : :nominal - fields.push(c['name']) - } - ds=Statsample::Dataset.new(vectors,fields) - sth.fetch do |row| - ds.add_case(row.to_a, false ) - end - ds.update_valid_data - ds + raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_sql instead." end + # Insert each case of the Dataset on the selected table # - # USE: - # - # ds={'id'=>[1,2,3].to_vector, 'name'=>["a","b","c"].to_vector}.to_dataset - # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password") - # Statsample::Database.insert(ds,dbh,"test") - # + # == NOTE + # + # Deprecated. Use Daru::DataFrame#write_sql instead def insert(ds, dbh, table) - require 'dbi' - query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")" - sth=dbh.prepare(query) - ds.each_array{|c| sth.execute(*c) } - return true + raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_sql instead." end # Create a sql, basen on a given Dataset # - # USE: - # - # ds={'id'=>[1,2,3,4,5].to_vector,'name'=>%w{Alex Peter Susan Mary John}.to_vector}.to_dataset - # Statsample::Database.create_sql(ds,'names') - # ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;" + # == NOTE # + # Deprecated. Use Daru::DataFrame#create_sql instead. def create_sql(ds,table,charset="UTF8") - sql="CREATE TABLE #{table} (" - fields=ds.fields.collect{|f| - v=ds[f] - f+" "+v.db_type - } - sql+fields.join(",\n ")+") CHARACTER SET=#{charset};" + raise NoMethodError, "Deprecated. Use Daru::DataFrame#create_sql instead." end end end @@ -65,183 +38,49 @@ module Mondrian class << self def write(dataset,filename) File.open(filename,"wb") do |fp| - fp.puts dataset.fields.join("\t") - dataset.each_array_with_nils do |row| - row2=row.collect{|v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") } + fp.puts dataset.vectors.to_a.join("\t") + dataset.each_row do |row| + row2 = row.map { |v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") } fp.puts row2.join("\t") end end end end end - class SpreadsheetBase + + class PlainText class << self - def extract_fields(row) - i=0; - fields=row.to_a.collect{|c| - if c.nil? - i+=1 - "var%05d" % i - else - c.to_s.downcase - end - } - fields.recode_repeated + def read(filename, fields) + raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_plaintext instead." end - - def process_row(row,empty) - row.to_a.map do |c| - if empty.include?(c) - nil - else - if c.is_a? String and c.is_number? - if c=~/^\d+$/ - c.to_i - else - c.gsub(",",".").to_f - end - else - c - end - end - end - end - def convert_to_scale_and_date(ds,fields) - fields.each do |f| - if ds[f].can_be_scale? - ds[f].type=:scale - elsif ds[f].can_be_date? - ds[f].type=:date - end - end - end - end end - class PlainText < SpreadsheetBase - class << self - def read(filename, fields) - ds=Statsample::Dataset.new(fields) - fp=File.open(filename,"r") - fp.each_line do |line| - row=process_row(line.strip.split(/\s+/),[""]) - next if row==["\x1A"] - ds.add_case_array(row) - end - convert_to_scale_and_date(ds,fields) - ds.update_valid_data - fields.each {|f| - ds[f].name=f - } - ds - end - end - end - class Excel < SpreadsheetBase + + # This class has been DEPRECATED. Use Daru::DataFrame::from_excel + # Daru::DataFrame#write_excel for XLS file operations. + class Excel class << self # Write a Excel spreadsheet based on a dataset # * TODO: Format nicely date values + # + # == NOTE + # + # Deprecated. Use Daru::DataFrame#write_csv. def write(dataset,filename) - require 'spreadsheet' - book = Spreadsheet::Workbook.new - sheet = book.create_worksheet - format = Spreadsheet::Format.new :color => :blue, - :weight => :bold - sheet.row(0).concat(dataset.fields.map {|i| i.dup}) # Unfreeze strings - sheet.row(0).default_format = format - i=1 - dataset.each_array{|row| - sheet.row(i).concat(row) - i+=1 - } - book.write(filename) + raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_excel instead." end - # This should be fixed. - # If we have a Formula, should be resolver first - def preprocess_row(row, dates) - i=-1 - row.collect!{|c| - i+=1 - if c.is_a? Spreadsheet::Formula - if(c.value.is_a? Spreadsheet::Excel::Error) - nil - else - c.value - end - elsif dates.include? i and !c.nil? and c.is_a? Numeric - row.date(i) - else - c - end - } - end - private :process_row, :preprocess_row - # Returns a dataset based on a xls file - # USE: - # ds = Statsample::Excel.read("test.xls") - # + # + # == NOTE + # + # Deprecated. Use Daru::DataFrame.from_excel instead. def read(filename, opts=Hash.new) - require 'spreadsheet' - raise "options should be Hash" unless opts.is_a? Hash - opts_default={ - :worksheet_id=>0, - :ignore_lines=>0, - :empty=>[''] - } - - opts=opts_default.merge opts - - worksheet_id=opts[:worksheet_id] - ignore_lines=opts[:ignore_lines] - empty=opts[:empty] - - first_row=true - fields=[] - fields_data={} - ds=nil - line_number=0 - book = Spreadsheet.open filename - sheet= book.worksheet worksheet_id - sheet.each do |row| - begin - dates=[] - row.formats.each_index{|i| - if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY" - dates.push(i) - end - } - line_number+=1 - next if(line_number<=ignore_lines) - - preprocess_row(row,dates) - if first_row - fields=extract_fields(row) - ds=Statsample::Dataset.new(fields) - first_row=false - else - rowa=process_row(row,empty) - (fields.size - rowa.size).times { - rowa << nil - } - ds.add_case(rowa,false) - end - rescue => e - error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}" - raise - end - end - convert_to_scale_and_date(ds, fields) - ds.update_valid_data - fields.each {|f| - ds[f].name=f - } - ds.name=filename - ds + raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_excel instead." end end end + module Mx class << self def write(dataset,filename,type=:covariance) @@ -250,12 +89,12 @@ def write(dataset,filename,type=:covariance) fp.puts "! #{filename}" fp.puts "! Output generated by Statsample" fp.puts "Data Ninput=#{dataset.fields.size} Nobservations=#{dataset.cases}" - fp.puts "Labels "+dataset.fields.join(" ") + fp.puts "Labels " + dataset.vectors.to_a.join(" ") case type when :raw fp.puts "Rectangular" dataset.each do |row| - out=dataset.fields.collect do |f| + out=dataset.vectors.to_a.collect do |f| if dataset[f].is_valid? row[f] row[f] else @@ -293,22 +132,22 @@ def out(dataset,opt={}) carrier=OpenStruct.new carrier.categorials=[] carrier.conversions={} - variables_def=dataset.fields.collect{|k| + variables_def=dataset.vectors.to_a.collect{|k| variable_definition(carrier,dataset[k],k) }.join("\n") - + indexes=carrier.categorials.inject({}) {|s,c| - s[dataset.fields.index(c)]=c + s[dataset.vectors.to_a.index(c)]=c s } records="" - dataset.each_array {|c| - indexes.each{|ik,iv| - c[ik]=carrier.conversions[iv][c[ik]] + dataset.each_row {|c| + indexes.each { |ik,iv| + c[ik] = carrier.conversions[iv][c[ik]] } records << "#{values_definition(c, default_opt[:missing])}\n" } - + out=< @@ -346,7 +185,7 @@ def values_definition(c,missing) # nickname = nickname def variable_definition(carrier,v,name,nickname=nil) nickname = (nickname.nil? ? "" : "nickname=\"#{nickname}\"" ) - if v.type==:nominal or v.data.find {|d| d.is_a? String } + if v.type==:object or v.to_a.find {|d| d.is_a? String } carrier.categorials.push(name) carrier.conversions[name]={} factors=v.factors @@ -354,17 +193,16 @@ def variable_definition(carrier,v,name,nickname=nil) out << "\n" out << (1..factors.size).to_a.collect{|i| carrier.conversions[name][factors[i-1]]=i - "#{v.labeling(factors[i-1])}" + "#{(v.labels[factors[i-1]] || factors[i-1])}" }.join("\n") out << "\n\n" out - elsif v.data.find {|d| d.is_a? Float} + elsif v.to_a.find {|d| d.is_a? Float} "" else "" end end - end end end diff --git a/lib/statsample/crosstab.rb b/lib/statsample/crosstab.rb index 75cf075..6dc4710 100644 --- a/lib/statsample/crosstab.rb +++ b/lib/statsample/crosstab.rb @@ -8,45 +8,46 @@ class Crosstab attr_reader :v_rows, :v_cols attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total def initialize(v1, v2, opts=Hash.new) - #raise ArgumentError, "Both arguments should be Vectors" unless v1.is_a? Statsample::Vector and v2.is_a? Statsample::Vector raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size - @v_rows, @v_cols=Statsample.only_valid_clone(v1.to_vector,v2.to_vector) - @cases=@v_rows.size - @row_label=v1.name - @column_label=v2.name - @name=nil + @v_rows, @v_cols = Statsample.only_valid_clone( + Daru::Vector.new(v1), + Daru::Vector.new(v2)) + @cases = @v_rows.size + @row_label = v1.name + @column_label = v2.name + @name = nil @percentage_row = @percentage_column = @percentage_total=false - opts.each{|k,v| + opts.each do |k,v| self.send("#{k}=",v) if self.respond_to? k - } - @name||=_("Crosstab %s - %s") % [@row_label, @column_label] + end + @name ||= _("Crosstab %s - %s") % [@row_label, @column_label] end def rows_names - @v_rows.factors.sort + @v_rows.factors.sort.reset_index! end def cols_names - @v_cols.factors.sort + @v_cols.factors.sort.reset_index! end def rows_total - @v_rows.frequencies + @v_rows.frequencies.to_h end def cols_total - @v_cols.frequencies + @v_cols.frequencies.to_h end def frequencies - base=rows_names.inject([]){|s,row| - s+=cols_names.collect{|col| [row,col]} - }.inject({}) {|s,par| + base = rows_names.inject([]) do |s,row| + s += cols_names.collect { |col| [row,col] } + end.inject({}) do |s,par| s[par]=0 s - } - base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies) + end + base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies.to_h) end def to_matrix - f=frequencies - rn=rows_names - cn=cols_names + f = frequencies + rn = rows_names + cn = cols_names Matrix.rows(rn.collect{|row| cn.collect{|col| f[[row,col]]} }) @@ -67,8 +68,8 @@ def frequencies_by_col end # Chi square, based on expected and real matrix def chi_square - require 'statsample/test' - Statsample::Test.chi_square(self.to_matrix, matrix_expected) + require 'statsample/test' + Statsample::Test.chi_square(self.to_matrix, matrix_expected) end # Useful to obtain chi square def matrix_expected @@ -98,10 +99,10 @@ def report_building(builder) generator.text(_("Rows: %s") % @row_label) unless @row_label.nil? generator.text(_("Columns: %s") % @column_label) unless @column_label.nil? - t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c)}+[_("Total")]) + t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c)}+[_("Total")]) rn.each do |row| total_row=0 - t_row=[@v_rows.labeling(row)] + t_row=[@v_rows.index_of(row)] cn.each do |col| data=fq[[row,col]] total_row+=fq[[row,col]] @@ -148,9 +149,9 @@ def table_percentage(generator,type) when :total then _("% Total") end - t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c) } + [_("Total")]) + t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c) } + [_("Total")]) rn.each do |row| - t_row=[@v_rows.labeling(row)] + t_row=[@v_rows.index_of(row)] cn.each do |col| total=case type when :row then rt[row] diff --git a/lib/statsample/daru.rb b/lib/statsample/daru.rb new file mode 100644 index 0000000..21f111a --- /dev/null +++ b/lib/statsample/daru.rb @@ -0,0 +1,115 @@ +# Opening the Daru::DataFrame class for adding methods to convert from +# data structures to specialized statsample data structues like Multiset. +module Daru + class Vector + def histogram(bins=10) + type == :numeric or raise TypeError, "Only numeric Vectors can do this operation." + + if bins.is_a? Array + h = Statsample::Histogram.alloc(bins) + else + # ugly patch. The upper limit for a bin has the form + # x < range + #h=Statsample::Histogram.new(self, bins) + valid = reject_values(*Daru::MISSING_VALUES) + min,max=Statsample::Util.nice(valid.min,valid.max) + # fix last data + if max == valid.max + max += 1e-10 + end + h = Statsample::Histogram.alloc(bins,[min,max]) + # Fix last bin + end + + h.increment(valid) + h + end + + # Variance of p, according to poblation size + def variance_proportion(n_poblation, v=1) + Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation) + end + + # Variance of p, according to poblation size + def variance_total(n_poblation, v=1) + Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation) + end + + def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1) + Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin) + end + + def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1) + Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin) + end + end + + class DataFrame + def crosstab(v1,v2,opts={}) + Statsample::Crosstab.new(self[v1], self[v2],opts) + end + + # Functions for converting to Statsample::Multiset + def to_multiset_by_split(*vecs) + require 'statsample/multiset' + + if vecs.size == 1 + to_multiset_by_split_one_field(vecs[0]) + else + to_multiset_by_split_multiple_fields(*vecs) + end + end + + # Creates a Statsample::Multiset, using one field + def to_multiset_by_split_one_field(field) + raise ArgumentError,"Should use a correct field name" if + !@vectors.include? field + + factors = self[field].factors + ms = Statsample::Multiset.new_empty_vectors(@vectors.to_a, factors) + each_row do |row| + ms[row[field]].add_row(row) + end + #puts "Ingreso a los dataset" + ms.datasets.each do |k,ds| + ds.rename self[field].index_of(k) + end + + ms + end + + def to_multiset_by_split_multiple_fields(*fields) + fields.map!(&:to_sym) + factors_total=nil + fields.each do |f| + if factors_total.nil? + factors_total = self[f].factors.collect { |c| [c] } + else + suma = [] + factors = self[f].factors + factors_total.each do |f1| + factors.each do |f2| + suma.push(f1+[f2]) + end + end + factors_total = suma + end + end + ms = Statsample::Multiset.new_empty_vectors(vectors.to_a, factors_total) + + p1 = eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}'.to_sym]"}.join(",")+"]].add_row(c) }" + each_row { |r| p1.call(r) } + + ms.datasets.each do |k,ds| + ds.rename( + fields.size.times.map do |i| + f = fields[i] + sk = k[i] + self[f].index_of(sk) + end.join("-") + ) + end + ms + end + end +end \ No newline at end of file diff --git a/lib/statsample/dataset.rb b/lib/statsample/dataset.rb index fbeea85..5243d12 100644 --- a/lib/statsample/dataset.rb +++ b/lib/statsample/dataset.rb @@ -1,1005 +1,10 @@ require 'statsample/vector' class Hash - # Creates a Statsample::Dataset based on a Hash - def to_dataset(*args) - Statsample::Dataset.new(self, *args) + # Creates a Statsample::Dataset based on a Hash + def to_dataframe(*args) + Daru::DataFrame.new(self, *args) end -end - -class Array - def prefix(s) # :nodoc: - self.collect{|c| s+c.to_s } - end - def suffix(s) # :nodoc: - self.collect{|c| c.to_s+s } - end -end - -module Statsample - class DatasetException < RuntimeError # :nodoc: - attr_reader :ds,:exp - def initialize(ds,e) - @ds=ds - @exp=e - end - def to_s - m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n") - m+="\nRow ##{@ds.i}:#{@ds.case_as_hash(@ds.i)}" unless @ds.i.nil? - m - end - end - # Set of cases with values for one or more variables, - # analog to a dataframe on R or a standard data file of SPSS. - # Every vector has #field name, which represent it. By default, - # the vectors are ordered by it field name, but you can change it - # the fields order manually. - # The Dataset work as a Hash, with keys are field names - # and values are Statsample::Vector - # - # - # ==Usage - # Create a empty dataset: - # Dataset.new() - # Create a dataset with three empty vectors, called v1, v2 and v3: - # Dataset.new(%w{v1 v2 v3}) - # Create a dataset with two vectors, called v1 - # and v2: - # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector}) - # Create a dataset with two given vectors (v1 and v2), - # with vectors on inverted order: - # Dataset.new({'v2'=>v2,'v1'=>v1},['v2','v1']) - # - # The fast way to create a dataset uses Hash#to_dataset, with - # field order as arguments - # v1 = [1,2,3].to_scale - # v2 = [1,2,3].to_scale - # ds = {'v1'=>v2, 'v2'=>v2}.to_dataset(%w{v2 v1}) - - class Dataset - include Writable - include Summarizable - # Hash of Statsample::Vector - attr_reader :vectors - # Ordered ids of vectors - attr_reader :fields - # Name of dataset - attr_accessor :name - # Number of cases - attr_reader :cases - # Location of pointer on enumerations methods (like #each) - attr_reader :i - - # Generates a new dataset, using three vectors - # - Rows - # - Columns - # - Values - # - # For example, you have these values - # - # x y v - # a a 0 - # a b 1 - # b a 1 - # b b 0 - # - # You obtain - # id a b - # a 0 1 - # b 1 0 - # - # Useful to process outputs from databases - def self.crosstab_by_asignation(rows,columns,values) - raise "Three vectors should be equal size" if rows.size!=columns.size or rows.size!=values.size - cols_values=columns.factors - cols_n=cols_values.size - h_rows=rows.factors.inject({}){|a,v| a[v]=cols_values.inject({}){ - |a1,v1| a1[v1]=nil; a1 - } - ;a} - values.each_index{|i| - h_rows[rows[i]][columns[i]]=values[i] - } - ds=Dataset.new(["_id"]+cols_values) - cols_values.each{|c| - ds[c].type=values.type - } - rows.factors.each {|row| - n_row=Array.new(cols_n+1) - n_row[0]=row - cols_values.each_index {|i| - n_row[i+1]=h_rows[row][cols_values[i]] - } - ds.add_case_array(n_row) - } - ds.update_valid_data - ds - end - # Return true if any vector has missing data - def has_missing_data? - @vectors.any? {|k,v| v.has_missing_data?} - end - # Return a nested hash using fields as keys and - # an array constructed of hashes with other values. - # If block provided, is used to provide the - # values, with parameters +row+ of dataset, - # +current+ last hash on hierarchy and - # +name+ of the key to include - def nest(*tree_keys,&block) - tree_keys=tree_keys[0] if tree_keys[0].is_a? Array - out=Hash.new - each do |row| - current=out - # Create tree - tree_keys[0,tree_keys.size-1].each do |f| - root=row[f] - current[root]||=Hash.new - current=current[root] - end - name=row[tree_keys.last] - if !block - current[name]||=Array.new - current[name].push(row.delete_if{|key,value| tree_keys.include? key}) - else - current[name]=block.call(row, current,name) - end - end - out - end - # Creates a new dataset. A dataset is a set of ordered named vectors - # of the same size. - # - # [vectors] With an array, creates a set of empty vectors named as - # values on the array. With a hash, each Vector is assigned as - # a variable of the Dataset named as its key - # [fields] Array of names for vectors. Is only used for set the - # order of variables. If empty, vectors keys on alfabethic order as - # used as fields. - def initialize(vectors={}, fields=[]) - @@n_dataset||=0 - @@n_dataset+=1 - @name=_("Dataset %d") % @@n_dataset - @cases=0 - @gsl=nil - @i=nil - - if vectors.instance_of? Array - @fields=vectors.dup - @vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a} - else - # Check vectors - @vectors=vectors - @fields=fields - check_order - check_length - end - end - # - # Creates a copy of the given dataset, deleting all the cases with - # missing data on one of the vectors. - # - # @param array of fields to include. No value include all fields - # - def dup_only_valid(*fields_to_include) - if fields_to_include.size==1 and fields_to_include[0].is_a? Array - fields_to_include=fields_to_include[0] - end - fields_to_include=@fields if fields_to_include.size==0 - if fields_to_include.any? {|f| @vectors[f].has_missing_data?} - ds=Dataset.new(fields_to_include) - fields_to_include.each {|f| ds[f].type=@vectors[f].type} - each {|row| - unless fields_to_include.any? {|f| @vectors[f].has_missing_data? and !@vectors[f].is_valid? row[f]} - row_2=fields_to_include.inject({}) {|ac,v| ac[v]=row[v]; ac} - ds.add_case(row_2) - end - } - else - ds=dup fields_to_include - end - ds.name= self.name - ds - end - # - # Returns a duplicate of the Dataset. - # All vectors are copied, so any modification on new - # dataset doesn't affect original dataset's vectors. - # If fields given as parameter, only include those vectors. - # - # @param array of fields to include. No value include all fields - # @return {Statsample::Dataset} - def dup(*fields_to_include) - if fields_to_include.size==1 and fields_to_include[0].is_a? Array - fields_to_include=fields_to_include[0] - end - fields_to_include=@fields if fields_to_include.size==0 - vectors={} - fields=[] - fields_to_include.each{|f| - raise "Vector #{f} doesn't exists" unless @vectors.has_key? f - vectors[f]=@vectors[f].dup - fields.push(f) - } - ds=Dataset.new(vectors,fields) - ds.name= self.name - ds - end - - - # Returns an array with the fields from first argumen to last argument - def from_to(from,to) - raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from - raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to - @fields.slice(@fields.index(from)..@fields.index(to)) - end - - # Returns (when possible) a cheap copy of dataset. - # If no vector have missing values, returns original vectors. - # If missing values presents, uses Dataset.dup_only_valid. - # - # @param array of fields to include. No value include all fields - # @return {Statsample::Dataset} - def clone_only_valid(*fields_to_include) - if fields_to_include.size==1 and fields_to_include[0].is_a? Array - fields_to_include=fields_to_include[0] - end - fields_to_include=@fields.dup if fields_to_include.size==0 - if fields_to_include.any? {|v| @vectors[v].has_missing_data?} - dup_only_valid(fields_to_include) - else - clone(fields_to_include) - end - end - # Returns a shallow copy of Dataset. - # Object id will be distinct, but @vectors will be the same. - # @param array of fields to include. No value include all fields - # @return {Statsample::Dataset} - def clone(*fields_to_include) - if fields_to_include.size==1 and fields_to_include[0].is_a? Array - fields_to_include=fields_to_include[0] - end - fields_to_include=@fields.dup if fields_to_include.size==0 - ds=Dataset.new - fields_to_include.each{|f| - raise "Vector #{f} doesn't exists" unless @vectors.has_key? f - ds[f]=@vectors[f] - } - ds.fields=fields_to_include - ds.name=@name - ds.update_valid_data - ds - end - # Creates a copy of the given dataset, without data on vectors - # - # @return {Statsample::Dataset} - def dup_empty - vectors=@vectors.inject({}) {|a,v| - a[v[0]]=v[1].dup_empty - a - } - Dataset.new(vectors,@fields.dup) - end - # Merge vectors from two datasets - # In case of name collition, the vectors names are changed to - # x_1, x_2 .... - # - # @return {Statsample::Dataset} - def merge(other_ds) - raise "Cases should be equal (this:#{@cases}; other:#{other_ds.cases}" unless @cases==other_ds.cases - types = @fields.collect{|f| @vectors[f].type} + other_ds.fields.collect{|f| other_ds[f].type} - new_fields = (@fields+other_ds.fields).recode_repeated - ds_new=Statsample::Dataset.new(new_fields) - new_fields.each_index{|i| - field=new_fields[i] - ds_new[field].type=types[i] - } - @cases.times {|i| - row=case_as_array(i)+other_ds.case_as_array(i) - ds_new.add_case_array(row) - } - ds_new.update_valid_data - ds_new - end - - # Join 2 Datasets by given fields - # type is one of :left and :inner, default is :left - # - # @return {Statsample::Dataset} - def join(other_ds,fields_1=[],fields_2=[],type=:left) - fields_new = other_ds.fields - fields_2 - fields = self.fields + fields_new - other_ds_hash = {} - other_ds.each do |row| - key = row.select{|k,v| fields_2.include?(k)}.values - value = row.select{|k,v| fields_new.include?(k)} - if other_ds_hash[key].nil? - other_ds_hash[key] = [value] - else - other_ds_hash[key] << value - end - end - - new_ds = Dataset.new(fields) - - self.each do |row| - key = row.select{|k,v| fields_1.include?(k)}.values - - new_case = row.dup - - if other_ds_hash[key].nil? - if type == :left - fields_new.each{|field| new_case[field] = nil} - new_ds.add_case(new_case) - end - else - other_ds_hash[key].each do |new_values| - new_ds.add_case new_case.merge(new_values) - end - end - - end - new_ds - end - # Returns a dataset with standarized data. - # - # @return {Statsample::Dataset} - def standarize - ds=dup() - ds.fields.each do |f| - ds[f]=ds[f].vector_standarized - end - ds - end - # Generate a matrix, based on fields of dataset - # - # @return {::Matrix} - - def collect_matrix - rows=@fields.collect{|row| - @fields.collect{|col| - yield row,col - } - } - Matrix.rows(rows) - end - - # We have the same datasets if +vectors+ and +fields+ are the same - # - # @return {Boolean} - def ==(d2) - @vectors==d2.vectors and @fields==d2.fields - end - # Returns vector c - # - # @return {Statsample::Vector} - def col(c) - @vectors[c] - end - alias_method :vector, :col - # Equal to Dataset[name]=vector - # - # @return self - def add_vector(name, vector) - raise ArgumentError, "Vector have different size" if vector.size!=@cases - @vectors[name]=vector - check_order - self - end - # Returns true if dataset have vector v. - # - # @return {Boolean} - def has_vector? (v) - return @vectors.has_key?(v) - end - # Creates a dataset with the random data, of a n size - # If n not given, uses original number of cases. - # - # @return {Statsample::Dataset} - def bootstrap(n=nil) - n||=@cases - ds_boot=dup_empty - n.times do - ds_boot.add_case_array(case_as_array(rand(n))) - end - ds_boot.update_valid_data - ds_boot - end - # Fast version of #add_case. - # Can only add one case and no error check if performed - # You SHOULD use #update_valid_data at the end of insertion cycle - # - # - def add_case_array(v) - v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])} - end - # Insert a case, using: - # * Array: size equal to number of vectors and values in the same order as fields - # * Hash: keys equal to fields - # If uvd is false, #update_valid_data is not executed after - # inserting a case. This is very useful if you want to increase the - # performance on inserting many cases, because #update_valid_data - # performs check on vectors and on the dataset - - def add_case(v,uvd=true) - case v - when Array - if (v[0].is_a? Array) - v.each{|subv| add_case(subv,false)} - else - raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size - v.each_index {|i| @vectors[@fields[i]].add(v[i],false)} - end - when Hash - raise ArgumentError, "Hash keys should be equal to fields #{(v.keys - @fields).join(",")}" if @fields.sort!=v.keys.sort - @fields.each{|f| @vectors[f].add(v[f],false)} - else - raise TypeError, 'Value must be a Array or a Hash' - end - if uvd - update_valid_data - end - end - # Check vectors and fields after inserting data. Use only - # after #add_case_array or #add_case with second parameter to false - def update_valid_data - @gsl=nil - @fields.each{|f| @vectors[f].set_valid_data} - check_length - end - # Delete vector named +name+. Multiple fields accepted. - def delete_vector(*args) - if args.size==1 and args[0].is_a? Array - names=args[0] - else - names=args - end - names.each do |name| - @fields.delete(name) - @vectors.delete(name) - end - end - - def add_vectors_by_split_recode(name_,join='-',sep=Statsample::SPLIT_TOKEN) - split=@vectors[name_].split_by_separator(sep) - i=1 - split.each{|k,v| - new_field=name_+join+i.to_s - v.name=name_+":"+k - add_vector(new_field,v) - i+=1 - } - end - def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN) - split=@vectors[name].split_by_separator(sep) - split.each{|k,v| - add_vector(name+join+k,v) - } - end - - def vector_by_calculation(type=:scale) - a=[] - each do |row| - a.push(yield(row)) - end - a.to_vector(type) - end - # Returns a vector with sumatory of fields - # if fields parameter is empty, sum all fields - def vector_sum(fields=nil) - fields||=@fields - vector=collect_with_index do |row, i| - if(fields.find{|f| !@vectors[f].data_with_nils[i]}) - nil - else - fields.inject(0) {|ac,v| ac + row[v].to_f} - end - end - vector.name=_("Sum from %s") % @name - vector - end - # Check if #fields attribute is correct, after inserting or deleting vectors - def check_fields(fields) - fields||=@fields - raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0 - fields - end - - # Returns a vector with the numbers of missing values for a case - def vector_missing_values(fields=nil) - fields=check_fields(fields) - collect_with_index do |row, i| - fields.inject(0) {|a,v| - a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0) - } - end - end - def vector_count_characters(fields=nil) - fields=check_fields(fields) - collect_with_index do |row, i| - fields.inject(0){|a,v| - a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size) - } - end - end - # Returns a vector with the mean for a set of fields - # if fields parameter is empty, return the mean for all fields - # if max invalid parameter > 0, returns the mean for all tuples - # with 0 to max_invalid invalid fields - def vector_mean(fields=nil, max_invalid=0) - a=[] - fields=check_fields(fields) - size=fields.size - each_with_index do |row, i | - # numero de invalidos - sum=0 - invalids=0 - fields.each{|f| - if !@vectors[f].data_with_nils[i].nil? - sum+=row[f].to_f - else - invalids+=1 - end - } - if(invalids>max_invalid) - a.push(nil) - else - a.push(sum.quo(size-invalids)) - end - end - a=a.to_vector(:scale) - a.name=_("Means from %s") % @name - a - end - # Check vectors for type and size. - def check_length # :nodoc: - size=nil - @vectors.each do |k,v| - raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector - if size.nil? - size=v.size - else - if v.size!=size - raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}" - end - end - end - @cases=size - end - # Retrieves each vector as [key, vector] - def each_vector # :yield: |key, vector| - @fields.each{|k| yield k, @vectors[k]} - end - - if Statsample::STATSAMPLE__.respond_to?(:case_as_hash) - def case_as_hash(c) # :nodoc: - Statsample::STATSAMPLE__.case_as_hash(self,c) - end - else - # Retrieves case i as a hash - def case_as_hash(i) - _case_as_hash(i) - end - end - - if Statsample::STATSAMPLE__.respond_to?(:case_as_array) - def case_as_array(c) # :nodoc: - Statsample::STATSAMPLE__.case_as_array(self,c) - end - else - # Retrieves case i as a array, ordered on #fields order - def case_as_array(i) - _case_as_array(i) - end - end - def _case_as_hash(c) # :nodoc: - @fields.inject({}) {|a,x| a[x]=@vectors[x][c];a } - end - def _case_as_array(c) # :nodoc: - @fields.collect {|x| @vectors[x][c]} - end - - # Returns each case as a hash - def each - begin - @i=0 - @cases.times {|i| - @i=i - row=case_as_hash(i) - yield row - } - @i=nil - rescue =>e - raise DatasetException.new(self, e) - end - end - - # Returns each case as hash and index - def each_with_index # :yield: |case, i| - begin - @i=0 - @cases.times{|i| - @i=i - row=case_as_hash(i) - yield row, i - } - @i=nil - rescue =>e - raise DatasetException.new(self, e) - end - end - - # Returns each case as an array, coding missing values as nils - def each_array_with_nils - m=fields.size - @cases.times {|i| - @i=i - row=Array.new(m) - fields.each_index{|j| - f=fields[j] - row[j]=@vectors[f].data_with_nils[i] - } - yield row - } - @i=nil - end - # Returns each case as an array - def each_array - @cases.times {|i| - @i=i - row=case_as_array(i) - yield row - } - @i=nil - end - # Set fields order. If you omit one or more vectors, they are - # ordered by alphabetic order. - def fields=(f) - @fields=f - check_order - end - # Check congruence between +fields+ attribute - # and keys on +vectors - def check_order #:nodoc: - if(@vectors.keys.sort!=@fields.sort) - @fields=@fields&@vectors.keys - @fields+=@vectors.keys.sort-@fields - end - end - # Returns the vector named i - def[](i) - if i.is_a? Range - fields=from_to(i.begin,i.end) - clone(*fields) - elsif i.is_a? Array - clone(i) - else - raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i) - @vectors[i] - end - end - # Retrieves a Statsample::Vector, based on the result - # of calculation performed on each case. - def collect(type=:scale) - data=[] - each {|row| - data.push yield(row) - } - Statsample::Vector.new(data,type) - end - # Same as Statsample::Vector.collect, but giving case index as second parameter on yield. - def collect_with_index(type=:scale) - data=[] - each_with_index {|row, i| - data.push(yield(row, i)) - } - Statsample::Vector.new(data,type) - end - # Recode a vector based on a block - def recode!(vector_name) - 0.upto(@cases-1) {|i| - @vectors[vector_name].data[i]=yield case_as_hash(i) - } - @vectors[vector_name].set_valid_data - end - - def crosstab(v1,v2,opts={}) - Statsample::Crosstab.new(@vectors[v1], @vectors[v2],opts) - end - def[]=(i,v) - if v.instance_of? Statsample::Vector - @vectors[i]=v - check_order - else - raise ArgumentError,"Should pass a Statsample::Vector" - end - end - # Return data as a matrix. Column are ordered by #fields and - # rows by orden of insertion - def to_matrix - rows=[] - self.each_array{|c| - rows.push(c) - } - Matrix.rows(rows) - end - - if Statsample.has_gsl? - def clear_gsl - @gsl=nil - end - - def to_gsl - if @gsl.nil? - if cases.nil? - update_valid_data - end - @gsl=GSL::Matrix.alloc(cases,fields.size) - self.each_array{|c| - @gsl.set_row(@i,c) - } - end - @gsl - end - - end - - # Return a correlation matrix for fields included as parameters. - # By default, uses all fields of dataset - def correlation_matrix(fields=nil) - if fields - ds=clone(fields) - else - ds=self - end - Statsample::Bivariate.correlation_matrix(ds) - end - # Return a correlation matrix for fields included as parameters. - # By default, uses all fields of dataset - def covariance_matrix(fields=nil) - if fields - ds=clone(fields) - else - ds=self - end - Statsample::Bivariate.covariance_matrix(ds) - end - - # Create a new dataset with all cases which the block returns true - def filter - ds=self.dup_empty - each {|c| - ds.add_case(c, false) if yield c - } - ds.update_valid_data - ds.name=_("%s(filtered)") % @name - ds - end - - # creates a new vector with the data of a given field which the block returns true - def filter_field(field) - a=[] - each do |c| - a.push(c[field]) if yield c - end - a.to_vector(@vectors[field].type) - end - - # Creates a Stastample::Multiset, using one or more fields - # to split the dataset. - - - def to_multiset_by_split(*fields) - require 'statsample/multiset' - if fields.size==1 - to_multiset_by_split_one_field(fields[0]) - else - to_multiset_by_split_multiple_fields(*fields) - end - end - # Creates a Statsample::Multiset, using one field - - def to_multiset_by_split_one_field(field) - raise ArgumentError,"Should use a correct field name" if !@fields.include? field - factors=@vectors[field].factors - ms=Multiset.new_empty_vectors(@fields, factors) - each {|c| - ms[c[field]].add_case(c,false) - } - #puts "Ingreso a los dataset" - ms.datasets.each {|k,ds| - ds.update_valid_data - ds.name=@vectors[field].labeling(k) - ds.vectors.each{|k1,v1| - # puts "Vector #{k1}:"+v1.to_s - v1.type=@vectors[k1].type - v1.name=@vectors[k1].name - v1.labels=@vectors[k1].labels - - } - } - ms - end - def to_multiset_by_split_multiple_fields(*fields) - factors_total=nil - fields.each do |f| - if factors_total.nil? - factors_total=@vectors[f].factors.collect{|c| - [c] - } - else - suma=[] - factors=@vectors[f].factors - factors_total.each{|f1| factors.each{|f2| suma.push(f1+[f2]) } } - factors_total=suma - end - end - ms=Multiset.new_empty_vectors(@fields,factors_total) - - p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }" - each{|c| p1.call(c)} - - ms.datasets.each do |k,ds| - ds.update_valid_data - ds.name=fields.size.times.map {|i| - f=fields[i] - sk=k[i] - @vectors[f].labeling(sk) - }.join("-") - ds.vectors.each{|k1,v1| - v1.type=@vectors[k1].type - v1.name=@vectors[k1].name - v1.labels=@vectors[k1].labels - - } - end - ms - - end - # Returns a vector, based on a string with a calculation based - # on vector - # The calculation will be eval'ed, so you can put any variable - # or expression valid on ruby - # For example: - # a=[1,2].to_vector(scale) - # b=[3,4].to_vector(scale) - # ds={'a'=>a,'b'=>b}.to_dataset - # ds.compute("a+b") - # => Vector [4,6] - def compute(text) - @fields.each{|f| - if @vectors[f].type=:scale - text.gsub!(f,"row['#{f}'].to_f") - else - text.gsub!(f,"row['#{f}']") - end - } - collect_with_index {|row, i| - invalid=false - @fields.each{|f| - if @vectors[f].data_with_nils[i].nil? - invalid=true - end - } - if invalid - nil - else - eval(text) - end - } - end - # Test each row with one or more tests - # each test is a Proc with the form - # Proc.new {|row| row['age']>0} - # The function returns an array with all errors - def verify(*tests) - if(tests[0].is_a? String) - id=tests[0] - tests.shift - else - id=@fields[0] - end - vr=[] - i=0 - each do |row| - i+=1 - tests.each{|test| - if ! test[2].call(row) - values="" - if test[1].size>0 - values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")" - end - vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}") - end - } - end - vr - end - def to_s - "#<"+self.class.to_s+":"+self.object_id.to_s+" @name=#{@name} @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s - end - def inspect - self.to_s - end - # Creates a new dataset for one to many relations - # on a dataset, based on pattern of field names. - # - # for example, you have a survey for number of children - # with this structure: - # id, name, child_name_1, child_age_1, child_name_2, child_age_2 - # with - # ds.one_to_many(%w{id}, "child_%v_%n" - # the field of first parameters will be copied verbatim - # to new dataset, and fields which responds to second - # pattern will be added one case for each different %n. - # For example - # cases=[ - # ['1','george','red',10,'blue',20,nil,nil], - # ['2','fred','green',15,'orange',30,'white',20], - # ['3','alfred',nil,nil,nil,nil,nil,nil] - # ] - # ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3}) - # cases.each {|c| ds.add_case_array c } - # ds.one_to_many(['id'],'car_%v%n').to_matrix - # => Matrix[ - # ["red", "1", 10], - # ["blue", "1", 20], - # ["green", "2", 15], - # ["orange", "2", 30], - # ["white", "2", 20] - # ] - # - def one_to_many(parent_fields, pattern) - #base_pattern=pattern.gsub(/%v|%n/,"") - re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)") - ds_vars=parent_fields - vars=[] - max_n=0 - h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a } - # Adding _row_id - h['_col_id']=[].to_scale - ds_vars.push("_col_id") - @fields.each do |f| - if f=~re - if !vars.include? $1 - vars.push($1) - h[$1]=Statsample::Vector.new([], @vectors[f].type) - end - max_n=$2.to_i if max_n < $2.to_i - end - end - ds=Dataset.new(h,ds_vars+vars) - each do |row| - row_out={} - parent_fields.each do |f| - row_out[f]=row[f] - end - max_n.times do |n1| - n=n1+1 - any_data=false - vars.each do |v| - data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)] - row_out[v]=data - any_data=true if !data.nil? - end - if any_data - row_out["_col_id"]=n - ds.add_case(row_out,false) - end - - end - end - ds.update_valid_data - ds - end - def report_building(b) - b.section(:name=>@name) do |g| - g.text _"Cases: %d" % cases - @fields.each do |f| - g.text "Element:[#{f}]" - g.parse_element(@vectors[f]) - end - end - end - end + alias :to_dataset :to_dataframe end diff --git a/lib/statsample/dominanceanalysis.rb b/lib/statsample/dominanceanalysis.rb index 6b4da5a..fed0a91 100644 --- a/lib/statsample/dominanceanalysis.rb +++ b/lib/statsample/dominanceanalysis.rb @@ -7,13 +7,13 @@ module Statsample # # == Use # - # a=1000.times.collect {rand}.to_scale - # b=1000.times.collect {rand}.to_scale - # c=1000.times.collect {rand}.to_scale - # ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset - # ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+rand()} - # da=Statsample::DominanceAnalysis.new(ds,'y') - # puts da.summary + # a = Daru::Vector.new(1000.times.collect {rand}) + # b = Daru::Vector.new(1000.times.collect {rand}) + # c = Daru::Vector.new(1000.times.collect {rand}) + # ds= Daru::DataFrame.new({:a => a,:b => b,:c => c}) + # ds[:y] = ds.collect_rows {|row| row[:a]*5 + row[:b]*3 + row[:c]*2 + rand()} + # da=Statsample::DominanceAnalysis.new(ds, :y) + # puts da.summary # # === Output: # @@ -115,21 +115,21 @@ def initialize(input, dependent, opts=Hash.new) } @dependent=dependent @dependent=[@dependent] unless @dependent.is_a? Array - - @predictors ||= input.fields-@dependent - - @name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil? - - if input.is_a? Statsample::Dataset + + if input.kind_of? Daru::DataFrame + @predictors ||= input.vectors.to_a - @dependent @ds=input @matrix=Statsample::Bivariate.correlation_matrix(input) @cases=Statsample::Bivariate.min_n_valid(input) elsif input.is_a? ::Matrix + @predictors ||= input.fields-@dependent @ds=nil @matrix=input else raise ArgumentError.new("You should use a Matrix or a Dataset") end + + @name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil? @models=nil @models_data=nil @general_averages=nil @@ -264,22 +264,21 @@ def general_dominance end def md(m) - models_data[m.sort {|a,b| a.to_s<=>b.to_s}] + models_data[m.sort {|a,b| a.to_s <=> b.to_s}] end # Get all model of size k def md_k(k) out=[] - @models.each{|m| out.push(md(m)) if m.size==k } + @models.each{ |m| out.push(md(m)) if m.size==k } out end # For a hash with arrays of numbers as values # Returns a hash with same keys and # value as the mean of values of original hash - def get_averages(averages) out={} - averages.each{|key,val| out[key]=val.to_vector(:scale).mean } + averages.each{ |key,val| out[key] = Daru::Vector.new(val).mean } out end # Hash with average for each k size model. diff --git a/lib/statsample/dominanceanalysis/bootstrap.rb b/lib/statsample/dominanceanalysis/bootstrap.rb index 32d1588..d81a6fd 100644 --- a/lib/statsample/dominanceanalysis/bootstrap.rb +++ b/lib/statsample/dominanceanalysis/bootstrap.rb @@ -5,16 +5,16 @@ class DominanceAnalysis # # == Usage # - # require 'statsample' - # a=100.times.collect {rand}.to_scale - # b=100.times.collect {rand}.to_scale - # c=100.times.collect {rand}.to_scale - # d=100.times.collect {rand}.to_scale - # ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset - # ds['y']=ds.collect{|row| row['a']*5+row['b']*2+row['c']*2+row['d']*2+10*rand()} - # dab=Statsample::DominanceAnalysis::Bootstrap.new(ds2, 'y', :debug=>true) - # dab.bootstrap(100,nil) - # puts dab.summary + # require 'statsample' + # a = Daru::Vector.new(100.times.collect {rand}) + # b = Daru::Vector.new(100.times.collect {rand}) + # c = Daru::Vector.new(100.times.collect {rand}) + # d = Daru::Vector.new(100.times.collect {rand}) + # ds = Daru::DataFrame.new({:a => a,:b => b,:c => c,:d => d}) + # ds[:y] = ds.collect_rows { |row| row[:a]*5+row[:b]*2+row[:c]*2+row[:d]*2+10*rand() } + # dab=Statsample::DominanceAnalysis::Bootstrap.new(ds, :y, :debug=>true) + # dab.bootstrap(100,nil) + # puts dab.summary # Output # Sample size: 100 # t: 1.98421693632958 @@ -91,28 +91,28 @@ class Bootstrap ALPHA=0.95 # Create a new Dominance Analysis Bootstrap Object # - # * ds: A Dataset object + # * ds: A Daru::DataFrame object # * y_var: Name of dependent variable # * opts: Any other attribute of the class def initialize(ds,y_var, opts=Hash.new) - @ds=ds - @y_var=y_var - @n=ds.cases + @ds = ds + @y_var = y_var.respond_to?(:to_sym) ? y_var.to_sym : y_var + @n = ds.nrows @n_samples=0 @alpha=ALPHA @debug=false if y_var.is_a? Array - @fields=ds.fields-y_var + @fields=ds.vectors.to_a - y_var @regression_class=Regression::Multiple::MultipleDependent else - @fields=ds.fields-[y_var] + @fields=ds.vectors.to_a - [y_var] @regression_class=Regression::Multiple::MatrixEngine end - @samples_ga=@fields.inject({}){|a,v| a[v]=[];a} + @samples_ga=@fields.inject({}) { |a,v| a[v]=[]; a } - @name=_("Bootstrap dominance Analysis: %s over %s") % [ ds.fields.join(",") , @y_var] + @name=_("Bootstrap dominance Analysis: %s over %s") % [ ds.vectors.to_a.join(",") , @y_var] opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } @@ -130,15 +130,14 @@ def da # each sample on @samples_td, @samples_cd, @samples_gd, @samples_ga # # * number_samples: Number of new samples to add - # * n: size of each new sample. If nil, equal to original sample size - + # * n: size of each new sample. If nil, equal to original sample size def bootstrap(number_samples,n=nil) number_samples.times{ |t| @n_samples+=1 puts _("Bootstrap %d of %d") % [t+1, number_samples] if @debug - ds_boot=@ds.bootstrap(n) + ds_boot=@ds.bootstrap(n) da_1=DominanceAnalysis.new(ds_boot, @y_var, :regression_class => @regression_class) - + da_1.total_dominance.each{|k,v| @samples_td[k].push(v) } @@ -182,7 +181,7 @@ def report_building(builder) # :nodoc: table.row([_("Complete dominance"),"","","","","","",""]) table.hr @pairs.each{|pair| - std=@samples_td[pair].to_vector(:scale) + std=Daru::Vector.new(@samples_td[pair]) ttd=da.total_dominance_pairwise(pair[0],pair[1]) table.row(summary_pairs(pair,std,ttd)) } @@ -190,7 +189,7 @@ def report_building(builder) # :nodoc: table.row([_("Conditional dominance"),"","","","","","",""]) table.hr @pairs.each{|pair| - std=@samples_cd[pair].to_vector(:scale) + std=Daru::Vector.new(@samples_cd[pair]) ttd=da.conditional_dominance_pairwise(pair[0],pair[1]) table.row(summary_pairs(pair,std,ttd)) @@ -199,7 +198,7 @@ def report_building(builder) # :nodoc: table.row([_("General Dominance"),"","","","","","",""]) table.hr @pairs.each{|pair| - std=@samples_gd[pair].to_vector(:scale) + std=Daru::Vector.new(@samples_gd[pair]) ttd=da.general_dominance_pairwise(pair[0],pair[1]) table.row(summary_pairs(pair,std,ttd)) } @@ -208,10 +207,9 @@ def report_building(builder) # :nodoc: table=ReportBuilder::Table.new(:name=>_("General averages"), :header=>[_("var"), _("mean"), _("se"), _("p.5"), _("p.95")]) @fields.each{|f| - v=@samples_ga[f].to_vector(:scale) + v=Daru::Vector.new(@samples_ga[f]) row=[@ds[f].name, sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))] - table.row(row) - + table.row(row) } generator.parse_element(table) diff --git a/lib/statsample/factor.rb b/lib/statsample/factor.rb index 686b8b2..ac99348 100644 --- a/lib/statsample/factor.rb +++ b/lib/statsample/factor.rb @@ -34,7 +34,7 @@ module Factor # matrix is not appropriate for factor analysis." # def self.anti_image_covariance_matrix(matrix) - s2=Matrix.diag(*(matrix.inverse.diagonal)).inverse + s2=Matrix.diagonal(*(matrix.inverse.diagonal)).inverse aicm=(s2)*matrix.inverse*(s2) aicm.extend(Statsample::CovariateMatrix) aicm.fields=matrix.fields if matrix.respond_to? :fields @@ -42,13 +42,12 @@ def self.anti_image_covariance_matrix(matrix) end def self.anti_image_correlation_matrix(matrix) matrix=matrix.to_matrix - s=Matrix.diag(*(matrix.inverse.diagonal)).sqrt.inverse + s=Matrix.diagonal(*(matrix.inverse.diagonal)).sqrt.inverse aicm=s*matrix.inverse*s aicm.extend(Statsample::CovariateMatrix) aicm.fields=matrix.fields if matrix.respond_to? :fields aicm - end # Kaiser-Meyer-Olkin measure of sampling adequacy for correlation matrix. @@ -101,6 +100,5 @@ def self.kmo_univariate(matrix, var) end sum_r.quo(sum_r+sum_q) end - end end diff --git a/lib/statsample/factor/map.rb b/lib/statsample/factor/map.rb index 963763a..26ac880 100644 --- a/lib/statsample/factor/map.rb +++ b/lib/statsample/factor/map.rb @@ -75,7 +75,8 @@ def compute (ncol-1).times do |m| puts "MAP:Eigenvalue #{m+1}" if $DEBUG - a=loadings[0..(loadings.row_size-1),0..m] + a=use_gsl ? loadings[0..(loadings.row_size-1),0..m] : + loadings.minor(0..(loadings.row_size-1),0..m) partcov= gsl_m - (a*a.transpose) d=klass_m.diagonal(*(partcov.diagonal.collect {|v| Math::sqrt(1/v)})) diff --git a/lib/statsample/factor/parallelanalysis.rb b/lib/statsample/factor/parallelanalysis.rb index 5a7ff28..4f9cb48 100644 --- a/lib/statsample/factor/parallelanalysis.rb +++ b/lib/statsample/factor/parallelanalysis.rb @@ -22,13 +22,13 @@ module Factor class ParallelAnalysis def self.with_random_data(cases,vars,opts=Hash.new) - require 'ostruct' - ds=OpenStruct.new - ds.fields=vars.times.map {|i| "v#{i+1}"} - ds.cases=cases + ds= Daru::DataFrame.new({}, + order: vars.times.map {|i| "v#{i+1}".to_sym}, + index: cases ) opts=opts.merge({:bootstrap_method=> :random, :no_data=>true}) new(ds, opts) end + include DirtyMemoize include Summarizable # Number of random sets to produce. 50 by default @@ -61,9 +61,9 @@ def self.with_random_data(cases,vars,opts=Hash.new) attr_accessor :use_gsl def initialize(ds, opts=Hash.new) @ds=ds - @fields=@ds.fields + @fields=@ds.vectors.to_a @n_variables=@fields.size - @n_cases=ds.cases + @n_cases=ds.nrows opts_default={ :name=>_("Parallel Analysis"), :iterations=>50, # See Liu and Rijmen (2008) @@ -82,7 +82,7 @@ def initialize(ds, opts=Hash.new) # Number of factor to retent def number_of_factors total=0 - ds_eigenvalues.fields.each_with_index do |f,i| + ds_eigenvalues.vectors.to_a.each_with_index do |f,i| if (@original[i]>0 and @original[i]>ds_eigenvalues[f].percentil(percentil)) total+=1 else @@ -101,7 +101,7 @@ def report_building(g) #:nodoc: s.text _("Number of iterations: %d") % @iterations if @no_data s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("generated eigenvalue"), "p.#{percentil}"]) do |t| - ds_eigenvalues.fields.each_with_index do |f,i| + ds_eigenvalues.vectors.to_a.each_with_index do |f,i| v=ds_eigenvalues[f] t.row [i+1, "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), ] end @@ -109,7 +109,7 @@ def report_building(g) #:nodoc: else s.text _("Number or factors to preserve: %d") % number_of_factors s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("data eigenvalue"), _("generated eigenvalue"),"p.#{percentil}",_("preserve?")]) do |t| - ds_eigenvalues.fields.each_with_index do |f,i| + ds_eigenvalues.vectors.to_a.each_with_index do |f,i| v=ds_eigenvalues[f] t.row [i+1, "%0.4f" % @original[i], "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), (v.percentil(percentil)>0 and @original[i] > v.percentil(percentil)) ? "Yes":""] end @@ -120,11 +120,9 @@ def report_building(g) #:nodoc: end # Perform calculation. Shouldn't be called directly for the user def compute + @original=Statsample::Bivariate.send(matrix_method, @ds).eigenvalues unless no_data + @ds_eigenvalues=Daru::DataFrame.new({}, order: (1..@n_variables).map{|v| ("ev_%05d" % v).to_sym}) - - @original=Statsample::Bivariate.send(matrix_method, @ds).eigenvalues unless no_data - @ds_eigenvalues=Statsample::Dataset.new((1..@n_variables).map{|v| "ev_%05d" % v}) - @ds_eigenvalues.fields.each {|f| @ds_eigenvalues[f].type=:scale} if bootstrap_method==:parameter or bootstrap_method==:random rng = Distribution::Normal.rng end @@ -133,19 +131,18 @@ def compute begin puts "#{@name}: Iteration #{i}" if $DEBUG or debug # Create a dataset of dummy values - ds_bootstrap=Statsample::Dataset.new(@ds.fields) + ds_bootstrap = Daru::DataFrame.new({}, order: @ds.vectors, index: @n_cases) @fields.each do |f| if bootstrap_method==:random - ds_bootstrap[f]=@n_cases.times.map {|c| rng.call}.to_scale + ds_bootstrap[f] = Daru::Vector.new(@n_cases.times.map {|c| rng.call}) elsif bootstrap_method==:data - ds_bootstrap[f]=ds[f].sample_with_replacement(@n_cases) + ds_bootstrap[f] = ds[f].sample_with_replacement(@n_cases) else raise "bootstrap_method doesn't recogniced" end end - ds_bootstrap.update_valid_data - + matrix=Statsample::Bivariate.send(matrix_method, ds_bootstrap) matrix=matrix.to_gsl if @use_gsl if smc @@ -155,13 +152,12 @@ def compute end end ev=matrix.eigenvalues - @ds_eigenvalues.add_case_array(ev) + @ds_eigenvalues.add_row(ev) rescue Statsample::Bivariate::Tetrachoric::RequerimentNotMeet => e puts "Error: #{e}" if $DEBUG redo end end - @ds_eigenvalues.update_valid_data end dirty_memoize :number_of_factors, :ds_eigenvalues dirty_writer :iterations, :bootstrap_method, :percentil, :smc diff --git a/lib/statsample/factor/pca.rb b/lib/statsample/factor/pca.rb index fa5fb37..799c185 100644 --- a/lib/statsample/factor/pca.rb +++ b/lib/statsample/factor/pca.rb @@ -13,11 +13,11 @@ module Factor # # == Usage: # require 'statsample' - # a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale - # b=[2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9].to_scale - # ds={'a'=>a,'b'=>b}.to_dataset - # cor_matrix=Statsample::Bivariate.correlation_matrix(ds) - # pca=Statsample::Factor::PCA.new(cor_matrix) + # a = Daru::Vector.new([2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1]) + # b = Daru::Vector.new([2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9]) + # ds = Daru::DataFrame.new({:a => a,:b => b}) + # cor_matrix = Statsample::Bivariate.correlation_matrix(ds) + # pca= Statsample::Factor::PCA.new(cor_matrix) # pca.m # => 1 # pca.eigenvalues @@ -52,11 +52,13 @@ class PCA attr_accessor :rotation_type attr_accessor :matrix_type def initialize(matrix, opts=Hash.new) - @use_gsl=nil + @use_gsl = opts[:use_gsl] + opts.delete :use_gsl + @name=_("Principal Component Analysis") @matrix=matrix @n_variables=@matrix.column_size - @variables_names=(@matrix.respond_to? :fields) ? @matrix.fields : @n_variables.times.map {|i| _("VAR_%d") % (i+1)} + @variables_names=(@matrix.respond_to? :fields) ? @matrix.fields : @n_variables.times.map {|i| "VAR_#{i+1}".to_sym } @matrix_type = @matrix.respond_to?(:_type) ? @matrix._type : :correlation @@ -67,13 +69,14 @@ def initialize(matrix, opts=Hash.new) opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } + if @use_gsl.nil? @use_gsl=Statsample.has_gsl? end if @matrix.respond_to? :fields @variables_names=@matrix.fields else - @variables_names=@n_variables.times.map {|i| "V#{i+1}"} + @variables_names=@n_variables.times.map {|i| "V#{i+1}".to_sym} end calculate_eigenpairs @@ -81,7 +84,6 @@ def initialize(matrix, opts=Hash.new) # Set number of factors with eigenvalues > 1 @m=@eigenpairs.find_all {|ev,ec| ev>=1.0}.size end - end def rotation @rotation_type.new(component_matrix) @@ -92,10 +94,10 @@ def total_eigenvalues def create_centered_ds h={} @original_ds.factors.each {|f| - mean=@original_ds[f].mean - h[f]=@original_ds[f].recode {|c| c-mean} + mean = @original_ds[f].mean + h[f] = @original_ds[f].recode {|c| c-mean} } - @ds=h.to_dataset + @ds = Daru::DataFrame.new(h) end # Feature matrix for +m+ factors @@ -137,8 +139,8 @@ def principal_components(input, m=nil) pcs=(fv.transpose*data_matrix.transpose).transpose pcs.extend Statsample::NamedMatrix - pcs.fields_y=m.times.map {|i| "PC_%d" % (i+1)} - pcs.to_dataset + pcs.fields_y = m.times.map { |i| "PC_#{i+1}".to_sym } + pcs.to_dataframe end def component_matrix(m=nil) var="component_matrix_#{matrix_type}" @@ -159,7 +161,7 @@ def component_matrix_covariance(m=nil) cm.extend NamedMatrix cm.name=_("Component matrix (from covariance)") cm.fields_x = @variables_names - cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)} + cm.fields_y = m.times.map {|i| "PC_#{i+1}".to_sym } cm end @@ -180,17 +182,16 @@ def component_matrix_correlation(m=nil) cm.extend CovariateMatrix cm.name=_("Component matrix") cm.fields_x = @variables_names - cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)} + cm.fields_y = m.times.map { |i| "PC_#{i+1}".to_sym } cm end def communalities(m=nil) - m||=@m h=[] @n_variables.times do |i| sum=0 m.times do |j| - sum+=(@eigenpairs[j][0].abs*@eigenpairs[j][1][i]**2) + sum += (@eigenpairs[j][0].abs*@eigenpairs[j][1][i]**2) end h.push(sum) end @@ -202,11 +203,11 @@ def eigenvalues end def eigenvectors @eigenpairs.collect {|c| - @use_gsl ? c[1].to_gsl : c[1].to_vector + @use_gsl ? c[1].to_gsl : Daru::Vector.new(c[1]) } end def calculate_eigenpairs - @eigenpairs= @use_gsl ? @matrix.to_gsl.eigenpairs : @matrix.to_matrix.eigenpairs_ruby + @eigenpairs= @use_gsl ? @matrix.to_gsl.eigenpairs : @matrix.to_matrix.eigenpairs_ruby end diff --git a/lib/statsample/factor/principalaxis.rb b/lib/statsample/factor/principalaxis.rb index 4420bf3..1df7aa7 100644 --- a/lib/statsample/factor/principalaxis.rb +++ b/lib/statsample/factor/principalaxis.rb @@ -6,9 +6,9 @@ module Factor # # == Usage: # require 'statsample' - # a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale - # b=[2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9].to_scale - # ds={'a'=>a,'b'=>b}.to_dataset + # a = Daru::Vector.new([2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1]) + # b = Daru::Vector.new([2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9]) + # ds= Daru::DataFrame.new({:a => a,:b => b}) # cor_matrix=Statsample::Bivariate.correlation_matrix(ds) # pa=Statsample::Factor::PrincipalAxis.new(cor_matrix) # pa.iterate(1) diff --git a/lib/statsample/formula/fit_model.rb b/lib/statsample/formula/fit_model.rb new file mode 100644 index 0000000..5ed76b3 --- /dev/null +++ b/lib/statsample/formula/fit_model.rb @@ -0,0 +1,46 @@ +require 'statsample/formula/formula' + +module Statsample + # Class for performing regression + class FitModel + def initialize(formula, df, opts = {}) + @formula = FormulaWrapper.new formula, df + @df = df + @opts = opts + end + + def model + @model || fit_model + end + + def predict(new_data) + model.predict(df_for_prediction(new_data)) + end + + def df_for_prediction df + canonicalize_df(df) + end + + def df_for_regression + df = canonicalize_df(@df) + df[@formula.y.value] = @df[@formula.y.value] + df + end + + def canonicalize_df(orig_df) + tokens = @formula.canonical_tokens + tokens.shift if tokens.first.value == '1' + df = tokens.map { |t| t.to_df orig_df }.reduce(&:merge) + df + end + + def fit_model + # TODO: Add support for inclusion/exclusion of intercept + @model = Statsample::Regression.multiple( + df_for_regression, + @formula.y.value, + @opts + ) + end + end +end diff --git a/lib/statsample/formula/formula.rb b/lib/statsample/formula/formula.rb new file mode 100644 index 0000000..47d5943 --- /dev/null +++ b/lib/statsample/formula/formula.rb @@ -0,0 +1,306 @@ +module Statsample + # This class recognizes what terms are numeric + # and accordingly forms groups which are fed to Formula + # Once they are parsed with Formula, they are combined back + class FormulaWrapper + attr_reader :tokens, :y, :canonical_tokens + + # Initializes formula wrapper object to parse a given formula into + # some tokens which do not overlap one another. + # @note Specify 0 as a term in the formula if you do not want constant + # to be included in the parsed formula + # @param [string] formula to parse + # @param [Daru::DataFrame] df dataframe requried to know what vectors + # are numerical + # @example + # df = Daru::DataFrame.from_csv 'spec/data/df.csv' + # df.to_category 'c', 'd', 'e' + # formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df + # formula.canonical_to_s + # #=> "1+c(-)+d(-):c+a" + def initialize(formula, df) + @df = df + # @y store the LHS term that is name of vector to be predicted + # @tokens store the RHS terms of the formula + @y, *@tokens = split_to_tokens(formula) + @tokens = @tokens.uniq.sort + manage_constant_term + @canonical_tokens = non_redundant_tokens + end + + # Returns canonical tokens in a readable form. + # @return [String] canonical tokens in a readable form. + # @note 'y~a+b(-)' means 'a' exist in full rank expansion + # and 'b(-)' exist in reduced rank expansion + # @example + # df = Daru::DataFrame.from_csv 'spec/data/df.csv' + # df.to_category 'c', 'd', 'e' + # formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df + # formula.canonical_to_s + # #=> "1+c(-)+d(-):c+a" + def canonical_to_s + canonical_tokens.join '+' + end + + # Returns tokens to produce non-redundant design matrix + # @return [Array] array of tokens that do not produce redundant matrix + def non_redundant_tokens + groups = split_to_groups + # TODO: An enhancement + # Right now x:c appears as c:x + groups.each { |k, v| groups[k] = strip_numeric v, k } + groups.each { |k, v| groups[k] = Formula.new(v).canonical_tokens } + groups.flat_map { |k, v| add_numeric v, k } + end + + private + + # Removes intercept token if term '0' is found in the formula. + # Intercept token remains if term '1' is found. + # If neither term '0' nor term '1' is found then, intercept token is added. + def manage_constant_term + @tokens.unshift Token.new('1') unless + @tokens.include?(Token.new('1')) || + @tokens.include?(Token.new('0')) + @tokens.delete Token.new('0') + end + + # Groups the tokens to gropus based on the numerical terms + # they are interacting with. + def split_to_groups + @tokens.group_by { |t| extract_numeric t } + end + + # Add numeric interaction term which was removed earlier + # @param [Array] tokens tokens on which to add numerical terms + # @param [Array] numeric array of numeric terms to add + def add_numeric(tokens, numeric) + tokens.map do |t| + terms = t.interact_terms + numeric + if terms == ['1'] + Token.new('1') + else + terms = terms.reject { |i| i == '1' } + Token.new terms.join(':'), t.full + end + end + end + + # Strip numerical interacting terms + # @param [Array] tokens tokens from which to strip numeric + # @param [Array] numeric array of numeric terms to strip from tokens + # @return [Array] array of tokens with striped numerical terms + def strip_numeric(tokens, numeric) + tokens.map do |t| + terms = t.interact_terms - numeric + terms = ['1'] if terms.empty? + Token.new terms.join(':') + end + end + + # Extract numeric interacting terms + # @param [Statsample::GLM::Token] token form which to extract numeric terms + # @return [Array] array of numericl terms + def extract_numeric(token) + terms = token.interact_terms + return [] if terms == ['1'] + terms.reject { |t| @df[t].category? } + end + + def split_to_tokens(formula) + formula = formula.gsub(/\s+/, '') + lhs_term, rhs = formula.split '~' + rhs_terms = rhs.split '+' + ([lhs_term] + rhs_terms).map { |t| Token.new t } + end + end + + # To process formula language + class Formula + attr_reader :tokens, :canonical_tokens + + def initialize(tokens) + @tokens = tokens + @canonical_tokens = parse_formula + end + + def canonical_to_s + canonical_tokens.join '+' + end + + private + + def parse_formula + @tokens.inject([]) do |acc, token| + acc + add_non_redundant_elements(token, acc) + end + end + + def add_non_redundant_elements(token, result_so_far) + return [token] if token.value == '1' + tokens = token.expand + result_so_far = result_so_far.flat_map(&:expand) + tokens -= result_so_far + contract_if_possible tokens + end + + def contract_if_possible(tokens) + tokens.combination(2).each do |a, b| + result = a.add b + next unless result + tokens.delete a + tokens.delete b + tokens << result + return contract_if_possible tokens + end + tokens.sort + end + end + + # To encapsulate interaction as well as non-interaction terms + class Token + attr_reader :value, :full, :interact_terms + + def initialize(value, full = true) + @interact_terms = value.include?(':') ? value.split(':') : [value] + @full = coerce_full full + end + + def value + interact_terms.join(':') + end + + def size + # TODO: Return size 1 for value '1' also + # CAn't do this at the moment because have to make + # changes in sorting first + value == '1' ? 0 : interact_terms.size + end + + def add(other) + # ANYTHING + FACTOR- : ANYTHING = FACTOR : ANYTHING + # ANYTHING + ANYTHING : FACTOR- = ANYTHING : FACTOR + if size > other.size + other.add self + + elsif other.size == 2 && + size == 1 && + other.interact_terms.last == value && + other.full.last == full.first && + other.full.first == false + Token.new( + "#{other.interact_terms.first}:#{value}", + [true, other.full.last] + ) + + elsif other.size == 2 && + size == 1 && + other.interact_terms.first == value && + other.full.first == full.first && + other.full.last == false + Token.new( + "#{value}:#{other.interact_terms.last}", + [other.full.first, true] + ) + + elsif value == '1' && + other.size == 1 + Token.new(other.value, true) + end + end + + def ==(other) + value == other.value && + full == other.full + end + + alias eql? == + + def hash + value.hash ^ full.hash + end + + def <=>(other) + size <=> other.size + end + + def to_s + interact_terms + .zip(full) + .map { |t, f| f ? t : t + '(-)' } + .join ':' + end + + def expand + case size + when 0 + [self] + when 1 + [Token.new('1'), Token.new(value, false)] + when 2 + a, b = interact_terms + [Token.new('1'), Token.new(a, false), Token.new(b, false), + Token.new(a + ':' + b, [false, false])] + end + end + + def to_df(df) + case size + when 1 + if df[value].category? + df[value].contrast_code full: full.first + else + Daru::DataFrame.new value => df[value].to_a + end + when 2 + to_df_when_interaction(df) + end + end + + private + + def coerce_full(value) + if value.is_a? Array + value + Array.new((@interact_terms.size - value.size), true) + else + [value] * @interact_terms.size + end + end + + def to_df_when_interaction(df) + case interact_terms.map { |t| df[t].category? } + when [true, true] + df.interact_code(interact_terms, full) + when [false, false] + to_df_numeric_interact_with_numeric df + when [true, false] + to_df_category_interact_with_numeric df + when [false, true] + to_df_numeric_interact_with_category df + end + end + + def to_df_numeric_interact_with_numeric(df) + Daru::DataFrame.new value => (df[interact_terms.first] * + df[interact_terms.last]).to_a + end + + def to_df_category_interact_with_numeric(df) + a, b = interact_terms + Daru::DataFrame.new( + df[a].contrast_code(full: full.first) + .map { |dv| ["#{dv.name}:#{b}", (dv * df[b]).to_a] } + .to_h + ) + end + + def to_df_numeric_interact_with_category(df) + a, b = interact_terms + Daru::DataFrame.new( + df[b].contrast_code(full: full.last) + .map { |dv| ["#{a}:#{dv.name}", (dv * df[a]).to_a] } + .to_h + ) + end + end +end diff --git a/lib/statsample/graph/boxplot.rb b/lib/statsample/graph/boxplot.rb index da1cd7d..f07b7d2 100644 --- a/lib/statsample/graph/boxplot.rb +++ b/lib/statsample/graph/boxplot.rb @@ -8,12 +8,12 @@ module Graph # # == Usage # === Svg output - # a=[1,2,3,4].to_scale - # b=[3,4,5,6].to_scale - # puts Statsample::Graph::Boxplot.new(:vectors=>[a,b]).to_svg + # a = Daru::Vector.new([1,2,3,4]) + # b = Daru::Vector.new([3,4,5,6]) + # puts Statsample::Graph::Boxplot.new(:vectors=>[a,b]).to_svg # === Using ReportBuilder - # a=[1,2,3,4].to_scale - # b=[3,4,5,6].to_scale + # a = Daru::Vector.new([1,2,3,4]) + # b = Daru::Vector.new([3,4,5,6]) # rb=ReportBuilder.new # rb.add(Statsample::Graph::Boxplot.new(:vectors=>[a,b])) # rb.save_html('boxplot.html') @@ -85,8 +85,6 @@ def rubyvis_panel # :nodoc: min||=@vectors.map {|v| v.min}.min max||=@vectors.map {|v| v.max}.max - - margin_hor=margin_left + margin_right margin_vert=margin_top + margin_bottom x_scale = pv.Scale.ordinal(@vectors.size.times.map.to_a).split_banded(0, width-margin_hor, 4.0/5) @@ -115,12 +113,10 @@ def rubyvis_panel # :nodoc: out[:low_whisker]=min out[:high_whisker]=max # And now, data outside whiskers - out[:outliers]=v.data_with_nils.find_all {|d| d < min or d > max } + out[:outliers]=v.to_a.find_all {|d| d < min or d > max } out } - - - + vis=Rubyvis::Panel.new do |pan| pan.width width - margin_hor pan.height height - margin_vert @@ -157,7 +153,6 @@ def rubyvis_panel # :nodoc: bp.left {|v| x_scale[index]} bp.width x_scale.range_band - # Bar bp.bar do |b| b.bottom {|v| y_scale[v[:percentil_25]]} @@ -168,9 +163,7 @@ def rubyvis_panel # :nodoc: colors.scale(that.groups[parent.index]).darker else colors.scale(index).darker - end - - + end } b.fill_style {|v| if that.groups @@ -237,7 +230,6 @@ def report_building(builder) # :nodoc: builder.section(:name=>name) do |b| b.image(to_svg, :type=>'svg', :width=>width, :height=>height) end - end end end diff --git a/lib/statsample/graph/histogram.rb b/lib/statsample/graph/histogram.rb index 3fd21d7..696cfa5 100644 --- a/lib/statsample/graph/histogram.rb +++ b/lib/statsample/graph/histogram.rb @@ -6,10 +6,10 @@ module Graph # # == Usage # === Svg output - # a=[1,2,3,4].to_scale - # puts Statsample::Graph::Histogram.new(a).to_svg + # a = Daru::Vector.new([1,2,3,4]) + # puts Statsample::Graph::Histogram.new(a).to_svg # === Using ReportBuilder - # a=[1,2,3,4].to_scale + # a = Daru::Vector.new([1,2,3,4]) # rb=ReportBuilder.new # rb.add(Statsample::Graph::Histogram.new(a)) # rb.save_html('histogram.html') @@ -70,7 +70,7 @@ def pre_vis # :nodoc: @hist=@data @mean=@hist.estimated_mean @sd=@hist.estimated_standard_deviation - elsif @data.is_a? Statsample::Vector + elsif @data.is_a? Daru::Vector @mean=@data.mean @sd=@data.sd @bins||=Math::sqrt(@data.size).floor diff --git a/lib/statsample/graph/scatterplot.rb b/lib/statsample/graph/scatterplot.rb index d6f2ee8..6bc29bb 100644 --- a/lib/statsample/graph/scatterplot.rb +++ b/lib/statsample/graph/scatterplot.rb @@ -10,12 +10,12 @@ module Graph # The data is displayed as a collection of points, each having the value of one variable determining the position on the horizontal axis and the value of the other variable determining the position on the vertical axis.[2] This kind of plot is also called a scatter chart, scatter diagram and scatter graph. # == Usage # === Svg output - # a=[1,2,3,4].to_scale - # b=[3,4,5,6].to_scale + # a = Daru::Vector.new([1,2,3,4]) + # b = Daru::Vector.new([3,4,5,6]) # puts Statsample::Graph::Scatterplot.new(a,b).to_svg # === Using ReportBuilder - # a=[1,2,3,4].to_scale - # b=[3,4,5,6].to_scale + # a = Daru::Vector.new([1,2,3,4]) + # b = Daru::Vector.new([3,4,5,6]) # rb=ReportBuilder.new # rb.add(Statsample::Graph::Scatterplot.new(a,b)) # rb.save_html('scatter.html') @@ -195,17 +195,18 @@ def rubyvis_panel # :nodoc: end vis end + # Returns SVG with scatterplot def to_svg - rp=rubyvis_panel + rp = rubyvis_panel rp.render rp.to_svg end + def report_building(builder) # :nodoc: builder.section(:name=>name) do |b| b.image(to_svg, :type=>'svg', :width=>width, :height=>height) - end - + end end end end diff --git a/lib/statsample/histogram.rb b/lib/statsample/histogram.rb index be6564e..4825890 100644 --- a/lib/statsample/histogram.rb +++ b/lib/statsample/histogram.rb @@ -37,135 +37,144 @@ module Statsample # == Reference: # * http://www.gnu.org/software/gsl/manual/html_node/The-histogram-struct.html - class Histogram - include Enumerable - class << self - # Alloc +n_bins+, using +range+ as ranges of bins - def alloc(n_bins, range=nil, opts=Hash.new) - Histogram.new(n_bins, range, opts) - - end - # Alloc +n_bins+ bins, using +p1+ as minimum and +p2+ - # as maximum - def alloc_uniform(n_bins, p1=nil,p2=nil) - if p1.is_a? Array - min,max=p1 - else - min,max=p1,p2 - end - range=max - min - step=range / n_bins.to_f - range=(n_bins+1).times.map {|i| min + (step*i)} - Histogram.new(range) - end - end - attr_accessor :name - attr_reader :bin - attr_reader :range - include GetText - bindtextdomain("statsample") - def initialize(p1, min_max=false, opts=Hash.new) + class Histogram + include Enumerable + + class << self + # Alloc +n_bins+, using +range+ as ranges of bins + def alloc(n_bins, range=nil, opts=Hash.new) + Histogram.new(n_bins, range, opts) + end + # Alloc +n_bins+ bins, using +p1+ as minimum and +p2+ + # as maximum + def alloc_uniform(n_bins, p1=nil,p2=nil) if p1.is_a? Array - range=p1 - @n_bins=p1.size-1 - elsif p1.is_a? Integer - @n_bins=p1 + min,max=p1 + else + min,max=p1,p2 end - - @bin=[0.0]*(@n_bins) - if(min_max) - min, max=min_max[0], min_max[1] - range=Array.new(@n_bins+1) - (@n_bins+1).times {|i| range[i]=min+(i*(max-min).quo(@n_bins)) } - end - range||=[0.0]*(@n_bins+1) - set_ranges(range) - @name="" - opts.each{|k,v| - self.send("#{k}=",v) if self.respond_to? k - } + range=max - min + step=range / n_bins.to_f + range=(n_bins+1).times.map {|i| min + (step*i)} + Histogram.new(range) end - # Number of bins - def bins - @n_bins + end + + attr_accessor :name + attr_reader :bin + attr_reader :range + + include GetText + bindtextdomain("statsample") + + def initialize(p1, min_max=false, opts=Hash.new) + + if p1.is_a? Array + range=p1 + @n_bins=p1.size-1 + elsif p1.is_a? Integer + @n_bins=p1 end - # - def increment(x, w=1) - if x.respond_to? :each - x.each{|y| increment(y,w) } - elsif x.is_a? Numeric - (range.size-1).times do |i| - if x>=range[i] and xi, :low=>r[0],:high=>r[1], :middle=>(r[0]+r[1]) / 2.0, :value=>@bin[i]} - yield arg - end - end - def estimated_variance - sum,n=0,0 - mean=estimated_mean - each do |v| - sum+=v[:value]*(v[:middle]-mean)**2 - n+=v[:value] - end - sum / (n-1) - end - def estimated_standard_deviation - Math::sqrt(estimated_variance) - end - def estimated_mean - sum,n=0,0 - each do |v| - sum+= v[:value]* v[:middle] - n+=v[:value] - end - sum / n - end - alias :mean :estimated_mean - alias :sigma :estimated_standard_deviation - - def sum(start=nil,_end=nil) - start||=0 - _end||=@n_bins-1 - (start.._end).inject(0) {|ac,i| ac+@bin[i]} + end + + def set_ranges(range) + raise "Range size should be bin+1" if range.size!=@bin.size+1 + @range=range + end + + def get_range(i) + [@range[i],@range[i+1]] + end + + def max + @range.last + end + + def min + @range.first + end + def max_val + @bin.max + end + def min_val + @bin.min + end + def each + bins.times.each do |i| + r=get_range(i) + arg={:i=>i, :low=>r[0],:high=>r[1], :middle=>(r[0]+r[1]) / 2.0, :value=>@bin[i]} + yield arg end - def report_building(generator) - hg=Statsample::Graph::Histogram.new(self) - generator.parse_element(hg) + end + def estimated_variance + sum,n=0,0 + mean=estimated_mean + each do |v| + sum+=v[:value]*(v[:middle]-mean)**2 + n+=v[:value] + end + sum / (n-1) + end + def estimated_standard_deviation + Math::sqrt(estimated_variance) + end + def estimated_mean + sum,n=0,0 + each do |v| + sum+= v[:value]* v[:middle] + n+=v[:value] end - def report_building_text(generator) - @range.each_with_index do |r,i| - next if i==@bin.size - generator.text(sprintf("%5.2f : %d", r, @bin[i])) - end + sum / n + end + alias :mean :estimated_mean + alias :sigma :estimated_standard_deviation + + def sum(start=nil,_end=nil) + start||=0 + _end||=@n_bins-1 + (start.._end).inject(0) {|ac,i| ac+@bin[i]} + end + def report_building(generator) + hg=Statsample::Graph::Histogram.new(self) + generator.parse_element(hg) + end + def report_building_text(generator) + @range.each_with_index do |r,i| + next if i==@bin.size + generator.text(sprintf("%5.2f : %d", r, @bin[i])) end end + end end diff --git a/lib/statsample/matrix.rb b/lib/statsample/matrix.rb index 662bd0a..a7102b0 100644 --- a/lib/statsample/matrix.rb +++ b/lib/statsample/matrix.rb @@ -10,45 +10,46 @@ class ::Matrix def to_matrix self end - def to_dataset - f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map {|i| _("VAR_%d") % (i+1) } - ds=Statsample::Dataset.new(f) + + def to_dataframe + f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map {|i| "VAR_#{i+1}".to_sym } + f = [f] unless f.is_a?(Array) + ds = Daru::DataFrame.new({}, order: f) f.each do |ff| - ds[ff].type=:scale - ds[ff].name=ff + ds[ff].rename ff end row_size.times {|i| - ds.add_case_array(self.row(i).to_a) + ds.add_row(self.row(i).to_a) } - ds.update_valid_data - ds.name=self.name if self.respond_to? :name + ds.rename(self.name) if self.respond_to? :name ds end + + alias :to_dataset :to_dataframe + if defined? :eigenpairs alias_method :eigenpairs_ruby, :eigenpairs end - + if Statsample.has_gsl? # Optimize eigenpairs of extendmatrix module using gsl def eigenpairs to_gsl.eigenpairs end end - + def eigenvalues eigenpairs.collect {|v| v[0]} end + def eigenvectors eigenpairs.collect {|v| v[1]} end + def eigenvectors_matrix Matrix.columns(eigenvectors) end - - - - def to_gsl out=[] self.row_size.times{|i| @@ -56,6 +57,10 @@ def to_gsl } GSL::Matrix[*out] end + + def []=(i, j, x) + @rows[i][j] = x + end end module GSL @@ -64,9 +69,11 @@ class Col def to_matrix ::Matrix.columns([self.size.times.map {|i| self[i]}]) end + def to_ary to_a end + def to_gsl self end @@ -76,53 +83,60 @@ class Matrix def to_gsl self end - - def to_dataset - f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map {|i| _("VAR_%d") % (i+1) } - ds=Statsample::Dataset.new(f) + + def to_dataframe + f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map { |i| "VAR_#{i+1}".to_sym } + ds=Daru::DataFrame.new({}, order: f) f.each do |ff| - ds[ff].type=:scale - ds[ff].name=ff + ds[ff].rename ff end + row_size.times {|i| - ds.add_case_array(self.row(i).to_a) + ds.add_row(self.row(i).to_a) } - ds.update_valid_data - ds.name=self.name if self.respond_to? :name + ds.rename(self.name) if self.respond_to? :name ds end - + + alias :to_dataset :to_dataframe + def row_size size1 end + def column_size size2 end + def determinant det end + def inverse GSL::Linalg::LU.invert(self) end + def eigenvalues eigenpairs.collect {|v| v[0]} end + def eigenvectors eigenpairs.collect {|v| v[1]} end - + # Matrix sum of squares def mssq sum=0 to_v.each {|i| sum+=i**2} sum end - + def eigenvectors_matrix eigval, eigvec= GSL::Eigen.symmv(self) GSL::Eigen::symmv_sort(eigval, eigvec, GSL::Eigen::SORT_VAL_DESC) - eigvec + eigvec end + def eigenpairs eigval, eigvec= GSL::Eigen.symmv(self) GSL::Eigen::symmv_sort(eigval, eigvec, GSL::Eigen::SORT_VAL_DESC) @@ -130,19 +144,21 @@ def eigenpairs [eigval[i],eigvec.get_col(i)] } end - + #def eigenpairs_ruby # self.to_matrix.eigenpairs_ruby #end def square? size1==size2 end + def to_matrix rows=self.size1 cols=self.size2 out=(0...rows).collect{|i| (0...cols).collect {|j| self[i,j]} } ::Matrix.rows(out) end + def total_sum sum=0 size1.times {|i| @@ -158,7 +174,7 @@ def total_sum module Statsample # Module to add names to X and Y fields module NamedMatrix - include Summarizable + include Summarizable def fields raise "Should be square" if !square? @@ -178,10 +194,10 @@ def fields_y=(v) @fields_y=v end def fields_x - @fields_x||=row_size.times.collect {|i| _("X%d") % i} + @fields_x||=row_size.times.collect {|i| _("X%d") % i} end def fields_y - @fields_y||=column_size.times.collect {|i| _("Y%d") % i} + @fields_y||=column_size.times.collect {|i| _("Y%d") % i} end def name @@ -195,13 +211,13 @@ def get_new_name @@named_matrix+=1 _("Matrix %d") % @@named_matrix end - + end # Module to add method for variance/covariance and correlation matrices # == Usage # matrix=Matrix[[1,2],[2,3]] # matrix.extend CovariateMatrix - # + # module CovariateMatrix include NamedMatrix @@covariatematrix=0 @@ -217,7 +233,7 @@ def _type else @type end - + end def _type=(t) @type=t @@ -233,7 +249,7 @@ def correlation end } }) - matrix.extend CovariateMatrix + matrix.extend CovariateMatrix matrix.fields_x=fields_x matrix.fields_y=fields_y matrix._type=:correlation @@ -242,19 +258,19 @@ def correlation self end end - - + + # Get variance for field k - # + # def variance(k) submatrix([k])[0,0] end - + def get_new_name @@covariatematrix+=1 _("Covariate matrix %d") % @@covariatematrix end - + # Select a submatrix of factors. If you have a correlation matrix # with a, b and c, you could obtain a submatrix of correlations of # a and b, b and c or a and b @@ -264,7 +280,7 @@ def get_new_name # # Example: # a=Matrix[[1.0, 0.3, 0.2], - # [0.3, 1.0, 0.5], + # [0.3, 1.0, 0.5], # [0.2, 0.5, 1.0]] # a.extend CovariateMatrix # a.fields=%w{a b c} @@ -272,31 +288,31 @@ def get_new_name # => Matrix[[0.5],[0.3]] # a.submatrix(%w{c a}) # => Matrix[[1.0, 0.2] , [0.2, 1.0]] - def submatrix(rows,columns=nil) - raise ArgumentError, "rows shouldn't be empty" if rows.respond_to? :size and rows.size==0 - columns||=rows + def submatrix(rows,columns = nil) + raise ArgumentError, "rows shouldn't be empty" if rows.respond_to? :size and rows.size == 0 + columns ||= rows # Convert all fields on index - row_index=rows.collect {|v| - r=v.is_a?(Numeric) ? v : fields_x.index(v) + row_index = rows.collect do |v| + r = v.is_a?(Numeric) ? v : fields_x.index(v) raise "Index #{v} doesn't exists on matrix" if r.nil? r - } - column_index=columns.collect {|v| - r=v.is_a?(Numeric) ? v : fields_y.index(v) + end + + column_index = columns.collect do |v| + r = v.is_a?(Numeric) ? v : fields_y.index(v) raise "Index #{v} doesn't exists on matrix" if r.nil? r - } - - + end + + fx=row_index.collect {|v| fields_x[v]} fy=column_index.collect {|v| fields_y[v]} - - matrix= Matrix.rows(row_index.collect {|i| - row=column_index.collect {|j| self[i,j]}}) - matrix.extend CovariateMatrix - matrix.fields_x=fx - matrix.fields_y=fy - matrix._type=_type + + matrix = Matrix.rows(row_index.collect { |i| column_index.collect { |j| self[i, j] }}) + matrix.extend CovariateMatrix + matrix.fields_x = fx + matrix.fields_y = fy + matrix._type = _type matrix end def report_building(generator) diff --git a/lib/statsample/multiset.rb b/lib/statsample/multiset.rb index e7cbe4f..9f50762 100644 --- a/lib/statsample/multiset.rb +++ b/lib/statsample/multiset.rb @@ -5,20 +5,21 @@ module Statsample class Multiset # Name of fields attr_reader :fields - # Array with Statsample::Dataset + # Array with Daru::DataFrame attr_reader :datasets # To create a multiset # * Multiset.new(%w{f1 f2 f3}) # define only fields def initialize(fields) - @fields=fields - @datasets={} + @fields=fields + @datasets={} end def self.new_empty_vectors(fields,ds_names) - ms=Multiset.new(fields) - ds_names.each{|d| - ms.add_dataset(d,Dataset.new(fields)) - } - ms + ms = Multiset.new(fields) + ds_names.each do |d| + ms.add_dataset(d, Daru::DataFrame.new({}, order: fields)) + end + + ms end # Generate a new dataset as a union of partial dataset # If block given, this is applied to each dataset before union @@ -29,65 +30,64 @@ def union(&block) labels={} each do |k,ds| if block - ds=ds.dup + ds = ds.dup yield k,ds end @fields.each do |f| - union_field[f]||=Array.new - union_field[f].concat(ds[f].data) - types[f]||=ds[f].type - names[f]||=ds[f].name - labels[f]||=ds[f].labels + union_field[f] ||= Array.new + union_field[f].concat(ds[f].to_a) + types[f] ||= ds[f].type + names[f] ||= ds[f].name + labels[f] ||= ds[f].index.to_a end end @fields.each do |f| - union_field[f]=union_field[f].to_vector(types[f]) - union_field[f].name=names[f] - union_field[f].labels=labels[f] + union_field[f] = Daru::Vector.new(union_field[f], name: names[f]) end - ds_union=union_field.to_dataset - ds_union.fields=@fields + + ds_union = Daru::DataFrame.new(union_field, order: @fields) ds_union end + def datasets_names - @datasets.keys.sort + @datasets.keys.sort end + def n_datasets - @datasets.size + @datasets.size end + def add_dataset(key,ds) - if(ds.fields!=@fields) - raise ArgumentError, "Dataset(#{ds.fields.to_s})must have the same fields of the Multiset(#{@fields})" + if ds.vectors.to_a != @fields + raise ArgumentError, "Dataset(#{ds.vectors.to_a.to_s})must have the same fields of the Multiset(#{@fields})" else - @datasets[key]=ds + @datasets[key] = ds end end def sum_field(field) @datasets.inject(0) {|a,da| - stratum_name=da[0] - vector=da[1][field] - val=yield stratum_name,vector - a+val + stratum_name = da[0] + vector = da[1][field] + val = yield stratum_name,vector + a + val } end def collect_vector(field) - @datasets.collect {|k,v| - yield k, v[field] - } + @datasets.collect { |k,v| yield k, v[field] } end def each_vector(field) - @datasets.each {|k,v| - yield k, v[field] - } + @datasets.each { |k,v| yield k, v[field] } end - def[](i) + + def [](i) @datasets[i] end + def each(&block) @datasets.each {|k,ds| - next if ds.cases==0 + next if ds.nrows == 0 block.call(k,ds) } end @@ -204,9 +204,9 @@ def initialize(ms,strata_sizes) @ms=ms raise ArgumentError,"You should put a strata size for each dataset" if strata_sizes.keys.sort!=ms.datasets_names @strata_sizes=strata_sizes - @population_size=@strata_sizes.inject(0) {|a,x| a+x[1]} + @population_size=@strata_sizes.inject(0) { |a,x| a+x[1] } @strata_number=@ms.n_datasets - @sample_size=@ms.datasets.inject(0) {|a,x| a+x[1].cases} + @sample_size=@ms.datasets.inject(0) { |a,x| a+x[1].nrows } end # Number of strata def strata_number diff --git a/lib/statsample/regression.rb b/lib/statsample/regression.rb index 0016e8a..b2ae630 100644 --- a/lib/statsample/regression.rb +++ b/lib/statsample/regression.rb @@ -15,8 +15,6 @@ module Statsample # # * Simple Regression : Statsample::Regression::Simple # * Multiple Regression: Statsample::Regression::Multiple - # * Logit Regression: Statsample::Regression::Binomial::Logit - # * Probit Regression: Statsample::Regression::Binomial::Probit module Regression LinearDependency=Class.new(Exception) @@ -25,8 +23,8 @@ module Regression # * x: independent Vector # * y: dependent Vector # Usage: - # x=100.times.collect {|i| rand(100)}.to_scale - # y=100.times.collect {|i| 2+x[i]*2+rand()}.to_scale + # x = Daru::Vector.new(100.times.collect {|i| rand(100)}) + # y = Daru::Vector.new(100.times.collect {|i| 2+x[i]*2+rand()}) # sr=Statsample::Regression.simple(x,y) # sr.a # => 2.51763295177808 @@ -49,7 +47,7 @@ def self.simple(x,y) # * :pairwise: uses correlation matrix. Use with caution. # # Usage: - # lr=Statsample::Regression::multiple(ds,'y') + # lr=Statsample::Regression::multiple(ds,:y) def self.multiple(ds,y_var, opts=Hash.new) missing_data= (opts[:missing_data].nil? ) ? :listwise : opts.delete(:missing_data) if missing_data==:pairwise @@ -58,7 +56,7 @@ def self.multiple(ds,y_var, opts=Hash.new) if Statsample.has_gsl? and false Statsample::Regression::Multiple::GslEngine.new(ds, y_var, opts) else - ds2=ds.dup_only_valid + ds2=ds.reject_values(*Daru::MISSING_VALUES) Statsample::Regression::Multiple::RubyEngine.new(ds2,y_var, opts) end end diff --git a/lib/statsample/regression/multiple.rb b/lib/statsample/regression/multiple.rb index 317efbc..a641363 100644 --- a/lib/statsample/regression/multiple.rb +++ b/lib/statsample/regression/multiple.rb @@ -6,12 +6,12 @@ module Regression # Use:. # # require 'statsample' - # a=1000.times.collect {rand}.to_scale - # b=1000.times.collect {rand}.to_scale - # c=1000.times.collect {rand}.to_scale - # ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset - # ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+rand()} - # lr=Statsample::Regression.multiple(ds,'y') + # a = Daru::Vector.new(1000.times.collect {rand}) + # b = Daru::Vector.new(1000.times.collect {rand}) + # c = Daru::Vector.new(1000.times.collect {rand}) + # ds= Daru::DataFrame.new({:a => a,:b => b,:c => c}) + # ds[:y]=ds.collect{|row| row[:a]*5 + row[:b]*3 + row[:c]*2 + rand()} + # lr=Statsample::Regression.multiple(ds, :y) # puts lr.summary # Summary for regression of a,b,c over y # ************************************************************* @@ -53,8 +53,8 @@ def significance def initialize(matrix,y_var, opts=Hash.new) matrix.extend Statsample::CovariateMatrix @matrix=matrix - @fields=matrix.fields-y_var - @y_var=y_var + @fields=matrix.fields - y_var + @y_var = y_var @q=@y_var.size @matrix_cor=matrix.correlation @matrix_cor_xx = @matrix_cor.submatrix(@fields) @@ -84,8 +84,6 @@ def p2yx vxy.quo(@q) end end - - end end end diff --git a/lib/statsample/regression/multiple/alglibengine.rb b/lib/statsample/regression/multiple/alglibengine.rb index d6ab942..05964ee 100644 --- a/lib/statsample/regression/multiple/alglibengine.rb +++ b/lib/statsample/regression/multiple/alglibengine.rb @@ -9,108 +9,115 @@ module Multiple # If you need pairwise, use RubyEngine # Example: # -# @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale) -# @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale) -# @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale) -# @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale) -# ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset -# lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,'y') +# @a = Daru::Vector.new([1,3,2,4,3,5,4,6,5,7]) +# @b = Daru::Vector.new([3,3,4,4,5,5,6,6,4,4]) +# @c = Daru::Vector.new([11,22,30,40,50,65,78,79,99,100]) +# @y = Daru::Vector.new([3,4,5,6,7,8,9,10,20,30]) +# ds = Daru::DataFrame.new({:a => @a,:b => @b,:c => @c,:y => @y}) +# lr=Statsample::Regression::Multiple::AlglibEngine.new(ds, :y) # class AlglibEngine < BaseEngine def initialize(ds,y_var, opts=Hash.new) super - @ds=ds.dup_only_valid - @ds_valid=@ds - @dy=@ds[@y_var] - @ds_indep=ds.dup(ds.fields-[y_var]) + @ds = ds.reject_values(*Daru::MISSING_VALUES) + @ds_valid = @ds + @dy = @ds[@y_var] + @ds_indep = ds.dup(ds.vectors.to_a - [y_var]) # Create a custom matrix - columns=[] - @fields=[] - @ds.fields.each{|f| - if f!=@y_var - columns.push(@ds[f].to_a) - @fields.push(f) - end - } - @dep_columns=columns.dup + columns = [] + @fields = [] + @ds.vectors.each do |f| + if f != @y_var + columns.push(@ds[f].to_a) + @fields.push(f) + end + end + @dep_columns = columns.dup columns.push(@ds[@y_var]) matrix=Matrix.columns(columns) @lr_s=nil @lr=::Alglib::LinearRegression.build_from_matrix(matrix) @coeffs=assign_names(@lr.coeffs) - end - def _dump(i) - Marshal.dump({'ds'=>@ds,'y_var'=>@y_var}) - end - def self._load(data) - h=Marshal.load(data) - self.new(h['ds'], h['y_var']) - end - - def coeffs - @coeffs - end - # Coefficients using a constant - # Based on http://www.xycoon.com/ols1.htm - def matrix_resolution - mse_p=mse - columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}} - columns.unshift([1.0]*@ds.cases) - y=Matrix.columns([@dy.data.map {|i| i.to_f}]) - x=Matrix.columns(columns) - xt=x.t - matrix=((xt*x)).inverse*xt - matrix*y - end - def r2 - r**2 - end - def r - Bivariate::pearson(@dy,predicted) - end - def sst - @dy.ss - end - def constant - @lr.constant - end - def standarized_coeffs - l=lr_s - assign_names(l.coeffs) - end - def lr_s - if @lr_s.nil? - build_standarized - end - @lr_s - end - def build_standarized - @ds_s=@ds.standarize - columns=[] - @ds_s.fields.each{|f| - columns.push(@ds_s[f].to_a) unless f==@y_var - } - @dep_columns_s=columns.dup - columns.push(@ds_s[@y_var]) - matrix=Matrix.columns(columns) - @lr_s=Alglib::LinearRegression.build_from_matrix(matrix) - end - def process(v) - @lr.process(v) - end - def process_s(v) - lr_s.process(v) - end - # ???? Not equal to SPSS output - def standarized_residuals - res=residuals - red_sd=residuals.sds - res.collect {|v| - v.quo(red_sd) - }.to_vector(:scale) + def _dump(i) + Marshal.dump({'ds'=>@ds,'y_var'=>@y_var}) + end + + def self._load(data) + h=Marshal.load(data) + self.new(h['ds'], h['y_var']) + end + + def coeffs + @coeffs + end + # Coefficients using a constant + # Based on http://www.xycoon.com/ols1.htm + def matrix_resolution + mse_p=mse + columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}} + columns.unshift([1.0]*@ds.cases) + y=Matrix.columns([@dy.data.map {|i| i.to_f}]) + x=Matrix.columns(columns) + xt=x.t + matrix=((xt*x)).inverse*xt + matrix*y + end + + def r2 + r**2 + end + + def r + Bivariate::pearson(@dy,predicted) + end + + def sst + @dy.ss + end + + def constant + @lr.constant + end + + def standarized_coeffs + l=lr_s + assign_names(l.coeffs) + end + + def lr_s + if @lr_s.nil? + build_standarized end + @lr_s + end + + def build_standarized + @ds_s=@ds.standardize + columns=[] + @ds_s.vectors.each{|f| + columns.push(@ds_s[f].to_a) unless f == @y_var + } + @dep_columns_s=columns.dup + columns.push(@ds_s[@y_var]) + matrix=Matrix.columns(columns) + @lr_s=Alglib::LinearRegression.build_from_matrix(matrix) + end + + def process(v) + @lr.process(v) + end + + def process_s(v) + lr_s.process(v) + end + # ???? Not equal to SPSS output + def standarized_residuals + res = residuals + red_sd = residuals.sds + Daru::Vector.new(res.collect {|v| v.quo(red_sd) }) + end end end end diff --git a/lib/statsample/regression/multiple/baseengine.rb b/lib/statsample/regression/multiple/baseengine.rb index d5e08ae..f2fdf82 100644 --- a/lib/statsample/regression/multiple/baseengine.rb +++ b/lib/statsample/regression/multiple/baseengine.rb @@ -19,13 +19,12 @@ def self.univariate? end def initialize(ds, y_var, opts = Hash.new) @ds=ds - @predictors_n=@ds.fields.size-1 - @total_cases=@ds.cases - @cases=@ds.cases + @predictors_n=@ds.vectors.size-1 + @total_cases=@ds.nrows + @cases=@ds.nrows @y_var=y_var @r2=nil - @name=_("Multiple Regression: %s over %s") % [ ds.fields.join(",") , @y_var] - + @name=_("Multiple Regression: %s over %s") % [ ds.vectors.to_a.join(",") , @y_var] opts_default={:digits=>3} @opts=opts_default.merge opts @@ -33,7 +32,6 @@ def initialize(ds, y_var, opts = Hash.new) @opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } - end # Calculate F Test def anova @@ -45,15 +43,17 @@ def se_estimate end # Retrieves a vector with predicted values for y def predicted - @total_cases.times.collect { |i| - invalid=false - vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]} - if invalid - nil - else - process(vect) + Daru::Vector.new( + @total_cases.times.collect do |i| + invalid = false + vect = @dep_columns.collect {|v| invalid = true if v[i].nil?; v[i]} + if invalid + nil + else + process(vect) + end end - }.to_vector(:scale) + ) end # Retrieves a vector with standarized values for y def standarized_predicted @@ -61,15 +61,17 @@ def standarized_predicted end # Retrieves a vector with residuals values for y def residuals - (0...@total_cases).collect{|i| - invalid=false - vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]} - if invalid or @ds[@y_var][i].nil? - nil - else - @ds[@y_var][i] - process(vect) + Daru::Vector.new( + (0...@total_cases).collect do |i| + invalid=false + vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]} + if invalid or @ds[@y_var][i].nil? + nil + else + @ds[@y_var][i] - process(vect) + end end - }.to_vector(:scale) + ) end # R Multiple def r @@ -131,12 +133,10 @@ def probability # Tolerance for a given variable # http://talkstats.com/showthread.php?t=5056 def tolerance(var) - ds=assign_names(@dep_columns) - ds.each{|k,v| - ds[k]=v.to_vector(:scale) - } - lr=self.class.new(ds.to_dataset,var) - 1-lr.r2 + ds = assign_names(@dep_columns) + ds.each { |k,v| ds[k] = Daru::Vector.new(v) } + lr = self.class.new(Daru::DataFrame.new(ds),var) + 1 - lr.r2 end # Tolerances for each coefficient def coeffs_tolerances @@ -165,12 +165,12 @@ def se_r2 def estimated_variance_covariance_matrix #mse_p=mse columns=[] - @ds_valid.fields.each{|k| - v=@ds_valid[k] - columns.push(v.data) unless k==@y_var + @ds_valid.vectors.each{|k| + v = @ds_valid[k] + columns.push(v.to_a) unless k == @y_var } columns.unshift([1.0]*@valid_cases) - x=Matrix.columns(columns) + x=::Matrix.columns(columns) matrix=((x.t*x)).inverse * mse matrix.collect {|i| Math::sqrt(i) if i>=0 } end diff --git a/lib/statsample/regression/multiple/gslengine.rb b/lib/statsample/regression/multiple/gslengine.rb index 5f3ef32..2462900 100644 --- a/lib/statsample/regression/multiple/gslengine.rb +++ b/lib/statsample/regression/multiple/gslengine.rb @@ -9,43 +9,44 @@ module Multiple # If you need pairwise, use RubyEngine # Example: # - # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale) - # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale) - # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale) - # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale) - # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset - # lr=Statsample::Regression::Multiple::GslEngine.new(ds,'y') + # @a = Daru::Vector.new([1,3,2,4,3,5,4,6,5,7]) + # @b = Daru::Vector.new([3,3,4,4,5,5,6,6,4,4]) + # @c = Daru::Vector.new([11,22,30,40,50,65,78,79,99,100]) + # @y = Daru::Vector.new([3,4,5,6,7,8,9,10,20,30]) + # ds = Daru::DataFrame.new({:a => @a,:b => @b,:c => @c,:y => @y}) + # lr=Statsample::Regression::Multiple::GslEngine.new(ds,:y) # class GslEngine < BaseEngine def initialize(ds,y_var, opts=Hash.new) super - @ds=ds.dup_only_valid - @ds_valid=@ds - @valid_cases=@ds_valid.cases - @dy=@ds[@y_var] - @ds_indep=ds.dup(ds.fields-[y_var]) + @ds = ds.reject_values(*Daru::MISSING_VALUES) + @ds_valid = @ds + @valid_cases = @ds_valid.nrows + @dy = @ds[@y_var] + @ds_indep = ds.dup(ds.vectors.to_a - [y_var]) # Create a custom matrix columns=[] @fields=[] - max_deps = GSL::Matrix.alloc(@ds.cases, @ds.fields.size) - constant_col=@ds.fields.size-1 - for i in 0...@ds.cases + max_deps = GSL::Matrix.alloc(@ds.nrows, @ds.vectors.size) + constant_col=@ds.vectors.size-1 + for i in 0...@ds.nrows max_deps.set(i,constant_col,1) end - j=0 - @ds.fields.each{|f| - if f!=@y_var - @ds[f].each_index{|i1| + j = 0 + @ds.vectors.each do |f| + if f != @y_var + @ds[f].each_index do |i1| max_deps.set(i1,j,@ds[f][i1]) - } + end + columns.push(@ds[f].to_a) @fields.push(f) - j+=1 + j += 1 end - } - @dep_columns=columns.dup - @lr_s=nil - c, @cov, @chisq, @status = GSL::MultiFit.linear(max_deps, @dy.gsl) + end + @dep_columns = columns.dup + @lr_s = nil + c, @cov, @chisq, @status = GSL::MultiFit.linear(max_deps, @dy.to_gsl) @constant=c[constant_col] @coeffs_a=c.to_a.slice(0...constant_col) @coeffs=assign_names(@coeffs_a) @@ -97,7 +98,7 @@ def lr_s @lr_s end def build_standarized - @ds_s=@ds.standarize + @ds_s=@ds.standardize @lr_s=GslEngine.new(@ds_s,@y_var) end def process_s(v) @@ -107,24 +108,20 @@ def process_s(v) def standarized_residuals res=residuals red_sd=residuals.sds - res.collect {|v| - v.quo(red_sd) - }.to_vector(:scale) + Daru::Vector.new(res.collect {|v| v.quo(red_sd) }) end # Standard error for coeffs def coeffs_se - out={} - evcm=estimated_variance_covariance_matrix - @ds_valid.fields.each_with_index do |f,i| - - mi=i+1 - next if f==@y_var - out[f]=evcm[mi,mi] + out = {} + evcm = estimated_variance_covariance_matrix + @ds_valid.vectors.to_a.each_with_index do |f,i| + mi = i+1 + next if f == @y_var + out[f] = evcm[mi,mi] end out end - end end end diff --git a/lib/statsample/regression/multiple/matrixengine.rb b/lib/statsample/regression/multiple/matrixengine.rb index 86ddc52..9c780f3 100644 --- a/lib/statsample/regression/multiple/matrixengine.rb +++ b/lib/statsample/regression/multiple/matrixengine.rb @@ -59,8 +59,6 @@ def initialize(matrix,y_var, opts=Hash.new) @matrix_y = @matrix_cor.submatrix(@fields, [y_var]) @matrix_y_cov = @matrix_cov.submatrix(@fields, [y_var]) - - @y_sd=Math::sqrt(@matrix_cov.submatrix([y_var])[0,0]) @x_sd=@n_predictors.times.inject({}) {|ac,i| @@ -77,14 +75,14 @@ def initialize(matrix,y_var, opts=Hash.new) @y_mean=0.0 @name=_("Multiple reggresion of %s on %s") % [@fields.join(","), @y_var] - opts_default={:digits=>3} - opts=opts_default.merge opts + opts_default = {:digits=>3} + opts = opts_default.merge opts opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } result_matrix=@matrix_x_cov.inverse * @matrix_y_cov - if matrix._type==:covariance + if matrix._type == :covariance @coeffs=result_matrix.column(0).to_a @coeffs_stan=coeffs.collect {|k,v| coeffs[k]*@x_sd[k].quo(@y_sd) @@ -116,12 +114,12 @@ def r end # Value of constant def constant - c=coeffs - @y_mean - @fields.inject(0){|a,k| a + (c[k] * @x_mean[k])} + c = coeffs + @y_mean - @fields.inject(0) { |a,k| a + (c[k] * @x_mean[k])} end # Hash of b or raw coefficients def coeffs - assign_names(@coeffs) + assign_names(@coeffs) end # Hash of beta or standarized coefficients @@ -185,7 +183,7 @@ def constant_se sd[:constant]=0 fields=[:constant]+@matrix_cov.fields-[@y_var] # Recreate X'X using the variance-covariance matrix - xt_x=Matrix.rows(fields.collect {|i| + xt_x=::Matrix.rows(fields.collect {|i| fields.collect {|j| if i==:constant or j==:constant cov=0 diff --git a/lib/statsample/regression/multiple/rubyengine.rb b/lib/statsample/regression/multiple/rubyengine.rb index fcee05f..6b36804 100644 --- a/lib/statsample/regression/multiple/rubyengine.rb +++ b/lib/statsample/regression/multiple/rubyengine.rb @@ -8,76 +8,72 @@ module Multiple # # Example: # -# @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale) -# @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale) -# @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale) -# @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale) -# ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset -# lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y') +# @a = Daru::Vector.new([1,3,2,4,3,5,4,6,5,7]) +# @b = Daru::Vector.new([3,3,4,4,5,5,6,6,4,4]) +# @c = Daru::Vector.new([11,22,30,40,50,65,78,79,99,100]) +# @y = Daru::Vector.new([3,4,5,6,7,8,9,10,20,30]) +# ds = Daru::DataFrame.new({:a => @a,:b => @b,:c => @c,:y => @y}) +# lr=Statsample::Regression::Multiple::RubyEngine.new(ds,:y) class RubyEngine < MatrixEngine def initialize(ds,y_var, opts=Hash.new) - matrix=ds.correlation_matrix - fields_indep=ds.fields-[y_var] - default={ - :y_mean=>ds[y_var].mean, - :x_mean=>fields_indep.inject({}) {|ac,f| ac[f]=ds[f].mean; ac}, - :y_sd=>ds[y_var].sd, - :x_sd=>fields_indep.inject({}) {|ac,f| ac[f]=ds[f].sd; ac}, - :cases=>Statsample::Bivariate.min_n_valid(ds) + matrix = Statsample::Bivariate.correlation_matrix ds + fields_indep=ds.vectors.to_a - [y_var] + default= { + :y_mean => ds[y_var].mean, + :x_mean => fields_indep.inject({}) {|ac,f| ac[f]=ds[f].mean; ac}, + :y_sd => ds[y_var].sd, + :x_sd => fields_indep.inject({}) {|ac,f| ac[f]=ds[f].sd; ac}, + :cases => Statsample::Bivariate.min_n_valid(ds) } - opts=opts.merge(default) + opts = opts.merge(default) super(matrix, y_var, opts) - @ds=ds - @dy=ds[@y_var] - @ds_valid=ds.dup_only_valid - @total_cases=@ds.cases - @valid_cases=@ds_valid.cases - @ds_indep = ds.dup(ds.fields-[y_var]) + @ds = ds + @dy = ds[@y_var] + @ds_valid = ds.reject_values(*Daru::MISSING_VALUES) + @total_cases = @ds.nrows + @valid_cases = @ds_valid.nrows + @ds_indep = ds.dup(ds.vectors.to_a - [y_var]) set_dep_columns end def set_dep_columns - @dep_columns=[] - @ds_indep.each_vector{|k,v| - @dep_columns.push(v.data_with_nils) - } + @dep_columns = [] + @ds_indep.each_vector { |v| @dep_columns.push(v.to_a) } end def fix_with_mean i=0 - @ds_indep.each do |row| + @ds_indep.each(:row) do |row| empty=[] row.each do |k,v| empty.push(k) if v.nil? end + if empty.size==1 @ds_indep[empty[0]][i]=@ds[empty[0]].mean end - i+=1 + i += 1 end - @ds_indep.update_valid_data set_dep_columns end def fix_with_regression - i=0 - @ds_indep.each{|row| - empty=[] - row.each{|k,v| - empty.push(k) if v.nil? - } + i = 0 + @ds_indep.each(:row) do |row| + empty = [] + row.each { |k,v| empty.push(k) if v.nil? } if empty.size==1 - field=empty[0] - lr=MultipleRegression.new(@ds_indep,field) - fields=[] - @ds_indep.fields.each{|f| - fields.push(row[f]) unless f==field + field = empty[0] + lr = MultipleRegression.new(@ds_indep,field) + fields = [] + @ds_indep.vectors.each { |f| + fields.push(row[f]) unless f == field } + @ds_indep[field][i]=lr.process(fields) end i+=1 - } - @ds_indep.update_valid_data + end set_dep_columns end # Standard error for constant diff --git a/lib/statsample/reliability.rb b/lib/statsample/reliability.rb index e5fb50c..5e81fd3 100644 --- a/lib/statsample/reliability.rb +++ b/lib/statsample/reliability.rb @@ -4,31 +4,30 @@ class << self # Calculate Chonbach's alpha for a given dataset. # only uses tuples without missing data def cronbach_alpha(ods) - ds=ods.dup_only_valid - return nil if ds.vectors.any? {|k,v| v.variance==0} - n_items=ds.fields.size - return nil if n_items<=1 - s2_items=ds.vectors.inject(0) {|ac,v| - ac+v[1].variance } - total=ds.vector_sum + ds = ods.reject_values(*Daru::MISSING_VALUES) + n_items = ds.ncols + return nil if n_items <= 1 + s2_items = ds.to_h.values.inject(0) { |ac,v| + ac + v.variance } + total = ds.vector_sum - (n_items.quo(n_items-1)) * (1-(s2_items.quo(total.variance))) + (n_items.quo(n_items - 1)) * (1 - (s2_items.quo(total.variance))) end # Calculate Chonbach's alpha for a given dataset # using standarized values for every vector. # Only uses tuples without missing data # Return nil if one or more vectors has 0 variance def cronbach_alpha_standarized(ods) + ds = ods.reject_values(*Daru::MISSING_VALUES) + return nil if ds.any? { |v| v.variance==0} - ds=ods.dup_only_valid - - return nil if ds.vectors.any? {|k,v| v.variance==0} - - ds=ds.fields.inject({}){|a,f| - a[f]=ods[f].standarized; - a - }.to_dataset - + ds = Daru::DataFrame.new( + ds.vectors.to_a.inject({}) { |a,i| + a[i] = ods[i].standardize + a + } + ) + cronbach_alpha(ds) end # Predicted reliability of a test by replicating @@ -55,10 +54,10 @@ def cronbach_alpha_from_n_s2_cov(n,s2,cov) end # Get Cronbach's alpha from a covariance matrix def cronbach_alpha_from_covariance_matrix(cov) - n=cov.row_size + n = cov.row_size raise "covariance matrix should have at least 2 variables" if n < 2 - s2=n.times.inject(0) {|ac,i| ac+cov[i,i]} - (n.quo(n-1))*(1-(s2.quo(cov.total_sum))) + s2 = n.times.inject(0) { |ac,i| ac + cov[i,i] } + (n.quo(n - 1)) * (1 - (s2.quo(cov.total_sum))) end # Returns n necessary to obtain specific alpha # given variance and covariance mean of items @@ -83,8 +82,6 @@ def n_for_desired_alpha(alpha,s2,cov) end c_a=cronbach_alpha_from_n_s2_cov(n,s2,cov) dif=c_a - alpha - #puts "#{n} , #{c_a}" - end n end @@ -111,20 +108,20 @@ class ItemCharacteristicCurve attr_reader :totals, :counts, :vector_total def initialize (ds, vector_total=nil) vector_total||=ds.vector_sum - raise ArgumentError, "Total size != Dataset size" if vector_total.size!=ds.cases + raise ArgumentError, "Total size != Dataset size" if vector_total.size != ds.nrows @vector_total=vector_total @ds=ds @totals={} - @counts=@ds.fields.inject({}) {|a,v| a[v]={};a} + @counts=@ds.vectors.to_a.inject({}) {|a,v| a[v]={};a} process end def process i=0 - @ds.each do |row| + @ds.each_row do |row| tot=@vector_total[i] @totals[tot]||=0 @totals[tot]+=1 - @ds.fields.each do |f| + @ds.vectors.each do |f| item=row[f].to_s @counts[f][tot]||={} @counts[f][tot][item]||=0 @@ -150,4 +147,4 @@ def curve_field(field, item) require 'statsample/reliability/icc.rb' require 'statsample/reliability/scaleanalysis.rb' require 'statsample/reliability/skillscaleanalysis.rb' -require 'statsample/reliability/multiscaleanalysis.rb' \ No newline at end of file +require 'statsample/reliability/multiscaleanalysis.rb' diff --git a/lib/statsample/reliability/icc.rb b/lib/statsample/reliability/icc.rb index 1277acc..8780d95 100644 --- a/lib/statsample/reliability/icc.rb +++ b/lib/statsample/reliability/icc.rb @@ -6,12 +6,12 @@ module Reliability # several ratings) on a target and another measurement obtained on that target" # == Usage # require 'statsample' - # size=1000 - # a = size.times.map {rand(10)}.to_scale + # size = 1000 + # a = Daru::Vector.new(size.times.map {rand(10)}) # b = a.recode{|i|i+rand(4)-2} - # c =a.recode{|i|i+rand(4)-2} + # c = a.recode{|i|i+rand(4)-2} # d = a.recode{|i|i+rand(4)-2} - # ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset + # ds = Daru::DataFrame.new({:a => a,:b => b,:c => c,:d => d}) # # Use :type attribute to set type to summarize # icc=Statsample::Reliability::ICC.new(ds, :type=>:icc_1_k) # puts icc.summary @@ -96,10 +96,10 @@ class ICC attr_accessor :alpha attr_accessor :name def initialize(ds, opts=Hash.new) - @ds=ds.dup_only_valid - @vectors=@ds.vectors.values - @n=@ds.cases - @k=@ds.fields.size + @ds=ds.reject_values(*Daru::MISSING_VALUES) + @vectors=@ds.map { |e| e } + @n=@ds.nrows + @k=@ds.ncols compute @g_rho=0 @alpha=0.05 diff --git a/lib/statsample/reliability/multiscaleanalysis.rb b/lib/statsample/reliability/multiscaleanalysis.rb index 3222593..ae74cb8 100644 --- a/lib/statsample/reliability/multiscaleanalysis.rb +++ b/lib/statsample/reliability/multiscaleanalysis.rb @@ -6,17 +6,17 @@ module Reliability # PCA and Factor Analysis. # # == Usage - # @x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_vector(:scale) - # @x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_vector(:scale) - # @x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_vector(:scale) - # @x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_vector(:scale) - # ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset + # @x1 = Daru::Vector.new([1,1,1,1,2,2,2,2,3,3,3,30]) + # @x2 = Daru::Vector.new([1,1,1,2,2,3,3,3,3,4,4,50]) + # @x3 = Daru::Vector.new([2,2,1,1,1,2,2,2,3,4,5,40]) + # @x4 = Daru::Vector.new([1,2,3,4,4,4,4,3,4,4,5,30]) + # ds = Daru::DataFrame.new({:x1 => @x1,:x2 => @x2,:x3 => @x3,:x4 => @x4}) # opts={:name=>"Scales", # Name of analysis # :summary_correlation_matrix=>true, # Add correlation matrix # :summary_pca } # Add PCA between scales # msa=Statsample::Reliability::MultiScaleAnalysis.new(opts) do |m| - # m.scale :s1, ds.clone(%w{x1 x2}) - # m.scale :s2, ds.clone(%w{x3 x4}), {:name=>"Scale 2"} + # m.scale :s1, ds.clone([:x1, :x2]) + # m.scale :s2, ds.clone([:x3, :x4]), {:name=>"Scale 2"} # end # # Retrieve summary # puts msa.summary @@ -107,7 +107,7 @@ def delete_scale(code) # Retrieves a Principal Component Analysis (Factor::PCA) # using all scales, using opts a options. def pca(opts=nil) - opts||=pca_options + opts ||= pca_options Statsample::Factor::PCA.new(correlation_matrix, opts) end # Retrieve Velicer's MAP @@ -123,14 +123,14 @@ def principal_axis_analysis(opts=nil) Statsample::Factor::PrincipalAxis.new(correlation_matrix, opts) end def dataset_from_scales - ds=Dataset.new(@scales_keys) + ds = Daru::DataFrame.new({}, order: @scales_keys.map(&:to_sym)) @scales.each_pair do |code,scale| - ds[code.to_s]=scale.ds.vector_sum - ds[code.to_s].name=scale.name + ds[code.to_sym] = scale.ds.vector_sum end - ds.update_valid_data + ds end + def parallel_analysis(opts=nil) opts||=parallel_analysis_options Statsample::Factor::ParallelAnalysis.new(dataset_from_scales, opts) @@ -140,6 +140,7 @@ def parallel_analysis(opts=nil) def correlation_matrix Statsample::Bivariate.correlation_matrix(dataset_from_scales) end + def report_building(b) # :nodoc: b.section(:name=>name) do |s| s.section(:name=>_("Reliability analysis of scales")) do |s2| diff --git a/lib/statsample/reliability/scaleanalysis.rb b/lib/statsample/reliability/scaleanalysis.rb index 9a48d0e..9a52230 100644 --- a/lib/statsample/reliability/scaleanalysis.rb +++ b/lib/statsample/reliability/scaleanalysis.rb @@ -3,12 +3,12 @@ module Reliability # Analysis of a Scale. Analoge of Scale Reliability analysis on SPSS. # Returns several statistics for complete scale and each item # == Usage - # @x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_vector(:scale) - # @x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_vector(:scale) - # @x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_vector(:scale) - # @x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_vector(:scale) - # ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset - # ia=Statsample::Reliability::ScaleAnalysis.new(ds) + # @x1 = Daru::Vector.new([1,1,1,1,2,2,2,2,3,3,3,30]) + # @x2 = Daru::Vector.new([1,1,1,2,2,3,3,3,3,4,4,50]) + # @x3 = Daru::Vector.new([2,2,1,1,1,2,2,2,3,4,5,40]) + # @x4 = Daru::Vector.new([1,2,3,4,4,4,4,3,4,4,5,30]) + # ds = Daru::DataFrame.new({:x1 => @x1,:x2 => @x2,:x3 => @x3,:x4 => @x4}) + # ia = Statsample::Reliability::ScaleAnalysis.new(ds) # puts ia.summary class ScaleAnalysis include Summarizable @@ -16,40 +16,40 @@ class ScaleAnalysis attr_accessor :name attr_accessor :summary_histogram def initialize(ds, opts=Hash.new) - @dumped=ds.fields.find_all {|f| - ds[f].variance==0 + @dumped=ds.vectors.to_a.find_all {|f| + ds[f].variance == 0 } - @ods=ds - @ds=ds.dup_only_valid(ds.fields - @dumped) - @ds.name=ds.name + @ods = ds + @ds = ds.reject_values(*Daru::MISSING_VALUES).dup(ds.vectors.to_a - @dumped) + @ds.rename ds.name - @k=@ds.fields.size - @total=@ds.vector_sum + @k = @ds.ncols + @total = @ds.vector_sum @o_total=@dumped.size > 0 ? @ods.vector_sum : nil - @vector_mean=@ds.vector_mean - @item_mean=@vector_mean.mean - @item_sd=@vector_mean.sd + @vector_mean = @ds.vector_mean + @item_mean = @vector_mean.mean + @item_sd = @vector_mean.sd - @mean=@total.mean - @median=@total.median - - @skew=@total.skew - @kurtosis=@total.kurtosis - @sd = @total.sd - @variance=@total.variance - @valid_n = @total.size - opts_default={ - :name=>_("Reliability Analysis"), - :summary_histogram=>true + @mean = @total.mean + @median = @total.median + @skew = @total.skew + @kurtosis = @total.kurtosis + @sd = @total.sd + @variance = @total.variance + @valid_n = @total.size + + opts_default = { + :name => _("Reliability Analysis"), + :summary_histogram => true } - @opts=opts_default.merge(opts) - @opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k } + @opts = opts_default.merge(opts) + @opts.each{ |k,v| self.send("#{k}=",v) if self.respond_to? k } @cov_m=Statsample::Bivariate.covariance_matrix(@ds) # Mean for covariances and variances - @variances=@k.times.map {|i| @cov_m[i,i]}.to_scale + @variances = Daru::Vector.new(@k.times.map { |i| @cov_m[i,i] }) @variances_mean=@variances.mean @covariances_mean=(@variance-@variances.sum).quo(@k**2-@k) #begin @@ -66,7 +66,7 @@ def item_characteristic_curve total={} @ds.each do |row| tot=@total[i] - @ds.fields.each do |f| + @ds.vectors.each do |f| out[f]||= {} total[f]||={} out[f][tot]||= 0 @@ -87,43 +87,40 @@ def item_characteristic_curve # Adjusted RPB(Point biserial-correlation) for each item # def item_total_correlation - @itc||=@ds.fields.inject({}) do |a,v| - vector=@ds[v].clone - ds2=@ds.clone - ds2.delete_vector(v) - total=ds2.vector_sum - a[v]=Statsample::Bivariate.pearson(vector,total) + vecs = @ds.vectors.to_a + @itc ||= vecs.inject({}) do |a,v| + total=@ds.vector_sum(vecs - [v]) + a[v]=Statsample::Bivariate.pearson(@ds[v],total) a end end def mean_rpb - item_total_correlation.values.to_scale.mean + Daru::Vector.new(item_total_correlation.values).mean end def item_statistics - @is||=@ds.fields.inject({}) do |a,v| - a[v]={:mean=>@ds[v].mean, :sds=>Math::sqrt(@cov_m.variance(v))} - a - end + @is||=@ds.vectors.to_a.inject({}) do |a,v| + a[v]={:mean=>@ds[v].mean, :sds=>Math::sqrt(@cov_m.variance(v))} + a + end end # Returns a dataset with cases ordered by score # and variables ordered by difficulty def item_difficulty_analysis dif={} - @ds.fields.each{|f| dif[f]=@ds[f].mean } - dif_sort=dif.sort{|a,b| -(a[1]<=>b[1])} + @ds.vectors.each{|f| dif[f]=@ds[f].mean } + dif_sort = dif.sort { |a,b| -(a[1]<=>b[1]) } scores_sort={} scores=@ds.vector_mean - scores.each_index{|i| scores_sort[i]=scores[i] } + scores.each_index{ |i| scores_sort[i]=scores[i] } scores_sort=scores_sort.sort{|a,b| a[1]<=>b[1]} - ds_new=Statsample::Dataset.new(['case','score'] + dif_sort.collect{|a,b| a}) + ds_new = Daru::DataFrame.new({}, order: ([:case,:score] + dif_sort.collect{|a,b| a.to_sym})) scores_sort.each do |i,score| - row=[i, score] - case_row=@ds.case_as_hash(i) - dif_sort.each{|variable,dif_value| row.push(case_row[variable]) } - ds_new.add_case_array(row) + row = [i, score] + case_row = @ds.row[i].to_h + dif_sort.each{ |variable,dif_value| row.push(case_row[variable]) } + ds_new.add_row(row) end - ds_new.update_valid_data ds_new end @@ -132,9 +129,10 @@ def stats_if_deleted end def stats_if_deleted_intern # :nodoc: - return Hash.new if @ds.fields.size==1 - @ds.fields.inject({}) do |a,v| - cov_2=@cov_m.submatrix(@ds.fields-[v]) + return Hash.new if @ds.ncols == 1 + vecs = @ds.vectors.to_a + vecs.inject({}) do |a,v| + cov_2=@cov_m.submatrix(vecs - [v]) #ds2=@ds.clone #ds2.delete_vector(v) #total=ds2.vector_sum @@ -151,11 +149,10 @@ def stats_if_deleted_intern # :nodoc: def report_building(builder) #:nodoc: builder.section(:name=>@name) do |s| - if @dumped.size>0 s.section(:name=>"Items with variance=0") do |s1| s.table(:name=>_("Summary for %s with all items") % @name) do |t| - t.row [_("Items"), @ods.fields.size] + t.row [_("Items"), @ods.ncols] t.row [_("Sum mean"), "%0.4f" % @o_total.mean] t.row [_("S.d. mean"), "%0.4f" % @o_total.sd] end @@ -170,7 +167,7 @@ def report_building(builder) #:nodoc: s.table(:name=>_("Summary for %s") % @name) do |t| - t.row [_("Valid Items"), @ds.fields.size] + t.row [_("Valid Items"), @ds.ncols] t.row [_("Valid cases"), @valid_n] t.row [_("Sum mean"), "%0.4f" % @mean] @@ -193,8 +190,8 @@ def report_building(builder) #:nodoc: end if (@alpha) - s.text _("Items for obtain alpha(0.8) : %d" % Statsample::Reliability::n_for_desired_reliability(@alpha, 0.8, @ds.fields.size)) - s.text _("Items for obtain alpha(0.9) : %d" % Statsample::Reliability::n_for_desired_reliability(@alpha, 0.9, @ds.fields.size)) + s.text _("Items for obtain alpha(0.8) : %d" % Statsample::Reliability::n_for_desired_reliability(@alpha, 0.8, @ds.ncols)) + s.text _("Items for obtain alpha(0.9) : %d" % Statsample::Reliability::n_for_desired_reliability(@alpha, 0.9, @ds.ncols)) end @@ -203,7 +200,7 @@ def report_building(builder) #:nodoc: itc=item_total_correlation s.table(:name=>_("Items report for %s") % @name, :header=>["item","mean","sd", "mean if deleted", "var if deleted", "sd if deleted"," item-total correl.", "alpha if deleted"]) do |t| - @ds.fields.each do |f| + @ds.vectors.each do |f| row=["#{@ds[f].name}(#{f})"] if is[f] row+=[sprintf("%0.5f",is[f][:mean]), sprintf("%0.5f", is[f][:sds])] diff --git a/lib/statsample/reliability/skillscaleanalysis.rb b/lib/statsample/reliability/skillscaleanalysis.rb index 5ce410b..aff272b 100644 --- a/lib/statsample/reliability/skillscaleanalysis.rb +++ b/lib/statsample/reliability/skillscaleanalysis.rb @@ -4,11 +4,11 @@ module Reliability # Given a dataset with results and a correct answers hash, # generates a ScaleAnalysis # == Usage - # x1=%{a b b c}.to_vector - # x2=%{b a b c}.to_vector - # x3=%{a c b a}.to_vector - # ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3}.to_dataset - # key={'x1'=>'a','x2'=>'b','x3'=>'a'} + # x1 = Daru::Vector.new(%{a b b c}) + # x2 = Daru::Vector.new(%{b a b c}) + # x3 = Daru::Vector.new(%{a c b a}) + # ds = Daru::DataFrame.new({:x1 => @x1, :x2 => @x2, :x3 => @x3}) + # key={ :x1 => 'a',:x2 => 'b', :x3 => 'a'} # ssa=Statsample::Reliability::SkillScaleAnalysis.new(ds,key) # puts ssa.summary class SkillScaleAnalysis @@ -30,53 +30,59 @@ def initialize(ds,key,opts=Hash.new) end # Dataset only corrected vectors def corrected_dataset_minimal - cds=corrected_dataset - dsm=@key.keys.inject({}) {|ac,v| ac[v]=cds[v];ac}.to_dataset - @key.keys.each do |k| - dsm[k].name=_("%s(corrected)") % @ds[k].name - dsm[k].labels=@ds[k].labels - end + cds = corrected_dataset + dsm = Daru::DataFrame.new( + @key.keys.inject({}) do |ac,v| + ac[v] = cds[v] + ac + end + ) - dsm.name=_("Corrected dataset from %s") % @ds.name + dsm.rename _("Corrected dataset from %s") % @ds.name dsm end + def vector_sum corrected_dataset_minimal.vector_sum end + def vector_mean corrected_dataset_minimal.vector_mean end + def scale_analysis - sa=ScaleAnalysis.new(corrected_dataset_minimal) + sa = ScaleAnalysis.new(corrected_dataset_minimal) sa.name=_("%s (Scale Analysis)") % @name sa end + def corrected_dataset if @cds.nil? - @cds=@ds.dup_empty - @key.keys.each {|k| @cds[k].type=:scale; @cds[k].name=@ds[k].name} - @ds.each do |row| - out={} - row.each do |k,v| - if @key.keys.include? k - if @ds[k].is_valid? v - out[k]= @key[k]==v ? 1 : 0 + @cds = Daru::DataFrame.new({}, order: @ds.vectors, name: @ds.name) + @ds.each_row do |row| + out = {} + row.each_with_index do |v, k| + if @key.has_key? k + if @ds[k].reject_values(*Daru::MISSING_VALUES).include_values? v + out[k]= @key[k] == v ? 1 : 0 else - out[k]=nil + out[k] = nil end else - out[k]=v + out[k] = v end end - @cds.add_case(out,false) + + @cds.add_row(Daru::Vector.new(out)) end - @cds.update_valid_data + @cds.update end @cds end + def report_building(builder) builder.section(:name=>@name) do |s| - sa=scale_analysis + sa = scale_analysis s.parse_element(sa) if summary_show_problematic_items s.section(:name=>_("Problematic Items")) do |spi| @@ -91,17 +97,16 @@ def report_building(builder) spi.table(:name=>"Proportions",:header=>[_("Value"), _("%")]) do |table| props.each do |k1,v| - table.row [ @ds[k].labeling(k1), "%0.3f" % v] + table.row [ @ds[k].index_of(k1), "%0.3f" % v] end end - end end end + spi.text _("No problematic items") if count==0 end end - end end end diff --git a/lib/statsample/resample.rb b/lib/statsample/resample.rb index 8a1795d..d6ca1b7 100644 --- a/lib/statsample/resample.rb +++ b/lib/statsample/resample.rb @@ -7,7 +7,7 @@ def repeat_and_save(times,&action) def generate (size,low,upper) range=upper-low+1 - Vector.new((0...size).collect {|x| rand(range)+low },:scale) + Daru::Vector.new((0...size).collect {|x| rand(range)+low }) end end diff --git a/lib/statsample/rserve_extension.rb b/lib/statsample/rserve_extension.rb deleted file mode 100644 index d439c91..0000000 --- a/lib/statsample/rserve_extension.rb +++ /dev/null @@ -1,20 +0,0 @@ -# Several additions to Statsample objects, to support -# rserve-client - -module Statsample - class Vector - def to_REXP - Rserve::REXP::Wrapper.wrap(data_with_nils) - end - end - class Dataset - def to_REXP - names=@fields - data=@fields.map {|f| - Rserve::REXP::Wrapper.wrap(@vectors[f].data_with_nils) - } - l=Rserve::Rlist.new(data,names) - Rserve::REXP.create_data_frame(l) - end - end -end \ No newline at end of file diff --git a/lib/statsample/shorthand.rb b/lib/statsample/shorthand.rb index d4956f3..6f2c5c4 100644 --- a/lib/statsample/shorthand.rb +++ b/lib/statsample/shorthand.rb @@ -11,30 +11,20 @@ module Shorthand ### # :section: R like methods ### - def read_with_cache(klass, filename,opts=Hash.new, cache=true) - file_ds=filename+".ds" - if cache and (File.exists? file_ds and File.mtime(file_ds)>File.mtime(filename)) - ds=Statsample.load(file_ds) - else - ds=klass.read(filename) - ds.save(file_ds) if cache - end - ds - end - # Import an Excel file. Cache result by default - def read_excel(filename, opts=Hash.new, cache=true) - read_with_cache(Statsample::Excel, filename, opts, cache) + # Import an Excel file. Cache result by default + def read_excel(filename, opts=Hash.new) + Daru::DataFrame.from_excel filename, opts end - # Import an CSV file. Cache result by default - def read_csv - read_with_cache(Statsample::CSV, filename, opts, cache) + # Import an CSV file. Cache result by default + def read_csv(filename, opts=Hash.new) + Daru::DataFrame.from_csv filename, opts end # Retrieve names (fields) from dataset def names(ds) - ds.fields + ds.vectors.to_a end # Create a correlation matrix from a dataset def cor(ds) @@ -44,21 +34,25 @@ def cor(ds) def cov(ds) Statsample::Bivariate.covariate_matrix(ds) end - # Create a Statsample::Vector + # Create a Daru::Vector # Analog to R's c def vector(*args) - Statsample::Vector[*args] + Daru::Vector[*args] end # Random generation for the normal distribution def rnorm(n,mean=0,sd=1) rng=Distribution::Normal.rng(mean,sd) - Statsample::Vector.new_scale(n) { rng.call} + Daru::Vector.new_with_size(n) { rng.call} end - # Creates a new Statsample::Dataset - # Each key is transformed into string + # Creates a new Daru::DataFrame + # Each key is transformed into a Symbol wherever possible. def dataset(vectors=Hash.new) - vectors=vectors.inject({}) {|ac,v| ac[v[0].to_s]=v[1];ac} - Statsample::Dataset.new(vectors) + vectors = vectors.inject({}) do |ac,v| + n = v[0].respond_to?(:to_sym) ? v[0].to_sym : v[0] + ac[n] = v[1] + ac + end + Daru::DataFrame.new(vectors) end alias :data_frame :dataset # Returns a Statsample::Graph::Boxplot @@ -78,13 +72,15 @@ def scatterplot(*args) def levene(*args) Statsample::Test::Levene.new(*args) end + def principal_axis(*args) Statsample::Factor::PrincipalAxis.new(*args) - end + def polychoric(*args) Statsample::Bivariate::Polychoric.new(*args) end + def tetrachoric(*args) Statsample::Bivariate::Tetrachoric.new(*args) end @@ -95,27 +91,35 @@ def tetrachoric(*args) def lr(*args) Statsample::Regression.multiple(*args) end + def pca(ds,opts=Hash.new) Statsample::Factor::PCA.new(ds,opts) end + def dominance_analysis(*args) Statsample::DominanceAnalysis.new(*args) end + def dominance_analysis_bootstrap(*args) Statsample::DominanceAnalysis::Bootstrap.new(*args) end + def scale_analysis(*args) Statsample::Reliability::ScaleAnalysis.new(*args) end + def skill_scale_analysis(*args) Statsample::Reliability::SkillScaleAnalysis.new(*args) end + def multiscale_analysis(*args,&block) Statsample::Reliability::MultiScaleAnalysis.new(*args,&block) end + def test_u(*args) Statsample::Test::UMannWhitney.new(*args) end + module_function :test_u, :rnorm end end diff --git a/lib/statsample/test/bartlettsphericity.rb b/lib/statsample/test/bartlettsphericity.rb index 98b6676..b05ed02 100644 --- a/lib/statsample/test/bartlettsphericity.rb +++ b/lib/statsample/test/bartlettsphericity.rb @@ -31,7 +31,7 @@ def initialize(matrix,ncases) # def compute @value=-((@ncases-1)-(2*@nvars+5).quo(6))*Math::log(@matrix.determinant) - @df=(@nvars*(@nvars-1)).quo(2) + @df=(@nvars*(@nvars-1)) / 2 end def probability 1-Distribution::ChiSquare.cdf(@value,@df) diff --git a/lib/statsample/test/chisquare.rb b/lib/statsample/test/chisquare.rb index 2180ea0..28acb04 100644 --- a/lib/statsample/test/chisquare.rb +++ b/lib/statsample/test/chisquare.rb @@ -1,9 +1,26 @@ module Statsample module Test module ChiSquare - class WithMatrix + module Shared attr_reader :df attr_reader :value + + def to_f + @value + end + + def chi_square + @value + end + + def probability + 1-Distribution::ChiSquare.cdf(@value.to_f,@df) + end + end + + class WithMatrix + include Statsample::Test::ChiSquare::Shared + def initialize(observed, expected=nil) @observed=observed @expected=expected or calculate_expected @@ -11,33 +28,46 @@ def initialize(observed, expected=nil) @df=(@observed.row_size-1)*(@observed.column_size-1) @value=compute_chi end + def calculate_expected sum=@observed.total_sum @expected=Matrix.rows( @observed.row_size.times.map {|i| @observed.column_size.times.map {|j| (@observed.row_sum[i].quo(sum) * @observed.column_sum[j].quo(sum))*sum } - }) - end - def to_f - @value - end - def chi_square - @value - end - def probability - 1-Distribution::ChiSquare.cdf(@value.to_f,@df) + }) end + def compute_chi - sum=0 - (0...@observed.row_size).each {|i| - (0...@observed.column_size).each {|j| + sum=0 + (0...@observed.row_size).each {|i| + (0...@observed.column_size).each {|j| sum+=((@observed[i, j] - @expected[i,j])**2).quo(@expected[i,j]) - } } - sum + } + sum + end + end + + class WithVector + include Statsample::Test::ChiSquare::Shared + + def initialize(observed, expected) + @observed = observed + @expected = expected + raise "Observed size!=expected size" if @observed.size!=@expected.size + @df = @observed.size - 1 + @value = compute_chi + end + + def compute_chi + sum=0 + (0...@observed.size).each {|i| + sum+=((@observed[i] - @expected[i])**2).quo(@expected[i]) + } + sum end end end end -end \ No newline at end of file +end diff --git a/lib/statsample/test/kolmogorovsmirnov.rb b/lib/statsample/test/kolmogorovsmirnov.rb index 31c60f9..f6e7436 100644 --- a/lib/statsample/test/kolmogorovsmirnov.rb +++ b/lib/statsample/test/kolmogorovsmirnov.rb @@ -22,6 +22,7 @@ def initialize(d1,d2) end calculate end + def calculate d=0 @d1.each {|x| @@ -31,12 +32,13 @@ def calculate } @d=d end + # Make a wrapper EmpiricDistribution to any method which implements - # each - # On Statsample::Vector, only uses #valid_data + # each on Statsample::Vector, only uses non-missing data. def make_cdf(v) - v.is_a?(Statsample::Vector) ? EmpiricDistribution.new(v.valid_data) : EmpiricDistribution.new(v) + v.is_a?(Daru::Vector) ? EmpiricDistribution.new(v.only_valid.to_a) : EmpiricDistribution.new(v) end + class EmpiricDistribution def initialize(data) @min=data.min diff --git a/lib/statsample/test/levene.rb b/lib/statsample/test/levene.rb index 4727ceb..4293bdc 100644 --- a/lib/statsample/test/levene.rb +++ b/lib/statsample/test/levene.rb @@ -5,8 +5,8 @@ module Test #
Levene's test ( Levene, 1960) is used to test if k samples have equal variances. Equal variances across samples is called homogeneity of variance. Some statistical tests, for example the analysis of variance, assume that variances are equal across groups or samples. The Levene test can be used to verify that assumption.
# Use: # require 'statsample' - # a=[1,2,3,4,5,6,7,8,100,10].to_scale - # b=[30,40,50,60,70,80,90,100,110,120].to_scale + # a = Daru::Vector.new([1,2,3,4,5,6,7,8,100,10]) + # b = Daru::Vector.new([30,40,50,60,70,80,90,100,110,120]) # # levene=Statsample::Test::Levene.new([a,b]) # puts levene.summary @@ -29,10 +29,10 @@ class Levene attr_accessor :name # Input could be an array of vectors or a dataset def initialize(input, opts=Hash.new()) - if input.is_a? Statsample::Dataset - @vectors=input.vectors.values + if input.is_a? Daru::DataFrame + @vectors = input.to_h.values else - @vectors=input + @vectors = input end @name=_("Levene Test") opts.each{|k,v| @@ -48,32 +48,34 @@ def report_building(builder) # :nodoc: builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @d1, @d2, f, probability] end def compute - n=@vectors.inject(0) {|ac,v| ac+v.n_valid} + n=@vectors.inject(0) { |ac,v| ac + v.reject_values(*Daru::MISSING_VALUES).size } - zi=@vectors.collect {|vector| + zi=@vectors.collect do |vector| mean=vector.mean - vector.collect {|v| (v-mean).abs }.to_scale - } + Daru::Vector.new(vector.collect { |v| (v - mean).abs }) + end - total_mean=zi.inject([]) {|ac,vector| - ac+vector.valid_data - }.to_scale.mean + total_mean = Daru::Vector.new( + zi.inject([]) do |ac,vector| + ac + vector.reject_values(*Daru::MISSING_VALUES).to_a + end + ).mean - k=@vectors.size - - sum_num=zi.inject(0) {|ac,vector| - ac+(vector.size*(vector.mean-total_mean)**2) - } + k = @vectors.size + sum_num = zi.inject(0) do |ac,vector| + ac + (vector.size * (vector.mean - total_mean)**2) + end - sum_den=zi.inject(0) {|ac,vector| - z_mean=vector.mean - ac+vector.valid_data.inject(0) {|acp,zij| - acp+(zij-z_mean)**2 - } - } - @w=((n-k)*sum_num).quo((k-1)*sum_den) - @d1=k-1 - @d2=n-k + sum_den = zi.inject(0) do |ac,vector| + z_mean = vector.mean + ac + vector.reject_values(*Daru::MISSING_VALUES).to_a.inject(0) do |acp,zij| + acp + (zij - z_mean)**2 + end + end + + @w = ((n - k) * sum_num).quo((k - 1) * sum_den) + @d1 = k - 1 + @d2 = n - k end private :compute # Probability. @@ -81,7 +83,6 @@ def compute def probability p_using_cdf(Distribution::F.cdf(f, @d1, @d2), :right) end - end end end diff --git a/lib/statsample/test/t.rb b/lib/statsample/test/t.rb index d0306a9..b132be8 100644 --- a/lib/statsample/test/t.rb +++ b/lib/statsample/test/t.rb @@ -1,10 +1,8 @@ module Statsample module Test - - - - - # A t-test is any statistical hypothesis test in which the test statistic follows a Student's t distribution, if the null hypothesis is supported + # A t-test is any statistical hypothesis test in which the test + # statistic follows a Student's t distribution, if the null + # hypothesis is supported class T class << self @@ -125,7 +123,7 @@ def report_building_t(s) # One Sample t-test # == Usage - # a=1000.times.map {rand(100)}.to_scale + # a = Daru::Vector.new(1000.times.map {rand(100)}) # t_1=Statsample::Test::T::OneSample.new(a, {:u=>50}) # t_1.summary # @@ -165,11 +163,11 @@ def initialize(vector, opts=Hash.new) @u=@opts[:u] @tails=@opts[:tails] @confidence_level=@opts[:confidence_level] || 0.95 - @df= @vector.n_valid-1 + @df= @vector.reject_values(*Daru::MISSING_VALUES).size-1 @t=nil end def t_object - T.new(@vector.mean-u, @vector.se, @vector.n_valid-1, opts) + T.new(@vector.mean-u, @vector.se, @vector.reject_values(*Daru::MISSING_VALUES).size-1, opts) end def t t_object.t @@ -196,8 +194,8 @@ def report_building(b) # :nodoc: # Two Sample t-test. # # == Usage - # a=1000.times.map {rand(100)}.to_scale - # b=1000.times.map {rand(100)}.to_scale + # a = Daru::Vector.new(1000.times.map {rand(100)}) + # b = Daru::Vector.new(1000.times.map {rand(100)}) # t_2=Statsample::Test::T::TwoSamplesIndependent.new(a,b) # t_2.summary # === Output @@ -266,12 +264,12 @@ def initialize(v1, v2, opts=Hash.new) # Set t and probability for given u def compute - @t_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.n_valid, @v2.n_valid,true) + @t_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.reject_values(*Daru::MISSING_VALUES).size, @v2.reject_values(*Daru::MISSING_VALUES).size,true) - @t_not_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.n_valid, @v2.n_valid, false) + @t_not_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.reject_values(*Daru::MISSING_VALUES).size, @v2.reject_values(*Daru::MISSING_VALUES).size, false) - @df_equal_variance=T.df_equal_variance(@v1.n_valid, @v2.n_valid) - @df_not_equal_variance=T.df_not_equal_variance(@v1.sd, @v2.sd, @v1.n_valid, @v2.n_valid) + @df_equal_variance=T.df_equal_variance(@v1.reject_values(*Daru::MISSING_VALUES).size, @v2.reject_values(*Daru::MISSING_VALUES).size) + @df_not_equal_variance=T.df_not_equal_variance(@v1.sd, @v2.sd, @v1.reject_values(*Daru::MISSING_VALUES).size, @v2.reject_values(*Daru::MISSING_VALUES).size) @probability_equal_variance = p_using_cdf(Distribution::T.cdf(@t_equal_variance, @df_equal_variance), tails) @@ -280,8 +278,8 @@ def compute end # Cohen's d is a measure of effect size. Its defined as the difference between two means divided by a standard deviation for the data def d - n1=@v1.n_valid - n2=@v2.n_valid + n1=@v1.reject_values(*Daru::MISSING_VALUES).size + n2=@v2.reject_values(*Daru::MISSING_VALUES).size num=@v1.mean-@v2.mean den=Math::sqrt( ((n1-1)*@v1.sd+(n2-1)*@v2.sd).quo(n1+n2)) num.quo(den) @@ -290,8 +288,8 @@ def d def report_building(b) # :nodoc: b.section(:name=>@name) {|g| g.table(:name=>_("Mean and standard deviation"), :header=>[_("Variable"), _("mean"), _("sd"),_("n")]) {|t| - t.row([@v1.name,"%0.4f" % @v1.mean,"%0.4f" % @v1.sd,@v1.n_valid]) - t.row([@v2.name,"%0.4f" % @v2.mean,"%0.4f" % @v2.sd, @v2.n_valid]) + t.row([@v1.name,"%0.4f" % @v1.mean,"%0.4f" % @v1.sd, @v1.reject_values(*Daru::MISSING_VALUES).size]) + t.row([@v2.name,"%0.4f" % @v2.mean,"%0.4f" % @v2.sd, @v2.reject_values(*Daru::MISSING_VALUES).size]) } g.parse_element(Statsample::Test.levene([@v1,@v2],:name=>_("Levene test for equality of variances"))) diff --git a/lib/statsample/test/umannwhitney.rb b/lib/statsample/test/umannwhitney.rb index e41d93d..43195cf 100644 --- a/lib/statsample/test/umannwhitney.rb +++ b/lib/statsample/test/umannwhitney.rb @@ -113,36 +113,36 @@ def self.distribution_permutations(n1,n2) include Summarizable # # Create a new U Mann-Whitney test - # Params: Two Statsample::Vectors + # Params: Two Daru::Vectors # def initialize(v1,v2, opts=Hash.new) - @v1=v1 - @v2=v2 - @n1=v1.valid_data.size - @n2=v2.valid_data.size - data=(v1.valid_data+v2.valid_data).to_scale - groups=(([0]*@n1)+([1]*@n2)).to_vector - ds={'g'=>groups, 'data'=>data}.to_dataset - @t=nil - @ties=data.data.size!=data.data.uniq.size - if(@ties) - adjust_for_ties(ds['data']) + @v1 = v1 + @v2 = v2 + v1_valid = v1.reject_values(*Daru::MISSING_VALUES).reset_index! + v2_valid = v2.reject_values(*Daru::MISSING_VALUES).reset_index! + @n1 = v1_valid.size + @n2 = v2_valid.size + data = Daru::Vector.new(v1_valid.to_a + v2_valid.to_a) + groups = Daru::Vector.new(([0] * @n1) + ([1] * @n2)) + ds = Daru::DataFrame.new({:g => groups, :data => data}) + @t = nil + @ties = data.to_a.size != data.to_a.uniq.size + if @ties + adjust_for_ties(ds[:data]) end - ds['ranked']=ds['data'].ranked(:scale) - - @n=ds.cases + ds[:ranked] = ds[:data].ranked + @n = ds.nrows - @r1=ds.filter{|r| r['g']==0}['ranked'].sum - @r2=((ds.cases*(ds.cases+1)).quo(2))-r1 - @u1=r1-((@n1*(@n1+1)).quo(2)) - @u2=r2-((@n2*(@n2+1)).quo(2)) - @u=(u1_("Mann-Whitney's U")} - @opts=opts_default.merge(opts) + @r1 = ds.filter_rows { |r| r[:g] == 0}[:ranked].sum + @r2 = ((ds.nrows * (ds.nrows + 1)).quo(2)) - r1 + @u1 = r1 - ((@n1 * (@n1 + 1)).quo(2)) + @u2 = r2 - ((@n2 * (@n2 + 1)).quo(2)) + @u = (u1 < u2) ? u1 : u2 + opts_default = { :name=>_("Mann-Whitney's U") } + @opts = opts_default.merge(opts) opts_default.keys.each {|k| send("#{k}=", @opts[k]) - } - + } end def report_building(generator) # :nodoc: generator.section(:name=>@name) do |s| @@ -160,8 +160,8 @@ def report_building(generator) # :nodoc: # Exact probability of finding values of U lower or equal to sample on U distribution. Use with caution with m*n>100000. # Uses u_sampling_distribution_as62 def probability_exact - dist=UMannWhitney.u_sampling_distribution_as62(@n1,@n2) - sum=0 + dist = UMannWhitney.u_sampling_distribution_as62(@n1,@n2) + sum = 0 (0..@u.to_i).each {|i| sum+=dist[i] } @@ -172,8 +172,8 @@ def probability_exact # == Reference: # * http://europe.isixsigma.com/library/content/c080806a.asp def adjust_for_ties(data) - @t=data.frequencies.find_all{|k,v| v>1}.inject(0) {|a,v| - a+(v[1]**3-v[1]).quo(12) + @t = data.frequencies.to_h.find_all { |k,v| v > 1 }.inject(0) { |a,v| + a + (v[1]**3 - v[1]).quo(12) } end diff --git a/lib/statsample/test/wilcoxonsignedrank.rb b/lib/statsample/test/wilcoxonsignedrank.rb index be8b223..5661904 100644 --- a/lib/statsample/test/wilcoxonsignedrank.rb +++ b/lib/statsample/test/wilcoxonsignedrank.rb @@ -8,13 +8,13 @@ class WilcoxonSignedRank # Name of F analysis attr_accessor :name - attr_reader :w - attr_reader :nr - attr_writer :tails + attr_reader :w + attr_reader :nr + attr_writer :tails # Parameters: def initialize(v1,v2, opts=Hash.new) - @v1=v1 - @v2=v2 + @v1 = v1 + @v2 = v2 opts_default={:name=>_("Wilcoxon Signed Rank Test"),:tails=>:both} @opts=opts_default.merge(opts) opts_default.keys.each {|k| @@ -22,66 +22,68 @@ def initialize(v1,v2, opts=Hash.new) } calculate end + def calculate - df=Statsample::Dataset.new({'v1'=>@v1,'v2'=>@v2}) - df["abs"]=df.collect {|row| - r=(row["v2"]-row["v1"]).abs - } - df["sgn"]=df.collect {|row| - r=row["v2"]-row["v1"] - r==0 ? 0 : r/r.abs - } - df=df.filter {|row| row["sgn"]!=0} - df["rank"]=df["abs"].ranked - @nr=df.cases - @w=df.collect {|row| - row["sgn"]*row["rank"] - #p row["sgn"]*row["rank"] - }.sum + df = Daru::DataFrame.new({:v1 => @v1,:v2 => @v2}) + # df[:abs]=df.collect(:row) { |row| (row[:v2] - row[:v1]).abs } + df[:abs] = (df[:v2] - df[:v1]).abs + df[:sgn] = df.collect(:row) { |row| + r = row[:v2] - row[:v1] + r == 0 ? 0 : r/r.abs + } + df = df.filter_rows { |row| row[:sgn] != 0} + df[:rank] = df[:abs].ranked + @nr = df.nrows + + @w = df.collect(:row) { |row| + row[:sgn] * row[:rank] + }.sum end + def report_building(generator) # :nodoc: generator.section(:name=>@name) do |s| s.table(:name=>_("%s results") % @name) do |t| t.row([_("W Value"), "%0.3f" % @w]) t.row([_("Z"), "%0.3f (p: %0.3f)" % [z, probability_z]]) if(nr<=10) - t.row([_("Exact probability"), "p-exact: %0.3f" % [probability_exact]]) + t.row([_("Exact probability"), "p-exact: %0.3f" % [probability_exact]]) end end end end def z - sigma=Math.sqrt((nr*(nr+1)*(2*nr+1))/6) - (w-0.5)/sigma + sigma=Math.sqrt((nr*(nr+1)*(2*nr+1))/6) + (w-0.5)/sigma end # Assuming normal distribution of W, this calculate # the probability of samples with Z equal or higher than # obtained on sample def probability_z - (1-Distribution::Normal.cdf(z))*(@tails==:both ? 2:1) + (1-Distribution::Normal.cdf(z))*(@tails==:both ? 2:1) end # Calculate exact probability. # Don't calculate for large Nr, please! def probability_exact - str_format="%0#{nr}b" - combinations=2**nr - #p str_format - total_w=combinations.times.map {|i| - comb=sprintf(str_format,i) - w_local=comb.length.times.inject(0) {|ac,j| - sgn=comb[j]=="0" ? -1 : 1 - ac+(j+1)*sgn - } - }.sort - total_w.find_all {|v| - if @tails==:both - v<=-w.abs or v>=w.abs - elsif @tails==:left - v<=w - elsif @tails==:right - v>=w - end - }.count/(combinations.to_f) + str_format="%0#{nr}b" + combinations=2**nr + #p str_format + total_w=combinations.times.map do |i| + comb=sprintf(str_format,i) + w_local=comb.length.times.inject(0) do |ac,j| + sgn=comb[j]=="0" ? -1 : 1 + ac+(j+1)*sgn + end + end.sort + + total_w.find_all do |v| + if @tails==:both + v<=-w.abs or v>=w.abs + elsif @tails==:left + v<=w + elsif @tails==:right + v>=w + end + end.count/(combinations.to_f) end end end diff --git a/lib/statsample/vector.rb b/lib/statsample/vector.rb index 64f5111..caf7ac2 100644 --- a/lib/statsample/vector.rb +++ b/lib/statsample/vector.rb @@ -1,15 +1,8 @@ -require 'date' -require 'statsample/vector/gsl' - module Statsample::VectorShorthands # Creates a new Statsample::Vector object # Argument should be equal to Vector.new def to_vector(*args) - Statsample::Vector.new(self,*args) - end - # Creates a new Statsample::Vector object of type :scale - def to_scale(*args) - Statsample::Vector.new(self, :scale, *args) + Daru::Vector.new(self) end end @@ -24,1047 +17,3 @@ class Vector end end end -module Statsample - - - # Collection of values on one dimension. Works as a column on a Spreadsheet. - # - # == Usage - # The fast way to create a vector uses Array.to_vector or Array.to_scale. - # - # v=[1,2,3,4].to_vector(:scale) - # v=[1,2,3,4].to_scale - # - class Vector - include Enumerable - include Writable - include Summarizable - include Statsample::VectorShorthands - - # Level of measurement. Could be :nominal, :ordinal or :scale - attr_reader :type - # Original data. - attr_reader :data - # Valid data. Equal to data, minus values assigned as missing values - attr_reader :valid_data - # Array of values considered as missing. Nil is a missing value, by default - attr_reader :missing_values - # Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default - attr_reader :today_values - # Missing values array - attr_reader :missing_data - # Original data, with all missing values replaced by nils - attr_reader :data_with_nils - # Date date, with all missing values replaced by nils - attr_reader :date_data_with_nils - # Change label for specific values - attr_accessor :labels - # Name of vector. Should be used for output by many classes - attr_accessor :name - - # Creates a new Vector object. - # * data Any data which can be converted on Array - # * type Level of meausurement. See Vector#type - # * opts Hash of options - # * :missing_values Array of missing values. See Vector#missing_values - # * :today_values Array of 'today' values. See Vector#today_values - # * :labels Labels for data values - # * :name Name of vector - def initialize(data=[], type=:nominal, opts=Hash.new) - @data=data.is_a?(Array) ? data : data.to_a - @type=type - opts_default={ - :missing_values=>[], - :today_values=>['NOW','TODAY', :NOW, :TODAY], - :labels=>{}, - :name=>nil - } - @opts=opts_default.merge(opts) - if @opts[:name].nil? - @@n_table||=0 - @@n_table+=1 - @opts[:name]="Vector #{@@n_table}" - end - @missing_values=@opts[:missing_values] - @labels=@opts[:labels] - @today_values=@opts[:today_values] - @name=@opts[:name] - @valid_data=[] - @data_with_nils=[] - @date_data_with_nils=[] - @missing_data=[] - @has_missing_data=nil - @scale_data=nil - set_valid_data - self.type=type - end - # Create a vector using (almost) any object - # * Array: flattened - # * Range: transformed using to_a - # * Statsample::Vector - # * Numeric and string values - def self.[](*args) - values=[] - args.each do |a| - case a - when Array - values.concat a.flatten - when Statsample::Vector - values.concat a.to_a - when Range - values.concat a.to_a - else - values << a - end - end - vector=new(values) - vector.type=:scale if vector.can_be_scale? - vector - end - # Create a new scale type vector - # Parameters - # [n] Size - # [val] Value of each value - # [&block] If block provided, is used to set the values of vector - def self.new_scale(n,val=nil, &block) - if block - vector=n.times.map {|i| block.call(i)}.to_scale - else - vector=n.times.map { val}.to_scale - end - vector.type=:scale - vector - end - # Creates a duplicate of the Vector. - # Note: data, missing_values and labels are duplicated, so - # changes on original vector doesn't propages to copies. - def dup - Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=>@name) - end - # Returns an empty duplicate of the vector. Maintains the type, - # missing values and labels. - def dup_empty - Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=> @name) - end - - if Statsample::STATSAMPLE__.respond_to?(:check_type) - # Raises an exception if type of vector is inferior to t type - def check_type(t) - Statsample::STATSAMPLE__.check_type(self,t) - end - else - def check_type(t) #:nodoc: - _check_type(t) - end - end - - - def _check_type(t) #:nodoc: - raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date) or (:date==@type) - end - - def vector_standarized_compute(m,sd) # :nodoc: - @data_with_nils.collect{|x| x.nil? ? nil : (x.to_f - m).quo(sd) }.to_vector(:scale) - end - # Return a vector usign the standarized values for data - # with sd with denominator n-1. With variance=0 or mean nil, - # returns a vector of equal size full of nils - # - def vector_standarized(use_population=false) - check_type :scale - m=mean - sd=use_population ? sdp : sds - return ([nil]*size).to_scale if mean.nil? or sd==0.0 - vector=vector_standarized_compute(m,sd) - vector.name=_("%s(standarized)") % @name - vector - end - def vector_centered_compute(m) #:nodoc: - @data_with_nils.collect {|x| x.nil? ? nil : x.to_f-m }.to_scale - end - # Return a centered vector - def vector_centered - check_type :scale - m=mean - return ([nil]*size).to_scale if mean.nil? - vector=vector_centered_compute(m) - vector.name=_("%s(centered)") % @name - vector - end - - alias_method :standarized, :vector_standarized - alias_method :centered, :vector_centered - # Return a vector with values replaced with the percentiles - # of each values - def vector_percentil - check_type :ordinal - c=@valid_data.size - vector=ranked.map {|i| i.nil? ? nil : (i.quo(c)*100).to_f }.to_vector(@type) - vector.name=_("%s(percentil)") % @name - vector - end - def box_cox_transformation(lambda) # :nodoc: - raise "Should be a scale" unless @type==:scale - @data_with_nils.collect{|x| - if !x.nil? - if(lambda==0) - Math.log(x) - else - (x**lambda-1).quo(lambda) - end - else - nil - end - }.to_vector(:scale) - end - - # Vector equality. - # Two vector will be the same if their data, missing values, type, labels are equals - def ==(v2) - return false unless v2.instance_of? Statsample::Vector - @data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels==v2.labels - end - - def _dump(i) # :nodoc: - Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type,'name'=>@name}) - end - - def self._load(data) # :nodoc: - h=Marshal.load(data) - Vector.new(h['data'], h['type'], :missing_values=> h['missing_values'], :labels=>h['labels'], :name=>h['name']) - end - # Returns a new vector, with data modified by block. - # Equivalent to create a Vector after #collect on data - def recode(type=nil) - type||=@type - @data.collect{|x| - yield x - }.to_vector(type) - end - # Modifies current vector, with data modified by block. - # Equivalent to #collect! on @data - def recode! - @data.collect!{|x| - yield x - } - set_valid_data - end - def push(v) - @data.push(v) - set_valid_data - end - # Dicotomize the vector with 0 and 1, based on lowest value - # If parameter if defined, this value and lower - # will be 0 and higher, 1 - def dichotomize(low=nil) - fs=factors - low||=factors.min - @data_with_nils.collect{|x| - if x.nil? - nil - elsif x>low - 1 - else - 0 - end - }.to_scale - end - # Iterate on each item. - # Equivalent to - # @data.each{|x| yield x} - def each - @data.each{|x| yield(x) } - end - - # Iterate on each item, retrieving index - def each_index - (0...@data.size).each {|i| - yield(i) - } - end - # Add a value at the end of the vector. - # If second argument set to false, you should update the Vector usign - # Vector.set_valid_data at the end of your insertion cycle - # - def add(v,update_valid=true) - @data.push(v) - set_valid_data if update_valid - end - # Update valid_data, missing_data, data_with_nils and gsl - # at the end of an insertion. - # - # Use after Vector.add(v,false) - # Usage: - # v=Statsample::Vector.new - # v.add(2,false) - # v.add(4,false) - # v.data - # => [2,3] - # v.valid_data - # => [] - # v.set_valid_data - # v.valid_data - # => [2,3] - def set_valid_data - @valid_data.clear - @missing_data.clear - @data_with_nils.clear - @date_data_with_nils.clear - set_valid_data_intern - set_scale_data if(@type==:scale) - set_date_data if(@type==:date) - end - if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern) - def set_valid_data_intern #:nodoc: - Statsample::STATSAMPLE__.set_valid_data_intern(self) - end - else - def set_valid_data_intern #:nodoc: - _set_valid_data_intern - end - end - def _set_valid_data_intern #:nodoc: - @data.each do |n| - if is_valid? n - @valid_data.push(n) - @data_with_nils.push(n) - else - @data_with_nils.push(nil) - @missing_data.push(n) - end - end - @has_missing_data=@missing_data.size>0 - end - - # Retrieves true if data has one o more missing values - def has_missing_data? - @has_missing_data - end - alias :flawed? :has_missing_data? - - # Retrieves label for value x. Retrieves x if - # no label defined. - def labeling(x) - @labels.has_key?(x) ? @labels[x].to_s : x.to_s - end - alias :label :labeling - # Returns a Vector with data with labels replaced by the label. - def vector_labeled - d=@data.collect{|x| - if @labels.has_key? x - @labels[x] - else - x - end - } - Vector.new(d,@type) - end - # Size of total data - def size - @data.size - end - alias_method :n, :size - - # Retrieves i element of data - def [](i) - @data[i] - end - # Set i element of data. - # Note: Use set_valid_data if you include missing values - def []=(i,v) - @data[i]=v - end - # Return true if a value is valid (not nil and not included on missing values) - def is_valid?(x) - !(x.nil? or @missing_values.include? x) - end - # Set missing_values. - # set_valid_data is called after changes - def missing_values=(vals) - @missing_values = vals - set_valid_data - end - # Set data considered as "today" on data vectors - def today_values=(vals) - @today_values = vals - set_valid_data - end - # Set level of measurement. - def type=(t) - @type=t - set_scale_data if(t==:scale) - set_date_data if (t==:date) - end - def to_a - if @data.is_a? Array - @data.dup - else - @data.to_a - end - end - alias_method :to_ary, :to_a - - # Vector sum. - # - If v is a scalar, add this value to all elements - # - If v is a Array or a Vector, should be of the same size of this vector - # every item of this vector will be added to the value of the - # item at the same position on the other vector - def +(v) - _vector_ari("+",v) - end - # Vector rest. - # - If v is a scalar, rest this value to all elements - # - If v is a Array or a Vector, should be of the same - # size of this vector - # every item of this vector will be rested to the value of the - # item at the same position on the other vector - - def -(v) - _vector_ari("-",v) - end - - def *(v) - _vector_ari("*",v) - end - # Reports all values that doesn't comply with a condition. - # Returns a hash with the index of data and the invalid data. - def verify - h={} - (0...@data.size).to_a.each{|i| - if !(yield @data[i]) - h[i]=@data[i] - end - } - h - end - def _vector_ari(method,v) # :nodoc: - if(v.is_a? Vector or v.is_a? Array) - raise ArgumentError, "The array/vector parameter (#{v.size}) should be of the same size of the original vector (#{@data.size})" unless v.size==@data.size - sum=[] - v.size.times {|i| - if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?)) - sum.push(@data[i].send(method,v[i])) - else - sum.push(nil) - end - } - Statsample::Vector.new(sum, :scale) - elsif(v.respond_to? method ) - Statsample::Vector.new( - @data.collect {|x| - if(!x.nil?) - x.send(method,v) - else - nil - end - } , :scale) - else - raise TypeError,"You should pass a scalar or a array/vector" - end - - end - # Return an array with the data splitted by a separator. - # a=Vector.new(["a,b","c,d","a,b","d"]) - # a.splitted - # => - # [["a","b"],["c","d"],["a","b"],["d"]] - def splitted(sep=Statsample::SPLIT_TOKEN) - @data.collect{|x| - if x.nil? - nil - elsif (x.respond_to? :split) - x.split(sep) - else - [x] - end - } - end - # Returns a hash of Vectors, defined by the different values - # defined on the fields - # Example: - # - # a=Vector.new(["a,b","c,d","a,b"]) - # a.split_by_separator - # => {"a"=>#, - # "b"=>#, - # "c"=>#} - # - def split_by_separator(sep=Statsample::SPLIT_TOKEN) - split_data=splitted(sep) - factors=split_data.flatten.uniq.compact - out=factors.inject({}) {|a,x| - a[x]=[] - a - } - split_data.each do |r| - if r.nil? - factors.each do |f| - out[f].push(nil) - end - else - factors.each do |f| - out[f].push(r.include?(f) ? 1:0) - end - end - end - out.inject({}){|s,v| - s[v[0]]=Vector.new(v[1],:nominal) - s - } - end - def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN) - split_by_separator(sep).inject({}) {|a,v| - a[v[0]]=v[1].inject {|s,x| s+x.to_i} - a - } - end - - # == Bootstrap - # Generate +nr+ resamples (with replacement) of size +s+ - # from vector, computing each estimate from +estimators+ - # over each resample. - # +estimators+ could be - # a) Hash with variable names as keys and lambdas as values - # a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000) - # b) Array with names of method to bootstrap - # a.bootstrap([:mean, :sd],1000) - # c) A single method to bootstrap - # a.jacknife(:mean, 1000) - # If s is nil, is set to vector size by default. - # - # Returns a dataset where each vector is an vector - # of length +nr+ containing the computed resample estimates. - def bootstrap(estimators, nr, s=nil) - s||=n - - h_est, es, bss= prepare_bootstrap(estimators) - - - nr.times do |i| - bs=sample_with_replacement(s) - es.each do |estimator| - # Add bootstrap - bss[estimator].push(h_est[estimator].call(bs)) - end - end - - es.each do |est| - bss[est]=bss[est].to_scale - bss[est].type=:scale - end - bss.to_dataset - - end - - # == Jacknife - # Returns a dataset with jacknife delete-+k+ +estimators+ - # +estimators+ could be: - # a) Hash with variable names as keys and lambdas as values - # a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)}) - # b) Array with method names to jacknife - # a.jacknife([:mean, :sd]) - # c) A single method to jacknife - # a.jacknife(:mean) - # +k+ represent the block size for block jacknife. By default - # is set to 1, for classic delete-one jacknife. - # - # Returns a dataset where each vector is an vector - # of length +cases+/+k+ containing the computed jacknife estimates. - # - # == Reference: - # * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife. - def jacknife(estimators, k=1) - raise "n should be divisible by k:#{k}" unless n%k==0 - - nb=(n / k).to_i - - - h_est, es, ps= prepare_bootstrap(estimators) - - est_n=es.inject({}) {|h,v| - h[v]=h_est[v].call(self) - h - } - - - nb.times do |i| - other=@data_with_nils.dup - other.slice!(i*k,k) - other=other.to_scale - es.each do |estimator| - # Add pseudovalue - ps[estimator].push( nb * est_n[estimator] - (nb-1) * h_est[estimator].call(other)) - end - end - - - es.each do |est| - ps[est]=ps[est].to_scale - ps[est].type=:scale - end - ps.to_dataset - end - - - # For an array or hash of estimators methods, returns - # an array with three elements - # 1.- A hash with estimators names as keys and lambdas as values - # 2.- An array with estimators names - # 3.- A Hash with estimators names as keys and empty arrays as values - def prepare_bootstrap(estimators) - h_est=estimators - - h_est=[h_est] unless h_est.is_a? Array or h_est.is_a? Hash - - if h_est.is_a? Array - h_est=h_est.inject({}) {|h,est| - h[est]=lambda {|v| v.send(est)} - h - } - end - - bss=h_est.keys.inject({}) {|h,v| h[v]=[];h} - - [h_est,h_est.keys, bss] - - end - private :prepare_bootstrap - - # Returns an random sample of size n, with replacement, - # only with valid data. - # - # In all the trails, every item have the same probability - # of been selected. - def sample_with_replacement(sample=1) - vds=@valid_data.size - (0...sample).collect{ @valid_data[rand(vds)] } - end - # Returns an random sample of size n, without replacement, - # only with valid data. - # - # Every element could only be selected once. - # - # A sample of the same size of the vector is the vector itself. - - def sample_without_replacement(sample=1) - raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size - out=[] - size=@valid_data.size - while out.sizedir could be :horizontal or :vertical - def to_matrix(dir=:horizontal) - case dir - when :horizontal - Matrix[@data] - when :vertical - Matrix.columns([@data]) - end - end - def inspect - self.to_s - end - # Retrieves uniques values for data. - def factors - if @type==:scale - @scale_data.uniq.sort - elsif @type==:date - @date_data_with_nils.uniq.sort - else - @valid_data.uniq.sort - end - end - if Statsample::STATSAMPLE__.respond_to?(:frequencies) - # Returns a hash with the distribution of frecuencies for - # the sample - def frequencies - Statsample::STATSAMPLE__.frequencies(@valid_data) - end - else - def frequencies #:nodoc: - _frequencies - end - end - - - def _frequencies #:nodoc: - @valid_data.inject(Hash.new) {|a,x| - a[x]||=0 - a[x]=a[x]+1 - a - } - end - - # Returns the most frequent item. - def mode - frequencies.max{|a,b| a[1]<=>b[1]}.first - end - # The numbers of item with valid data. - def n_valid - @valid_data.size - end - # Returns a hash with the distribution of proportions of - # the sample. - def proportions - frequencies.inject({}){|a,v| - a[v[0]] = v[1].quo(n_valid) - a - } - end - # Proportion of a given value. - def proportion(v=1) - frequencies[v].quo(@valid_data.size) - end - def report_building(b) - b.section(:name=>name) do |s| - s.text _("n :%d") % n - s.text _("n valid:%d") % n_valid - if @type==:nominal - s.text _("factors:%s") % factors.join(",") - s.text _("mode: %s") % mode - - s.table(:name=>_("Distribution")) do |t| - frequencies.sort.each do |k,v| - key=labels.has_key?(k) ? labels[k]:k - t.row [key, v , ("%0.2f%%" % (v.quo(n_valid)*100))] - end - end - end - - s.text _("median: %s") % median.to_s if(@type==:ordinal or @type==:scale) - if(@type==:scale) - s.text _("mean: %0.4f") % mean - if sd - s.text _("std.dev.: %0.4f") % sd - s.text _("std.err.: %0.4f") % se - s.text _("skew: %0.4f") % skew - s.text _("kurtosis: %0.4f") % kurtosis - end - end - end - end - - # Variance of p, according to poblation size - def variance_proportion(n_poblation, v=1) - Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation) - end - # Variance of p, according to poblation size - def variance_total(n_poblation, v=1) - Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation) - end - def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1) - Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin) - end - def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1) - Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin) - end - - self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met| - met_or=met.gsub("_slow","") - if !self.method_defined?(met_or) - alias_method met_or, met - end - end - - ###### - ### Ordinal Methods - ###### - - # == Percentil - # Returns the value of the percentile q - # - # Accepts an optional second argument specifying the strategy to interpolate - # when the requested percentile lies between two data points a and b - # Valid strategies are: - # * :midpoint (Default): (a + b) / 2 - # * :linear : a + (b - a) * d where d is the decimal part of the index between a and b. - # This is the NIST recommended method (http://en.wikipedia.org/wiki/Percentile#NIST_method) - # - def percentil(q, strategy = :midpoint) - check_type :ordinal - sorted=@valid_data.sort - - case strategy - when :midpoint - v = (n_valid * q).quo(100) - if(v.to_i!=v) - sorted[v.to_i] - else - (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2) - end - when :linear - index = (q / 100.0) * (n_valid + 1) - - k = index.truncate - d = index % 1 - - if k == 0 - sorted[0] - elsif k >= sorted.size - sorted[-1] - else - sorted[k - 1] + d * (sorted[k] - sorted[k - 1]) - end - else - raise NotImplementedError.new "Unknown strategy #{strategy.to_s}" - end - end - - # Returns a ranked vector. - def ranked(type=:ordinal) - check_type :ordinal - i=0 - r=frequencies.sort.inject({}){|a,v| - a[v[0]]=(i+1 + i+v[1]).quo(2) - i+=v[1] - a - } - @data.collect {|c| r[c] }.to_vector(type) - end - # Return the median (percentil 50) - def median - check_type :ordinal - percentil(50) - end - # Minimun value - def min - check_type :ordinal - @valid_data.min - end - # Maximum value - def max - check_type :ordinal - @valid_data.max - end - - def set_date_data - @date_data_with_nils=@data.collect do|x| - if x.is_a? Date - x - elsif x.is_a? Time - Date.new(x.year, x.month, x.day) - elsif x.is_a? String and x=~/(\d{4,4})[-\/](\d{1,2})[-\/](\d{1,2})/ - Date.new($1.to_i,$2.to_i,$3.to_i) - elsif @today_values.include? x - Date.today() - elsif @missing_values.include? x or x.nil? - nil - end - end - end - - def set_scale_data - @scale_data=@valid_data.collect do|x| - if x.is_a? Numeric - x - elsif x.is_a? String and x.to_i==x.to_f - x.to_i - else - x.to_f - end - end - end - - private :set_date_data, :set_scale_data - - # The range of the data (max - min) - def range; - check_type :scale - @scale_data.max - @scale_data.min - end - # The sum of values for the data - def sum - check_type :scale - @scale_data.inject(0){|a,x|x+a} ; - end - # The arithmetical mean of data - def mean - check_type :scale - sum.to_f.quo(n_valid) - end - # Sum of squares for the data around a value. - # By default, this value is the mean - # ss= sum{(xi-m)^2} - # - def sum_of_squares(m=nil) - check_type :scale - m||=mean - @scale_data.inject(0){|a,x| a+(x-m).square} - end - # Sum of squared deviation - def sum_of_squared_deviation - check_type :scale - @scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid)) - end - - # Population variance (denominator N) - def variance_population(m=nil) - check_type :scale - m||=mean - squares=@scale_data.inject(0){|a,x| x.square+a} - squares.quo(n_valid) - m.square - end - - - # Population Standard deviation (denominator N) - def standard_deviation_population(m=nil) - check_type :scale - Math::sqrt( variance_population(m) ) - end - - # Population average deviation (denominator N) - # author: Al Chou - - def average_deviation_population( m = nil ) - check_type :scale - m ||= mean - ( @scale_data.inject( 0 ) { |a, x| ( x - m ).abs + a } ).quo( n_valid ) - end - def median_absolute_deviation - med=median - recode {|x| (x-med).abs}.median - end - alias :mad :median_absolute_deviation - # Sample Variance (denominator n-1) - def variance_sample(m=nil) - check_type :scale - m||=mean - sum_of_squares(m).quo(n_valid - 1) - end - - # Sample Standard deviation (denominator n-1) - def standard_deviation_sample(m=nil) - check_type :scale - m||=mean - Math::sqrt(variance_sample(m)) - end - # Skewness of the sample - def skew(m=nil) - check_type :scale - m||=mean - th=@scale_data.inject(0){|a,x| a+((x-m)**3)} - th.quo((@scale_data.size)*sd(m)**3) - end - # Kurtosis of the sample - def kurtosis(m=nil) - check_type :scale - m||=mean - fo=@scale_data.inject(0){|a,x| a+((x-m)**4)} - fo.quo((@scale_data.size)*sd(m)**4)-3 - - end - # Product of all values on the sample - # - def product - check_type :scale - @scale_data.inject(1){|a,x| a*x } - end - - # With a fixnum, creates X bins within the range of data - # With an Array, each value will be a cut point - def histogram(bins=10) - check_type :scale - - if bins.is_a? Array - #h=Statsample::Histogram.new(self, bins) - h=Statsample::Histogram.alloc(bins) - else - # ugly patch. The upper limit for a bin has the form - # x < range - #h=Statsample::Histogram.new(self, bins) - min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max) - # fix last data - if max==@valid_data.max - max+=1e-10 - end - h=Statsample::Histogram.alloc(bins,[min,max]) - # Fix last bin - - end - h.increment(@valid_data) - h - end - - # Coefficient of variation - # Calculed with the sample standard deviation - def coefficient_of_variation - check_type :scale - standard_deviation_sample.quo(mean) - end - # Standard error of the distribution mean - # Calculated using sd/sqrt(n) - def standard_error - standard_deviation_sample.quo(Math.sqrt(valid_data.size)) - end - alias :se :standard_error - - alias_method :sdp, :standard_deviation_population - alias_method :sds, :standard_deviation_sample - alias_method :adp, :average_deviation_population - alias_method :cov, :coefficient_of_variation - alias_method :variance, :variance_sample - alias_method :sd, :standard_deviation_sample - alias_method :ss, :sum_of_squares - include_aliasing Statsample::Vector::GSL_ if Statsample.has_gsl? - end -end diff --git a/lib/statsample/vector/gsl.rb b/lib/statsample/vector/gsl.rb deleted file mode 100644 index 9b12418..0000000 --- a/lib/statsample/vector/gsl.rb +++ /dev/null @@ -1,106 +0,0 @@ -module Statsample - class Vector - module GSL_ - def clear_gsl - @gsl=nil - end - - def set_valid_data - clear_gsl - set_valid_data_ruby - end - def push(v) - # If data is GSL::Vector, should be converted first to an Array - if @data.is_a? GSL::Vector - @data=@data.to_a - end - push_ruby(v) - end - - def gsl - @gsl||=GSL::Vector.alloc(@scale_data) if @scale_data.size>0 - end - - alias :to_gsl :gsl - def vector_standarized_compute(m,sd) - if flawed? - vector_standarized_compute_ruby(m,sd) - else - gsl.collect {|x| (x.to_f - m).quo(sd)}.to_scale - end - end - - def vector_centered_compute(m) - if flawed? - vector_centered_compute_ruby(m) - else - gsl.collect {|x| (x.to_f - m)}.to_scale - end - end - def sample_with_replacement(sample=1) - if(@type!=:scale) - sample_with_replacement_ruby(sample) - else - r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000)) - Statsample::Vector.new(r.sample(gsl, sample).to_a,:scale) - end - end - - def sample_without_replacement(sample=1) - if(@type!=:scale) - sample_without_replacement_ruby(sample) - else - r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000)) - r.choose(gsl, sample).to_a - end - end - def median - if @type!=:scale - median_ruby - else - sorted=GSL::Vector.alloc(@scale_data.sort) - GSL::Stats::median_from_sorted_data(sorted) - end - end - - def sum - check_type :scale - gsl.nil? ? nil : gsl.sum - end - def mean - check_type :scale - gsl.nil? ? nil : gsl.mean - end - def variance_sample(m=nil) - check_type :scale - m||=mean - gsl.nil? ? nil : gsl.variance_m - end - - def standard_deviation_sample(m=nil) - check_type :scale - m||=mean - gsl.nil? ? nil : gsl.sd(m) - end - - def variance_population(m=nil) # :nodoc: - check_type :scale - m||=mean - gsl.nil? ? nil : gsl.variance_with_fixed_mean(m) - end - def standard_deviation_population(m=nil) # :nodoc: - check_type :scale - m||=mean - gsl.nil? ? nil : gsl.sd_with_fixed_mean(m) - end - def skew # :nodoc: - check_type :scale - gsl.nil? ? nil : gsl.skew - end - def kurtosis # :nodoc: - check_type :scale - gsl.nil? ? nil : gsl.kurtosis - end - end - end -end diff --git a/lib/statsample/version.rb b/lib/statsample/version.rb index 4da66f2..b14c020 100644 --- a/lib/statsample/version.rb +++ b/lib/statsample/version.rb @@ -1,3 +1,3 @@ module Statsample - VERSION = '1.4.0' + VERSION = '2.1.0' end diff --git a/setup.rb b/setup.rb deleted file mode 100644 index 424a5f3..0000000 --- a/setup.rb +++ /dev/null @@ -1,1585 +0,0 @@ -# -# setup.rb -# -# Copyright (c) 2000-2005 Minero Aoki -# -# This program is free software. -# You can distribute/modify this program under the terms of -# the GNU LGPL, Lesser General Public License version 2.1. -# - -unless Enumerable.method_defined?(:map) # Ruby 1.4.6 - module Enumerable - alias map collect - end -end - -unless File.respond_to?(:read) # Ruby 1.6 - def File.read(fname) - open(fname) {|f| - return f.read - } - end -end - -unless Errno.const_defined?(:ENOTEMPTY) # Windows? - module Errno - class ENOTEMPTY - # We do not raise this exception, implementation is not needed. - end - end -end - -def File.binread(fname) - open(fname, 'rb') {|f| - return f.read - } -end - -# for corrupted Windows' stat(2) -def File.dir?(path) - File.directory?((path[-1,1] == '/') ? path : path + '/') -end - - -class ConfigTable - - include Enumerable - - def initialize(rbconfig) - @rbconfig = rbconfig - @items = [] - @table = {} - # options - @install_prefix = nil - @config_opt = nil - @verbose = true - @no_harm = false - end - - attr_accessor :install_prefix - attr_accessor :config_opt - - attr_writer :verbose - - def verbose? - @verbose - end - - attr_writer :no_harm - - def no_harm? - @no_harm - end - - def [](key) - lookup(key).resolve(self) - end - - def []=(key, val) - lookup(key).set val - end - - def names - @items.map {|i| i.name } - end - - def each(&block) - @items.each(&block) - end - - def key?(name) - @table.key?(name) - end - - def lookup(name) - @table[name] or setup_rb_error "no such config item: #{name}" - end - - def add(item) - @items.push item - @table[item.name] = item - end - - def remove(name) - item = lookup(name) - @items.delete_if {|i| i.name == name } - @table.delete_if {|name, i| i.name == name } - item - end - - def load_script(path, inst = nil) - if File.file?(path) - MetaConfigEnvironment.new(self, inst).instance_eval File.read(path), path - end - end - - def savefile - '.config' - end - - def load_savefile - begin - File.foreach(savefile()) do |line| - k, v = *line.split(/=/, 2) - self[k] = v.strip - end - rescue Errno::ENOENT - setup_rb_error $!.message + "\n#{File.basename($0)} config first" - end - end - - def save - @items.each {|i| i.value } - File.open(savefile(), 'w') {|f| - @items.each do |i| - f.printf "%s=%s\n", i.name, i.value if i.value? and i.value - end - } - end - - def load_standard_entries - standard_entries(@rbconfig).each do |ent| - add ent - end - end - - def standard_entries(rbconfig) - c = rbconfig - - rubypath = File.join(c['bindir'], c['ruby_install_name'] + c['EXEEXT']) - - major = c['MAJOR'].to_i - minor = c['MINOR'].to_i - teeny = c['TEENY'].to_i - version = "#{major}.#{minor}" - - # ruby ver. >= 1.4.4? - newpath_p = ((major >= 2) or - ((major == 1) and - ((minor >= 5) or - ((minor == 4) and (teeny >= 4))))) - - if c['rubylibdir'] - # V > 1.6.3 - libruby = "#{c['prefix']}/lib/ruby" - librubyver = c['rubylibdir'] - librubyverarch = c['archdir'] - siteruby = c['sitedir'] - siterubyver = c['sitelibdir'] - siterubyverarch = c['sitearchdir'] - elsif newpath_p - # 1.4.4 <= V <= 1.6.3 - libruby = "#{c['prefix']}/lib/ruby" - librubyver = "#{c['prefix']}/lib/ruby/#{version}" - librubyverarch = "#{c['prefix']}/lib/ruby/#{version}/#{c['arch']}" - siteruby = c['sitedir'] - siterubyver = "$siteruby/#{version}" - siterubyverarch = "$siterubyver/#{c['arch']}" - else - # V < 1.4.4 - libruby = "#{c['prefix']}/lib/ruby" - librubyver = "#{c['prefix']}/lib/ruby/#{version}" - librubyverarch = "#{c['prefix']}/lib/ruby/#{version}/#{c['arch']}" - siteruby = "#{c['prefix']}/lib/ruby/#{version}/site_ruby" - siterubyver = siteruby - siterubyverarch = "$siterubyver/#{c['arch']}" - end - parameterize = lambda {|path| - path.sub(/\A#{Regexp.quote(c['prefix'])}/, '$prefix') - } - - if arg = c['configure_args'].split.detect {|arg| /--with-make-prog=/ =~ arg } - makeprog = arg.sub(/'/, '').split(/=/, 2)[1] - else - makeprog = 'make' - end - - [ - ExecItem.new('installdirs', 'std/site/home', - 'std: install under libruby; site: install under site_ruby; home: install under $HOME')\ - {|val, table| - case val - when 'std' - table['rbdir'] = '$librubyver' - table['sodir'] = '$librubyverarch' - when 'site' - table['rbdir'] = '$siterubyver' - table['sodir'] = '$siterubyverarch' - when 'home' - setup_rb_error '$HOME was not set' unless ENV['HOME'] - table['prefix'] = ENV['HOME'] - table['rbdir'] = '$libdir/ruby' - table['sodir'] = '$libdir/ruby' - end - }, - PathItem.new('prefix', 'path', c['prefix'], - 'path prefix of target environment'), - PathItem.new('bindir', 'path', parameterize.call(c['bindir']), - 'the directory for commands'), - PathItem.new('libdir', 'path', parameterize.call(c['libdir']), - 'the directory for libraries'), - PathItem.new('datadir', 'path', parameterize.call(c['datadir']), - 'the directory for shared data'), - PathItem.new('mandir', 'path', parameterize.call(c['mandir']), - 'the directory for man pages'), - PathItem.new('sysconfdir', 'path', parameterize.call(c['sysconfdir']), - 'the directory for system configuration files'), - PathItem.new('localstatedir', 'path', parameterize.call(c['localstatedir']), - 'the directory for local state data'), - PathItem.new('libruby', 'path', libruby, - 'the directory for ruby libraries'), - PathItem.new('librubyver', 'path', librubyver, - 'the directory for standard ruby libraries'), - PathItem.new('librubyverarch', 'path', librubyverarch, - 'the directory for standard ruby extensions'), - PathItem.new('siteruby', 'path', siteruby, - 'the directory for version-independent aux ruby libraries'), - PathItem.new('siterubyver', 'path', siterubyver, - 'the directory for aux ruby libraries'), - PathItem.new('siterubyverarch', 'path', siterubyverarch, - 'the directory for aux ruby binaries'), - PathItem.new('rbdir', 'path', '$siterubyver', - 'the directory for ruby scripts'), - PathItem.new('sodir', 'path', '$siterubyverarch', - 'the directory for ruby extentions'), - PathItem.new('rubypath', 'path', rubypath, - 'the path to set to #! line'), - ProgramItem.new('rubyprog', 'name', rubypath, - 'the ruby program using for installation'), - ProgramItem.new('makeprog', 'name', makeprog, - 'the make program to compile ruby extentions'), - SelectItem.new('shebang', 'all/ruby/never', 'ruby', - 'shebang line (#!) editing mode'), - BoolItem.new('without-ext', 'yes/no', 'no', - 'does not compile/install ruby extentions') - ] - end - private :standard_entries - - def load_multipackage_entries - multipackage_entries().each do |ent| - add ent - end - end - - def multipackage_entries - [ - PackageSelectionItem.new('with', 'name,name...', '', 'ALL', - 'package names that you want to install'), - PackageSelectionItem.new('without', 'name,name...', '', 'NONE', - 'package names that you do not want to install') - ] - end - private :multipackage_entries - - ALIASES = { - 'std-ruby' => 'librubyver', - 'stdruby' => 'librubyver', - 'rubylibdir' => 'librubyver', - 'archdir' => 'librubyverarch', - 'site-ruby-common' => 'siteruby', # For backward compatibility - 'site-ruby' => 'siterubyver', # For backward compatibility - 'bin-dir' => 'bindir', - 'bin-dir' => 'bindir', - 'rb-dir' => 'rbdir', - 'so-dir' => 'sodir', - 'data-dir' => 'datadir', - 'ruby-path' => 'rubypath', - 'ruby-prog' => 'rubyprog', - 'ruby' => 'rubyprog', - 'make-prog' => 'makeprog', - 'make' => 'makeprog' - } - - def fixup - ALIASES.each do |ali, name| - @table[ali] = @table[name] - end - @items.freeze - @table.freeze - @options_re = /\A--(#{@table.keys.join('|')})(?:=(.*))?\z/ - end - - def parse_opt(opt) - m = @options_re.match(opt) or setup_rb_error "config: unknown option #{opt}" - m.to_a[1,2] - end - - def dllext - @rbconfig['DLEXT'] - end - - def value_config?(name) - lookup(name).value? - end - - class Item - def initialize(name, template, default, desc) - @name = name.freeze - @template = template - @value = default - @default = default - @description = desc - end - - attr_reader :name - attr_reader :description - - attr_accessor :default - alias help_default default - - def help_opt - "--#{@name}=#{@template}" - end - - def value? - true - end - - def value - @value - end - - def resolve(table) - @value.gsub(%r<\$([^/]+)>) { table[$1] } - end - - def set(val) - @value = check(val) - end - - private - - def check(val) - setup_rb_error "config: --#{name} requires argument" unless val - val - end - end - - class BoolItem < Item - def config_type - 'bool' - end - - def help_opt - "--#{@name}" - end - - private - - def check(val) - return 'yes' unless val - case val - when /\Ay(es)?\z/i, /\At(rue)?\z/i then 'yes' - when /\An(o)?\z/i, /\Af(alse)\z/i then 'no' - else - setup_rb_error "config: --#{@name} accepts only yes/no for argument" - end - end - end - - class PathItem < Item - def config_type - 'path' - end - - private - - def check(path) - setup_rb_error "config: --#{@name} requires argument" unless path - path[0,1] == '$' ? path : File.expand_path(path) - end - end - - class ProgramItem < Item - def config_type - 'program' - end - end - - class SelectItem < Item - def initialize(name, selection, default, desc) - super - @ok = selection.split('/') - end - - def config_type - 'select' - end - - private - - def check(val) - unless @ok.include?(val.strip) - setup_rb_error "config: use --#{@name}=#{@template} (#{val})" - end - val.strip - end - end - - class ExecItem < Item - def initialize(name, selection, desc, &block) - super name, selection, nil, desc - @ok = selection.split('/') - @action = block - end - - def config_type - 'exec' - end - - def value? - false - end - - def resolve(table) - setup_rb_error "$#{name()} wrongly used as option value" - end - - undef set - - def evaluate(val, table) - v = val.strip.downcase - unless @ok.include?(v) - setup_rb_error "invalid option --#{@name}=#{val} (use #{@template})" - end - @action.call v, table - end - end - - class PackageSelectionItem < Item - def initialize(name, template, default, help_default, desc) - super name, template, default, desc - @help_default = help_default - end - - attr_reader :help_default - - def config_type - 'package' - end - - private - - def check(val) - unless File.dir?("packages/#{val}") - setup_rb_error "config: no such package: #{val}" - end - val - end - end - - class MetaConfigEnvironment - def initialize(config, installer) - @config = config - @installer = installer - end - - def config_names - @config.names - end - - def config?(name) - @config.key?(name) - end - - def bool_config?(name) - @config.lookup(name).config_type == 'bool' - end - - def path_config?(name) - @config.lookup(name).config_type == 'path' - end - - def value_config?(name) - @config.lookup(name).config_type != 'exec' - end - - def add_config(item) - @config.add item - end - - def add_bool_config(name, default, desc) - @config.add BoolItem.new(name, 'yes/no', default ? 'yes' : 'no', desc) - end - - def add_path_config(name, default, desc) - @config.add PathItem.new(name, 'path', default, desc) - end - - def set_config_default(name, default) - @config.lookup(name).default = default - end - - def remove_config(name) - @config.remove(name) - end - - # For only multipackage - def packages - raise '[setup.rb fatal] multi-package metaconfig API packages() called for single-package; contact application package vendor' unless @installer - @installer.packages - end - - # For only multipackage - def declare_packages(list) - raise '[setup.rb fatal] multi-package metaconfig API declare_packages() called for single-package; contact application package vendor' unless @installer - @installer.packages = list - end - end - -end # class ConfigTable - - -# This module requires: #verbose?, #no_harm? -module FileOperations - - def mkdir_p(dirname, prefix = nil) - dirname = prefix + File.expand_path(dirname) if prefix - $stderr.puts "mkdir -p #{dirname}" if verbose? - return if no_harm? - - # Does not check '/', it's too abnormal. - dirs = File.expand_path(dirname).split(%r<(?=/)>) - if /\A[a-z]:\z/i =~ dirs[0] - disk = dirs.shift - dirs[0] = disk + dirs[0] - end - dirs.each_index do |idx| - path = dirs[0..idx].join('') - Dir.mkdir path unless File.dir?(path) - end - end - - def rm_f(path) - $stderr.puts "rm -f #{path}" if verbose? - return if no_harm? - force_remove_file path - end - - def rm_rf(path) - $stderr.puts "rm -rf #{path}" if verbose? - return if no_harm? - remove_tree path - end - - def remove_tree(path) - if File.symlink?(path) - remove_file path - elsif File.dir?(path) - remove_tree0 path - else - force_remove_file path - end - end - - def remove_tree0(path) - Dir.foreach(path) do |ent| - next if ent == '.' - next if ent == '..' - entpath = "#{path}/#{ent}" - if File.symlink?(entpath) - remove_file entpath - elsif File.dir?(entpath) - remove_tree0 entpath - else - force_remove_file entpath - end - end - begin - Dir.rmdir path - rescue Errno::ENOTEMPTY - # directory may not be empty - end - end - - def move_file(src, dest) - force_remove_file dest - begin - File.rename src, dest - rescue - File.open(dest, 'wb') {|f| - f.write File.binread(src) - } - File.chmod File.stat(src).mode, dest - File.unlink src - end - end - - def force_remove_file(path) - begin - remove_file path - rescue - end - end - - def remove_file(path) - File.chmod 0777, path - File.unlink path - end - - def install(from, dest, mode, prefix = nil) - $stderr.puts "install #{from} #{dest}" if verbose? - return if no_harm? - - realdest = prefix ? prefix + File.expand_path(dest) : dest - realdest = File.join(realdest, File.basename(from)) if File.dir?(realdest) - str = File.binread(from) - if diff?(str, realdest) - verbose_off { - rm_f realdest if File.exist?(realdest) - } - File.open(realdest, 'wb') {|f| - f.write str - } - File.chmod mode, realdest - - File.open("#{objdir_root()}/InstalledFiles", 'a') {|f| - if prefix - f.puts realdest.sub(prefix, '') - else - f.puts realdest - end - } - end - end - - def diff?(new_content, path) - return true unless File.exist?(path) - new_content != File.binread(path) - end - - def command(*args) - $stderr.puts args.join(' ') if verbose? - system(*args) or raise RuntimeError, - "system(#{args.map{|a| a.inspect }.join(' ')}) failed" - end - - def ruby(*args) - command config('rubyprog'), *args - end - - def make(task = nil) - command(*[config('makeprog'), task].compact) - end - - def extdir?(dir) - File.exist?("#{dir}/MANIFEST") or File.exist?("#{dir}/extconf.rb") - end - - def files_of(dir) - Dir.open(dir) {|d| - return d.select {|ent| File.file?("#{dir}/#{ent}") } - } - end - - DIR_REJECT = %w( . .. CVS SCCS RCS CVS.adm .svn ) - - def directories_of(dir) - Dir.open(dir) {|d| - return d.select {|ent| File.dir?("#{dir}/#{ent}") } - DIR_REJECT - } - end - -end - - -# This module requires: #srcdir_root, #objdir_root, #relpath -module HookScriptAPI - - def get_config(key) - @config[key] - end - - alias config get_config - - # obsolete: use metaconfig to change configuration - def set_config(key, val) - @config[key] = val - end - - # - # srcdir/objdir (works only in the package directory) - # - - def curr_srcdir - "#{srcdir_root()}/#{relpath()}" - end - - def curr_objdir - "#{objdir_root()}/#{relpath()}" - end - - def srcfile(path) - "#{curr_srcdir()}/#{path}" - end - - def srcexist?(path) - File.exist?(srcfile(path)) - end - - def srcdirectory?(path) - File.dir?(srcfile(path)) - end - - def srcfile?(path) - File.file?(srcfile(path)) - end - - def srcentries(path = '.') - Dir.open("#{curr_srcdir()}/#{path}") {|d| - return d.to_a - %w(. ..) - } - end - - def srcfiles(path = '.') - srcentries(path).select {|fname| - File.file?(File.join(curr_srcdir(), path, fname)) - } - end - - def srcdirectories(path = '.') - srcentries(path).select {|fname| - File.dir?(File.join(curr_srcdir(), path, fname)) - } - end - -end - - -class ToplevelInstaller - - Version = '3.4.1' - Copyright = 'Copyright (c) 2000-2005 Minero Aoki' - - TASKS = [ - [ 'all', 'do config, setup, then install' ], - [ 'config', 'saves your configurations' ], - [ 'show', 'shows current configuration' ], - [ 'setup', 'compiles ruby extentions and others' ], - [ 'install', 'installs files' ], - [ 'test', 'run all tests in test/' ], - [ 'clean', "does `make clean' for each extention" ], - [ 'distclean',"does `make distclean' for each extention" ] - ] - - def ToplevelInstaller.invoke - config = ConfigTable.new(load_rbconfig()) - config.load_standard_entries - config.load_multipackage_entries if multipackage? - config.fixup - klass = (multipackage?() ? ToplevelInstallerMulti : ToplevelInstaller) - klass.new(File.dirname($0), config).invoke - end - - def ToplevelInstaller.multipackage? - File.dir?(File.dirname($0) + '/packages') - end - - def ToplevelInstaller.load_rbconfig - if arg = ARGV.detect {|arg| /\A--rbconfig=/ =~ arg } - ARGV.delete(arg) - load File.expand_path(arg.split(/=/, 2)[1]) - $".push 'rbconfig.rb' - else - require 'rbconfig' - end - ::Config::CONFIG - end - - def initialize(ardir_root, config) - @ardir = File.expand_path(ardir_root) - @config = config - # cache - @valid_task_re = nil - end - - def config(key) - @config[key] - end - - def inspect - "#<#{self.class} #{__id__()}>" - end - - def invoke - run_metaconfigs - case task = parsearg_global() - when nil, 'all' - parsearg_config - init_installers - exec_config - exec_setup - exec_install - else - case task - when 'config', 'test' - ; - when 'clean', 'distclean' - @config.load_savefile if File.exist?(@config.savefile) - else - @config.load_savefile - end - __send__ "parsearg_#{task}" - init_installers - __send__ "exec_#{task}" - end - end - - def run_metaconfigs - @config.load_script "#{@ardir}/metaconfig" - end - - def init_installers - @installer = Installer.new(@config, @ardir, File.expand_path('.')) - end - - # - # Hook Script API bases - # - - def srcdir_root - @ardir - end - - def objdir_root - '.' - end - - def relpath - '.' - end - - # - # Option Parsing - # - - def parsearg_global - while arg = ARGV.shift - case arg - when /\A\w+\z/ - setup_rb_error "invalid task: #{arg}" unless valid_task?(arg) - return arg - when '-q', '--quiet' - @config.verbose = false - when '--verbose' - @config.verbose = true - when '--help' - print_usage $stdout - exit 0 - when '--version' - puts "#{File.basename($0)} version #{Version}" - exit 0 - when '--copyright' - puts Copyright - exit 0 - else - setup_rb_error "unknown global option '#{arg}'" - end - end - nil - end - - def valid_task?(t) - valid_task_re() =~ t - end - - def valid_task_re - @valid_task_re ||= /\A(?:#{TASKS.map {|task,desc| task }.join('|')})\z/ - end - - def parsearg_no_options - unless ARGV.empty? - task = caller(0).first.slice(%r<`parsearg_(\w+)'>, 1) - setup_rb_error "#{task}: unknown options: #{ARGV.join(' ')}" - end - end - - alias parsearg_show parsearg_no_options - alias parsearg_setup parsearg_no_options - alias parsearg_test parsearg_no_options - alias parsearg_clean parsearg_no_options - alias parsearg_distclean parsearg_no_options - - def parsearg_config - evalopt = [] - set = [] - @config.config_opt = [] - while i = ARGV.shift - if /\A--?\z/ =~ i - @config.config_opt = ARGV.dup - break - end - name, value = *@config.parse_opt(i) - if @config.value_config?(name) - @config[name] = value - else - evalopt.push [name, value] - end - set.push name - end - evalopt.each do |name, value| - @config.lookup(name).evaluate value, @config - end - # Check if configuration is valid - set.each do |n| - @config[n] if @config.value_config?(n) - end - end - - def parsearg_install - @config.no_harm = false - @config.install_prefix = '' - while a = ARGV.shift - case a - when '--no-harm' - @config.no_harm = true - when /\A--prefix=/ - path = a.split(/=/, 2)[1] - path = File.expand_path(path) unless path[0,1] == '/' - @config.install_prefix = path - else - setup_rb_error "install: unknown option #{a}" - end - end - end - - def print_usage(out) - out.puts 'Typical Installation Procedure:' - out.puts " $ ruby #{File.basename $0} config" - out.puts " $ ruby #{File.basename $0} setup" - out.puts " # ruby #{File.basename $0} install (may require root privilege)" - out.puts - out.puts 'Detailed Usage:' - out.puts " ruby #{File.basename $0} " - out.puts " ruby #{File.basename $0} [] []" - - fmt = " %-24s %s\n" - out.puts - out.puts 'Global options:' - out.printf fmt, '-q,--quiet', 'suppress message outputs' - out.printf fmt, ' --verbose', 'output messages verbosely' - out.printf fmt, ' --help', 'print this message' - out.printf fmt, ' --version', 'print version and quit' - out.printf fmt, ' --copyright', 'print copyright and quit' - out.puts - out.puts 'Tasks:' - TASKS.each do |name, desc| - out.printf fmt, name, desc - end - - fmt = " %-24s %s [%s]\n" - out.puts - out.puts 'Options for CONFIG or ALL:' - @config.each do |item| - out.printf fmt, item.help_opt, item.description, item.help_default - end - out.printf fmt, '--rbconfig=path', 'rbconfig.rb to load',"running ruby's" - out.puts - out.puts 'Options for INSTALL:' - out.printf fmt, '--no-harm', 'only display what to do if given', 'off' - out.printf fmt, '--prefix=path', 'install path prefix', '' - out.puts - end - - # - # Task Handlers - # - - def exec_config - @installer.exec_config - @config.save # must be final - end - - def exec_setup - @installer.exec_setup - end - - def exec_install - @installer.exec_install - end - - def exec_test - @installer.exec_test - end - - def exec_show - @config.each do |i| - printf "%-20s %s\n", i.name, i.value if i.value? - end - end - - def exec_clean - @installer.exec_clean - end - - def exec_distclean - @installer.exec_distclean - end - -end # class ToplevelInstaller - - -class ToplevelInstallerMulti < ToplevelInstaller - - include FileOperations - - def initialize(ardir_root, config) - super - @packages = directories_of("#{@ardir}/packages") - raise 'no package exists' if @packages.empty? - @root_installer = Installer.new(@config, @ardir, File.expand_path('.')) - end - - def run_metaconfigs - @config.load_script "#{@ardir}/metaconfig", self - @packages.each do |name| - @config.load_script "#{@ardir}/packages/#{name}/metaconfig" - end - end - - attr_reader :packages - - def packages=(list) - raise 'package list is empty' if list.empty? - list.each do |name| - raise "directory packages/#{name} does not exist"\ - unless File.dir?("#{@ardir}/packages/#{name}") - end - @packages = list - end - - def init_installers - @installers = {} - @packages.each do |pack| - @installers[pack] = Installer.new(@config, - "#{@ardir}/packages/#{pack}", - "packages/#{pack}") - end - with = extract_selection(config('with')) - without = extract_selection(config('without')) - @selected = @installers.keys.select {|name| - (with.empty? or with.include?(name)) \ - and not without.include?(name) - } - end - - def extract_selection(list) - a = list.split(/,/) - a.each do |name| - setup_rb_error "no such package: #{name}" unless @installers.key?(name) - end - a - end - - def print_usage(f) - super - f.puts 'Inluded packages:' - f.puts ' ' + @packages.sort.join(' ') - f.puts - end - - # - # Task Handlers - # - - def exec_config - run_hook 'pre-config' - each_selected_installers {|inst| inst.exec_config } - run_hook 'post-config' - @config.save # must be final - end - - def exec_setup - run_hook 'pre-setup' - each_selected_installers {|inst| inst.exec_setup } - run_hook 'post-setup' - end - - def exec_install - run_hook 'pre-install' - each_selected_installers {|inst| inst.exec_install } - run_hook 'post-install' - end - - def exec_test - run_hook 'pre-test' - each_selected_installers {|inst| inst.exec_test } - run_hook 'post-test' - end - - def exec_clean - rm_f @config.savefile - run_hook 'pre-clean' - each_selected_installers {|inst| inst.exec_clean } - run_hook 'post-clean' - end - - def exec_distclean - rm_f @config.savefile - run_hook 'pre-distclean' - each_selected_installers {|inst| inst.exec_distclean } - run_hook 'post-distclean' - end - - # - # lib - # - - def each_selected_installers - Dir.mkdir 'packages' unless File.dir?('packages') - @selected.each do |pack| - $stderr.puts "Processing the package `#{pack}' ..." if verbose? - Dir.mkdir "packages/#{pack}" unless File.dir?("packages/#{pack}") - Dir.chdir "packages/#{pack}" - yield @installers[pack] - Dir.chdir '../..' - end - end - - def run_hook(id) - @root_installer.run_hook id - end - - # module FileOperations requires this - def verbose? - @config.verbose? - end - - # module FileOperations requires this - def no_harm? - @config.no_harm? - end - -end # class ToplevelInstallerMulti - - -class Installer - - FILETYPES = %w( bin lib ext data conf man ) - - include FileOperations - include HookScriptAPI - - def initialize(config, srcroot, objroot) - @config = config - @srcdir = File.expand_path(srcroot) - @objdir = File.expand_path(objroot) - @currdir = '.' - end - - def inspect - "#<#{self.class} #{File.basename(@srcdir)}>" - end - - def noop(rel) - end - - # - # Hook Script API base methods - # - - def srcdir_root - @srcdir - end - - def objdir_root - @objdir - end - - def relpath - @currdir - end - - # - # Config Access - # - - # module FileOperations requires this - def verbose? - @config.verbose? - end - - # module FileOperations requires this - def no_harm? - @config.no_harm? - end - - def verbose_off - begin - save, @config.verbose = @config.verbose?, false - yield - ensure - @config.verbose = save - end - end - - # - # TASK config - # - - def exec_config - exec_task_traverse 'config' - end - - alias config_dir_bin noop - alias config_dir_lib noop - - def config_dir_ext(rel) - extconf if extdir?(curr_srcdir()) - end - - alias config_dir_data noop - alias config_dir_conf noop - alias config_dir_man noop - - def extconf - ruby "#{curr_srcdir()}/extconf.rb", *@config.config_opt - end - - # - # TASK setup - # - - def exec_setup - exec_task_traverse 'setup' - end - - def setup_dir_bin(rel) - files_of(curr_srcdir()).each do |fname| - update_shebang_line "#{curr_srcdir()}/#{fname}" - end - end - - alias setup_dir_lib noop - - def setup_dir_ext(rel) - make if extdir?(curr_srcdir()) - end - - alias setup_dir_data noop - alias setup_dir_conf noop - alias setup_dir_man noop - - def update_shebang_line(path) - return if no_harm? - return if config('shebang') == 'never' - old = Shebang.load(path) - if old - $stderr.puts "warning: #{path}: Shebang line includes too many args. It is not portable and your program may not work." if old.args.size > 1 - new = new_shebang(old) - return if new.to_s == old.to_s - else - return unless config('shebang') == 'all' - new = Shebang.new(config('rubypath')) - end - $stderr.puts "updating shebang: #{File.basename(path)}" if verbose? - open_atomic_writer(path) {|output| - File.open(path, 'rb') {|f| - f.gets if old # discard - output.puts new.to_s - output.print f.read - } - } - end - - def new_shebang(old) - if /\Aruby/ =~ File.basename(old.cmd) - Shebang.new(config('rubypath'), old.args) - elsif File.basename(old.cmd) == 'env' and old.args.first == 'ruby' - Shebang.new(config('rubypath'), old.args[1..-1]) - else - return old unless config('shebang') == 'all' - Shebang.new(config('rubypath')) - end - end - - def open_atomic_writer(path, &block) - tmpfile = File.basename(path) + '.tmp' - begin - File.open(tmpfile, 'wb', &block) - File.rename tmpfile, File.basename(path) - ensure - File.unlink tmpfile if File.exist?(tmpfile) - end - end - - class Shebang - def Shebang.load(path) - line = nil - File.open(path) {|f| - line = f.gets - } - return nil unless /\A#!/ =~ line - parse(line) - end - - def Shebang.parse(line) - cmd, *args = *line.strip.sub(/\A\#!/, '').split(' ') - new(cmd, args) - end - - def initialize(cmd, args = []) - @cmd = cmd - @args = args - end - - attr_reader :cmd - attr_reader :args - - def to_s - "#! #{@cmd}" + (@args.empty? ? '' : " #{@args.join(' ')}") - end - end - - # - # TASK install - # - - def exec_install - rm_f 'InstalledFiles' - exec_task_traverse 'install' - end - - def install_dir_bin(rel) - install_files targetfiles(), "#{config('bindir')}/#{rel}", 0755 - end - - def install_dir_lib(rel) - install_files libfiles(), "#{config('rbdir')}/#{rel}", 0644 - end - - def install_dir_ext(rel) - return unless extdir?(curr_srcdir()) - install_files rubyextentions('.'), - "#{config('sodir')}/#{File.dirname(rel)}", - 0555 - end - - def install_dir_data(rel) - install_files targetfiles(), "#{config('datadir')}/#{rel}", 0644 - end - - def install_dir_conf(rel) - # FIXME: should not remove current config files - # (rename previous file to .old/.org) - install_files targetfiles(), "#{config('sysconfdir')}/#{rel}", 0644 - end - - def install_dir_man(rel) - install_files targetfiles(), "#{config('mandir')}/#{rel}", 0644 - end - - def install_files(list, dest, mode) - mkdir_p dest, @config.install_prefix - list.each do |fname| - install fname, dest, mode, @config.install_prefix - end - end - - def libfiles - glob_reject(%w(*.y *.output), targetfiles()) - end - - def rubyextentions(dir) - ents = glob_select("*.#{@config.dllext}", targetfiles()) - if ents.empty? - setup_rb_error "no ruby extention exists: 'ruby #{$0} setup' first" - end - ents - end - - def targetfiles - mapdir(existfiles() - hookfiles()) - end - - def mapdir(ents) - ents.map {|ent| - if File.exist?(ent) - then ent # objdir - else "#{curr_srcdir()}/#{ent}" # srcdir - end - } - end - - # picked up many entries from cvs-1.11.1/src/ignore.c - JUNK_FILES = %w( - core RCSLOG tags TAGS .make.state - .nse_depinfo #* .#* cvslog.* ,* .del-* *.olb - *~ *.old *.bak *.BAK *.orig *.rej _$* *$ - - *.org *.in .* - ) - - def existfiles - glob_reject(JUNK_FILES, (files_of(curr_srcdir()) | files_of('.'))) - end - - def hookfiles - %w( pre-%s post-%s pre-%s.rb post-%s.rb ).map {|fmt| - %w( config setup install clean ).map {|t| sprintf(fmt, t) } - }.flatten - end - - def glob_select(pat, ents) - re = globs2re([pat]) - ents.select {|ent| re =~ ent } - end - - def glob_reject(pats, ents) - re = globs2re(pats) - ents.reject {|ent| re =~ ent } - end - - GLOB2REGEX = { - '.' => '\.', - '$' => '\$', - '#' => '\#', - '*' => '.*' - } - - def globs2re(pats) - /\A(?:#{ - pats.map {|pat| pat.gsub(/[\.\$\#\*]/) {|ch| GLOB2REGEX[ch] } }.join('|') - })\z/ - end - - # - # TASK test - # - - TESTDIR = 'test' - - def exec_test - unless File.directory?('test') - $stderr.puts 'no test in this package' if verbose? - return - end - $stderr.puts 'Running tests...' if verbose? - begin - require 'test/unit' - rescue LoadError - setup_rb_error 'test/unit cannot loaded. You need Ruby 1.8 or later to invoke this task.' - end - runner = Test::Unit::AutoRunner.new(true) - runner.to_run << TESTDIR - runner.run - end - - # - # TASK clean - # - - def exec_clean - exec_task_traverse 'clean' - rm_f @config.savefile - rm_f 'InstalledFiles' - end - - alias clean_dir_bin noop - alias clean_dir_lib noop - alias clean_dir_data noop - alias clean_dir_conf noop - alias clean_dir_man noop - - def clean_dir_ext(rel) - return unless extdir?(curr_srcdir()) - make 'clean' if File.file?('Makefile') - end - - # - # TASK distclean - # - - def exec_distclean - exec_task_traverse 'distclean' - rm_f @config.savefile - rm_f 'InstalledFiles' - end - - alias distclean_dir_bin noop - alias distclean_dir_lib noop - - def distclean_dir_ext(rel) - return unless extdir?(curr_srcdir()) - make 'distclean' if File.file?('Makefile') - end - - alias distclean_dir_data noop - alias distclean_dir_conf noop - alias distclean_dir_man noop - - # - # Traversing - # - - def exec_task_traverse(task) - run_hook "pre-#{task}" - FILETYPES.each do |type| - if type == 'ext' and config('without-ext') == 'yes' - $stderr.puts 'skipping ext/* by user option' if verbose? - next - end - traverse task, type, "#{task}_dir_#{type}" - end - run_hook "post-#{task}" - end - - def traverse(task, rel, mid) - dive_into(rel) { - run_hook "pre-#{task}" - __send__ mid, rel.sub(%r[\A.*?(?:/|\z)], '') - directories_of(curr_srcdir()).each do |d| - traverse task, "#{rel}/#{d}", mid - end - run_hook "post-#{task}" - } - end - - def dive_into(rel) - return unless File.dir?("#{@srcdir}/#{rel}") - - dir = File.basename(rel) - Dir.mkdir dir unless File.dir?(dir) - prevdir = Dir.pwd - Dir.chdir dir - $stderr.puts '---> ' + rel if verbose? - @currdir = rel - yield - Dir.chdir prevdir - $stderr.puts '<--- ' + rel if verbose? - @currdir = File.dirname(rel) - end - - def run_hook(id) - path = [ "#{curr_srcdir()}/#{id}", - "#{curr_srcdir()}/#{id}.rb" ].detect {|cand| File.file?(cand) } - return unless path - begin - instance_eval File.read(path), path, 1 - rescue - raise if $DEBUG - setup_rb_error "hook #{path} failed:\n" + $!.message - end - end - -end # class Installer - - -class SetupError < StandardError; end - -def setup_rb_error(msg) - raise SetupError, msg -end - -if $0 == __FILE__ - begin - ToplevelInstaller.invoke - rescue SetupError - raise if $DEBUG - $stderr.puts $!.message - $stderr.puts "Try 'ruby #{$0} --help' for detailed usage." - exit 1 - end -end diff --git a/statsample.gemspec b/statsample.gemspec new file mode 100644 index 0000000..4ff94bf --- /dev/null +++ b/statsample.gemspec @@ -0,0 +1,87 @@ +$:.unshift File.expand_path("../lib/", __FILE__) + +require 'statsample/version' +require 'date' + +Statsample::DESCRIPTION = < 0.1.6' + s.add_runtime_dependency 'spreadsheet', '~> 1.1' + s.add_runtime_dependency 'reportbuilder', '~> 1.4' + s.add_runtime_dependency 'minimization', '~> 0.2' + s.add_runtime_dependency 'dirty-memoize', '~> 0.0.4' + s.add_runtime_dependency 'extendmatrix', '~> 0.4' + s.add_runtime_dependency 'rserve-client', '~> 0.3' + s.add_runtime_dependency 'rubyvis', '~> 0.6.1' + s.add_runtime_dependency 'distribution', '~> 0.7' + s.add_runtime_dependency 'awesome_print', '~> 1.6' + + s.add_development_dependency 'bundler', '~> 1.10' + s.add_development_dependency 'rake', '~> 10.4' + s.add_development_dependency 'rdoc', '~> 4.2' + s.add_development_dependency 'shoulda', '~> 3.5' + s.add_development_dependency 'shoulda-matchers', '~> 2.2' + s.add_development_dependency 'minitest', '~> 5.7' + s.add_development_dependency 'gettext', '~> 3.1' + s.add_development_dependency 'mocha', '~> 1.1' + s.add_development_dependency 'nmatrix', '~> 0.2.1' + s.add_development_dependency 'gsl', '~> 2.1' + s.add_development_dependency 'pry' + s.add_development_dependency 'rubocop' + s.add_development_dependency 'activesupport', '~> 4.2' +end diff --git a/test/fixtures/df.csv b/test/fixtures/df.csv new file mode 100644 index 0000000..4398132 --- /dev/null +++ b/test/fixtures/df.csv @@ -0,0 +1,15 @@ +y,a,b,c,d,e +0,6,62.1,no,female,A +1,18,34.7,yes,male,B +1,6,29.7,no,female,C +0,4,71,no,male,C +1,5,36.9,yes,male,B +0,11,58.7,no,female,B +0,8,63.3,no,male,B +1,21,20.4,yes,male,A +1,2,20.5,yes,male,C +0,11,59.2,no,male,B +0,1,76.4,yes,female,A +0,8,71.7,no,female,B +1,2,77.5,no,male,C +1,3,31.1,no,male,B \ No newline at end of file diff --git a/test/fixtures/repeated_fields.csv b/test/fixtures/repeated_fields.csv deleted file mode 100644 index 90010dd..0000000 --- a/test/fixtures/repeated_fields.csv +++ /dev/null @@ -1,7 +0,0 @@ -"id","name","age","city","a1","name","age" -1,"Alex",20,"New York","a,b","a",3 -2,"Claude",23,"London","b,c","b",4 -3,"Peter",25,"London","a","c",5 -4,"Franz",27,"Paris",,"d",6 -5,"George","5,5","Tome","a,b,c","f", -6,"Fernand",20,"London","c,b","f",8 diff --git a/test/fixtures/test_csv.csv b/test/fixtures/test_csv.csv deleted file mode 100644 index 667beaf..0000000 --- a/test/fixtures/test_csv.csv +++ /dev/null @@ -1,7 +0,0 @@ -"id","name","age","city","a1" -1,"Alex",20,"New York","a,b" -2,"Claude",23,"London","b,c" -3,"Peter",25,"London","a" -4,"Franz",27,"Paris", -5,"George","5,5","Tome","a,b,c" -6,"Fernand",,, diff --git a/test/fixtures/test_xls.xls b/test/fixtures/test_xls.xls deleted file mode 100644 index 043890d..0000000 Binary files a/test/fixtures/test_xls.xls and /dev/null differ diff --git a/test/helpers_tests.rb b/test/helpers_tests.rb index 47495e7..99d2d28 100644 --- a/test/helpers_tests.rb +++ b/test/helpers_tests.rb @@ -1,5 +1,6 @@ -$:.unshift(File.expand_path(File.dirname(__FILE__)+'/../lib/')) -$:.unshift(File.expand_path(File.dirname(__FILE__)+'/')) +$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib/')) +$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__) + '/')) + require 'minitest' require 'minitest/unit' require 'mocha/setup' @@ -11,67 +12,72 @@ require 'statsample' - -module MiniTest +module Minitest class Test include Shoulda::Context::Assertions include Shoulda::Context::InstanceMethods extend Shoulda::Context::ClassMethods - def self.should_with_gsl(name,&block) - should(name) do - if Statsample.has_gsl? - instance_eval(&block) - else - skip("Requires GSL") - end - + + def self.should_with_gsl(name, &block) + should(name) do + if Statsample.has_gsl? + instance_eval(&block) + else + skip('Requires GSL') end - - + end end end module Assertions - def assert_similar_vector(exp, obs, delta=1e-10,msg=nil) - msg||="Different vectors #{exp} - #{obs}" + def assert_vectors_from_formula(formula, names) + model = Statsample::FitModel.new formula, @df + + model.df_for_regression.vectors.to_a.sort + .must_equal names.sort + end + + def assert_similar_vector(exp, obs, delta = 1e-10, msg = nil) + msg ||= "Different vectors #{exp} - #{obs}" assert_equal(exp.size, obs.size) - exp.data_with_nils.each_with_index {|v,i| - assert_in_delta(v,obs[i],delta) + exp.to_a.each_with_index {|v, i| + assert_in_delta(v, obs[i], delta) } end - def assert_equal_vector(exp,obs,delta=1e-10,msg=nil) + + def assert_equal_vector(exp, obs, delta = 1e-10, msg = nil) assert_equal(exp.size, obs.size, "Different size.#{msg}") exp.size.times {|i| - assert_in_delta(exp[i],obs[i],delta, "Different element #{i}. \nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}") + assert_in_delta(exp[i], obs[i], delta, "Different element #{i}. \nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}") } end - def assert_equal_matrix(exp,obs,delta=1e-10,msg=nil) - assert_equal(exp.row_size, obs.row_size, "Different row size.#{msg}") - assert_equal(exp.column_size, obs.column_size, "Different column size.#{msg}") - exp.row_size.times {|i| - exp.column_size.times {|j| - assert_in_delta(exp[i,j],obs[i,j], delta, "Different element #{i},#{j}\nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}") - } - } + + def assert_equal_matrix(exp, obs, delta = 1e-10, msg = nil) + assert_equal(exp.row_size, obs.row_size, "Different row size.#{msg}") + assert_equal(exp.column_size, obs.column_size, "Different column size.#{msg}") + exp.row_size.times {|i| + exp.column_size.times {|j| + assert_in_delta(exp[i, j], obs[i, j], delta, "Different element #{i},#{j}\nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}") + } + } end - alias :assert_raise :assert_raises unless method_defined? :assert_raise - alias :assert_not_equal :refute_equal unless method_defined? :assert_not_equal - alias :assert_not_same :refute_same unless method_defined? :assert_not_same + alias_method :assert_raise, :assert_raises unless method_defined? :assert_raise + alias_method :assert_not_equal, :refute_equal unless method_defined? :assert_not_equal + alias_method :assert_not_same, :refute_same unless method_defined? :assert_not_same unless method_defined? :assert_nothing_raised - def assert_nothing_raised(msg=nil) - msg||="Nothing should be raised, but raised %s" + def assert_nothing_raised(msg = nil) + msg ||= 'Nothing should be raised, but raised %s' begin yield - not_raised=true + not_raised = true rescue Exception => e - not_raised=false - msg=sprintf(msg,e) + not_raised = false + msg = sprintf(msg, e) end - assert(not_raised,msg) + assert(not_raised, msg) end end end end MiniTest.autorun - diff --git a/test/test_analysis.rb b/test/test_analysis.rb index 9799f57..20d8985 100644 --- a/test/test_analysis.rb +++ b/test/test_analysis.rb @@ -1,77 +1,77 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) -class StatsampleAnalysisTestCase < MiniTest::Unit::TestCase +class StatsampleAnalysisTestCase < Minitest::Test context(Statsample::Analysis) do setup do Statsample::Analysis.clear_analysis end - should "store() should create and store Statsample::Analysis::Suite" do + should 'store() should create and store Statsample::Analysis::Suite' do Statsample::Analysis.store(:first) do - a=1 + a = 1 end assert(Statsample::Analysis.stored_analysis[:first]) assert(Statsample::Analysis.stored_analysis[:first].is_a? Statsample::Analysis::Suite) end - - should "ss_analysis should create an Statsample::Analysis" do - ss_analysis(:first) {a=1} + + should 'ss_analysis should create an Statsample::Analysis' do + ss_analysis(:first) { a = 1 } end - should "store last created analysis" do - an=Statsample::Analysis.store(:first) do - a=1 + should 'store last created analysis' do + an = Statsample::Analysis.store(:first) do + a = 1 end - assert_equal(an,Statsample::Analysis.last) + assert_equal(an, Statsample::Analysis.last) end - - should "add_to_reportbuilder() add sections to reportbuilder object" do - rb=mock() - rb.expects(:add).with {|value| value.is_a? ReportBuilder::Section and value.name==:first} - rb.expects(:add).with {|value| value.is_a? ReportBuilder::Section and value.name==:second} - + + should 'add_to_reportbuilder() add sections to reportbuilder object' do + rb = mock + rb.expects(:add).with { |value| value.is_a? ReportBuilder::Section and value.name == :first } + rb.expects(:add).with { |value| value.is_a? ReportBuilder::Section and value.name == :second } + Statsample::Analysis.store(:first) do - echo "first","second" + echo 'first', 'second' end Statsample::Analysis.store(:second) do - echo "third" + echo 'third' end - Statsample::Analysis.add_to_reportbuilder(rb,:first,:second) + Statsample::Analysis.add_to_reportbuilder(rb, :first, :second) end - should "to_text returns the same as a normal ReportBuilder object" do - rb=ReportBuilder.new(:name=>:test) - section=ReportBuilder::Section.new(:name=>"first") - a=[1,2,3].to_scale - section.add("first") + should 'to_text returns the same as a normal ReportBuilder object' do + rb = ReportBuilder.new(name: :test) + section = ReportBuilder::Section.new(name: 'first') + a = Daru::Vector.new([1, 2, 3]) + section.add('first') section.add(a) rb.add(section) - exp=rb.to_text - an=ss_analysis(:first) { + exp = rb.to_text + an = ss_analysis(:first) { echo 'first' summary(a) } - obs=Statsample::Analysis.to_text(:first) - - assert_equal(exp.split("\n")[1,exp.size], obs.split("\n")[1,obs.size]) + obs = Statsample::Analysis.to_text(:first) + + assert_equal(exp.split("\n")[1, exp.size], obs.split("\n")[1, obs.size]) end - - should "run() execute all analysis by default" do - m1=mock() + + should 'run() execute all analysis by default' do + m1 = mock m1.expects(:run).once m1.expects(:hide).once - + Statsample::Analysis.store(:first) do m1.run end Statsample::Analysis.store(:second) do m1.hide end - + # Should run all test Statsample::Analysis.run end - - should "run() execute blocks specificed on parameters" do - m1=mock() - m1.expects(:run).once + + should 'run() execute blocks specificed on parameters' do + m1 = mock + m1.expects(:run).once m1.expects(:hide).never Statsample::Analysis.store(:first) do m1.run @@ -82,78 +82,78 @@ class StatsampleAnalysisTestCase < MiniTest::Unit::TestCase # Should run all test Statsample::Analysis.run(:first) end - + context(Statsample::Analysis::Suite) do - should "echo() uses output#puts with same arguments" do - an=Statsample::Analysis::Suite.new(:output) - obj=mock() - obj.expects(:puts).with(:first,:second).once - an.output=obj - an.echo(:first,:second) - end - should "summary() should call object.summary" do - an=Statsample::Analysis::Suite.new(:summary) - obj=stub('summarizable',:summary=>'summary') - assert_equal(obj.summary,an.summary(obj)) - end - should "attach() allows to call objects on objects which respond to fields" do - an=Statsample::Analysis::Suite.new(:summary) - ds={'x'=>stub(:mean=>10),'y'=>stub(:mean=>12)} - ds.expects(:fields).returns(%w{x y}).at_least_once + should 'echo() uses output#puts with same arguments' do + an = Statsample::Analysis::Suite.new(:output) + obj = mock + obj.expects(:puts).with(:first, :second).once + an.output = obj + an.echo(:first, :second) + end + should 'summary() should call object.summary' do + an = Statsample::Analysis::Suite.new(:summary) + obj = stub('summarizable', summary: 'summary') + assert_equal(obj.summary, an.summary(obj)) + end + should 'attach() allows to call objects on objects which respond to fields' do + an = Statsample::Analysis::Suite.new(:summary) + ds = { :x => stub(mean: 10), :y => stub(mean: 12) } + ds.expects(:vectors).returns([:x, :y]).at_least_once an.attach(ds) - assert_equal(10,an.x.mean) - assert_equal(12,an.y.mean) + assert_equal(10, an.x.mean) + assert_equal(12, an.y.mean) assert_raise(RuntimeError) { an.z } end - should "attached objects should be called LIFO" do - an=Statsample::Analysis::Suite.new(:summary) - ds1={'x'=>stub(:mean=>100),'y'=>stub(:mean=>120),'z'=>stub(:mean=>13)} - ds1.expects(:fields).returns(%w{x y z}).at_least_once - ds2={'x'=>stub(:mean=>10),'y'=>stub(:mean=>12)} - ds2.expects(:fields).returns(%w{x y}).at_least_once + should 'attached objects should be called LIFO' do + an = Statsample::Analysis::Suite.new(:summary) + ds1 = { :x => stub(mean: 100), :y => stub(mean: 120), :z => stub(mean: 13) } + ds1.expects(:vectors).returns([:x, :y, :z]).at_least_once + ds2 = { :x => stub(mean: 10), :y => stub(mean: 12) } + ds2.expects(:vectors).returns([:x, :y]).at_least_once an.attach(ds1) an.attach(ds2) - assert_equal(10,an.x.mean) - assert_equal(12,an.y.mean) - assert_equal(13,an.z.mean) - end - - should "detach() without arguments drop latest object" do - an=Statsample::Analysis::Suite.new(:summary) - ds1={'x'=>stub(:mean=>100),'y'=>stub(:mean=>120),'z'=>stub(:mean=>13)} - ds1.expects(:fields).returns(%w{x y z}).at_least_once - ds2={'x'=>stub(:mean=>10),'y'=>stub(:mean=>12)} - ds2.expects(:fields).returns(%w{x y}).at_least_once + assert_equal(10, an.x.mean) + assert_equal(12, an.y.mean) + assert_equal(13, an.z.mean) + end + + should 'detach() without arguments drop latest object' do + an = Statsample::Analysis::Suite.new(:summary) + ds1 = { :x => stub(mean: 100), :y => stub(mean: 120), :z => stub(mean: 13) } + ds1.expects(:vectors).returns([:x, :y, :z]).at_least_once + ds2 = { :x => stub(mean: 10), :y => stub(mean: 12) } + ds2.expects(:vectors).returns([:x, :y]).at_least_once an.attach(ds1) an.attach(ds2) - assert_equal(10,an.x.mean) + assert_equal(10, an.x.mean) an.detach assert_equal(100, an.x.mean) end - should "detach() with argument drop select object" do - an=Statsample::Analysis::Suite.new(:summary) - ds1={'x'=>1} - ds1.expects(:fields).returns(%w{x}).at_least_once - ds2={'x'=>2,'y'=>3} - ds2.expects(:fields).returns(%w{x y}).at_least_once - ds3={'y'=>4} - ds3.expects(:fields).returns(%w{y}).at_least_once - + should 'detach() with argument drop select object' do + an = Statsample::Analysis::Suite.new(:summary) + ds1 = { :x => 1 } + ds1.expects(:vectors).returns([:x]).at_least_once + ds2 = { :x => 2, :y => 3 } + ds2.expects(:vectors).returns([:x, :y]).at_least_once + ds3 = { :y => 4 } + ds3.expects(:vectors).returns([:y]).at_least_once + an.attach(ds3) an.attach(ds2) an.attach(ds1) - assert_equal(1,an.x) - assert_equal(3,an.y) + assert_equal(1, an.x) + assert_equal(3, an.y) an.detach(ds2) - assert_equal(4,an.y) + assert_equal(4, an.y) end - should "perform a simple analysis" do - output=mock() + should 'perform a simple analysis' do + output = mock output.expects(:puts).with(5.5) - an=Statsample::Analysis.store(:simple, :output=>output) do - ds=data_frame(:x=>vector(1..10),:y=>vector(1..10)) + an = Statsample::Analysis.store(:simple, output: output) do + ds = data_frame(x: vector(1..10), y: vector(1..10)) attach(ds) echo x.mean end @@ -161,17 +161,16 @@ class StatsampleAnalysisTestCase < MiniTest::Unit::TestCase end end context(Statsample::Analysis::SuiteReportBuilder) do - should "echo() use add on rb object" do - an=Statsample::Analysis::SuiteReportBuilder.new(:puts_to_add) + should 'echo() use add on rb object' do + an = Statsample::Analysis::SuiteReportBuilder.new(:puts_to_add) an.rb.expects(:add).with(:first).twice an.echo(:first, :first) end - should "summary() uses add on rb object" do - an=Statsample::Analysis::SuiteReportBuilder.new(:summary_to_add) + should 'summary() uses add on rb object' do + an = Statsample::Analysis::SuiteReportBuilder.new(:summary_to_add) an.rb.expects(:add).with(:first).once an.summary(:first) end end - end end diff --git a/test/test_anova_contrast.rb b/test/test_anova_contrast.rb index a335149..36ccc60 100644 --- a/test/test_anova_contrast.rb +++ b/test/test_anova_contrast.rb @@ -1,36 +1,36 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleAnovaContrastTestCase < MiniTest::Unit::TestCase +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleAnovaContrastTestCase < Minitest::Test context(Statsample::Anova::Contrast) do setup do - constant=[12,13,11,12,12].to_scale - frequent=[9,10,9,13,14].to_scale - infrequent=[15,16,17,16,16].to_scale - never=[17,18,12,18,20].to_scale - @vectors=[constant, frequent, infrequent, never] - @c=Statsample::Anova::Contrast.new(:vectors=>@vectors) + constant = Daru::Vector.new([12, 13, 11, 12, 12]) + frequent = Daru::Vector.new([9, 10, 9, 13, 14]) + infrequent = Daru::Vector.new([15, 16, 17, 16, 16]) + never = Daru::Vector.new([17, 18, 12, 18, 20]) + @vectors = [constant, frequent, infrequent, never] + @c = Statsample::Anova::Contrast.new(vectors: @vectors) end - should "return correct value using c" do - @c.c([1,-1.quo(3),-1.quo(3),-1.quo(3)]) - #@c.c([1,-0.333,-0.333,-0.333]) + should 'return correct value using c' do + @c.c([1, -1.quo(3), -1.quo(3), -1.quo(3)]) + # @c.c([1,-0.333,-0.333,-0.333]) assert_in_delta(-2.6667, @c.psi, 0.0001) assert_in_delta(1.0165, @c.se, 0.0001) assert_in_delta(-2.623, @c.t, 0.001) - assert_in_delta(-4.82, @c.confidence_interval[0],0.01) - assert_in_delta(-0.51, @c.confidence_interval[1],0.01) - assert(@c.summary.size>0) + assert_in_delta(-4.82, @c.confidence_interval[0], 0.01) + assert_in_delta(-0.51, @c.confidence_interval[1], 0.01) + assert(@c.summary.size > 0) end - should "return correct values using c_by_index" do - @c.c_by_index([0],[1,2,3]) + should 'return correct values using c_by_index' do + @c.c_by_index([0], [1, 2, 3]) assert_in_delta(-2.6667, @c.psi, 0.0001) assert_in_delta(1.0165, @c.se, 0.0001) assert_in_delta(-2.623, @c.t, 0.001) end - should "return correct values using incomplete c_by_index" do - c1=Statsample::Anova::Contrast.new(:vectors=>@vectors, :c=>[0.5,0.5,-1,0]) - c2=Statsample::Anova::Contrast.new(:vectors=>@vectors, :c1=>[0,1],:c2=>[2]) - assert_equal(c1.psi,c2.psi) - assert_equal(c1.se,c2.se) - assert_equal(c1.t,c2.t) + should 'return correct values using incomplete c_by_index' do + c1 = Statsample::Anova::Contrast.new(vectors: @vectors, c: [0.5, 0.5, -1, 0]) + c2 = Statsample::Anova::Contrast.new(vectors: @vectors, c1: [0, 1], c2: [2]) + assert_equal(c1.psi, c2.psi) + assert_equal(c1.se, c2.se) + assert_equal(c1.t, c2.t) end end end diff --git a/test/test_anovaoneway.rb b/test/test_anovaoneway.rb index 2f0e1e5..17c86cb 100644 --- a/test/test_anovaoneway.rb +++ b/test/test_anovaoneway.rb @@ -1,26 +1,26 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleAnovaOneWayTestCase < MiniTest::Unit::TestCase +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleAnovaOneWayTestCase < Minitest::Test context(Statsample::Anova::OneWay) do setup do - @ss_num=30.08 - @ss_den=87.88 - @df_num=2 - @df_den=21 - @anova=Statsample::Anova::OneWay.new(:ss_num=>@ss_num, :ss_den=>@ss_den, :df_num=>@df_num, :df_den=>@df_den) + @ss_num = 30.08 + @ss_den = 87.88 + @df_num = 2 + @df_den = 21 + @anova = Statsample::Anova::OneWay.new(ss_num: @ss_num, ss_den: @ss_den, df_num: @df_num, df_den: @df_den) end - should "Statsample::Anova.oneway respond to #oneway" do + should 'Statsample::Anova.oneway respond to #oneway' do assert(Statsample::Anova.respond_to? :oneway) end - should "return correct value for ms_num and ms_den" do + should 'return correct value for ms_num and ms_den' do assert_in_delta(15.04, @anova.ms_num, 0.01) assert_in_delta(4.18, @anova.ms_den, 0.01) end - should "return correct value for f" do + should 'return correct value for f' do assert_in_delta(3.59, @anova.f, 0.01) end - should "respond to summary" do + should 'respond to summary' do assert(@anova.respond_to? :summary) - assert(@anova.summary.size>0) + assert(@anova.summary.size > 0) end end end diff --git a/test/test_anovatwoway.rb b/test/test_anovatwoway.rb index aa88194..db110c4 100644 --- a/test/test_anovatwoway.rb +++ b/test/test_anovatwoway.rb @@ -1,38 +1,37 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleAnovaTwoWayTestCase < MiniTest::Unit::TestCase +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleAnovaTwoWayTestCase < Minitest::Test context(Statsample::Anova::TwoWay) do setup do - @ss_a=192.2 - @ss_b=57.8 - @ss_axb=168.2 - @ss_within=75.6 - @df_a=@df_b=1 - @df_within=16 - @anova=Statsample::Anova::TwoWay.new(:ss_a=>@ss_a, :ss_b=>@ss_b, :ss_axb=>@ss_axb, :ss_within=>@ss_within , :df_a=>@df_a, :df_b=>@df_b, :df_within=>@df_within) + @ss_a = 192.2 + @ss_b = 57.8 + @ss_axb = 168.2 + @ss_within = 75.6 + @df_a = @df_b = 1 + @df_within = 16 + @anova = Statsample::Anova::TwoWay.new(ss_a: @ss_a, ss_b: @ss_b, ss_axb: @ss_axb, ss_within: @ss_within, df_a: @df_a, df_b: @df_b, df_within: @df_within) end - should "Statsample::Anova.twoway respond to #twoway" do - assert(Statsample::Anova.respond_to? :twoway) + should 'Statsample::Anova.twoway respond to #twoway' do + assert(Statsample::Anova.respond_to? :twoway) end - should "return correct value for ms_a, ms_b and ms_axb" do + should 'return correct value for ms_a, ms_b and ms_axb' do assert_in_delta(192.2, @anova.ms_a, 0.01) assert_in_delta(57.8, @anova.ms_b, 0.01) assert_in_delta(168.2, @anova.ms_axb, 0.01) - end - should "return correct value for f " do + should 'return correct value for f ' do assert_in_delta(40.68, @anova.f_a, 0.01) assert_in_delta(12.23, @anova.f_b, 0.01) assert_in_delta(35.60, @anova.f_axb, 0.01) end - should "return correct value for probability for f " do + should 'return correct value for probability for f ' do assert(@anova.f_a_probability < 0.05) assert(@anova.f_b_probability < 0.05) assert(@anova.f_axb_probability < 0.05) end - should "respond to summary" do + should 'respond to summary' do assert(@anova.respond_to? :summary) - assert(@anova.summary.size>0) + assert(@anova.summary.size > 0) end end end diff --git a/test/test_anovatwowaywithdataset.rb b/test/test_anovatwowaywithdataset.rb index a08eb7d..ee69a3e 100644 --- a/test/test_anovatwowaywithdataset.rb +++ b/test/test_anovatwowaywithdataset.rb @@ -1,49 +1,47 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) # Reference: # * http://www.uwsp.edu/psych/Stat/13/anova-2w.htm#III -class StatsampleAnovaTwoWayWithVectorsTestCase < MiniTest::Unit::TestCase +class StatsampleAnovaTwoWayWithVectorsTestCase < Minitest::Test context(Statsample::Anova::TwoWayWithVectors) do setup do - @pa=[5,4,3,4,2,18,19,14,12,15,6,7,5,8,4,6,9,5,9,3].to_scale - @pa.name="Passive Avoidance" - @a=[0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,1,1,1,1,1].to_vector - @a.labels={0=>'0%',1=>'35%'} - @a.name='Diet' - @b=[0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1].to_vector - @b.labels={0=>'Young',1=>'Older'} - @b.name="Age" - @anova=Statsample::Anova::TwoWayWithVectors.new(:a=>@a,:b=>@b, :dependent=>@pa) + @pa = Daru::Vector.new [5, 4, 3, 4, 2, 18, 19, 14, 12, 15, 6, 7, 5, 8, 4, 6, 9, 5, 9, 3] + @pa.rename 'Passive Avoidance' + @a = Daru::Vector.new [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1] + # @a.labels = { 0 => '0%', 1 => '35%' } + @a.rename 'Diet' + @b = Daru::Vector.new [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + # @b.labels = { 0 => 'Young', 1 => 'Older' } + @b.rename 'Age' + @anova = Statsample::Anova::TwoWayWithVectors.new(a: @a, b: @b, dependent: @pa) end - should "Statsample::Anova respond to #twoway_with_vectors" do - assert(Statsample::Anova.respond_to? :twoway_with_vectors) + should 'Statsample::Anova respond to #twoway_with_vectors' do + assert(Statsample::Anova.respond_to? :twoway_with_vectors) end - should "#new returns the same as Statsample::Anova.twoway_with_vectors" do - @anova2=Statsample::Anova.twoway_with_vectors(:a=>@a,:b=>@b, :dependent=>@pa) + should '#new returns the same as Statsample::Anova.twoway_with_vectors' do + @anova2 = Statsample::Anova.twoway_with_vectors(a: @a, b: @b, dependent: @pa) assert_equal(@anova.summary, @anova2.summary) end - should "return correct value for ms_a, ms_b and ms_axb" do + should 'return correct value for ms_a, ms_b and ms_axb' do assert_in_delta(192.2, @anova.ms_a, 0.01) assert_in_delta(57.8, @anova.ms_b, 0.01) assert_in_delta(168.2, @anova.ms_axb, 0.01) - end - should "return correct value for f " do + should 'return correct value for f ' do assert_in_delta(40.68, @anova.f_a, 0.01) assert_in_delta(12.23, @anova.f_b, 0.01) assert_in_delta(35.60, @anova.f_axb, 0.01) end - should "return correct value for probability for f " do + should 'return correct value for probability for f ' do assert(@anova.f_a_probability < 0.05) assert(@anova.f_b_probability < 0.05) assert(@anova.f_axb_probability < 0.05) end - should "respond to summary" do - - @anova.summary_descriptives=true - @anova.summary_levene=true + should 'respond to summary' do + @anova.summary_descriptives = true + @anova.summary_levene = true assert(@anova.respond_to? :summary) - assert(@anova.summary.size>0) + assert(@anova.summary.size > 0) end end end diff --git a/test/test_anovawithvectors.rb b/test/test_anovawithvectors.rb index b85c074..9da0380 100644 --- a/test/test_anovawithvectors.rb +++ b/test/test_anovawithvectors.rb @@ -1,102 +1,100 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleAnovaOneWayWithVectorsTestCase < MiniTest::Unit::TestCase +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleAnovaOneWayWithVectorsTestCase < Minitest::Test context(Statsample::Anova::OneWayWithVectors) do - - context("when initializing") do + context('when initializing') do setup do - @v1=10.times.map {rand(100)}.to_scale - @v2=10.times.map {rand(100)}.to_scale - @v3=10.times.map {rand(100)}.to_scale + @v1 = Daru::Vector.new(10.times.map { rand(100) }) + @v2 = Daru::Vector.new(10.times.map { rand(100) }) + @v3 = Daru::Vector.new(10.times.map { rand(100) }) end - should "be the same using [] or args*" do - a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3) - a2=Statsample::Anova::OneWayWithVectors.new([@v1,@v2,@v3]) - assert_equal(a1.f,a2.f) + should 'be the same using [] or args*' do + a1 = Statsample::Anova::OneWayWithVectors.new(@v1, @v2, @v3) + a2 = Statsample::Anova::OneWayWithVectors.new([@v1, @v2, @v3]) + assert_equal(a1.f, a2.f) end - should "be the same using module method or object instantiation" do - a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3) - a2=Statsample::Anova.oneway_with_vectors(@v1,@v2,@v3) - assert_equal(a1.f,a2.f) + should 'be the same using module method or object instantiation' do + a1 = Statsample::Anova::OneWayWithVectors.new(@v1, @v2, @v3) + a2 = Statsample::Anova.oneway_with_vectors(@v1, @v2, @v3) + assert_equal(a1.f, a2.f) end - should "detect optional hash" do - a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3, {:name=>'aaa'}) + should 'detect optional hash' do + a1 = Statsample::Anova::OneWayWithVectors.new(@v1, @v2, @v3, name: 'aaa') assert_equal('aaa', a1.name) end - should "omit incorrect arguments" do - a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3, {:name=>'aaa'}) - a2=Statsample::Anova::OneWayWithVectors.new(@v1,nil,nil,@v2,@v3, {:name=>'aaa'}) - assert_equal(a1.f,a2.f) + should 'omit incorrect arguments' do + a1 = Statsample::Anova::OneWayWithVectors.new(@v1, @v2, @v3, name: 'aaa') + a2 = Statsample::Anova::OneWayWithVectors.new(@v1, nil, nil, @v2, @v3, name: 'aaa') + assert_equal(a1.f, a2.f) end end setup do - @v1=[3,3,2,3,6].to_vector(:scale) - @v2=[7,6,5,6,7].to_vector(:scale) - @v3=[9,8,9,7,8].to_vector(:scale) - @name="Anova testing" - @anova=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3, :name=>@name) - end - should "store correctly contrasts" do - c1=Statsample::Anova::Contrast.new(:vectors=>[@v1,@v2,@v3], :c=>[1,-0.5, -0.5]) - - c2=@anova.contrast(:c=>[1,-0.5,-0.5]) - assert_equal(c1.t,c2.t) - - end - should "respond to #summary" do + @v1 = Daru::Vector.new([3, 3, 2, 3, 6]) + @v2 = Daru::Vector.new([7, 6, 5, 6, 7]) + @v3 = Daru::Vector.new([9, 8, 9, 7, 8]) + @name = 'Anova testing' + @anova = Statsample::Anova::OneWayWithVectors.new(@v1, @v2, @v3, name: @name) + end + should 'store correctly contrasts' do + c1 = Statsample::Anova::Contrast.new(vectors: [@v1, @v2, @v3], c: [1, -0.5, -0.5]) + + c2 = @anova.contrast(c: [1, -0.5, -0.5]) + assert_equal(c1.t, c2.t) + end + should 'respond to #summary' do assert(@anova.respond_to? :summary) end - should "have correct name of analysis on #summary" do + should 'have correct name of analysis on #summary' do assert_match(/#{@name}/, @anova.summary) end - should "returns same levene values as direct Levene creation" do - assert_equal(@anova.levene.f, Statsample::Test.levene([@v1,@v2,@v3]).f) + should 'returns same levene values as direct Levene creation' do + assert_equal(@anova.levene.f, Statsample::Test.levene([@v1, @v2, @v3]).f) end - should "have correct value for levene" do - assert_in_delta(0.604,@anova.levene.f, 0.001) - assert_in_delta(0.562,@anova.levene.probability, 0.001) + should 'have correct value for levene' do + assert_in_delta(0.604, @anova.levene.f, 0.001) + assert_in_delta(0.562, @anova.levene.probability, 0.001) end - should "have correct value for sst" do - assert_in_delta(72.933, @anova.sst,0.001) + should 'have correct value for sst' do + assert_in_delta(72.933, @anova.sst, 0.001) end - should "have correct value for sswg" do - assert_in_delta(14.8,@anova.sswg,0.001) + should 'have correct value for sswg' do + assert_in_delta(14.8, @anova.sswg, 0.001) end - should "have correct value for ssb" do - assert_in_delta(58.133,@anova.ssbg,0.001) + should 'have correct value for ssb' do + assert_in_delta(58.133, @anova.ssbg, 0.001) end - should "sst=sswg+ssbg" do - assert_in_delta(@anova.sst,@anova.sswg+@anova.ssbg,0.00001) + should 'sst=sswg+ssbg' do + assert_in_delta(@anova.sst, @anova.sswg + @anova.ssbg, 0.00001) end - should "df total equal to number of n-1" do - assert_equal(@v1.n+@v2.n+@v3.n-1,@anova.df_total) + should 'df total equal to number of n-1' do + assert_equal(@v1.size + @v2.size + @v3.size - 1, @anova.df_total) end - should "df wg equal to number of n-k" do - assert_equal(@v1.n+@v2.n+@v3.n-3,@anova.df_wg) + should 'df wg equal to number of n-k' do + assert_equal(@v1.size + @v2.size + @v3.size - 3, @anova.df_wg) end - should "df bg equal to number of k-1" do - assert_equal(2,@anova.df_bg) + should 'df bg equal to number of k-1' do + assert_equal(2, @anova.df_bg) end - should "f=(ssbg/df_bg)/(sswt/df_wt)" do - assert_in_delta((@anova.ssbg.quo(@anova.df_bg)).quo( @anova.sswg.quo(@anova.df_wg)), @anova.f, 0.001) + should 'f=(ssbg/df_bg)/(sswt/df_wt)' do + assert_in_delta((@anova.ssbg.quo(@anova.df_bg)).quo(@anova.sswg.quo(@anova.df_wg)), @anova.f, 0.001) end - should "p be correct" do - assert(@anova.probability<0.01) + should 'p be correct' do + assert(@anova.probability < 0.01) end - should "be correct using different test values" do - anova2=Statsample::Anova::OneWayWithVectors.new([@v1,@v1,@v1,@v1,@v2]) - assert_in_delta(3.960, anova2.f,0.001) - assert_in_delta(0.016, anova2.probability,0.001) + should 'be correct using different test values' do + anova2 = Statsample::Anova::OneWayWithVectors.new([@v1, @v1, @v1, @v1, @v2]) + assert_in_delta(3.960, anova2.f, 0.001) + assert_in_delta(0.016, anova2.probability, 0.001) end - context "with extra information on summary" do + context 'with extra information on summary' do setup do - @anova.summary_descriptives=true - @anova.summary_levene=true - @summary=@anova.summary + @anova.summary_descriptives = true + @anova.summary_levene = true + @summary = @anova.summary end - should "have section with levene statistics" do + should 'have section with levene statistics' do assert_match(/Levene/, @summary) end - should "have section with descriptives" do + should 'have section with descriptives' do assert_match(/Min/, @summary) end end diff --git a/test/test_awesome_print_bug.rb b/test/test_awesome_print_bug.rb index 065d3e7..ceccd72 100644 --- a/test/test_awesome_print_bug.rb +++ b/test/test_awesome_print_bug.rb @@ -1,14 +1,14 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleAwesomePrintBug < MiniTest::Test - context("Awesome Print integration") do +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleAwesomePrintBug < Minitest::Test + context('Awesome Print integration') do setup do - require "awesome_print" + require 'awesome_print' end - should "should be flawless" do - a=[1,2,3].to_scale - - assert(a!=[1,2,3]) - assert_nothing_raised do + should 'should be flawless' do + a = Daru::Vector.new([1, 2, 3]) + + assert(a != [1, 2, 3]) + assert_nothing_raised do ap a end end diff --git a/test/test_bartlettsphericity.rb b/test/test_bartlettsphericity.rb index 02f43ce..3865259 100644 --- a/test/test_bartlettsphericity.rb +++ b/test/test_bartlettsphericity.rb @@ -1,25 +1,25 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) -class StatsampleBartlettSphericityTestCase < MiniTest::Test +class StatsampleBartlettSphericityTestCase < Minitest::Test include Statsample::Test context Statsample::Test::BartlettSphericity do setup do - @v1=[1 ,2 ,3 ,4 ,7 ,8 ,9 ,10,14,15,20,50,60,70].to_scale - @v2=[5 ,6 ,11,12,13,16,17,18,19,20,30,0,0,0].to_scale - @v3=[10,3 ,20,30,40,50,80,10,20,30,40,2,3,4].to_scale + @v1 = Daru::Vector.new([1, 2, 3, 4, 7, 8, 9, 10, 14, 15, 20, 50, 60, 70]) + @v2 = Daru::Vector.new([5, 6, 11, 12, 13, 16, 17, 18, 19, 20, 30, 0, 0, 0]) + @v3 = Daru::Vector.new([10, 3, 20, 30, 40, 50, 80, 10, 20, 30, 40, 2, 3, 4]) # KMO: 0.490 - ds={'v1'=>@v1,'v2'=>@v2,'v3'=>@v3}.to_dataset - cor=Statsample::Bivariate.correlation_matrix(ds) - @bs=Statsample::Test::BartlettSphericity.new(cor, 14) + ds = Daru::DataFrame.new({ :v1 => @v1, :v2 => @v2, :v3 => @v3 }) + cor = Statsample::Bivariate.correlation_matrix(ds) + @bs = Statsample::Test::BartlettSphericity.new(cor, 14) end - should "have correct value for chi" do - assert_in_delta(9.477, @bs.value,0.001) + should 'have correct value for chi' do + assert_in_delta(9.477, @bs.value, 0.001) end - should "have correct value for df" do + should 'have correct value for df' do assert_equal(3, @bs.df) end - should "have correct value for probability" do - assert_in_delta(0.024,@bs.probability,0.001) + should 'have correct value for probability' do + assert_in_delta(0.024, @bs.probability, 0.001) end end end diff --git a/test/test_bivariate.rb b/test/test_bivariate.rb index 2b745cd..8d20917 100644 --- a/test/test_bivariate.rb +++ b/test/test_bivariate.rb @@ -1,163 +1,164 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleBivariateTestCase < MiniTest::Test - should "method sum of squares should be correct" do - v1=[1,2,3,4,5,6].to_vector(:scale) - v2=[6,2,4,10,12,8].to_vector(:scale) - assert_equal(23.0, Statsample::Bivariate.sum_of_squares(v1,v2)) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleBivariateTestCase < Minitest::Test + should 'method sum of squares should be correct' do + v1 = Daru::Vector.new([1, 2, 3, 4, 5, 6]) + v2 = Daru::Vector.new([6, 2, 4, 10, 12, 8]) + assert_equal(23.0, Statsample::Bivariate.sum_of_squares(v1, v2)) end - should_with_gsl "return same covariance with ruby and gls implementation" do - v1=20.times.collect {|a| rand()}.to_scale - v2=20.times.collect {|a| rand()}.to_scale - assert_in_delta(Statsample::Bivariate.covariance(v1,v2), Statsample::Bivariate.covariance_slow(v1,v2), 0.001) + should_with_gsl 'return same covariance with ruby and gls implementation' do + v1 = Daru::Vector.new(20.times.collect { |_a| rand }) + v2 = Daru::Vector.new(20.times.collect { |_a| rand }) + assert_in_delta(Statsample::Bivariate.covariance(v1, v2), Statsample::Bivariate.covariance_slow(v1, v2), 0.001) end - should_with_gsl "return same correlation with ruby and gls implementation" do - v1=20.times.collect {|a| rand()}.to_scale - v2=20.times.collect {|a| rand()}.to_scale + should_with_gsl 'return same correlation with ruby and gls implementation' do + v1 = Daru::Vector.new(20.times.collect { |_a| rand }) + v2 = Daru::Vector.new(20.times.collect { |_a| rand }) - assert_in_delta(GSL::Stats::correlation(v1.gsl, v2.gsl), Statsample::Bivariate.pearson_slow(v1,v2), 1e-10) + assert_in_delta(GSL::Stats.correlation(v1.to_gsl, v2.to_gsl), Statsample::Bivariate.pearson_slow(v1, v2), 1e-10) end - should "return correct pearson correlation" do - v1=[6,5,4,7,8,4,3,2].to_vector(:scale) - v2=[2,3,7,8,6,4,3,2].to_vector(:scale) - assert_in_delta(0.525,Statsample::Bivariate.pearson(v1,v2), 0.001) - assert_in_delta(0.525,Statsample::Bivariate.pearson_slow(v1,v2), 0.001) + should 'return correct pearson correlation' do + v1 = Daru::Vector.new([6, 5, 4, 7, 8, 4, 3, 2]) + v2 = Daru::Vector.new([2, 3, 7, 8, 6, 4, 3, 2]) + assert_in_delta(0.525, Statsample::Bivariate.pearson(v1, v2), 0.001) + assert_in_delta(0.525, Statsample::Bivariate.pearson_slow(v1, v2), 0.001) - v3=[6,2, 1000,1000,5,4,7,8,4,3,2,nil].to_vector(:scale) - v4=[2,nil,nil,nil, 3,7,8,6,4,3,2,500].to_vector(:scale) - assert_in_delta(0.525,Statsample::Bivariate.pearson(v3,v4),0.001) + v3 = Daru::Vector.new([6, 2, 1000, 1000, 5, 4, 7, 8, 4, 3, 2, nil]) + v4 = Daru::Vector.new([2, nil, nil, nil, 3, 7, 8, 6, 4, 3, 2, 500]) + assert_in_delta(0.525, Statsample::Bivariate.pearson(v3, v4), 0.001) # Test ruby method - v3a,v4a=Statsample.only_valid v3, v4 - assert_in_delta(0.525, Statsample::Bivariate.pearson_slow(v3a,v4a),0.001) + v3a, v4a = Statsample.only_valid v3, v4 + assert_in_delta(0.525, Statsample::Bivariate.pearson_slow(v3a, v4a), 0.001) end - should "return correct values for t_pearson and prop_pearson" do - v1=[6,5,4,7,8,4,3,2].to_vector(:scale) - v2=[2,3,7,8,6,4,3,2].to_vector(:scale) - r=Statsample::Bivariate::Pearson.new(v1,v2) - assert_in_delta(0.525,r.r, 0.001) - assert_in_delta(Statsample::Bivariate.t_pearson(v1,v2), r.t, 0.001) - assert_in_delta(Statsample::Bivariate.prop_pearson(r.t,8,:both), r.probability, 0.001) - assert(r.summary.size>0) + should 'return correct values for t_pearson and prop_pearson' do + v1 = Daru::Vector.new([6, 5, 4, 7, 8, 4, 3, 2]) + v2 = Daru::Vector.new([2, 3, 7, 8, 6, 4, 3, 2]) + r = Statsample::Bivariate::Pearson.new(v1, v2) + assert_in_delta(0.525, r.r, 0.001) + assert_in_delta(Statsample::Bivariate.t_pearson(v1, v2), r.t, 0.001) + assert_in_delta(Statsample::Bivariate.prop_pearson(r.t, 8, :both), r.probability, 0.001) + assert(r.summary.size > 0) end - should "return correct correlation_matrix with nils values" do - v1=[6,5,4,7,8,4,3,2].to_vector(:scale) - v2=[2,3,7,8,6,4,3,2].to_vector(:scale) - v3=[6,2, 1000,1000,5,4,7,8].to_vector(:scale) - v4=[2,nil,nil,nil, 3,7,8,6].to_vector(:scale) - ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4}.to_dataset - c=Proc.new {|n1,n2|Statsample::Bivariate.pearson(n1,n2)} - expected=Matrix[ [c.call(v1,v1),c.call(v1,v2),c.call(v1,v3),c.call(v1,v4)], [c.call(v2,v1),c.call(v2,v2),c.call(v2,v3),c.call(v2,v4)], [c.call(v3,v1),c.call(v3,v2),c.call(v3,v3),c.call(v3,v4)], - [c.call(v4,v1),c.call(v4,v2),c.call(v4,v3),c.call(v4,v4)] + should 'return correct correlation_matrix with nils values' do + v1 = Daru::Vector.new([6, 5, 4, 7, 8, 4, 3, 2]) + v2 = Daru::Vector.new([2, 3, 7, 8, 6, 4, 3, 2]) + v3 = Daru::Vector.new([6, 2, 1000, 1000, 5, 4, 7, 8]) + v4 = Daru::Vector.new([2, nil, nil, nil, 3, 7, 8, 6]) + ds = Daru::DataFrame.new({ :v1 => v1, :v2 => v2, :v3 => v3, :v4 => v4 }) + c = proc { |n1, n2| Statsample::Bivariate.pearson(n1, n2) } + expected = Matrix[[c.call(v1, v1), c.call(v1, v2), c.call(v1, v3), c.call(v1, v4)], [c.call(v2, v1), c.call(v2, v2), c.call(v2, v3), c.call(v2, v4)], [c.call(v3, v1), c.call(v3, v2), c.call(v3, v3), c.call(v3, v4)], + [c.call(v4, v1), c.call(v4, v2), c.call(v4, v3), c.call(v4, v4)] ] - obt=Statsample::Bivariate.correlation_matrix(ds) + obt = Statsample::Bivariate.correlation_matrix(ds) for i in 0...expected.row_size for j in 0...expected.column_size - #puts expected[i,j].inspect - #puts obt[i,j].inspect - assert_in_delta(expected[i,j], obt[i,j],0.0001, "#{expected[i,j].class}!=#{obt[i,j].class} ") + # puts expected[i,j].inspect + # puts obt[i,j].inspect + assert_in_delta(expected[i, j], obt[i, j], 0.0001, "#{expected[i, j].class}!=#{obt[i, j].class} ") end end - #assert_equal(expected,obt) + # assert_equal(expected,obt) end - should_with_gsl "return same values for optimized and pairwise covariance matrix" do - cases=100 - v1=Statsample::Vector.new_scale(cases) {rand()} - v2=Statsample::Vector.new_scale(cases) {rand()} - v3=Statsample::Vector.new_scale(cases) {rand()} - v4=Statsample::Vector.new_scale(cases) {rand()} - v5=Statsample::Vector.new_scale(cases) {rand()} + should_with_gsl 'return same values for optimized and pairwise covariance matrix' do + cases = 100 + v1 = Daru::Vector.new_with_size(cases) { rand } + v2 = Daru::Vector.new_with_size(cases) { rand } + v3 = Daru::Vector.new_with_size(cases) { rand } + v4 = Daru::Vector.new_with_size(cases) { rand } + v5 = Daru::Vector.new_with_size(cases) { rand } - ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'v5'=>v5}.to_dataset - - cor_opt=Statsample::Bivariate.covariance_matrix_optimized(ds) - - cor_pw =Statsample::Bivariate.covariance_matrix_pairwise(ds) - assert_equal_matrix(cor_opt,cor_pw,1e-15) + ds = Daru::DataFrame.new({ :v1 => v1, :v2 => v2, :v3 => v3, :v4 => v4, :v5 => v5 }) + + cor_opt = Statsample::Bivariate.covariance_matrix_optimized(ds) + + cor_pw = Statsample::Bivariate.covariance_matrix_pairwise(ds) + assert_equal_matrix(cor_opt, cor_pw, 1e-15) end - should_with_gsl "return same values for optimized and pairwise correlation matrix" do - - cases=100 - v1=Statsample::Vector.new_scale(cases) {rand()} - v2=Statsample::Vector.new_scale(cases) {rand()} - v3=Statsample::Vector.new_scale(cases) {rand()} - v4=Statsample::Vector.new_scale(cases) {rand()} - v5=Statsample::Vector.new_scale(cases) {rand()} + should_with_gsl 'return same values for optimized and pairwise correlation matrix' do + cases = 100 + v1 = Daru::Vector.new_with_size(cases) { rand } + v2 = Daru::Vector.new_with_size(cases) { rand } + v3 = Daru::Vector.new_with_size(cases) { rand } + v4 = Daru::Vector.new_with_size(cases) { rand } + v5 = Daru::Vector.new_with_size(cases) { rand } + + ds = Daru::DataFrame.new({ + :v1 => v1, :v2 => v2, :v3 => v3, :v4 => v4, :v5 => v5 }) + + cor_opt = Statsample::Bivariate.correlation_matrix_optimized(ds) - ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'v5'=>v5}.to_dataset - - cor_opt=Statsample::Bivariate.correlation_matrix_optimized(ds) - - cor_pw =Statsample::Bivariate.correlation_matrix_pairwise(ds) - assert_equal_matrix(cor_opt,cor_pw,1e-15) - + cor_pw = Statsample::Bivariate.correlation_matrix_pairwise(ds) + assert_equal_matrix(cor_opt, cor_pw, 1e-15) end - should "return correct correlation_matrix without nils values" do - v1=[6,5,4,7,8,4,3,2].to_vector(:scale) - v2=[2,3,7,8,6,4,3,2].to_vector(:scale) - v3=[6,2, 1000,1000,5,4,7,8].to_vector(:scale) - v4=[2,4,6,7, 3,7,8,6].to_vector(:scale) - ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4}.to_dataset - c=Proc.new {|n1,n2|Statsample::Bivariate.pearson(n1,n2)} - expected=Matrix[ [c.call(v1,v1),c.call(v1,v2),c.call(v1,v3),c.call(v1,v4)], [c.call(v2,v1),c.call(v2,v2),c.call(v2,v3),c.call(v2,v4)], [c.call(v3,v1),c.call(v3,v2),c.call(v3,v3),c.call(v3,v4)], - [c.call(v4,v1),c.call(v4,v2),c.call(v4,v3),c.call(v4,v4)] + should 'return correct correlation_matrix without nils values' do + v1 = Daru::Vector.new([6, 5, 4, 7, 8, 4, 3, 2]) + v2 = Daru::Vector.new([2, 3, 7, 8, 6, 4, 3, 2]) + v3 = Daru::Vector.new([6, 2, 1000, 1000, 5, 4, 7, 8]) + v4 = Daru::Vector.new([2, 4, 6, 7, 3, 7, 8, 6]) + ds = Daru::DataFrame.new({ :v1 => v1, :v2 => v2, :v3 => v3, :v4 => v4 }) + c = proc { |n1, n2| Statsample::Bivariate.pearson(n1, n2) } + expected = Matrix[[c.call(v1, v1), c.call(v1, v2), c.call(v1, v3), c.call(v1, v4)], [c.call(v2, v1), c.call(v2, v2), c.call(v2, v3), c.call(v2, v4)], [c.call(v3, v1), c.call(v3, v2), c.call(v3, v3), c.call(v3, v4)], + [c.call(v4, v1), c.call(v4, v2), c.call(v4, v3), c.call(v4, v4)] ] - obt=Statsample::Bivariate.correlation_matrix(ds) + obt = Statsample::Bivariate.correlation_matrix(ds) for i in 0...expected.row_size for j in 0...expected.column_size - #puts expected[i,j].inspect - #puts obt[i,j].inspect - assert_in_delta(expected[i,j], obt[i,j],0.0001, "#{expected[i,j].class}!=#{obt[i,j].class} ") + # puts expected[i,j].inspect + # puts obt[i,j].inspect + assert_in_delta(expected[i, j], obt[i, j], 0.0001, "#{expected[i, j].class}!=#{obt[i, j].class} ") end end - #assert_equal(expected,obt) + # assert_equal(expected,obt) end - - should "return correct value for prop pearson" do - assert_in_delta(0.42, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.084,94), 94),0.01) - assert_in_delta(0.65, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.046,95), 95),0.01) - r=0.9 - n=100 - t=Statsample::Bivariate.t_r(r,n) - assert(Statsample::Bivariate.prop_pearson(t,n,:both)<0.05) - assert(Statsample::Bivariate.prop_pearson(t,n,:right)<0.05) - assert(Statsample::Bivariate.prop_pearson(t,n,:left)>0.05) + should 'return correct value for prop pearson' do + assert_in_delta(0.42, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.084, 94), 94), 0.01) + assert_in_delta(0.65, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.046, 95), 95), 0.01) + r = 0.9 + n = 100 + t = Statsample::Bivariate.t_r(r, n) + assert(Statsample::Bivariate.prop_pearson(t, n, :both) < 0.05) + assert(Statsample::Bivariate.prop_pearson(t, n, :right) < 0.05) + assert(Statsample::Bivariate.prop_pearson(t, n, :left) > 0.05) - r=-0.9 - n=100 - t=Statsample::Bivariate.t_r(r,n) - assert(Statsample::Bivariate.prop_pearson(t,n,:both)<0.05) - assert(Statsample::Bivariate.prop_pearson(t,n,:right)>0.05) - assert(Statsample::Bivariate.prop_pearson(t,n,:left)<0.05) + r = -0.9 + n = 100 + t = Statsample::Bivariate.t_r(r, n) + assert(Statsample::Bivariate.prop_pearson(t, n, :both) < 0.05) + assert(Statsample::Bivariate.prop_pearson(t, n, :right) > 0.05) + assert(Statsample::Bivariate.prop_pearson(t, n, :left) < 0.05) end should "return correct value for Spearman's rho" do - v1=[86,97,99,100,101,103,106,110,112,113].to_vector(:scale) - v2=[0,20,28,27,50,29,7,17,6,12].to_vector(:scale) - assert_in_delta(-0.175758,Statsample::Bivariate.spearman(v1,v2),0.0001) - + v1 =Daru::Vector.new( [86, 97, 99, 100, 101, 103, 106, 110, 112, 113]) + v2 =Daru::Vector.new( [0, 20, 28, 27, 50, 29, 7, 17, 6, 12]) + assert_in_delta(-0.175758, Statsample::Bivariate.spearman(v1, v2), 0.0001) end - should "return correct value for point_biserial correlation" do - c=[1,3,5,6,7,100,200,300,400,300].to_vector(:scale) - d=[1,1,1,1,1,0,0,0,0,0].to_vector(:scale) + should 'return correct value for point_biserial correlation' do + c = Daru::Vector.new([1, 3, 5, 6, 7, 100, 200, 300, 400, 300]) + d = Daru::Vector.new([1, 1, 1, 1, 1, 0, 0, 0, 0, 0]) assert_raises TypeError do - Statsample::Bivariate.point_biserial(c,d) + Statsample::Bivariate.point_biserial(c, d) end - assert_in_delta(Statsample::Bivariate.point_biserial(d,c), Statsample::Bivariate.pearson(d,c), 0.0001) + assert_in_delta(Statsample::Bivariate.point_biserial(d, c), Statsample::Bivariate.pearson(d, c), 0.0001) + end + should 'return correct value for tau_a and tau_b' do + v1 = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) + v2 = Daru::Vector.new([1, 3, 4, 5, 7, 8, 2, 9, 10, 6, 11]) + assert_in_delta(0.6727, Statsample::Bivariate.tau_a(v1, v2), 0.001) + assert_in_delta(0.6727, Statsample::Bivariate.tau_b((Statsample::Crosstab.new(v1, v2).to_matrix)), 0.001) + v1 = Daru::Vector.new([12, 14, 14, 17, 19, 19, 19, 19, 19, 20, 21, 21, 21, 21, 21, 22, 23, 24, 24, 24, 26, 26, 27]) + v2 = Daru::Vector.new([11, 4, 4, 2, 0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0]) + assert_in_delta(-0.376201540231705, Statsample::Bivariate.tau_b(Statsample::Crosstab.new(v1, v2).to_matrix), 0.001) end - should "return correct value for tau_a and tau_b" do - v1=[1,2,3,4,5,6,7,8,9,10,11].to_vector(:ordinal) - v2=[1,3,4,5,7,8,2,9,10,6,11].to_vector(:ordinal) - assert_in_delta(0.6727,Statsample::Bivariate.tau_a(v1,v2),0.001) - assert_in_delta(0.6727,Statsample::Bivariate.tau_b((Statsample::Crosstab.new(v1,v2).to_matrix)),0.001) - v1=[12,14,14,17,19,19,19,19,19,20,21,21,21,21,21,22,23,24,24,24,26,26,27].to_vector(:ordinal) - v2=[11,4,4,2,0,0,0,0,0,0,4,0,4,0,0,0,0,4,0,0,0,0,0].to_vector(:ordinal) - assert_in_delta(-0.376201540231705, Statsample::Bivariate.tau_b(Statsample::Crosstab.new(v1,v2).to_matrix),0.001) + should 'return correct value for gamma correlation' do + m = Matrix[[10, 5, 2], [10, 15, 20]] + assert_in_delta(0.636, Statsample::Bivariate.gamma(m), 0.001) + m2 = Matrix[[15, 12, 6, 5], [12, 8, 10, 8], [4, 6, 9, 10]] + assert_in_delta(0.349, Statsample::Bivariate.gamma(m2), 0.001) end - should "return correct value for gamma correlation" do - m=Matrix[[10,5,2],[10,15,20]] - assert_in_delta(0.636,Statsample::Bivariate.gamma(m),0.001) - m2=Matrix[[15,12,6,5],[12,8,10,8],[4,6,9,10]] - assert_in_delta(0.349,Statsample::Bivariate.gamma(m2),0.001) + + should 'return correct residuals' do + # TODO: test Statsample::Bivariate.residuals end end diff --git a/test/test_codification.rb b/test/test_codification.rb index 2049d06..21121bc 100644 --- a/test/test_codification.rb +++ b/test/test_codification.rb @@ -1,76 +1,78 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleCodificationTestCase < MiniTest::Unit::TestCase - +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleCodificationTestCase < Minitest::Test def initialize(*args) - v1=%w{run walk,run walking running sleep sleeping,dreaming sleep,dream}.to_vector - @dict={'run'=>'r','walk'=>'w','walking'=>'w','running'=>'r','sleep'=>'s', 'sleeping'=>'s', 'dream'=>'d', 'dreaming'=>'d'} - @ds={"v1"=>v1}.to_dataset + v1 = Daru::Vector.new(%w(run walk,run walking running sleep sleeping,dreaming sleep,dream)) + @dict = { 'run' => 'r', 'walk' => 'w', 'walking' => 'w', 'running' => 'r', 'sleep' => 's', 'sleeping' => 's', 'dream' => 'd', 'dreaming' => 'd' } + @ds = Daru::DataFrame.new({ :v1 => v1 }) super end + def test_create_hash - expected_keys_v1=%w{run walk walking running sleep sleeping dream dreaming}.sort - hash=Statsample::Codification.create_hash(@ds,['v1']) - assert_equal(['v1'],hash.keys) - assert_equal(expected_keys_v1,hash['v1'].keys.sort) - assert_equal(expected_keys_v1,hash['v1'].values.sort) + expected_keys_v1 = %w(run walk walking running sleep sleeping dream dreaming).sort + hash = Statsample::Codification.create_hash(@ds, [:v1]) + assert_equal([:v1], hash.keys) + assert_equal(expected_keys_v1, hash[:v1].keys.sort) + assert_equal(expected_keys_v1, hash[:v1].values.sort) end + def test_create_excel - filename=Dir::tmpdir+"/test_excel"+Time.now().to_s+".xls" - #filename = Tempfile.new("test_codification_"+Time.now().to_s) + filename = Dir.tmpdir + '/test_excel' + Time.now.to_s + '.xls' + # filename = Tempfile.new("test_codification_"+Time.now().to_s) Statsample::Codification.create_excel(@ds, ['v1'], filename) - field=(["v1"]*8).to_vector - keys=%w{dream dreaming run running sleep sleeping walk walking}.to_vector - ds=Statsample::Excel.read(filename) - assert_equal(field, ds['field']) - assert_equal(keys, ds['original']) - assert_equal(keys, ds['recoded']) - hash=Statsample::Codification.excel_to_recoded_hash(filename) - assert_equal(keys.data, hash['v1'].keys.sort) - assert_equal(keys.data, hash['v1'].values.sort) - + field = Daru::Vector.new(['v1'] * 8, name: :field) + keys = Daru::Vector.new(%w(dream dreaming run running sleep sleeping walk walking)) + ds = Daru::DataFrame.from_excel(filename) + assert_equal(field, ds[:field]) + assert_equal(keys, ds[:original]) + assert_equal(keys, ds[:recoded]) + hash = Statsample::Codification.excel_to_recoded_hash(filename) + assert_equal(keys.to_a, hash[:v1].keys.sort) + assert_equal(keys.to_a, hash[:v1].values.sort) end + def test_create_yaml - assert_raise ArgumentError do - Statsample::Codification.create_yaml(@ds,[]) + assert_raise ArgumentError do + Statsample::Codification.create_yaml(@ds, []) end - expected_keys_v1=%w{run walk walking running sleep sleeping dream dreaming}.sort - yaml_hash=Statsample::Codification.create_yaml(@ds,['v1']) - h=YAML::load(yaml_hash) - assert_equal(['v1'],h.keys) - assert_equal(expected_keys_v1,h['v1'].keys.sort) - tf = Tempfile.new("test_codification") - yaml_hash=Statsample::Codification.create_yaml(@ds,['v1'],tf, Statsample::SPLIT_TOKEN) + expected_keys_v1 = %w(run walk walking running sleep sleeping dream dreaming).sort + yaml_hash = Statsample::Codification.create_yaml(@ds, [:v1]) + h = YAML.load(yaml_hash) + assert_equal([:v1], h.keys) + assert_equal(expected_keys_v1, h[:v1].keys.sort) + tf = Tempfile.new('test_codification') + yaml_hash = Statsample::Codification.create_yaml(@ds, [:v1], tf, Statsample::SPLIT_TOKEN) tf.close tf.open - h=YAML::load(tf) - assert_equal(['v1'],h.keys) - assert_equal(expected_keys_v1,h['v1'].keys.sort) + h = YAML.load(tf) + assert_equal([:v1], h.keys) + assert_equal(expected_keys_v1, h[:v1].keys.sort) tf.close(true) end + def test_recodification - expected=[['r'],['w','r'],['w'],['r'],['s'],['s','d'], ['s','d']] - assert_equal(expected,Statsample::Codification.recode_vector(@ds['v1'],@dict)) - v2=['run','walk,dreaming',nil,'walk,dream,dreaming,walking'].to_vector - expected=[['r'],['w','d'],nil,['w','d']] - assert_equal(expected,Statsample::Codification.recode_vector(v2,@dict)) + expected = [['r'], %w(w r), ['w'], ['r'], ['s'], %w(s d), %w(s d)] + assert_equal(expected, Statsample::Codification.recode_vector(@ds[:v1], @dict)) + v2 = Daru::Vector.new(['run', 'walk,dreaming', nil, 'walk,dream,dreaming,walking']) + expected = [['r'], %w(w d), nil, %w(w d)] + assert_equal(expected, Statsample::Codification.recode_vector(v2, @dict)) end + def test_recode_dataset_simple - Statsample::Codification.recode_dataset_simple!(@ds,{'v1'=>@dict}) - expected_vector=['r','w,r','w','r','s','s,d', 's,d'].to_vector - assert_not_equal(expected_vector,@ds['v1']) - assert_equal(expected_vector,@ds['v1_recoded']) + Statsample::Codification.recode_dataset_simple!(@ds, :v1 => @dict) + expected_vector = Daru::Vector.new(['r', 'w,r', 'w', 'r', 's', 's,d', 's,d']) + assert_not_equal(expected_vector, @ds[:v1]) + assert_equal(expected_vector, @ds[:v1_recoded]) end - def test_recode_dataset_split - Statsample::Codification.recode_dataset_split!(@ds,{'v1'=>@dict}) - e={} - e['r']=[1,1,0,1,0,0,0].to_vector - e['w']=[0,1,1,0,0,0,0].to_vector - e['s']=[0,0,0,0,1,1,1].to_vector - e['d']=[0,0,0,0,0,1,1].to_vector - e.each{|k,expected| - assert_equal(expected,@ds['v1_'+k],"Error on key #{k}") + def test_recode_dataset_split + Statsample::Codification.recode_dataset_split!(@ds, :v1 => @dict) + e = {} + e['r'] = Daru::Vector.new([1, 1, 0, 1, 0, 0, 0]) + e['w'] = Daru::Vector.new([0, 1, 1, 0, 0, 0, 0]) + e['s'] = Daru::Vector.new([0, 0, 0, 0, 1, 1, 1]) + e['d'] = Daru::Vector.new([0, 0, 0, 0, 0, 1, 1]) + e.each { |k, expected| + assert_equal(expected, @ds[('v1_' + k).to_sym], "Error on key #{k}") } end - end diff --git a/test/test_crosstab.rb b/test/test_crosstab.rb index 2eef2b1..8f39460 100644 --- a/test/test_crosstab.rb +++ b/test/test_crosstab.rb @@ -1,63 +1,67 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleCrosstabTestCase < MiniTest::Unit::TestCase - +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleCrosstabTestCase < Minitest::Test def initialize(*args) - @v1=%w{black blonde black black red black brown black blonde black red black blonde}.to_vector - @v2=%w{woman man man woman man man man woman man woman woman man man}.to_vector - @ct=Statsample::Crosstab.new(@v1,@v2) + @v1 =Daru::Vector.new( %w(black blonde black black red black brown black blonde black red black blonde)) + @v2 =Daru::Vector.new( %w(woman man man woman man man man woman man woman woman man man)) + @ct = Statsample::Crosstab.new(@v1, @v2) super end + def test_crosstab_errors - e1=%w{black blonde black black red black brown black blonde black} + e1 = %w(black blonde black black red black brown black blonde black) assert_raise ArgumentError do - Statsample::Crosstab.new(e1,@v2) + Statsample::Crosstab.new(e1, @v2) end - e2=%w{black blonde black black red black brown black blonde black black}.to_vector + e2 = Daru::Vector.new(%w(black blonde black black red black brown black blonde black black)) assert_raise ArgumentError do - Statsample::Crosstab.new(e2,@v2) + Statsample::Crosstab.new(e2, @v2) end assert_nothing_raised do - Statsample::Crosstab.new(@v1,@v2) + Statsample::Crosstab.new(@v1, @v2) end end + def test_crosstab_basic - assert_equal(%w{black blonde brown red}, @ct.rows_names) - assert_equal(%w{man woman}, @ct.cols_names) - assert_equal({'black'=>7,'blonde'=>3,'red'=>2,'brown'=>1}, @ct.rows_total) - assert_equal({'man'=>8,'woman'=>5}, @ct.cols_total) + assert_equal(Daru::Vector.new(%w(black blonde brown red)), @ct.rows_names) + assert_equal(Daru::Vector.new(%w(man woman)), @ct.cols_names) + assert_equal({ 'black' => 7, 'blonde' => 3, 'red' => 2, 'brown' => 1 }, @ct.rows_total) + assert_equal({ 'man' => 8, 'woman' => 5 }, @ct.cols_total) end + def test_crosstab_frequencies - fq=@ct.frequencies - assert_equal(8,fq.size) - sum=fq.inject(0) {|s,x| s+x[1]} - assert_equal(13,sum) - fr=@ct.frequencies_by_row - assert_equal(4,fr.size) - assert_equal(%w{black blonde brown red},fr.keys.sort) - fc=@ct.frequencies_by_col - assert_equal(2,fc.size) - assert_equal(%w{man woman},fc.keys.sort) - assert_equal(Matrix.rows([[3,4],[3,0],[1,0],[1,1]]),@ct.to_matrix) + fq = @ct.frequencies + assert_equal(8, fq.size) + sum = fq.inject(0) { |s, x| s + x[1] } + assert_equal(13, sum) + fr = @ct.frequencies_by_row + assert_equal(4, fr.size) + assert_equal(%w(black blonde brown red), fr.keys.sort) + fc = @ct.frequencies_by_col + assert_equal(2, fc.size) + assert_equal(%w(man woman), fc.keys.sort) + assert_equal(Matrix.rows([[3, 4], [3, 0], [1, 0], [1, 1]]), @ct.to_matrix) end + def test_summary - @ct.percentage_row=true - @ct.percentage_column=true - @ct.percentage_total=true - assert(@ct.summary.size>0) + @ct.percentage_row = true + @ct.percentage_column = true + @ct.percentage_total = true + assert(@ct.summary.size > 0) end + def test_expected - v1=%w{1 1 1 1 1 0 0 0 0 0}.to_vector - v2=%w{0 0 0 0 0 1 1 1 1 1}.to_vector - ct=Statsample::Crosstab.new(v1,v2) - assert_equal(Matrix[[2.5,2.5],[2.5,2.5]],ct.matrix_expected) + v1 = Daru::Vector.new(%w(1 1 1 1 1 0 0 0 0 0)) + v2 = Daru::Vector.new(%w(0 0 0 0 0 1 1 1 1 1)) + ct = Statsample::Crosstab.new(v1, v2) + assert_equal(Matrix[[2.5, 2.5], [2.5, 2.5]], ct.matrix_expected) end + def test_crosstab_with_scale - v1=%w{1 1 1 1 1 0 0 0 0 0}.to_scale - v2=%w{0 0 0 0 0 1 1 1 1 1}.to_scale - ct=Statsample::Crosstab.new(v1,v2) - assert_equal(Matrix[[0,5],[5,0]],ct.to_matrix) - assert_nothing_raised { ct.summary } + v1 = Daru::Vector.new(%w(1 1 1 1 1 0 0 0 0 0)) + v2 = Daru::Vector.new(%w(0 0 0 0 0 1 1 1 1 1)) + ct = Statsample::Crosstab.new(v1, v2) + assert_equal(Matrix[[0, 5], [5, 0]], ct.to_matrix) + assert_nothing_raised { ct.summary } end - end diff --git a/test/test_csv.rb b/test/test_csv.rb deleted file mode 100644 index 283dadd..0000000 --- a/test/test_csv.rb +++ /dev/null @@ -1,81 +0,0 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleCSVTestCase < MiniTest::Unit::TestCase - def setup - @ds=Statsample::CSV.read(File.dirname(__FILE__)+"/fixtures/test_csv.csv") - end - def test_read - assert_equal(6,@ds.cases) - assert_equal(%w{id name age city a1}, @ds.fields) - id=[1,2,3,4,5,6].to_vector(:scale) - name=["Alex","Claude","Peter","Franz","George","Fernand"].to_vector(:nominal) - age=[20,23,25,27,5.5,nil].to_vector(:scale) - city=["New York","London","London","Paris","Tome",nil].to_vector(:nominal) - a1=["a,b","b,c","a",nil,"a,b,c",nil].to_vector(:nominal) - ds_exp=Statsample::Dataset.new({'id'=>id,'name'=>name,'age'=>age,'city'=>city,'a1'=>a1}, %w{id name age city a1}) - ds_exp.fields.each{|f| - assert_equal(ds_exp[f],@ds[f]) - } - assert_equal(ds_exp,@ds) - end - def test_nil - assert_equal(nil,@ds['age'][5]) - end - def test_repeated - ds=Statsample::CSV.read(File.dirname(__FILE__)+"/fixtures/repeated_fields.csv") - assert_equal(%w{id name_1 age_1 city a1 name_2 age_2},ds.fields) - age=[3,4,5,6,nil,8].to_vector(:scale) - assert_equal(age,ds['age_2']) - end - def test_write - filename=Tempfile.new("afile") - # filename=Dir::tmpdir+"/test_write.csv" - Statsample::CSV.write(@ds, filename.path) - ds2=Statsample::CSV.read(filename.path) - i=0 - ds2.each_array{|row| - assert_equal(@ds.case_as_array(i),row) - i+=1 - } - end -end -=begin -class StatsampleCSVTestCase2 < MiniTest::Unit::TestCase - def setup - @ds=Statsample::CSV.read19(File.dirname(__FILE__)+"/fixtures/test_csv.csv") - end - def test_read - assert_equal(6,@ds.cases) - assert_equal(%w{id name age city a1}, @ds.fields) - id=[1,2,3,4,5,6].to_vector(:scale) - name=["Alex","Claude","Peter","Franz","George","Fernand"].to_vector(:nominal) - age=[20,23,25,27,5.5,nil].to_vector(:scale) - city=["New York","London","London","Paris","Tome",nil].to_vector(:nominal) - a1=["a,b","b,c","a",nil,"a,b,c",nil].to_vector(:nominal) - ds_exp=Statsample::Dataset.new({'id'=>id,'name'=>name,'age'=>age,'city'=>city,'a1'=>a1}, %w{id name age city a1}) - ds_exp.fields.each{|f| - assert_equal(ds_exp[f],@ds[f]) - } - assert_equal(ds_exp,@ds) - end - def test_nil - assert_equal(nil,@ds['age'][5]) - end - def test_repeated - ds=Statsample::CSV.read19(File.dirname(__FILE__)+"/fixtures/repeated_fields.csv") - assert_equal(%w{id name_1 age_1 city a1 name_2 age_2},ds.fields) - age=[3,4,5,6,nil,8].to_vector(:scale) - assert_equal(age,ds['age_2']) - end - def test_write - filename=Tempfile.new("afile") - # filename=Dir::tmpdir+"/test_write.csv" - Statsample::CSV.write(@ds, filename.path) - ds2=Statsample::CSV.read19(filename.path) - i=0 - ds2.each_array{|row| - assert_equal(@ds.case_as_array(i),row) - i+=1 - } - end -end -=end diff --git a/test/test_dataset.rb b/test/test_dataset.rb deleted file mode 100644 index c6fb979..0000000 --- a/test/test_dataset.rb +++ /dev/null @@ -1,462 +0,0 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleDatasetTestCase < MiniTest::Unit::TestCase - def setup - @ds=Statsample::Dataset.new({'id' => Statsample::Vector.new([1,2,3,4,5]), 'name'=>Statsample::Vector.new(%w{Alex Claude Peter Franz George}), 'age'=>Statsample::Vector.new([20,23,25,27,5]), - 'city'=>Statsample::Vector.new(['New York','London','London','Paris','Tome']), - 'a1'=>Statsample::Vector.new(['a,b','b,c','a',nil,'a,b,c'])}, ['id','name','age','city','a1']) - end - def test_nest - ds={ - 'a'=>%w{a a a b b b}.to_vector, - 'b'=>%w{c c d d e e}.to_vector, - 'c'=>%w{f g h i j k}.to_vector - }.to_dataset - nest=ds.nest('a','b') - assert_equal([{'c'=>'f'},{'c'=>'g'}], nest['a']['c']) - assert_equal([{'c'=>'h'}], nest['a']['d']) - assert_equal([{'c'=>'j'},{'c'=>'k'}], nest['b']['e']) - - end - def test_should_have_summary - assert(@ds.summary.size>0) - end - def test_basic - assert_equal(5,@ds.cases) - assert_equal(%w{id name age city a1}, @ds.fields) - end - def test_saveload - outfile=Tempfile.new("dataset.ds") - @ds.save(outfile.path) - a=Statsample.load(outfile.path) - assert_equal(@ds,a) - end - def test_gsl - if Statsample.has_gsl? - matrix=GSL::Matrix[[1,2],[3,4],[5,6]] - ds=Statsample::Dataset.new('v1'=>[1,3,5].to_vector,'v2'=>[2,4,6].to_vector) - assert_equal(matrix,ds.to_gsl) - else - skip("Gsl needed") - end - end - def test_matrix - matrix=Matrix[[1,2],[3,4],[5,6]] - ds=Statsample::Dataset.new('v1'=>[1,3,5].to_vector,'v2'=>[2,4,6].to_vector) - assert_equal(matrix,ds.to_matrix) - end - - def test_fields - @ds.fields=%w{name a1 id age city} - assert_equal(%w{name a1 id age city}, @ds.fields) - @ds.fields=%w{id name age} - assert_equal(%w{id name age a1 city}, @ds.fields) - end - def test_merge - a=[1,2,3].to_scale - b=[3,4,5].to_vector - c=[4,5,6].to_scale - d=[7,8,9].to_vector - e=[10,20,30].to_vector - ds1={'a'=>a,'b'=>b}.to_dataset - ds2={'c'=>c,'d'=>d}.to_dataset - exp={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset - - assert_equal(exp,ds1.merge(ds2)) - exp.fields=%w{c d a b} - assert_equal(exp,ds2.merge(ds1)) - ds3={'a'=>e}.to_dataset - exp={'a_1'=>a,'b'=>b,'a_2'=>e}.to_dataset - exp.fields=%w{a_1 b a_2} - assert_equal(exp,ds1.merge(ds3)) - end - def test_each_vector - a=[1,2,3].to_vector - b=[3,4,5].to_vector - fields=["a","b"] - ds=Statsample::Dataset.new({'a'=>a,'b'=>b},fields) - res=[] - ds.each_vector{|k,v| - res.push([k,v]) - } - assert_equal([["a",a],["b",b]],res) - ds.fields=["b","a"] - res=[] - ds.each_vector{|k,v| - res.push([k,v]) - } - assert_equal([["b",b],["a",a]],res) - end - def test_equality - v1=[1,2,3,4].to_vector - v2=[5,6,7,8].to_vector - ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2}, %w{v2 v1}) - v3=[1,2,3,4].to_vector - v4=[5,6,7,8].to_vector - ds2=Statsample::Dataset.new({'v1'=>v3,'v2'=>v4}, %w{v2 v1}) - assert_equal(ds1,ds2) - ds2.fields=%w{v1 v2} - assert_not_equal(ds1,ds2) - end - def test_add_vector - v=Statsample::Vector.new(%w{a b c d e}) - @ds.add_vector('new',v) - assert_equal(%w{id name age city a1 new},@ds.fields) - x=Statsample::Vector.new(%w{a b c d e f g}) - assert_raise ArgumentError do - @ds.add_vector('new2',x) - end - end - def test_vector_by_calculation - a1=[1,2,3,4,5,6,7].to_vector(:scale) - a2=[10,20,30,40,50,60,70].to_vector(:scale) - a3=[100,200,300,400,500,600,700].to_vector(:scale) - ds={'a1'=>a1,'a2'=>a2,'a3'=>a3}.to_dataset - total=ds.vector_by_calculation() {|row| - row['a1']+row['a2']+row['a3'] - } - expected=[111,222,333,444,555,666,777].to_vector(:scale) - assert_equal(expected,total) - end - def test_vector_sum - a1=[1 ,2 ,3 ,4 , 5,nil].to_vector(:scale) - a2=[10 ,10,20,20 ,20,30].to_vector(:scale) - b1=[nil,1 ,1 ,1 ,1 ,2].to_vector(:scale) - b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale) - ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2}.to_dataset - total=ds.vector_sum - a=ds.vector_sum(['a1','a2']) - b=ds.vector_sum(['b1','b2']) - expected_a=[11,12,23,24,25,nil].to_vector(:scale) - expected_b=[nil,3,3,nil,3,5].to_vector(:scale) - expected_total=[nil,15,26,nil,28,nil].to_vector(:scale) - assert_equal(expected_a, a) - assert_equal(expected_b, b) - assert_equal(expected_total, total) - end - def test_vector_missing_values - a1=[1 ,nil ,3 ,4 , 5,nil].to_vector(:scale) - a2=[10 ,nil ,20,20 ,20,30].to_vector(:scale) - b1=[nil,nil ,1 ,1 ,1 ,2].to_vector(:scale) - b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale) - c= [nil,2 , 4,2 ,2 ,2].to_vector(:scale) - ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset - mva=[2,3,0,1,0,1].to_vector(:scale) - assert_equal(mva,ds.vector_missing_values) - end - - def test_has_missing_values - a1=[1 ,nil ,3 ,4 , 5,nil].to_vector(:scale) - a2=[10 ,nil ,20,20 ,20,30].to_vector(:scale) - b1=[nil,nil ,1 ,1 ,1 ,2].to_vector(:scale) - b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale) - c= [nil,2 , 4,2 ,2 ,2].to_vector(:scale) - ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset - assert(ds.has_missing_data?) - clean=ds.dup_only_valid - assert(!clean.has_missing_data?) - end - - - def test_vector_count_characters - a1=[1 ,"abcde" ,3 ,4 , 5,nil].to_vector(:scale) - a2=[10 ,20.3 ,20 ,20 ,20,30].to_vector(:scale) - b1=[nil,"343434" ,1 ,1 ,1 ,2].to_vector(:scale) - b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale) - c= [nil,2 ,"This is a nice example",2 ,2 ,2].to_vector(:scale) - ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset - exp=[4,17,27,5,6,5].to_vector(:scale) - assert_equal(exp,ds.vector_count_characters) - - end - def test_vector_mean - a1=[1 ,2 ,3 ,4 , 5,nil].to_vector(:scale) - a2=[10 ,10,20,20 ,20,30].to_vector(:scale) - b1=[nil,1 ,1 ,1 ,1 ,2].to_vector(:scale) - b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale) - c= [nil,2, 4,2 ,2 ,2].to_vector(:scale) - ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset - total=ds.vector_mean - a=ds.vector_mean(['a1','a2'],1) - b=ds.vector_mean(['b1','b2'],1) - c=ds.vector_mean(['b1','b2','c'],1) - expected_a=[5.5,6,11.5,12,12.5,30].to_vector(:scale) - expected_b=[2,1.5,1.5,1,1.5,2.5].to_vector(:scale) - expected_c=[nil, 5.0/3,7.0/3,1.5,5.0/3,7.0/3].to_vector(:scale) - expected_total=[nil,3.4,6,nil,6.0,nil].to_vector(:scale) - assert_equal(expected_a, a) - assert_equal(expected_b, b) - assert_equal(expected_c, c) - assert_equal(expected_total, total) - end - - def test_each_array - expected=[[1,'Alex',20,'New York','a,b'], [2,'Claude',23,'London','b,c'], [3,'Peter',25,'London','a'],[4,'Franz', 27,'Paris',nil],[5,'George',5,'Tome','a,b,c']] - out=[] - @ds.each_array{ |a| - out.push(a) - } - assert_equal(expected,out) - end - def test_recode - @ds['age'].type=:scale - @ds.recode!("age") {|c| c['id']*2} - expected=[2,4,6,8,10].to_vector(:scale) - assert_equal(expected,@ds['age']) - end - def test_case_as - assert_equal({'id'=>1,'name'=>'Alex','city'=>'New York','age'=>20,'a1'=>'a,b'},@ds.case_as_hash(0)) - assert_equal([5,'George',5,'Tome','a,b,c'],@ds.case_as_array(4)) - # Native methods - assert_equal({'id'=>1,'name'=>'Alex','city'=>'New York','age'=>20,'a1'=>'a,b'},@ds._case_as_hash(0)) - assert_equal([5,'George',5,'Tome','a,b,c'],@ds._case_as_array(4)) - - - - end - def test_delete_vector - @ds.delete_vector('name') - assert_equal(%w{id age city a1},@ds.fields) - assert_equal(%w{a1 age city id},@ds.vectors.keys.sort) - end - def test_change_type - @ds.col('age').type=:scale - assert_equal(:scale,@ds.col('age').type) - end - def test_split_by_separator_recode - @ds.add_vectors_by_split_recode("a1","_") - assert_equal(%w{id name age city a1 a1_1 a1_2 a1_3},@ds.fields) - assert_equal([1,0,1,nil,1],@ds.col('a1_1').to_a) - assert_equal([1,1,0,nil,1],@ds.col('a1_2').to_a) - assert_equal([0,1,0,nil,1],@ds.col('a1_3').to_a) - {'a1_1'=>'a1:a', 'a1_2'=>'a1:b', 'a1_3'=>'a1:c'}.each do |k,v| - assert_equal(v, @ds[k].name) - end - end - def test_split_by_separator - @ds.add_vectors_by_split("a1","_") - assert_equal(%w{id name age city a1 a1_a a1_b a1_c},@ds.fields) - assert_equal([1,0,1,nil,1],@ds.col('a1_a').to_a) - assert_equal([1,1,0,nil,1],@ds.col('a1_b').to_a) - assert_equal([0,1,0,nil,1],@ds.col('a1_c').to_a) - end - def test_percentiles - v1=(1..100).to_a.to_scale - assert_equal(50.5,v1.median) - assert_equal(25.5, v1.percentil(25)) - v2=(1..99).to_a.to_scale - assert_equal(50,v2.median) - assert_equal(25,v2.percentil(25)) - v3=(1..50).to_a.to_scale - assert_equal(25.5, v3.median) - assert_equal(13, v3.percentil(25)) - - end - def test_add_case - ds=Statsample::Dataset.new({'a'=>[].to_vector, 'b'=>[].to_vector, 'c'=>[].to_vector}) - ds.add_case([1,2,3]) - ds.add_case({'a'=>4,'b'=>5,'c'=>6}) - ds.add_case([[7,8,9],%w{a b c}]) - assert_equal({'a'=>1,'b'=>2,'c'=>3},ds.case_as_hash(0)) - assert_equal([4,5,6],ds.case_as_array(1)) - assert_equal([7,8,9],ds.case_as_array(2)) - assert_equal(['a','b','c'],ds.case_as_array(3)) - ds.add_case_array([6,7,1]) - ds.update_valid_data - assert_equal([6,7,1],ds.case_as_array(4)) - - end - def test_marshaling - ds_marshal=Marshal.load(Marshal.dump(@ds)) - assert_equal(ds_marshal,@ds) - end - def test_range - v1=[1,2,3,4].to_vector - v2=[5,6,7,8].to_vector - v3=[9,10,11,12].to_vector - ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2,'v3'=>v3}, %w{v3 v2 v1}) - assert_same(v1,ds1['v1']) - ds2=ds1["v2".."v1"] - assert_equal(%w{v2 v1},ds2.fields) - assert_same(ds1['v1'],ds2['v1']) - assert_same(ds1['v2'],ds2['v2']) - - - end - def test_clone - v1=[1,2,3,4].to_vector - v2=[5,6,7,8].to_vector - ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2}, %w{v2 v1}) - ds2=ds1.clone - assert_equal(ds1,ds2) - assert_not_same(ds1,ds2) - assert_equal(ds1['v1'],ds2['v1']) - assert_same(ds1['v1'], ds2['v1']) - assert_equal(ds1.fields,ds2.fields) - assert_not_same(ds1.fields,ds2.fields) - assert_equal(ds1.cases,ds2.cases) - - # partial clone - ds3=ds1.clone('v1') - ds_exp=Statsample::Dataset.new({'v1'=>v1},%w{v1}) - assert_equal(ds_exp,ds3) - assert_not_same(ds_exp,ds3) - assert_equal(ds3['v1'],ds_exp['v1']) - assert_same(ds3['v1'],ds_exp['v1']) - assert_equal(ds3.fields,ds_exp.fields) - assert_equal(ds3.cases,ds_exp.cases) - - assert_not_same(ds3.fields,ds_exp.fields) - - end - def test_dup - v1=[1,2,3,4].to_vector - v2=[5,6,7,8].to_vector - ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2}, %w{v2 v1}) - ds2=ds1.dup - assert_equal(ds1,ds2) - assert_not_same(ds1,ds2) - assert_equal(ds1['v1'],ds2['v1']) - assert_not_same(ds1['v1'],ds2['v1']) - assert_equal(ds1.cases,ds2.cases) - - assert_equal(ds1.fields,ds2.fields) - assert_not_same(ds1.fields,ds2.fields) - ds1['v1'].type=:scale - # dup partial - ds3=ds1.dup('v1') - ds_exp=Statsample::Dataset.new({'v1'=>v1},%w{v1}) - assert_equal(ds_exp,ds3) - assert_not_same(ds_exp,ds3) - assert_equal(ds3['v1'],ds_exp['v1']) - assert_not_same(ds3['v1'],ds_exp['v1']) - assert_equal(ds3.fields,ds_exp.fields) - assert_equal(ds3.cases,ds_exp.cases) - - assert_not_same(ds3.fields,ds_exp.fields) - - - # empty - ds3=ds1.dup_empty - assert_not_equal(ds1,ds3) - assert_not_equal(ds1['v1'],ds3['v1']) - assert_equal([],ds3['v1'].data) - assert_equal([],ds3['v2'].data) - assert_equal(:scale,ds3['v1'].type) - assert_equal(ds1.fields,ds2.fields) - assert_not_same(ds1.fields,ds2.fields) - end - def test_from_to - assert_equal(%w{name age city}, @ds.from_to("name","city")) - assert_raise ArgumentError do - @ds.from_to("name","a2") - end - end - def test_each_array_with_nils - v1=[1,-99,3,4,"na"].to_vector(:scale,:missing_values=>[-99,"na"]) - v2=[5,6,-99,8,20].to_vector(:scale,:missing_values=>[-99]) - v3=[9,10,11,12,20].to_vector(:scale,:missing_values=>[-99]) - ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2,'v3'=>v3}) - ds2=ds1.dup_empty - ds1.each_array_with_nils {|row| - ds2.add_case_array(row) - } - ds2.update_valid_data - assert_equal([1,nil,3,4,nil],ds2['v1'].data) - assert_equal([5,6,nil,8,20],ds2['v2'].data) - end - def test_dup_only_valid - v1=[1,nil,3,4].to_vector(:scale) - v2=[5,6,nil,8].to_vector(:scale) - v3=[9,10,11,12].to_vector(:scale) - ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2,'v3'=>v3}) - ds2=ds1.dup_only_valid - expected=Statsample::Dataset.new({'v1'=>[1,4].to_vector(:scale), 'v2'=> [5,8].to_vector(:scale), 'v3'=>[9, 12].to_vector(:scale)}) - assert_equal(expected,ds2) - assert_equal(expected.vectors.values,Statsample::only_valid(v1,v2,v3)) - expected_partial=Statsample::Dataset.new({'v1'=>[1,3,4].to_vector(:scale), 'v3'=>[9, 11,12].to_vector(:scale)}) - assert_equal(expected_partial, ds1.dup_only_valid(%w{v1 v3})) - - - end - def test_filter - @ds['age'].type=:scale - filtered=@ds.filter{|c| c['id']==2 or c['id']==4} - expected=Statsample::Dataset.new({'id' => Statsample::Vector.new([2,4]), 'name'=>Statsample::Vector.new(%w{Claude Franz}), 'age'=>Statsample::Vector.new([23,27],:scale), - 'city'=>Statsample::Vector.new(['London','Paris']), - 'a1'=>Statsample::Vector.new(['b,c',nil,])}, ['id','name','age','city','a1']) - assert_equal(expected,filtered) - end - def test_filter_field - @ds['age'].type=:scale - filtered=@ds.filter_field('id') {|c| c['id']==2 or c['id']==4} - expected=[2,4].to_vector - assert_equal(expected,filtered) - - end - def test_verify - name=%w{r1 r2 r3 r4}.to_vector(:nominal) - v1=[1,2,3,4].to_vector(:scale) - v2=[4,3,2,1].to_vector(:scale) - v3=[10,20,30,40].to_vector(:scale) - v4=%w{a b a b}.to_vector(:nominal) - ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'id'=>name}.to_dataset - ds.fields=%w{v1 v2 v3 v4 id} - #Correct - t1=create_test("If v4=a, v1 odd") {|r| r['v4']=='b' or (r['v4']=='a' and r['v1']%2==1)} - t2=create_test("v3=v1*10") {|r| r['v3']==r['v1']*10} - # Fail! - t3=create_test("v4='b'") {|r| r['v4']=='b'} - exp1=["1 [1]: v4='b'", "3 [3]: v4='b'"] - exp2=["1 [r1]: v4='b'", "3 [r3]: v4='b'"] - res=ds.verify(t3,t1,t2) - assert_equal(exp1,res) - res=ds.verify('id',t1,t2,t3) - assert_equal(exp2,res) - end - def test_compute_operation - v1=[1,2,3,4].to_vector(:scale) - v2=[4,3,2,1].to_vector(:scale) - v3=[10,20,30,40].to_vector(:scale) - vscale=[1.quo(2),1,3.quo(2),2].to_vector(:scale) - vsum=[1+4+10.0,2+3+20.0,3+2+30.0,4+1+40.0].to_vector(:scale) - vmult=[1*4,2*3,3*2,4*1].to_vector(:scale) - ds={'v1'=>v1,'v2'=>v2,'v3'=>v3}.to_dataset - assert_equal(vscale,ds.compute("v1/2")) - assert_equal(vsum,ds.compute("v1+v2+v3")) - assert_equal(vmult,ds.compute("v1*v2")) - - end - def test_crosstab_with_asignation - v1=%w{a a a b b b c c c}.to_vector - v2=%w{a b c a b c a b c}.to_vector - v3=%w{0 1 0 0 1 1 0 0 1}.to_scale - ds=Statsample::Dataset.crosstab_by_asignation(v1,v2,v3) - assert_equal(:nominal, ds['_id'].type) - assert_equal(:scale, ds['a'].type) - assert_equal(:scale, ds['b'].type) - ev_id=%w{a b c}.to_vector - ev_a =%w{0 0 0}.to_scale - ev_b =%w{1 1 0}.to_scale - ev_c =%w{0 1 1}.to_scale - ds2={'_id'=>ev_id, 'a'=>ev_a, 'b'=>ev_b, 'c'=>ev_c}.to_dataset - assert_equal(ds, ds2) - end - def test_one_to_many - cases=[ - ['1','george','red',10,'blue',20,nil,nil], - ['2','fred','green',15,'orange',30,'white',20], - ['3','alfred',nil,nil,nil,nil,nil,nil] - ] - ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3}) - cases.each {|c| ds.add_case_array c } - ds.update_valid_data - ids=%w{1 1 2 2 2}.to_vector - colors=%w{red blue green orange white}.to_vector - values=[10,20,15,30,20].to_vector - col_ids=[1,2,1,2,3].to_scale - ds_expected={'id'=>ids, '_col_id'=>col_ids, 'color'=>colors, 'value'=>values}.to_dataset(['id','_col_id', 'color','value']) - assert_equal(ds_expected, ds.one_to_many(%w{id}, "car_%v%n")) - - end - -end diff --git a/test/test_dominance_analysis.rb b/test/test_dominance_analysis.rb index 803262a..012d1a6 100644 --- a/test/test_dominance_analysis.rb +++ b/test/test_dominance_analysis.rb @@ -1,41 +1,39 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleDominanceAnalysisTestCase < MiniTest::Unit::TestCase +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleDominanceAnalysisTestCase < Minitest::Test def test_dominance_univariate # Example from Budescu (1993) - m=Matrix[[1, 0.683, 0.154, 0.460, 0.618],[0.683, 1, -0.050, 0.297, 0.461], [0.154, -0.050, 1, 0.006, 0.262],[0.460, 0.297, 0.006, 1, 0.507],[0.618, 0.461, 0.262, 0.507, 1]] + m = Matrix[[1, 0.683, 0.154, 0.460, 0.618], [0.683, 1, -0.050, 0.297, 0.461], [0.154, -0.050, 1, 0.006, 0.262], [0.460, 0.297, 0.006, 1, 0.507], [0.618, 0.461, 0.262, 0.507, 1]] m.extend Statsample::CovariateMatrix - m.fields=%w{x1 x2 x3 x4 y} - da=Statsample::DominanceAnalysis.new(m,'y') + m.fields = %w(x1 x2 x3 x4 y) + da = Statsample::DominanceAnalysis.new(m, 'y') - contr_x1={'x2'=>0.003, 'x3'=>0.028, 'x4'=>0.063} - contr_x1.each do |k,v| + contr_x1 = { 'x2' => 0.003, 'x3' => 0.028, 'x4' => 0.063 } + contr_x1.each do |k, v| assert_in_delta(v, da.models_data[['x1']].contributions[k], 0.001) end - assert_in_delta(0.052, da.models_data[['x2','x3','x4']].contributions['x1'], 0.001) - expected_dominances=[1, 1, 0.5, 0.5, 0,0] - expected_g_dominances=[1, 1, 1, 1, 0,0] + assert_in_delta(0.052, da.models_data[%w(x2 x3 x4)].contributions['x1'], 0.001) + expected_dominances = [1, 1, 0.5, 0.5, 0, 0] + expected_g_dominances = [1, 1, 1, 1, 0, 0] - da.pairs.each_with_index do |a,i| - assert_equal(expected_dominances[i], da.total_dominance_pairwise(a[0],a[1])) - assert_equal(expected_dominances[i], da.conditional_dominance_pairwise(a[0],a[1])) - assert_equal(expected_g_dominances[i], da.general_dominance_pairwise(a[0],a[1])) + da.pairs.each_with_index do |a, i| + assert_equal(expected_dominances[i], da.total_dominance_pairwise(a[0], a[1])) + assert_equal(expected_dominances[i], da.conditional_dominance_pairwise(a[0], a[1])) + assert_equal(expected_g_dominances[i], da.general_dominance_pairwise(a[0], a[1])) end - assert(da.summary.size>0) + assert(da.summary.size > 0) end + def test_dominance_multivariate - m=Matrix[[1.0, -0.19, -0.358, -0.343, 0.359, 0.257], [-0.19, 1.0, 0.26, 0.29, -0.11, -0.11], [-0.358, 0.26, 1.0, 0.54, -0.49, -0.23], [-0.343, 0.29, 0.54, 1.0, -0.22, -0.41], [0.359, -0.11, -0.49, -0.22, 1.0, 0.62], [0.257, -0.11, -0.23, -0.41, 0.62, 1]] + m = Matrix[[1.0, -0.19, -0.358, -0.343, 0.359, 0.257], [-0.19, 1.0, 0.26, 0.29, -0.11, -0.11], [-0.358, 0.26, 1.0, 0.54, -0.49, -0.23], [-0.343, 0.29, 0.54, 1.0, -0.22, -0.41], [0.359, -0.11, -0.49, -0.22, 1.0, 0.62], [0.257, -0.11, -0.23, -0.41, 0.62, 1]] m.extend Statsample::CovariateMatrix - m.fields=%w{y1 y2 x1 x2 x3 x4} - m2=m.submatrix(%w{y1 x1 x2 x3 x4}) - + m.fields = %w(y1 y2 x1 x2 x3 x4) + m2 = m.submatrix(%w(y1 x1 x2 x3 x4)) - da=Statsample::DominanceAnalysis.new(m, ['y1','y2'], :cases=>683, :method_association=>:p2yx) + da = Statsample::DominanceAnalysis.new(m, %w(y1 y2), cases: 683, method_association: :p2yx) - contr_x1={'x2'=>0.027, 'x3'=>0.024, 'x4'=>0.017} - contr_x1.each do |k,v| + contr_x1 = { 'x2' => 0.027, 'x3' => 0.024, 'x4' => 0.017 } + contr_x1.each do |k, v| assert_in_delta(v, da.models_data[['x1']].contributions[k], 0.003) end - - end end diff --git a/test/test_factor.rb b/test/test_factor.rb index 1884f4e..b724091 100644 --- a/test/test_factor.rb +++ b/test/test_factor.rb @@ -1,222 +1,228 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -#require 'rserve' -#require 'statsample/rserve_extension' +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +# require 'rserve' +# require 'statsample/rserve_extension' -class StatsampleFactorTestCase < MiniTest::Unit::TestCase +class StatsampleFactorTestCase < Minitest::Test include Statsample::Fixtures # Based on Hardle and Simar def setup - @fixtures_dir=File.expand_path(File.dirname(__FILE__)+"/fixtures") + @fixtures_dir = File.expand_path(File.dirname(__FILE__) + '/fixtures') end + # Based on Hurdle example def test_covariance_matrix - ds=Statsample::PlainText.read(@fixtures_dir+"/bank2.dat", %w{v1 v2 v3 v4 v5 v6}) - ds.fields.each {|f| - ds[f]=ds[f].centered + ds = Daru::DataFrame.from_plaintext(@fixtures_dir + '/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]) + ds.vectors.each {|f| + ds[f] = ds[f].center } - cm=ds.covariance_matrix - pca =Statsample::Factor::PCA.new( cm, :m=>6) - #puts pca.summary - #puts pca.feature_matrix - exp_eig=[2.985, 0.931,0.242, 0.194, 0.085, 0.035].to_scale - assert_similar_vector(exp_eig, pca.eigenvalues.to_scale, 0.1) - pcs=pca.principal_components(ds) - k=6 - comp_matrix=pca.component_matrix() + cm = Statsample::Bivariate.covariance_matrix ds + pca = Statsample::Factor::PCA.new(cm, m: 6) + # puts pca.summary + # puts pca.feature_matrix + exp_eig = Daru::Vector.new([2.985, 0.931, 0.242, 0.194, 0.085, 0.035]) + assert_similar_vector(exp_eig, Daru::Vector.new(pca.eigenvalues), 0.1) + pcs = pca.principal_components(ds) + k = 6 + comp_matrix = pca.component_matrix k.times {|i| - pc_id="PC_#{i+1}" + pc_id = "PC_#{i + 1}".to_sym k.times {|j| # variable - ds_id="v#{j+1}" - r= Statsample::Bivariate.correlation(ds[ds_id], pcs[pc_id]) - assert_in_delta( r, comp_matrix[j,i]) - } + ds_id = "v#{j + 1}".to_sym + r = Statsample::Bivariate.correlation(ds[ds_id], pcs[pc_id]) + assert_in_delta(r, comp_matrix[j, i]) + } } - end + def test_principalcomponents_ruby_gsl - - ran=Distribution::Normal.rng - -# @r=::Rserve::Connection.new - - samples=20 - [3,5,7].each {|k| - v={} - v["x0"]=samples.times.map { ran.call()}.to_scale.centered - (1...k).each {|i| - v["x#{i}"]=samples.times.map {|ii| ran.call()*0.5+v["x#{i-1}"][ii]*0.5}.to_scale.centered - } - - ds=v.to_dataset - cm=ds.covariance_matrix -# @r.assign('ds',ds) -# @r.eval('cm<-cor(ds);sm<-eigen(cm, sym=TRUE);v<-sm$vectors') -# puts "eigenvalues" -# puts @r.eval('v').to_ruby.to_s - pca_ruby=Statsample::Factor::PCA.new( cm, :m=>k, :use_gsl=>false ) - pca_gsl =Statsample::Factor::PCA.new( cm, :m=>k, :use_gsl=>true ) - pc_ruby = pca_ruby.principal_components(ds) - pc_gsl = pca_gsl.principal_components(ds) - # Test component matrix correlation! - cm_ruby=pca_ruby.component_matrix - #puts cm_ruby.summary - k.times {|i| - pc_id="PC_#{i+1}" - assert_in_delta(pca_ruby.eigenvalues[i], pca_gsl.eigenvalues[i],1e-10) - # Revert gsl component values - pc_gsl_data= (pc_gsl[pc_id][0]-pc_ruby[pc_id][0]).abs>1e-6 ? pc_gsl[pc_id].recode {|v| -v} : pc_gsl[pc_id] - assert_similar_vector(pc_gsl_data, pc_ruby[pc_id], 1e-6,"PC for #{k} variables") - if false - k.times {|j| # variable - ds_id="x#{j}" - r= Statsample::Bivariate.correlation(ds[ds_id],pc_ruby[pc_id]) - puts "#{pc_id}-#{ds_id}:#{r}" + if Statsample.has_gsl? + ran = Distribution::Normal.rng + + # @r=::Rserve::Connection.new + + samples = 20 + [3, 5, 7].each {|k| + v = {} + v[:x0] = Daru::Vector.new(samples.times.map { ran.call }).center + (1...k).each { |i| + v["x#{i}".to_sym] = Daru::Vector.new(samples.times.map { |ii| ran.call * 0.5 + v["x#{i - 1}".to_sym][ii] * 0.5 }).center + } + + ds = Daru::DataFrame.new(v) + cm = Statsample::Bivariate.covariance_matrix ds + # @r.assign('ds',ds) + # @r.eval('cm<-cor(ds);sm<-eigen(cm, sym=TRUE);v<-sm$vectors') + # puts "eigenvalues" + # puts @r.eval('v').to_ruby.to_s + pca_ruby = Statsample::Factor::PCA.new(cm, m: k, use_gsl: false) + pca_gsl = Statsample::Factor::PCA.new(cm, m: k, use_gsl: true) + pc_ruby = pca_ruby.principal_components(ds) + pc_gsl = pca_gsl.principal_components(ds) + # Test component matrix correlation! + cm_ruby = pca_ruby.component_matrix + # puts cm_ruby.summary + k.times {|i| + pc_id = "PC_#{i + 1}".to_sym + assert_in_delta(pca_ruby.eigenvalues[i], pca_gsl.eigenvalues[i], 1e-10) + # Revert gsl component values + pc_gsl_data = (pc_gsl[pc_id][0] - pc_ruby[pc_id][0]).abs > 1e-6 ? pc_gsl[pc_id].recode(&:-@) : pc_gsl[pc_id] + assert_similar_vector(pc_gsl_data, pc_ruby[pc_id], 1e-6, "PC for #{k} variables") + if false + k.times {|j| # variable + ds_id = "x#{j}".to_sym + r = Statsample::Bivariate.correlation(ds[ds_id], pc_ruby[pc_id]) + puts "#{pc_id}-#{ds_id}:#{r}" + } + end } - end } - } - #@r.close + end + # @r.close end - def test_principalcomponents() - principalcomponents(true) - principalcomponents(false) - - end + + def test_principalcomponents + if Statsample.has_gsl? + principalcomponents(true) + else + skip "Require GSL" + end + principalcomponents(false) + end + def principalcomponents(gsl) - ran=Distribution::Normal.rng - samples=50 - x1=samples.times.map { ran.call()}.to_scale - x2=samples.times.map {|i| ran.call()*0.5+x1[i]*0.5}.to_scale - ds={'x1'=>x1,'x2'=>x2}.to_dataset - - cm=ds.correlation_matrix - r=cm[0,1] - pca=Statsample::Factor::PCA.new(cm,:m=>2,:use_gsl=>gsl) - assert_in_delta(1+r,pca.eigenvalues[0],1e-10) - assert_in_delta(1-r,pca.eigenvalues[1],1e-10) - hs=1.0 / Math.sqrt(2) - assert_equal_vector(Vector[1, 1]*hs, pca.eigenvectors[0]) - m_1=gsl ? Vector[-1,1] : Vector[1,-1] - - assert_equal_vector(hs*m_1, pca.eigenvectors[1]) - - pcs=pca.principal_components(ds) - exp_pc_1=ds.collect_with_index {|row,i| - hs*(row['x1']+row['x2']) - } - exp_pc_2=ds.collect_with_index {|row,i| - gsl ? hs*(row['x2']-row['x1']) : hs*(row['x1']-row['x2']) + ran = Distribution::Normal.rng + samples = 50 + x1 = Daru::Vector.new(samples.times.map { ran.call }) + x2 = Daru::Vector.new(samples.times.map { |i| ran.call * 0.5 + x1[i] * 0.5 }) + ds = Daru::DataFrame.new({ :x1 => x1, :x2 => x2 }) + + cm = Statsample::Bivariate.correlation_matrix ds + r = cm[0, 1] + pca = Statsample::Factor::PCA.new(cm, m: 2, use_gsl: gsl) + assert_in_delta(1 + r, pca.eigenvalues[0], 1e-10) + assert_in_delta(1 - r, pca.eigenvalues[1], 1e-10) + hs = 1.0 / Math.sqrt(2) + assert_equal_vector(Vector[1, 1] * hs, pca.eigenvectors[0]) + m_1 = gsl ? Vector[-1, 1] : Vector[1, -1] + assert_equal_vector(hs * m_1, pca.eigenvectors[1]) + + pcs = pca.principal_components(ds) + exp_pc_1 = ds.collect_row_with_index {|row, _i| + hs * (row[:x1] + row[:x2]) + } + exp_pc_2 = ds.collect_row_with_index {|row, _i| + gsl ? hs * (row[:x2] - row[:x1]) : hs * (row[:x1] - row[:x2]) } - assert_similar_vector(exp_pc_1, pcs["PC_1"]) - assert_similar_vector(exp_pc_2, pcs["PC_2"]) + assert_similar_vector(exp_pc_1, pcs[:PC_1]) + assert_similar_vector(exp_pc_2, pcs[:PC_2]) end + def test_antiimage - cor=Matrix[[1,0.964, 0.312],[0.964,1,0.411],[0.312,0.411,1]] - expected=Matrix[[0.062,-0.057, 0.074],[-0.057, 0.057, -0.089], [0.074, -0.089, 0.729]] - ai=Statsample::Factor.anti_image_covariance_matrix(cor) - assert(Matrix.equal_in_delta?(expected, ai, 0.01), "#{expected.to_s} not equal to #{ai.to_s}") + cor = Matrix[[1, 0.964, 0.312], [0.964, 1, 0.411], [0.312, 0.411, 1]] + expected = Matrix[[0.062, -0.057, 0.074], [-0.057, 0.057, -0.089], [0.074, -0.089, 0.729]] + ai = Statsample::Factor.anti_image_covariance_matrix(cor) + assert(Matrix.equal_in_delta?(expected, ai, 0.01), "#{expected} not equal to #{ai}") end + def test_kmo - @v1=[1 ,2 ,3 ,4 ,7 ,8 ,9 ,10,14,15,20,50,60,70].to_scale - @v2=[5 ,6 ,11,12,13,16,17,18,19,20,30,0,0,0].to_scale - @v3=[10,3 ,20,30,40,50,80,10,20,30,40,2,3,4].to_scale - # KMO: 0.490 - ds={'v1'=>@v1,'v2'=>@v2,'v3'=>@v3}.to_dataset - cor=Statsample::Bivariate.correlation_matrix(ds) - kmo=Statsample::Factor.kmo(cor) - assert_in_delta(0.667, kmo,0.001) - assert_in_delta(0.81, Statsample::Factor.kmo(harman_817),0.01) - + @v1 = Daru::Vector.new([1, 2, 3, 4, 7, 8, 9, 10, 14, 15, 20, 50, 60, 70]) + @v2 = Daru::Vector.new([5, 6, 11, 12, 13, 16, 17, 18, 19, 20, 30, 0, 0, 0]) + @v3 = Daru::Vector.new([10, 3, 20, 30, 40, 50, 80, 10, 20, 30, 40, 2, 3, 4]) + # KMO: 0.490 + ds = Daru::DataFrame.new({ :v1 => @v1, :v2 => @v2, :v3 => @v3 }) + cor = Statsample::Bivariate.correlation_matrix(ds) + kmo = Statsample::Factor.kmo(cor) + assert_in_delta(0.667, kmo, 0.001) + assert_in_delta(0.81, Statsample::Factor.kmo(harman_817), 0.01) end + def test_kmo_univariate - m=harman_817 - expected=[0.73,0.76,0.84,0.87,0.53,0.93,0.78,0.86] + m = harman_817 + expected = [0.73, 0.76, 0.84, 0.87, 0.53, 0.93, 0.78, 0.86] m.row_size.times.map {|i| - assert_in_delta(expected[i], Statsample::Factor.kmo_univariate(m,i),0.01) + assert_in_delta(expected[i], Statsample::Factor.kmo_univariate(m, i), 0.01) } end # Tested with SPSS and R def test_pca - a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale - b=[2.4, 0.7, 2.9, 2.2, 3.0, 2.7, 1.6, 1.1, 1.6, 0.9].to_scale - a.recode! {|c| c-a.mean} - b.recode! {|c| c-b.mean} - ds={'a'=>a,'b'=>b}.to_dataset - cov_matrix=Statsample::Bivariate.covariance_matrix(ds) - if Statsample.has_gsl? - pca=Statsample::Factor::PCA.new(cov_matrix,:use_gsl=>true) - pca_set(pca,"gsl") - else - skip("Eigenvalues could be calculated with GSL (requires gsl)") - end - pca=Statsample::Factor::PCA.new(cov_matrix,:use_gsl=>false) - pca_set(pca,"ruby") + dtype = Statsample.has_gsl? ? :gsl : :array + a = Daru::Vector.new([2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1], dtype: dtype) + b = Daru::Vector.new([2.4, 0.7, 2.9, 2.2, 3.0, 2.7, 1.6, 1.1, 1.6, 0.9], dtype: dtype) + a = a - a.mean + b = b - b.mean + ds = Daru::DataFrame.new({ :a => a, :b => b }) + + cov_matrix = Statsample::Bivariate.covariance_matrix(ds) + if Statsample.has_gsl? + pca = Statsample::Factor::PCA.new(cov_matrix, use_gsl: true) + pca_set(pca, 'gsl') + else + skip('Eigenvalues could be calculated with GSL (requires gsl)') + end + pca = Statsample::Factor::PCA.new(cov_matrix, use_gsl: false) + pca_set(pca, 'ruby') end - def pca_set(pca,type) - expected_eigenvalues=[1.284, 0.0490] - expected_eigenvalues.each_with_index{|ev,i| - assert_in_delta(ev,pca.eigenvalues[i],0.001) - } - expected_communality=[0.590, 0.694] - expected_communality.each_with_index{|ev,i| - assert_in_delta(ev,pca.communalities[i],0.001) - } - expected_cm=[0.768, 0.833] - obs=pca.component_matrix_correlation(1).column(0).to_a - expected_cm.each_with_index{|ev,i| - assert_in_delta(ev,obs[i],0.001) - } - assert(pca.summary) + def pca_set(pca, _type) + expected_eigenvalues = [1.284, 0.0490] + expected_eigenvalues.each_with_index{|ev, i| + assert_in_delta(ev, pca.eigenvalues[i], 0.001) + } + expected_communality = [0.590, 0.694] + expected_communality.each_with_index{|ev, i| + assert_in_delta(ev, pca.communalities[i], 0.001) + } + expected_cm = [0.768, 0.833] + obs = pca.component_matrix_correlation(1).column(0).to_a + expected_cm.each_with_index{|ev, i| + assert_in_delta(ev, obs[i], 0.001) + } + + assert(pca.summary) end # Tested with R def test_principalaxis - matrix=::Matrix[ - [1.0, 0.709501601093587, 0.877596585880047, 0.272219316266807], [0.709501601093587, 1.0, 0.291633797330304, 0.871141831433844], [0.877596585880047, 0.291633797330304, 1.0, -0.213373722977167], [0.272219316266807, 0.871141831433844, -0.213373722977167, 1.0]] - - - fa=Statsample::Factor::PrincipalAxis.new(matrix,:m=>1, :max_iterations=>50) - - cm=::Matrix[[0.923],[0.912],[0.507],[0.483]] - - assert_equal_matrix(cm,fa.component_matrix,0.001) - - h2=[0.852,0.832,0.257,0.233] - h2.each_with_index{|ev,i| - assert_in_delta(ev,fa.communalities[i],0.001) - } - eigen1=2.175 - assert_in_delta(eigen1, fa.eigenvalues[0],0.001) - assert(fa.summary.size>0) - fa=Statsample::Factor::PrincipalAxis.new(matrix,:smc=>false) - - assert_raise RuntimeError do - fa.iterate - end + matrix = ::Matrix[ + [1.0, 0.709501601093587, 0.877596585880047, 0.272219316266807], [0.709501601093587, 1.0, 0.291633797330304, 0.871141831433844], [0.877596585880047, 0.291633797330304, 1.0, -0.213373722977167], [0.272219316266807, 0.871141831433844, -0.213373722977167, 1.0]] - end + fa = Statsample::Factor::PrincipalAxis.new(matrix, m: 1, max_iterations: 50) + + cm = ::Matrix[[0.923], [0.912], [0.507], [0.483]] + assert_equal_matrix(cm, fa.component_matrix, 0.001) + + h2 = [0.852, 0.832, 0.257, 0.233] + h2.each_with_index{|ev, i| + assert_in_delta(ev, fa.communalities[i], 0.001) + } + eigen1 = 2.175 + assert_in_delta(eigen1, fa.eigenvalues[0], 0.001) + assert(fa.summary.size > 0) + fa = Statsample::Factor::PrincipalAxis.new(matrix, smc: false) + + assert_raise RuntimeError do + fa.iterate + end + end def test_rotation_varimax - a = Matrix[ [ 0.4320, 0.8129, 0.3872] , - [0.7950, -0.5416, 0.2565] , - [0.5944, 0.7234, -0.3441], - [0.8945, -0.3921, -0.1863] ] - - expected= Matrix[[-0.0204423, 0.938674, -0.340334], - [0.983662, 0.0730206, 0.134997], - [0.0826106, 0.435975, -0.893379], - [0.939901, -0.0965213, -0.309596]] - varimax=Statsample::Factor::Varimax.new(a) + a = Matrix[[0.4320, 0.8129, 0.3872], + [0.7950, -0.5416, 0.2565], + [0.5944, 0.7234, -0.3441], + [0.8945, -0.3921, -0.1863]] + + expected = Matrix[[-0.0204423, 0.938674, -0.340334], + [0.983662, 0.0730206, 0.134997], + [0.0826106, 0.435975, -0.893379], + [0.939901, -0.0965213, -0.309596]] + varimax = Statsample::Factor::Varimax.new(a) assert(!varimax.rotated.nil?, "Rotated shouldn't be empty") assert(!varimax.component_transformation_matrix.nil?, "Component matrix shouldn't be empty") assert(!varimax.h2.nil?, "H2 shouldn't be empty") - - assert_equal_matrix(expected,varimax.rotated,1e-6) - assert(varimax.summary.size>0) - end - + assert_equal_matrix(expected, varimax.rotated, 1e-6) + assert(varimax.summary.size > 0) + end end diff --git a/test/test_factor_map.rb b/test/test_factor_map.rb index 05c94d5..69610bc 100644 --- a/test/test_factor_map.rb +++ b/test/test_factor_map.rb @@ -1,43 +1,38 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -#require 'rserve' -#require 'statsample/rserve_extension' +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +# require 'rserve' +# require 'statsample/rserve_extension' -class StatsampleFactorMpaTestCase < MiniTest::Unit::TestCase +class StatsampleFactorMpaTestCase < Minitest::Test context Statsample::Factor::MAP do setup do - m=Matrix[ - [ 1, 0.846, 0.805, 0.859, 0.473, 0.398, 0.301, 0.382], - [ 0.846, 1, 0.881, 0.826, 0.376, 0.326, 0.277, 0.415], - [ 0.805, 0.881, 1, 0.801, 0.38, 0.319, 0.237, 0.345], - [ 0.859, 0.826, 0.801, 1, 0.436, 0.329, 0.327, 0.365], - [ 0.473, 0.376, 0.38, 0.436, 1, 0.762, 0.73, 0.629], - [ 0.398, 0.326, 0.319, 0.329, 0.762, 1, 0.583, 0.577], - [ 0.301, 0.277, 0.237, 0.327, 0.73, 0.583, 1, 0.539], - [ 0.382, 0.415, 0.345, 0.365, 0.629, 0.577, 0.539, 1] + m = Matrix[ + [1, 0.846, 0.805, 0.859, 0.473, 0.398, 0.301, 0.382], + [0.846, 1, 0.881, 0.826, 0.376, 0.326, 0.277, 0.415], + [0.805, 0.881, 1, 0.801, 0.38, 0.319, 0.237, 0.345], + [0.859, 0.826, 0.801, 1, 0.436, 0.329, 0.327, 0.365], + [0.473, 0.376, 0.38, 0.436, 1, 0.762, 0.73, 0.629], + [0.398, 0.326, 0.319, 0.329, 0.762, 1, 0.583, 0.577], + [0.301, 0.277, 0.237, 0.327, 0.73, 0.583, 1, 0.539], + [0.382, 0.415, 0.345, 0.365, 0.629, 0.577, 0.539, 1] ] - @map=Statsample::Factor::MAP.new(m) + @map = Statsample::Factor::MAP.new(m) end - should "return correct values with pure ruby" do - @map.use_gsl=false + should 'return correct values with pure ruby' do + @map.use_gsl = false map_assertions(@map) end - should_with_gsl "return correct values with gsl" do - #require 'ruby-prof' + should_with_gsl 'return correct values with gsl' do + # require 'ruby-prof' - @map.use_gsl=true - map_assertions(@map) + @map.use_gsl = true + map_assertions(@map) end - - end - + def map_assertions(map) - assert_in_delta(map.minfm, 0.066445,0.00001) - assert_equal(map.number_of_factors, 2) - assert_in_delta(map.fm[0], 0.312475,0.00001) - assert_in_delta(map.fm[1], 0.245121,0.00001) + assert_in_delta(map.minfm, 0.066445, 0.00001) + assert_equal(map.number_of_factors, 2) + assert_in_delta(map.fm[0], 0.312475, 0.00001) + assert_in_delta(map.fm[1], 0.245121, 0.00001) end - - end - diff --git a/test/test_factor_pa.rb b/test/test_factor_pa.rb index b1332ba..e2df935 100644 --- a/test/test_factor_pa.rb +++ b/test/test_factor_pa.rb @@ -1,52 +1,56 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -#require 'rserve' -#require 'statsample/rserve_extension' +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +# require 'rserve' +# require 'statsample/rserve_extension' -class StatsampleFactorTestCase < MiniTest::Unit::TestCase +class StatsampleFactorTestCase < Minitest::Test include Statsample::Fixtures # Based on Hardle and Simar def setup - @fixtures_dir=File.expand_path(File.dirname(__FILE__)+"/fixtures") + @fixtures_dir = File.expand_path(File.dirname(__FILE__) + '/fixtures') end + def test_parallelanalysis_with_data if Statsample.has_gsl? - samples=100 - variables=10 - iterations=50 + samples = 100 + variables = 10 + iterations = 50 rng = Distribution::Normal.rng - f1=samples.times.collect {rng.call}.to_scale - f2=samples.times.collect {rng.call}.to_scale - vectors={} + f1 = Daru::Vector.new(samples.times.collect { rng.call }) + f2 = Daru::Vector.new(samples.times.collect { rng.call }) + vectors = {} variables.times do |i| - if i<5 - vectors["v#{i}"]=samples.times.collect {|nv| - f1[nv]*5+f2[nv]*2+rng.call - }.to_scale + if i < 5 + vectors["v#{i}".to_sym] = Daru::Vector.new( + samples.times.collect { |nv| + f1[nv] * 5 + f2[nv] * 2 + rng.call + } + ) else - vectors["v#{i}"]=samples.times.collect {|nv| - f2[nv]*5+f1[nv]*2+rng.call - }.to_scale + vectors["v#{i}".to_sym] = Daru::Vector.new( + samples.times.collect { |nv| + f2[nv] * 5 + f1[nv] * 2 + rng.call + } + ) end - end - ds=vectors.to_dataset - - pa1=Statsample::Factor::ParallelAnalysis.new(ds, :bootstrap_method=>:data, :iterations=>iterations) - pa2=Statsample::Factor::ParallelAnalysis.with_random_data(samples,variables,:iterations=>iterations,:percentil=>95) + ds = Daru::DataFrame.new(vectors) + + pa1 = Statsample::Factor::ParallelAnalysis.new(ds, bootstrap_method: :data, iterations: iterations) + pa2 = Statsample::Factor::ParallelAnalysis.with_random_data(samples, variables, iterations: iterations, percentil: 95) 3.times do |n| - var="ev_0000#{n+1}" - assert_in_delta(pa1.ds_eigenvalues[var].mean, pa2.ds_eigenvalues[var].mean,0.05) + var = "ev_0000#{n + 1}".to_sym + assert_in_delta(pa1.ds_eigenvalues[var].mean, pa2.ds_eigenvalues[var].mean, 0.07) end else - skip("Too slow without GSL") + skip('Too slow without GSL') end - end + def test_parallelanalysis - pa=Statsample::Factor::ParallelAnalysis.with_random_data(305,8,:iterations=>100,:percentil=>95) - assert_in_delta(1.2454, pa.ds_eigenvalues['ev_00001'].mean, 0.01) - assert_in_delta(1.1542, pa.ds_eigenvalues['ev_00002'].mean, 0.01) - assert_in_delta(1.0836, pa.ds_eigenvalues['ev_00003'].mean, 0.01) - assert(pa.summary.size>0) - end + pa = Statsample::Factor::ParallelAnalysis.with_random_data(305, 8, iterations: 100, percentil: 95) + assert_in_delta(1.2454, pa.ds_eigenvalues[:ev_00001].mean, 0.05) + assert_in_delta(1.1542, pa.ds_eigenvalues[:ev_00002].mean, 0.01) + assert_in_delta(1.0836, pa.ds_eigenvalues[:ev_00003].mean, 0.01) + assert(pa.summary.size > 0) + end end diff --git a/test/test_fit_model.rb b/test/test_fit_model.rb new file mode 100644 index 0000000..e7be554 --- /dev/null +++ b/test/test_fit_model.rb @@ -0,0 +1,88 @@ +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +require 'minitest/autorun' + +describe Statsample::FitModel do + before do + @df = Daru::DataFrame.from_csv 'test/fixtures/df.csv' + @df.to_category 'c', 'd', 'e' + end + context '#df_for_regression' do + context 'no interaction' do + it { assert_vectors_from_formula 'y~a+e', %w[a e_B e_C y] } + end + + context '2-way interaction' do + context 'interaction of numerical with numerical' do + context 'none reoccur' do + it { assert_vectors_from_formula 'y~a:b', %w[a:b y] } + end + + context 'one reoccur' do + it { assert_vectors_from_formula 'y~a+a:b', %w[a a:b y] } + end + + context 'both reoccur' do + it { assert_vectors_from_formula 'y~a+b+a:b', %w[a a:b b y] } + end + end + + context 'interaction of category with numerical' do + context 'none reoccur' do + it { assert_vectors_from_formula 'y~a:e', %w[e_A:a e_B:a e_C:a y] } + end + + context 'one reoccur' do + context 'numeric occur' do + it { assert_vectors_from_formula 'y~a+a:e', %w[a e_B:a e_C:a y] } + end + + context 'category occur' do + it { assert_vectors_from_formula 'y~e+a:e', + %w[e_B e_C e_A:a e_B:a e_C:a y] } + end + end + + context 'both reoccur' do + it { assert_vectors_from_formula 'y~a+e+a:e', + %w[a e_B e_C e_B:a e_C:a y] } + end + end + + context 'interaction of category with category' do + context 'none reoccur' do + it { assert_vectors_from_formula 'y~c:e', + %w[e_B e_C c_yes:e_A c_yes:e_B c_yes:e_C y] } + end + + context 'one reoccur' do + it { assert_vectors_from_formula 'y~e+c:e', + %w[e_B e_C c_yes:e_A c_yes:e_B c_yes:e_C y] } + end + + context 'both reoccur' do + it { assert_vectors_from_formula 'y~c+e+c:e', + %w[c_yes e_B e_C c_yes:e_B c_yes:e_C y] } + end + end + end + + context 'corner case' do + context 'example 1' do + it { assert_vectors_from_formula 'y~d:a+d:e', + %w[e_B e_C d_male:e_A d_male:e_B d_male:e_C d_female:a d_male:a y] } + end + end + + context 'complex examples' do + context 'random example 1' do + it { assert_vectors_from_formula 'y~a+e+c:d+e:d', + %w[e_B e_C d_male c_yes:d_female c_yes:d_male e_B:d_male e_C:d_male a y] } + end + + context 'random example 2' do + it { assert_vectors_from_formula 'y~e+b+c+d:e+b:e+a:e+0', + %w[e_A e_B e_C c_yes d_male:e_A d_male:e_B d_male:e_C b e_B:b e_C:b e_A:a e_B:a e_C:a y] } + end + end + end +end diff --git a/test/test_ggobi.rb b/test/test_ggobi.rb index ecef32c..6f1724a 100644 --- a/test/test_ggobi.rb +++ b/test/test_ggobi.rb @@ -1,24 +1,25 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) require 'ostruct' -class StatsampleGGobiTestCase < MiniTest::Unit::TestCase - +class StatsampleGGobiTestCase < Minitest::Test def setup - v1=([10.2,20.3,10,20,30,40,30,20,30,40]*10).to_vector(:scale) - @v2=(%w{a b c a a a b b c d}*10).to_vector(:nominal) - @v2.labels={"a"=>"letter a","d"=>"letter d"} - v3=([1,2,3,4,5,4,3,2,1,2]*10).to_vector(:ordinal) - @ds={'v1'=>v1,'v2'=>@v2,'v3'=>v3}.to_dataset + v1 = Daru::Vector.new([10.2, 20.3, 10, 20, 30, 40, 30, 20, 30, 40] * 10) + @v2 = Daru::Vector.new(%w(a b c a a a b b c d) * 10) + @v2.labels = { 'a' => 'letter a', 'd' => 'letter d' } + v3 = Daru::Vector.new([1, 2, 3, 4, 5, 4, 3, 2, 1, 2] * 10) + @ds = Daru::DataFrame.new({ :v1 => v1, :v2 => @v2, :v3 => v3 }) end + def test_values_definition - a=[1.0,2,"a",nil] - assert_equal("1.0 2 a NA", Statsample::GGobi.values_definition(a,"NA")) + a = [1.0, 2, 'a', nil] + assert_equal('1.0 2 a NA', Statsample::GGobi.values_definition(a, 'NA')) end + def test_variable_definition - carrier=OpenStruct.new - carrier.categorials=[] - carrier.conversions={} - real_var_definition=Statsample::GGobi.variable_definition(carrier,@v2,'variable 2',"v2") - expected=<<-EOS + carrier = OpenStruct.new + carrier.categorials = [] + carrier.conversions = {} + real_var_definition = Statsample::GGobi.variable_definition(carrier, @v2, 'variable 2', 'v2') + expected = <<-EOS letter a @@ -27,8 +28,8 @@ def test_variable_definition letter d EOS - assert_equal(expected.gsub(/\s/," "),real_var_definition.gsub(/\s/," ")) - assert_equal({'variable 2'=>{'a'=>1,'b'=>2,'c'=>3,'d'=>4}},carrier.conversions) - assert_equal(['variable 2'],carrier.categorials) + assert_equal(expected.gsub(/\s/, ' '), real_var_definition.gsub(/\s/, ' ')) + assert_equal({ 'variable 2' => { 'a' => 1, 'b' => 2, 'c' => 3, 'd' => 4 } }, carrier.conversions) + assert_equal(['variable 2'], carrier.categorials) end end diff --git a/test/test_gsl.rb b/test/test_gsl.rb index 2d841aa..261b9cf 100644 --- a/test/test_gsl.rb +++ b/test/test_gsl.rb @@ -1,17 +1,15 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleGSLTestCase < MiniTest::Unit::TestCase - should_with_gsl "matrix with gsl" do - a=[1,2,3,4,20].to_vector(:scale) - b=[3,2,3,4,50].to_vector(:scale) - c=[6,2,3,4,3].to_vector(:scale) - ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset - gsl=ds.to_matrix.to_gsl - assert_equal(5,gsl.size1) - assert_equal(3,gsl.size2) - matrix=gsl.to_matrix - assert_equal(5,matrix.row_size) - assert_equal(3,matrix.column_size) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleGSLTestCase < Minitest::Test + should_with_gsl 'matrix with gsl' do + a = Daru::Vector.new([1, 2, 3, 4, 20]) + b = Daru::Vector.new([3, 2, 3, 4, 50]) + c = Daru::Vector.new([6, 2, 3, 4, 3]) + ds = Daru::DataFrame.new({ :a => a, :b => b, :c => c }) + gsl = ds.to_matrix.to_gsl + assert_equal(5, gsl.size1) + assert_equal(3, gsl.size2) + matrix = gsl.to_matrix + assert_equal(5, matrix.row_size) + assert_equal(3, matrix.column_size) end end - - diff --git a/test/test_histogram.rb b/test/test_histogram.rb index 1a086e0..5db9101 100644 --- a/test/test_histogram.rb +++ b/test/test_histogram.rb @@ -1,112 +1,109 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) - -class StatsampleHistogramTestCase < MiniTest::Unit::TestCase +class StatsampleHistogramTestCase < Minitest::Test context Statsample::Histogram do - should "alloc correctly with integer" do + should 'alloc correctly with integer' do h = Statsample::Histogram.alloc(4) - assert_equal([0.0]*4, h.bin) - assert_equal([0.0]*5, h.range) + assert_equal([0.0] * 4, h.bin) + assert_equal([0.0] * 5, h.range) end - should "alloc correctly with array" do + should 'alloc correctly with array' do h = Statsample::Histogram.alloc([1, 3, 7, 9, 20]) - assert_equal([0.0]*4, h.bin) - assert_equal([1,3,7,9,20], h.range) + assert_equal([0.0] * 4, h.bin) + assert_equal([1, 3, 7, 9, 20], h.range) end - should "alloc correctly with integer and min, max array" do + should 'alloc correctly with integer and min, max array' do h = Statsample::Histogram.alloc(5, [0, 5]) - assert_equal([0.0,1.0,2.0,3.0,4.0,5.0], h.range) - assert_equal([0.0]*5,h.bin) + assert_equal([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], h.range) + assert_equal([0.0] * 5, h.bin) end - should "bin() method return correct number of bins" do + should 'bin() method return correct number of bins' do h = Statsample::Histogram.alloc(4) - assert_equal(4,h.bins) + assert_equal(4, h.bins) end - should "increment correctly" do - h = Statsample::Histogram.alloc(5, [0, 5]) + should 'increment correctly' do + h = Statsample::Histogram.alloc(5, [0, 5]) h.increment 2.5 - assert_equal([0.0,0.0,1.0,0.0,0.0], h.bin) - h.increment [0.5,0.5,3.5,3.5] - assert_equal([2.0,0.0,1.0,2.0,0.0], h.bin) + assert_equal([0.0, 0.0, 1.0, 0.0, 0.0], h.bin) + h.increment [0.5, 0.5, 3.5, 3.5] + assert_equal([2.0, 0.0, 1.0, 2.0, 0.0], h.bin) h.increment 0 - assert_equal([3.0,0.0,1.0,2.0,0.0], h.bin) + assert_equal([3.0, 0.0, 1.0, 2.0, 0.0], h.bin) h.increment 5 - assert_equal([3.0,0.0,1.0,2.0,0.0], h.bin) + assert_equal([3.0, 0.0, 1.0, 2.0, 0.0], h.bin) end - - should "alloc_uniform correctly with n, min,max" do - h = Statsample::Histogram.alloc_uniform(5,0,10) - assert_equal(5,h.bins) - assert_equal([0.0]*5,h.bin) - assert_equal([0.0,2.0,4.0,6.0,8.0,10.0], h.range) + + should 'alloc_uniform correctly with n, min,max' do + h = Statsample::Histogram.alloc_uniform(5, 0, 10) + assert_equal(5, h.bins) + assert_equal([0.0] * 5, h.bin) + assert_equal([0.0, 2.0, 4.0, 6.0, 8.0, 10.0], h.range) end - should "alloc_uniform correctly with n, [min,max]" do + should 'alloc_uniform correctly with n, [min,max]' do h = Statsample::Histogram.alloc_uniform(5, [0, 10]) - assert_equal(5,h.bins) - assert_equal([0.0]*5,h.bin) - assert_equal([0.0,2.0,4.0,6.0,8.0,10.0], h.range) + assert_equal(5, h.bins) + assert_equal([0.0] * 5, h.bin) + assert_equal([0.0, 2.0, 4.0, 6.0, 8.0, 10.0], h.range) end - should "get_range()" do - h = Statsample::Histogram.alloc_uniform(5,2,12) + should 'get_range()' do + h = Statsample::Histogram.alloc_uniform(5, 2, 12) 5.times {|i| - assert_equal([2+i*2, 4+i*2], h.get_range(i)) + assert_equal([2 + i * 2, 4 + i * 2], h.get_range(i)) + } + end + should 'min() and max()' do + h = Statsample::Histogram.alloc_uniform(5, 2, 12) + assert_equal(2, h.min) + assert_equal(12, h.max) + end + should 'max_val()' do + h = Statsample::Histogram.alloc(5, [0, 5]) + 100.times { h.increment(rand * 5) } + max = h.bin[0] + (1..4).each {|i| + max = h.bin[i] if h.bin[i] > max } + assert_equal(max, h.max_val) end - should "min() and max()" do - h=Statsample::Histogram.alloc_uniform(5,2,12) - assert_equal(2,h.min) - assert_equal(12,h.max) - end - should "max_val()" do - h = Statsample::Histogram.alloc(5, [0, 5]) - 100.times {h.increment(rand*5)} - max=h.bin[0] - (1..4).each {|i| - max = h.bin[i] if h.bin[i] > max - } - assert_equal(max,h.max_val) - end - should "min_val()" do - h = Statsample::Histogram.alloc(5, [0, 5]) - 100.times {h.increment(rand*5)} - min=h.bin[0] - (1..4).each {|i| - min = h.bin[i] if h.bin[i]x1,'x2'=>x2}.to_dataset - ds.name="test" - obs=m.to_dataset - assert_equal(ds['x1'],obs['x1']) - assert_equal(ds['x2'],obs['x2']) - assert_equal(ds['x1'].mean,obs['x1'].mean) - - + m.fields_y = [:x1, :x2] + m.name = 'test' + samples = 100 + x1 =Daru::Vector.new([1, 2, 3]) + x2 =Daru::Vector.new([4, 5, 6]) + ds = Daru::DataFrame.new({ :x1 => x1, :x2 => x2 }) + ds.rename 'test' + obs = m.to_dataframe + assert_equal(ds[:x1], obs[:x1]) + assert_equal(ds[:x2], obs[:x2]) + assert_equal(ds[:x1].mean, obs[:x1].mean) end + def test_covariate - a=Matrix[[1.0, 0.3, 0.2], [0.3, 1.0, 0.5], [0.2, 0.5, 1.0]] + a = Matrix[[1.0, 0.3, 0.2], [0.3, 1.0, 0.5], [0.2, 0.5, 1.0]] a.extend Statsample::CovariateMatrix - a.fields=%w{a b c} + a.fields = %w(a b c) assert_equal(:correlation, a._type) - assert_equal(Matrix[[0.5],[0.3]], a.submatrix(%w{c a}, %w{b})) - assert_equal(Matrix[[1.0, 0.2] , [0.2, 1.0]], a.submatrix(%w{c a})) - assert_equal(:correlation, a.submatrix(%w{c a})._type) + assert_equal(Matrix[[0.5], [0.3]], a.submatrix(%w(c a), %w(b))) + assert_equal(Matrix[[1.0, 0.2], [0.2, 1.0]], a.submatrix(%w(c a))) + assert_equal(:correlation, a.submatrix(%w(c a))._type) - a=Matrix[[20,30,10], [30,60,50], [10,50,50]] + a = Matrix[[20, 30, 10], [30, 60, 50], [10, 50, 50]] a.extend Statsample::CovariateMatrix assert_equal(:covariance, a._type) - a=50.times.collect {rand()}.to_scale - b=50.times.collect {rand()}.to_scale - c=50.times.collect {rand()}.to_scale - ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset - corr=Statsample::Bivariate.correlation_matrix(ds) - real=Statsample::Bivariate.covariance_matrix(ds).correlation + a = Daru::Vector.new(50.times.collect { rand }) + b = Daru::Vector.new(50.times.collect { rand }) + c = Daru::Vector.new(50.times.collect { rand }) + ds = Daru::DataFrame.new({ :a => a, :b => b, :c => c }) + corr = Statsample::Bivariate.correlation_matrix(ds) + real = Statsample::Bivariate.covariance_matrix(ds).correlation corr.row_size.times do |i| corr.column_size.times do |j| - assert_in_delta(corr[i,j], real[i,j],1e-15) + assert_in_delta(corr[i, j], real[i, j], 1e-15) end end - end + end end diff --git a/test/test_multiset.rb b/test/test_multiset.rb index 2c5487c..0e47477 100644 --- a/test/test_multiset.rb +++ b/test/test_multiset.rb @@ -1,158 +1,176 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) - -class StatsampleMultisetTestCase < MiniTest::Unit::TestCase +class StatsampleMultisetTestCase < Minitest::Test def setup - @x=%w{a a a a b b b b}.to_vector - @y=[1,2,3,4,5,6,7,8].to_scale - @z=[10,11,12,13,14,15,16,17].to_scale - @ds={'x'=>@x,'y'=>@y,'z'=>@z}.to_dataset - @ms=@ds.to_multiset_by_split('x') + @x = Daru::Vector.new(%w(a a a a b b b b)) + @y = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7, 8]) + @z = Daru::Vector.new([10, 11, 12, 13, 14, 15, 16, 17]) + @ds = Daru::DataFrame.new({ :x => @x, :y => @y, :z => @z }) + @ms = @ds.to_multiset_by_split(:x) end + def test_creation - v1a=[1,2,3,4,5].to_vector - v2b=[11,21,31,41,51].to_vector - v3c=[21,23,34,45,56].to_vector - ds1={'v1'=>v1a,'v2'=>v2b,'v3'=>v3c}.to_dataset - v1b=[15,25,35,45,55].to_vector - v2b=[11,21,31,41,51].to_vector - v3b=[21,23,34,45,56].to_vector - ds2={'v1'=>v1b,'v2'=>v2b,'v3'=>v3b}.to_dataset - ms=Statsample::Multiset.new(['v1','v2','v3']) - ms.add_dataset('ds1',ds1) - ms.add_dataset('ds2',ds2) - assert_equal(ds1,ms['ds1']) - assert_equal(ds2,ms['ds2']) - assert_equal(v1a,ms['ds1']['v1']) - assert_not_equal(v1b,ms['ds1']['v1']) - ds3={'v1'=>v1b,'v2'=>v2b}.to_dataset + v1a = Daru::Vector.new([1, 2, 3, 4, 5]) + v2b = Daru::Vector.new([11, 21, 31, 41, 51]) + v3c = Daru::Vector.new([21, 23, 34, 45, 56]) + ds1 = Daru::DataFrame.new({ :v1 => v1a, :v2 => v2b, :v3 => v3c }) + v1b = Daru::Vector.new([15, 25, 35, 45, 55]) + v2b = Daru::Vector.new([11, 21, 31, 41, 51]) + v3b = Daru::Vector.new([21, 23, 34, 45, 56]) + ds2 = Daru::DataFrame.new({ :v1 => v1b, :v2 => v2b, :v3 => v3b }) + ms = Statsample::Multiset.new([:v1, :v2, :v3]) + ms.add_dataset(:ds1, ds1) + ms.add_dataset(:ds2, ds2) + assert_equal(ds1, ms[:ds1]) + assert_equal(ds2, ms[:ds2]) + assert_equal(v1a, ms[:ds1][:v1]) + assert_not_equal(v1b, ms[:ds1][:v1]) + ds3 = Daru::DataFrame.new({ :v1 => v1b, :v2 => v2b }) assert_raise ArgumentError do ms.add_dataset(ds3) end end + def test_creation_empty - ms=Statsample::Multiset.new_empty_vectors(%w{id age name},%w{male female}) - ds_male={'id'=>[].to_vector,'age'=>[].to_vector, 'name'=>[].to_vector}.to_dataset(%w{id age name}) - ds_female={'id'=>[].to_vector,'age'=>[].to_vector, 'name'=>[].to_vector}.to_dataset(%w{id age name}) - ms2=Statsample::Multiset.new(%w{id age name}) - ms2.add_dataset('male',ds_male) - ms2.add_dataset('female',ds_female) - assert_equal(ms2.fields,ms.fields) - assert_equal(ms2['male'],ms['male']) - assert_equal(ms2['female'],ms['female']) + ms = Statsample::Multiset.new_empty_vectors([:id, :age, :name], [:male, :female]) + ds_male = Daru::DataFrame.new({ + :id => Daru::Vector.new([]), + :age => Daru::Vector.new([]), + :name => Daru::Vector.new([]) + }, order: [:id, :age, :name]) + + ds_female = Daru::DataFrame.new({ + :id => Daru::Vector.new([]), + :age => Daru::Vector.new([]), + :name => Daru::Vector.new([]) + }, order: [:id, :age, :name]) + + ms2 = Statsample::Multiset.new([:id, :age, :name]) + ms2.add_dataset(:male, ds_male) + ms2.add_dataset(:female, ds_female) + assert_equal(ms2.fields, ms.fields) + assert_equal(ms2[:male], ms[:male]) + assert_equal(ms2[:female], ms[:female]) end + def test_to_multiset_by_split_one - sex=%w{m m m m m f f f f m}.to_vector(:nominal) - city=%w{London Paris NY London Paris NY London Paris NY Tome}.to_vector(:nominal) - age=[10,10,20,30,34,34,33,35,36,40].to_vector(:scale) - ds={'sex'=>sex,'city'=>city,'age'=>age}.to_dataset - ms=ds.to_multiset_by_split('sex') - assert_equal(2,ms.n_datasets) - assert_equal(%w{f m},ms.datasets.keys.sort) - assert_equal(6,ms['m'].cases) - assert_equal(4,ms['f'].cases) - assert_equal(%w{London Paris NY London Paris Tome},ms['m']['city'].to_a) - assert_equal([34,33,35,36],ms['f']['age'].to_a) + sex = Daru::Vector.new(%w(m m m m m f f f f m)) + city = Daru::Vector.new(%w(London Paris NY London Paris NY London Paris NY Tome)) + age = Daru::Vector.new([10, 10, 20, 30, 34, 34, 33, 35, 36, 40]) + ds = Daru::DataFrame.new({ :sex => sex, :city => city, :age => age }) + ms = ds.to_multiset_by_split(:sex) + assert_equal(2, ms.n_datasets) + assert_equal(%w(f m), ms.datasets.keys.sort) + assert_equal(6, ms['m'].nrows) + assert_equal(4, ms['f'].nrows) + assert_equal(%w(London Paris NY London Paris Tome), ms['m'][:city].to_a) + assert_equal([34, 33, 35, 36], ms['f'][:age].to_a) end + def test_to_multiset_by_split_multiple - sex=%w{m m m m m m m m m m f f f f f f f f f f}.to_vector(:nominal) - city=%w{London London London Paris Paris London London London Paris Paris London London London Paris Paris London London London Paris Paris}.to_vector(:nominal) - hair=%w{blonde blonde black black blonde blonde black black blonde blonde black black blonde blonde black black blonde blonde black black}.to_vector(:nominal) - age=[10,10,20,30,34,34,33,35,36,40, 10,10,20,30,34,34,33,35,36,40].to_vector(:scale) - ds={'sex'=>sex,'city'=>city,'hair'=>hair,'age'=>age}.to_dataset(%w{sex city hair age}) - ms=ds.to_multiset_by_split('sex','city','hair') - assert_equal(8,ms.n_datasets) - assert_equal(3,ms[%w{m London blonde}].cases) - assert_equal(3,ms[%w{m London blonde}].cases) - assert_equal(1,ms[%w{m Paris black}].cases) + sex = Daru::Vector.new(%w(m m m m m m m m m m f f f f f f f f f f)) + city = Daru::Vector.new(%w(London London London Paris Paris London London London Paris Paris London London London Paris Paris London London London Paris Paris)) + hair = Daru::Vector.new(%w(blonde blonde black black blonde blonde black black blonde blonde black black blonde blonde black black blonde blonde black black)) + age = Daru::Vector.new([10, 10, 20, 30, 34, 34, 33, 35, 36, 40, 10, 10, 20, 30, 34, 34, 33, 35, 36, 40]) + ds = Daru::DataFrame.new({ + :sex => sex, :city => city, :hair => hair, :age => age + }, order: [:sex, :city, :hair, :age]) + ms = ds.to_multiset_by_split(:sex, :city, :hair) + assert_equal(8, ms.n_datasets) + assert_equal(3, ms[%w(m London blonde)].nrows) + assert_equal(3, ms[%w(m London blonde)].nrows) + assert_equal(1, ms[%w(m Paris black)].nrows) end def test_stratum_proportion - ds1={'q1'=>[1,1,1,1,1,0,0,0,0,0,0,0].to_vector}.to_dataset - ds2={'q1'=>[1,1,1,1,1,1,1,0,0].to_vector}.to_dataset - assert_equal(5.0/12, ds1['q1'].proportion ) - assert_equal(7.0/9, ds2['q1'].proportion ) - ms=Statsample::Multiset.new(['q1']) - ms.add_dataset('d1',ds1) - ms.add_dataset('d2',ds2) - ss=Statsample::StratifiedSample.new(ms,{'d1'=>50,'d2'=>100}) - assert_in_delta(0.655, ss.proportion('q1'),0.01) - assert_in_delta(0.345, ss.proportion('q1',0),0.01) - + ds1 = Daru::DataFrame.new({ :q1 => Daru::Vector.new([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]) }) + ds2 = Daru::DataFrame.new({ :q1 => Daru::Vector.new([1, 1, 1, 1, 1, 1, 1, 0, 0]) }) + assert_equal(5.0 / 12, ds1[:q1].proportion) + assert_equal(7.0 / 9, ds2[:q1].proportion) + ms = Statsample::Multiset.new([:q1]) + ms.add_dataset(:d1, ds1) + ms.add_dataset(:d2, ds2) + ss = Statsample::StratifiedSample.new(ms, :d1 => 50, :d2 => 100) + assert_in_delta(0.655, ss.proportion(:q1), 0.01) + assert_in_delta(0.345, ss.proportion(:q1, 0), 0.01) end + def test_stratum_scale - boys={'test'=>[50, 55, 60, 62, 62, 65, 67, 67, 70, 70, 73, 73, 75, 78, 78, 80, 85, 90].to_vector(:scale)}.to_dataset - girls={'test'=>[70, 70, 72, 72, 75, 75, 78, 78, 80, 80, 82, 82, 85, 85, 88, 88, 90, 90].to_vector(:scale)}.to_dataset - ms=Statsample::Multiset.new(['test']) - ms.add_dataset('boys',boys) - ms.add_dataset('girls',girls) - ss=Statsample::StratifiedSample.new(ms,{'boys'=>10000,'girls'=>10000}) - assert_equal(2,ss.strata_number) - assert_equal(20000,ss.population_size) - assert_equal(10000,ss.stratum_size('boys')) - assert_equal(10000,ss.stratum_size('girls')) - assert_equal(36,ss.sample_size) - assert_equal(75,ss.mean('test')) - assert_in_delta(1.45,ss.standard_error_wor('test'),0.01) - assert_in_delta(ss.standard_error_wor('test'), ss.standard_error_wor_2('test'),0.00001) + boys = Daru::DataFrame.new({ :test => Daru::Vector.new([50, 55, 60, 62, 62, 65, 67, 67, 70, 70, 73, 73, 75, 78, 78, 80, 85, 90]) }) + girls =Daru::DataFrame.new({ :test => Daru::Vector.new( [70, 70, 72, 72, 75, 75, 78, 78, 80, 80, 82, 82, 85, 85, 88, 88, 90, 90]) }) + ms = Statsample::Multiset.new([:test]) + ms.add_dataset(:boys, boys) + ms.add_dataset(:girls, girls) + ss = Statsample::StratifiedSample.new(ms, :boys => 10_000, :girls => 10_000) + assert_equal(2, ss.strata_number) + assert_equal(20_000, ss.population_size) + assert_equal(10_000, ss.stratum_size(:boys)) + assert_equal(10_000, ss.stratum_size(:girls)) + assert_equal(36, ss.sample_size) + assert_equal(75, ss.mean(:test)) + assert_in_delta(1.45, ss.standard_error_wor(:test), 0.01) + assert_in_delta(ss.standard_error_wor(:test), ss.standard_error_wor_2(:test), 0.00001) end + def test_each - xpe={ - 'a'=>%w{a a a a}.to_vector, - 'b'=>%w{b b b b}.to_vector + xpe = { + 'a' => Daru::Vector.new(%w(a a a a)), + 'b' => Daru::Vector.new(%w(b b b b)) } - ype={ - 'a'=>[1,2,3,4].to_scale, - 'b'=>[5,6,7,8].to_scale, + ype = { + 'a' => Daru::Vector.new([1, 2, 3, 4]), + 'b' => Daru::Vector.new([5, 6, 7, 8]) } - zpe={ - 'a'=>[10,11,12,13].to_scale, - 'b'=>[14,15,16,17].to_scale, + zpe = { + 'a' => Daru::Vector.new([10, 11, 12, 13]), + 'b' => Daru::Vector.new([14, 15, 16, 17]) } - xp,yp,zp=Hash.new(),Hash.new(),Hash.new() - @ms.each {|k,ds| - xp[k]=ds['x'] - yp[k]=ds['y'] - zp[k]=ds['z'] + xp, yp, zp = {}, {}, {} + @ms.each {|k, ds| + xp[k] = ds[:x] + yp[k] = ds[:y] + zp[k] = ds[:z] } - assert_equal(xpe,xp) - assert_equal(ype,yp) - assert_equal(zpe,zp) - + assert_equal(xpe, xp) + assert_equal(ype, yp) + assert_equal(zpe, zp) end + def test_multiset_union_with_block - - r1=rand() - r2=rand() - ye=[1*r1,2*r1,3*r1,4*r1,5*r2,6*r2,7*r2,8*r2].to_scale - - ze=[10*r1,11*r1,12*r1,13*r1, 14*r2,15*r2,16*r2,17*r2].to_scale - - ds2=@ms.union {|k,ds| - ds['y'].recode!{|v| - k=='a' ? v*r1 : v*r2} - ds['z'].recode!{|v| - k=='a' ? v*r1 : v*r2} + r1 = rand + r2 = rand + ye = Daru::Vector.new([1 * r1, 2 * r1, 3 * r1, 4 * r1, 5 * r2, 6 * r2, 7 * r2, 8 * r2]) + + ze = Daru::Vector.new([10 * r1, 11 * r1, 12 * r1, 13 * r1, 14 * r2, 15 * r2, 16 * r2, 17 * r2]) + + ds2 = @ms.union {|k, ds| + ds[:y].recode!{|v| + k == 'a' ? v * r1 : v * r2 + } + ds[:z].recode!{|v| + k == 'a' ? v * r1 : v * r2 + } } - assert_equal(ye,ds2['y']) - assert_equal(ze,ds2['z']) + assert_equal(ye, ds2[:y]) + assert_equal(ze, ds2[:z]) end + def test_multiset_union - r1=rand() - r2=rand() - ye=[1*r1,2*r1,3*r1,4*r1,5*r2,6*r2,7*r2,8*r2].to_scale - - ze=[10*r1,11*r1,12*r1,13*r1, 14*r2,15*r2,16*r2,17*r2].to_scale - @ms.each {|k,ds| - ds['y'].recode!{|v| - k=='a' ? v*r1 : v*r2} - ds['z'].recode!{|v| - k=='a' ? v*r1 : v*r2} - - } - ds2=@ms.union - assert_equal(ye,ds2['y']) - assert_equal(ze,ds2['z']) - + r1 = rand + r2 = rand + ye = Daru::Vector.new([1 * r1, 2 * r1, 3 * r1, 4 * r1, 5 * r2, 6 * r2, 7 * r2, 8 * r2]) + ze = Daru::Vector.new([10 * r1, 11 * r1, 12 * r1, 13 * r1, 14 * r2, 15 * r2, 16 * r2, 17 * r2]) + + @ms.each do |k, ds| + ds[:y].recode! { |v| + k == 'a' ? v * r1 : v * r2 + } + ds[:z].recode! {|v| + k == 'a' ? v * r1 : v * r2 + } + end + ds2 = @ms.union + assert_equal(ye, ds2[:y]) + assert_equal(ze, ds2[:z]) end end diff --git a/test/test_regression.rb b/test/test_regression.rb index 8405703..8c23bc0 100644 --- a/test/test_regression.rb +++ b/test/test_regression.rb @@ -1,215 +1,215 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) -class StatsampleRegressionTestCase < MiniTest::Unit::TestCase - context "Example with missing data" do - setup do - @x=[0.285714285714286, 0.114285714285714, 0.314285714285714, 0.2, 0.2, 0.228571428571429, 0.2, 0.4, 0.714285714285714, 0.285714285714286, 0.285714285714286, 0.228571428571429, 0.485714285714286, 0.457142857142857, 0.257142857142857, 0.228571428571429, 0.285714285714286, 0.285714285714286, 0.285714285714286, 0.142857142857143, 0.285714285714286, 0.514285714285714, 0.485714285714286, 0.228571428571429, 0.285714285714286, 0.342857142857143, 0.285714285714286, 0.0857142857142857].to_scale +class StatsampleRegressionTestCase < Minitest::Test + context 'Example with missing data' do + setup do + @x = Daru::Vector.new([0.285714285714286, 0.114285714285714, 0.314285714285714, 0.2, 0.2, 0.228571428571429, 0.2, 0.4, 0.714285714285714, 0.285714285714286, 0.285714285714286, 0.228571428571429, 0.485714285714286, 0.457142857142857, 0.257142857142857, 0.228571428571429, 0.285714285714286, 0.285714285714286, 0.285714285714286, 0.142857142857143, 0.285714285714286, 0.514285714285714, 0.485714285714286, 0.228571428571429, 0.285714285714286, 0.342857142857143, 0.285714285714286, 0.0857142857142857]) - @y=[nil, 0.233333333333333, nil, 0.266666666666667, 0.366666666666667, nil, 0.333333333333333, 0.3, 0.666666666666667, 0.0333333333333333, 0.333333333333333, nil, nil, 0.533333333333333, 0.433333333333333, 0.4, 0.4, 0.5, 0.4, 0.266666666666667, 0.166666666666667, 0.666666666666667, 0.433333333333333, 0.166666666666667, nil, 0.4, 0.366666666666667, nil].to_scale - @ds={'x'=>@x,'y'=>@y}.to_dataset - @lr=Statsample::Regression::Multiple::RubyEngine.new(@ds,'y') + @y = Daru::Vector.new([nil, 0.233333333333333, nil, 0.266666666666667, 0.366666666666667, nil, 0.333333333333333, 0.3, 0.666666666666667, 0.0333333333333333, 0.333333333333333, nil, nil, 0.533333333333333, 0.433333333333333, 0.4, 0.4, 0.5, 0.4, 0.266666666666667, 0.166666666666667, 0.666666666666667, 0.433333333333333, 0.166666666666667, nil, 0.4, 0.366666666666667, nil]) + @ds = Daru::DataFrame.new({ :x => @x, :y => @y }) + @lr = Statsample::Regression::Multiple::RubyEngine.new(@ds, :y) + end + should 'have correct values' do + assert_in_delta(0.455, @lr.r2, 0.001) + assert_in_delta(0.427, @lr.r2_adjusted, 0.001) + assert_in_delta(0.1165, @lr.se_estimate, 0.001) + assert_in_delta(15.925, @lr.f, 0.0001) + assert_in_delta(0.675, @lr.standarized_coeffs[:x], 0.001) + assert_in_delta(0.778, @lr.coeffs[:x], 0.001, 'coeff x') + assert_in_delta(0.132, @lr.constant, 0.001, 'constant') + assert_in_delta(0.195, @lr.coeffs_se[:x], 0.001, 'coeff x se') + assert_in_delta(0.064, @lr.constant_se, 0.001, 'constant se') end - should "have correct values" do - assert_in_delta(0.455,@lr.r2,0.001) - assert_in_delta(0.427,@lr.r2_adjusted, 0.001) - assert_in_delta(0.1165,@lr.se_estimate,0.001) - assert_in_delta(15.925,@lr.f,0.0001) - assert_in_delta(0.675, @lr.standarized_coeffs['x'],0.001) - assert_in_delta(0.778, @lr.coeffs['x'],0.001, "coeff x") - assert_in_delta(0.132, @lr.constant,0.001,"constant") - assert_in_delta(0.195, @lr.coeffs_se['x'],0.001,"coeff x se") - assert_in_delta(0.064, @lr.constant_se,0.001,"constant se") - end end - should "return an error if data is linearly dependent" do - samples=100 - - a,b=rand,rand - - x1=samples.times.map { rand}.to_scale - x2=samples.times.map {rand}.to_scale - x3=samples.times.map {|i| x1[i]*(1+a)+x2[i]*(1+b)}.to_scale - y=samples.times.map {|i| x1[i]+x2[i]+x3[i]+rand}.to_scale - - ds={'x1'=>x1,'x2'=>x2,'x3'=>x3,'y'=>y}.to_dataset + should 'return an error if data is linearly dependent' do + samples = 100 + + a, b = rand, rand + + x1 = Daru::Vector.new(samples.times.map { rand }) + x2 = Daru::Vector.new(samples.times.map { rand }) + x3 = Daru::Vector.new(samples.times.map { |i| x1[i] * (1 + a) + x2[i] * (1 + b) }) + y = Daru::Vector.new(samples.times.map { |i| x1[i] + x2[i] + x3[i] + rand }) + + ds = Daru::DataFrame.new({ :x1 => x1, :x2 => x2, :x3 => x3, :y => y }) assert_raise(Statsample::Regression::LinearDependency) { - Statsample::Regression::Multiple::RubyEngine.new(ds,'y') + Statsample::Regression::Multiple::RubyEngine.new(ds, :y) } end def test_parameters - @x=[13,20,10,33,15].to_vector(:scale) - @y=[23,18,35,10,27 ].to_vector(:scale) - reg=Statsample::Regression::Simple.new_from_vectors(@x,@y) + @x =Daru::Vector.new([13, 20, 10, 33, 15]) + @y =Daru::Vector.new([23, 18, 35, 10, 27]) + reg = Statsample::Regression::Simple.new_from_vectors(@x, @y) _test_simple_regression(reg) - ds={'x'=>@x,'y'=>@y}.to_dataset - reg=Statsample::Regression::Simple.new_from_dataset(ds,'x','y') + ds = Daru::DataFrame.new({ :x => @x, :y => @y }) + reg = Statsample::Regression::Simple.new_from_dataset(ds, :x, :y) _test_simple_regression(reg) - reg=Statsample::Regression.simple(@x,@y) + reg = Statsample::Regression.simple(@x, @y) _test_simple_regression(reg) - end + def _test_simple_regression(reg) - - assert_in_delta(40.009, reg.a,0.001) - assert_in_delta(-0.957, reg.b,0.001) - assert_in_delta(4.248,reg.standard_error,0.002) + assert_in_delta(40.009, reg.a, 0.001) + assert_in_delta(-0.957, reg.b, 0.001) + assert_in_delta(4.248, reg.standard_error, 0.002) assert(reg.summary) end - + def test_summaries - a=10.times.map{rand(100)}.to_scale - b=10.times.map{rand(100)}.to_scale - y=10.times.map{rand(100)}.to_scale - ds={'a'=>a,'b'=>b,'y'=>y}.to_dataset - lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y') - assert(lr.summary.size>0) + a = Daru::Vector.new(10.times.map { rand(100) }) + b = Daru::Vector.new(10.times.map { rand(100) }) + y = Daru::Vector.new(10.times.map { rand(100) }) + ds = Daru::DataFrame.new({ :a => a, :b => b, :y => y }) + lr = Statsample::Regression::Multiple::RubyEngine.new(ds, :y) + assert(lr.summary.size > 0) end + def test_multiple_dependent - complete=Matrix[ - [1,0.53,0.62,0.19,-0.09,0.08,0.02,-0.12,0.08], - [0.53,1,0.61,0.23,0.1,0.18,0.02,-0.1,0.15], - [0.62,0.61,1,0.03,0.1,0.12,0.03,-0.06,0.12], - [0.19,0.23,0.03,1,-0.02,0.02,0,-0.02,-0.02], - [-0.09,0.1,0.1,-0.02,1,0.05,0.06,0.18,0.02], - [0.08,0.18,0.12,0.02,0.05,1,0.22,-0.07,0.36], - [0.02,0.02,0.03,0,0.06,0.22,1,-0.01,-0.05], - [-0.12,-0.1,-0.06,-0.02,0.18,-0.07,-0.01,1,-0.03], - [0.08,0.15,0.12,-0.02,0.02,0.36,-0.05,-0.03,1]] + complete = Matrix[ + [1, 0.53, 0.62, 0.19, -0.09, 0.08, 0.02, -0.12, 0.08], + [0.53, 1, 0.61, 0.23, 0.1, 0.18, 0.02, -0.1, 0.15], + [0.62, 0.61, 1, 0.03, 0.1, 0.12, 0.03, -0.06, 0.12], + [0.19, 0.23, 0.03, 1, -0.02, 0.02, 0, -0.02, -0.02], + [-0.09, 0.1, 0.1, -0.02, 1, 0.05, 0.06, 0.18, 0.02], + [0.08, 0.18, 0.12, 0.02, 0.05, 1, 0.22, -0.07, 0.36], + [0.02, 0.02, 0.03, 0, 0.06, 0.22, 1, -0.01, -0.05], + [-0.12, -0.1, -0.06, -0.02, 0.18, -0.07, -0.01, 1, -0.03], + [0.08, 0.15, 0.12, -0.02, 0.02, 0.36, -0.05, -0.03, 1]] complete.extend Statsample::CovariateMatrix - complete.fields=%w{adhd cd odd sex age monly mwork mage poverty} + complete.fields = %w(adhd cd odd sex age monly mwork mage poverty) - lr=Statsample::Regression::Multiple::MultipleDependent.new(complete, %w{adhd cd odd}) - - - assert_in_delta(0.197, lr.r2yx,0.001) - assert_in_delta(0.197, lr.r2yx_covariance,0.001) - assert_in_delta(0.07, lr.p2yx,0.001) + lr = Statsample::Regression::Multiple::MultipleDependent.new(complete, %w(adhd cd odd)) + assert_in_delta(0.197, lr.r2yx, 0.001) + assert_in_delta(0.197, lr.r2yx_covariance, 0.001) + assert_in_delta(0.07, lr.p2yx, 0.001) end - + def test_multiple_regression_pairwise_2 - @a=[1,3,2,4,3,5,4,6,5,7,3,nil,3,nil,3].to_vector(:scale) - @b=[3,3,4,4,5,5,6,6,4,4,2,2,nil,6,2].to_vector(:scale) - @c=[11,22,30,40,50,65,78,79,99,100,nil,3,7,nil,7].to_vector(:scale) - @y=[3,4,5,6,7,8,9,10,20,30,30,40,nil,50,nil].to_vector(:scale) - ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset - lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y') - assert_in_delta(2407.436,lr.sst,0.001) - assert_in_delta(0.752,lr.r,0.001, "pairwise r") - assert_in_delta(0.565,lr.r2,0.001) - assert_in_delta(1361.130,lr.ssr,0.001) - assert_in_delta(1046.306,lr.sse,0.001) - assert_in_delta(3.035,lr.f,0.001) + @a =Daru::Vector.new( [1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 3, nil, 3, nil, 3]) + @b =Daru::Vector.new( [3, 3, 4, 4, 5, 5, 6, 6, 4, 4, 2, 2, nil, 6, 2]) + @c =Daru::Vector.new( [11, 22, 30, 40, 50, 65, 78, 79, 99, 100, nil, 3, 7, nil, 7]) + @y =Daru::Vector.new( [3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 30, 40, nil, 50, nil]) + ds = Daru::DataFrame.new({ :a => @a, :b => @b, :c => @c, :y => @y }) + lr = Statsample::Regression::Multiple::RubyEngine.new(ds, :y) + assert_in_delta(2407.436, lr.sst, 0.001) + assert_in_delta(0.752, lr.r, 0.001, 'pairwise r') + assert_in_delta(0.565, lr.r2, 0.001) + assert_in_delta(1361.130, lr.ssr, 0.001) + assert_in_delta(1046.306, lr.sse, 0.001) + assert_in_delta(3.035, lr.f, 0.001) end - def test_multiple_regression_gsl if Statsample.has_gsl? - @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale) - @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale) - @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale) - @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale) - ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset - lr=Statsample::Regression::Multiple::GslEngine.new(ds,'y') - assert(lr.summary.size>0) - model_test(lr,'gsl') - predicted=[1.7857, 6.0989, 3.2433, 7.2908, 4.9667, 10.3428, 8.8158, 10.4717, 23.6639, 25.3198] - c_predicted=lr.predicted + @a =Daru::Vector.new( [1, 3, 2, 4, 3, 5, 4, 6, 5, 7]) + @b =Daru::Vector.new( [3, 3, 4, 4, 5, 5, 6, 6, 4, 4]) + @c =Daru::Vector.new( [11, 22, 30, 40, 50, 65, 78, 79, 99, 100]) + @y =Daru::Vector.new( [3, 4, 5, 6, 7, 8, 9, 10, 20, 30]) + ds = Daru::DataFrame.new({ :a => @a, :b => @b, :c => @c, :y => @y }) + lr = Statsample::Regression::Multiple::GslEngine.new(ds, :y) + assert(lr.summary.size > 0) + model_test(lr, 'gsl') + predicted = [1.7857, 6.0989, 3.2433, 7.2908, 4.9667, 10.3428, 8.8158, 10.4717, 23.6639, 25.3198] + c_predicted = lr.predicted predicted.each_index{|i| - assert_in_delta(predicted[i],c_predicted[i],0.001) + assert_in_delta(predicted[i], c_predicted[i], 0.001) } - residuals=[1.2142, -2.0989, 1.7566, -1.29085, 2.033, -2.3428, 0.18414, -0.47177, -3.66395, 4.6801] - c_residuals=lr.residuals + residuals = [1.2142, -2.0989, 1.7566, -1.29085, 2.033, -2.3428, 0.18414, -0.47177, -3.66395, 4.6801] + c_residuals = lr.residuals residuals.each_index{|i| - assert_in_delta(residuals[i],c_residuals[i],0.001) + assert_in_delta(residuals[i], c_residuals[i], 0.001) } else - skip "Regression::Multiple::GslEngine not tested (no Gsl)" + skip 'Regression::Multiple::GslEngine not tested (no Gsl)' end end - - - def model_test_matrix(lr,name='undefined') - - stan_coeffs={'a'=>0.151,'b'=>-0.547,'c'=>0.997} - unstan_coeffs={'a'=>0.695, 'b'=>-4.286, 'c'=>0.266} + def model_test_matrix(lr, name = 'undefined') + stan_coeffs = { :a => 0.151, :b => -0.547, :c => 0.997 } + unstan_coeffs = { :a => 0.695, :b => -4.286, :c => 0.266 } unstan_coeffs.each_key{|k| - assert_in_delta(unstan_coeffs[k], lr.coeffs[k],0.001,"b coeffs - #{name}") + assert_in_delta(unstan_coeffs[k], lr.coeffs[k], 0.001, "b coeffs - #{name}") } stan_coeffs.each_key{|k| - assert_in_delta(stan_coeffs[k], lr.standarized_coeffs[k],0.001, "beta coeffs - #{name}") + assert_in_delta(stan_coeffs[k], lr.standarized_coeffs[k], 0.001, "beta coeffs - #{name}") } - assert_in_delta(11.027,lr.constant,0.001) + assert_in_delta(11.027, lr.constant, 0.001) - assert_in_delta(0.955,lr.r,0.001) - assert_in_delta(0.913,lr.r2,0.001) + assert_in_delta(0.955, lr.r, 0.001) + assert_in_delta(0.913, lr.r2, 0.001) - assert_in_delta(20.908, lr.f,0.001) + assert_in_delta(20.908, lr.f, 0.001) assert_in_delta(0.001, lr.probability, 0.001) - assert_in_delta(0.226,lr.tolerance("a"),0.001) - - coeffs_se={"a"=>1.171,"b"=>1.129,"c"=>0.072} + assert_in_delta(0.226, lr.tolerance(:a), 0.001) + coeffs_se = { :a => 1.171, :b => 1.129, :c => 0.072 } - - ccoeffs_se=lr.coeffs_se + ccoeffs_se = lr.coeffs_se coeffs_se.each_key{|k| - assert_in_delta(coeffs_se[k],ccoeffs_se[k],0.001) + assert_in_delta(coeffs_se[k], ccoeffs_se[k], 0.001) } - coeffs_t={"a"=>0.594,"b"=>-3.796,"c"=>3.703} - ccoeffs_t=lr.coeffs_t + coeffs_t = { :a => 0.594, :b => -3.796, :c => 3.703 } + ccoeffs_t = lr.coeffs_t coeffs_t.each_key{|k| - assert_in_delta(coeffs_t[k], ccoeffs_t[k],0.001) + assert_in_delta(coeffs_t[k], ccoeffs_t[k], 0.001) } - assert_in_delta(639.6,lr.sst,0.001) - assert_in_delta(583.76,lr.ssr,0.001) - assert_in_delta(55.840,lr.sse,0.001) - assert(lr.summary.size>0, "#{name} without summary") + assert_in_delta(639.6, lr.sst, 0.001) + assert_in_delta(583.76, lr.ssr, 0.001) + assert_in_delta(55.840, lr.sse, 0.001) + assert(lr.summary.size > 0, "#{name} without summary") end - def model_test(lr,name='undefined') - model_test_matrix(lr,name) - assert_in_delta(4.559, lr.constant_se,0.001) - assert_in_delta(2.419, lr.constant_t,0.001) - assert_in_delta(1.785,lr.process([1,3,11]),0.001) + def model_test(lr, name = 'undefined') + model_test_matrix(lr, name) + assert_in_delta(4.559, lr.constant_se, 0.001) + assert_in_delta(2.419, lr.constant_t, 0.001) + + assert_in_delta(1.785, lr.process([1, 3, 11]), 0.001) end + def test_regression_matrix - @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale) - @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale) - @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale) - @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale) - ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset - cor=Statsample::Bivariate.correlation_matrix(ds) - - lr=Statsample::Regression::Multiple::MatrixEngine.new(cor,'y', :y_mean=>@y.mean, :x_mean=>{'a'=>ds['a'].mean, 'b'=>ds['b'].mean, 'c'=>ds['c'].mean}, :cases=>@a.size, :y_sd=>@y.sd , :x_sd=>{'a' => @a.sd, 'b' => @b.sd, 'c' => @c.sd}) + @a = Daru::Vector.new([1, 3, 2, 4, 3, 5, 4, 6, 5, 7]) + @b = Daru::Vector.new([3, 3, 4, 4, 5, 5, 6, 6, 4, 4]) + @c = Daru::Vector.new([11, 22, 30, 40, 50, 65, 78, 79, 99, 100]) + @y = Daru::Vector.new([3, 4, 5, 6, 7, 8, 9, 10, 20, 30]) + ds = Daru::DataFrame.new({ :a => @a, :b => @b, :c => @c, :y => @y }) + cor = Statsample::Bivariate.correlation_matrix(ds) + + lr = Statsample::Regression::Multiple::MatrixEngine.new( + cor, :y, y_mean: @y.mean, + x_mean: { :a => ds[:a].mean, :b => ds[:b].mean, :c => ds[:c].mean }, + cases: @a.size, y_sd: @y.sd, x_sd: { :a => @a.sd, :b => @b.sd, :c => @c.sd }) assert_nil(lr.constant_se) assert_nil(lr.constant_t) - model_test_matrix(lr, "correlation matrix") + model_test_matrix(lr, 'correlation matrix') - covariance=Statsample::Bivariate.covariance_matrix(ds) - lr=Statsample::Regression::Multiple::MatrixEngine.new(covariance,'y', :y_mean=>@y.mean, :x_mean=>{'a'=>ds['a'].mean, 'b'=>ds['b'].mean, 'c'=>ds['c'].mean}, :cases=>@a.size) - assert(lr.summary.size>0) + covariance = Statsample::Bivariate.covariance_matrix(ds) + lr = Statsample::Regression::Multiple::MatrixEngine.new( + covariance, :y, y_mean: @y.mean, + x_mean: { :a => ds[:a].mean, :b => ds[:b].mean, :c => ds[:c].mean }, cases: @a.size) + assert(lr.summary.size > 0) - model_test(lr , "covariance matrix") + model_test(lr, 'covariance matrix') end + def test_regression_rubyengine - @a=[nil,1,3,2,4,3,5,4,6,5,7].to_vector(:scale) - @b=[nil,3,3,4,4,5,5,6,6,4,4].to_vector(:scale) - @c=[nil,11,22,30,40,50,65,78,79,99,100].to_vector(:scale) - @y=[nil,3,4,5,6,7,8,9,10,20,30].to_vector(:scale) - ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset - lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y') + @a = Daru::Vector.new([nil, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7]) + @b = Daru::Vector.new([nil, 3, 3, 4, 4, 5, 5, 6, 6, 4, 4]) + @c = Daru::Vector.new([nil, 11, 22, 30, 40, 50, 65, 78, 79, 99, 100]) + @y = Daru::Vector.new([nil, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30]) + ds = Daru::DataFrame.new({ :a => @a, :b => @b, :c => @c, :y => @y }) + lr = Statsample::Regression::Multiple::RubyEngine.new(ds, :y) assert_equal(11, lr.total_cases) assert_equal(10, lr.valid_cases) model_test(lr, 'rubyengine with missing data') - predicted=[nil,1.7857, 6.0989, 3.2433, 7.2908, 4.9667, 10.3428, 8.8158, 10.4717, 23.6639, 25.3198] + predicted = [nil, 1.7857, 6.0989, 3.2433, 7.2908, 4.9667, 10.3428, 8.8158, 10.4717, 23.6639, 25.3198] c_predicted = lr.predicted predicted.each_index do |i| if c_predicted[i].nil? @@ -218,15 +218,14 @@ def test_regression_rubyengine assert_in_delta(predicted[i], c_predicted[i], 0.001) end end - residuals=[nil,1.2142, -2.0989, 1.7566, -1.29085, 2.033, -2.3428, 0.18414, -0.47177, -3.66395, 4.6801] - c_residuals=lr.residuals + residuals = [nil, 1.2142, -2.0989, 1.7566, -1.29085, 2.033, -2.3428, 0.18414, -0.47177, -3.66395, 4.6801] + c_residuals = lr.residuals residuals.each_index do |i| if c_residuals[i].nil? assert(residuals[i].nil?) else - assert_in_delta(residuals[i],c_residuals[i],0.001) + assert_in_delta(residuals[i], c_residuals[i], 0.001) end end - end end diff --git a/test/test_reliability.rb b/test/test_reliability.rb index c7730e6..d0e284d 100644 --- a/test/test_reliability.rb +++ b/test/test_reliability.rb @@ -1,229 +1,223 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleReliabilityTestCase < MiniTest::Unit::TestCase +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleReliabilityTestCase < Minitest::Test context Statsample::Reliability do - should "return correct r according to Spearman-Brown prophecy" do - r=0.6849 - n=62.quo(15) - assert_in_delta(0.9, Statsample::Reliability.sbp(r,n), 0.001) + should 'return correct r according to Spearman-Brown prophecy' do + r = 0.6849 + n = 62.quo(15) + assert_in_delta(0.9, Statsample::Reliability.sbp(r, n), 0.001) end - should "return correct n for desired realiability" do - r=0.6849 - r_d=0.9 - assert_in_delta(62, Statsample::Reliability.n_for_desired_reliability(r, r_d, 15),0.5) - end - context "Cronbach's alpha" do + should 'return correct n for desired realiability' do + r = 0.6849 + r_d = 0.9 + assert_in_delta(62, Statsample::Reliability.n_for_desired_reliability(r, r_d, 15), 0.5) + end + context "Cronbach's alpha" do setup do - @samples=40 - @n_variables=rand(10)+2 - @ds=Statsample::Dataset.new() - base=@samples.times.collect {|a| rand()}.to_scale + @samples = 40 + @n_variables = rand(10) + 2 + @ds = Daru::DataFrame.new({}, index: @samples) + base = Daru::Vector.new(@samples.times.collect { |_a| rand }) @n_variables.times do |i| - @ds[i]=base.collect {|v| v+rand()}.to_scale + @ds[i] = Daru::Vector.new(base.collect { |v| v + rand }) end - - @ds.update_valid_data - @k=@ds.fields.size - @cm=Statsample::Bivariate.covariance_matrix(@ds) - @dse=@ds.dup - @dse.fields.each do |f| - @dse[f]=@dse[f].standarized + + @k = @ds.ncols + @cm = Statsample::Bivariate.covariance_matrix(@ds) + @dse = @ds.dup + @dse.vectors.each do |f| + @dse[f] = @dse[f].standardize end - @cme=Statsample::Bivariate.covariance_matrix(@dse) - @a=Statsample::Reliability.cronbach_alpha(@ds) - @as=Statsample::Reliability.cronbach_alpha_standarized(@ds) - end - should "alpha will be equal to sum of matrix covariance less the individual variances" do - total_sum=@cm.total_sum - ind_var=@ds.fields.inject(0) {|ac,v| ac+@ds[v].variance} - expected = @k.quo(@k-1) * (1-(ind_var.quo(total_sum))) - assert_in_delta(expected, @a,1e-10) - end - should "method cronbach_alpha_from_n_s2_cov return correct values" do - sa=Statsample::Reliability::ScaleAnalysis.new(@ds) + @cme = Statsample::Bivariate.covariance_matrix(@dse) + @a = Statsample::Reliability.cronbach_alpha(@ds) + @as = Statsample::Reliability.cronbach_alpha_standarized(@ds) + end + should 'alpha will be equal to sum of matrix covariance less the individual variances' do + total_sum = @cm.total_sum + ind_var = @ds.vectors.to_a.inject(0) { |ac, v| ac + @ds[v].variance } + expected = @k.quo(@k - 1) * (1 - (ind_var.quo(total_sum))) + assert_in_delta(expected, @a, 1e-10) + end + should 'method cronbach_alpha_from_n_s2_cov return correct values' do + sa = Statsample::Reliability::ScaleAnalysis.new(@ds) vm, cm = sa.variances_mean, sa.covariances_mean - assert_in_delta(sa.alpha, Statsample::Reliability.cronbach_alpha_from_n_s2_cov(@n_variables, vm,cm), 1e-10) + assert_in_delta(sa.alpha, Statsample::Reliability.cronbach_alpha_from_n_s2_cov(@n_variables, vm, cm), 1e-10) end - should "method cronbach_alpha_from_covariance_matrix returns correct value" do - cov=Statsample::Bivariate.covariance_matrix(@ds) - assert_in_delta(@a, Statsample::Reliability.cronbach_alpha_from_covariance_matrix(cov),0.0000001) + should 'method cronbach_alpha_from_covariance_matrix returns correct value' do + cov = Statsample::Bivariate.covariance_matrix(@ds) + assert_in_delta(@a, Statsample::Reliability.cronbach_alpha_from_covariance_matrix(cov), 0.0000001) end - should "return correct n for desired alpha, covariance and variance" do - sa=Statsample::Reliability::ScaleAnalysis.new(@ds) + should 'return correct n for desired alpha, covariance and variance' do + sa = Statsample::Reliability::ScaleAnalysis.new(@ds) vm, cm = sa.variances_mean, sa.covariances_mean - n_obtained=Statsample::Reliability.n_for_desired_alpha(@a, vm,cm) - #p n_obtained - assert_in_delta(Statsample::Reliability.cronbach_alpha_from_n_s2_cov(n_obtained, vm,cm) ,@a,0.001) - end - - should "standarized alpha will be equal to sum of matrix covariance less the individual variances on standarized values" do - total_sum=@cme.total_sum - ind_var=@dse.fields.inject(0) {|ac,v| ac+@dse[v].variance} - expected = @k.quo(@k-1) * (1-(ind_var.quo(total_sum))) + n_obtained = Statsample::Reliability.n_for_desired_alpha(@a, vm, cm) + # p n_obtained + assert_in_delta(Statsample::Reliability.cronbach_alpha_from_n_s2_cov(n_obtained, vm, cm), @a, 0.001) + end + + should 'standarized alpha will be equal to sum of matrix covariance less the individual variances on standarized values' do + total_sum = @cme.total_sum + ind_var = @dse.vectors.to_a.inject(0) { |ac, v| ac + @dse[v].variance } + expected = @k.quo(@k - 1) * (1 - (ind_var.quo(total_sum))) assert_in_delta(expected, @as, 1e-10) end end context Statsample::Reliability::ItemCharacteristicCurve do setup do - @samples=100 - @points=rand(10)+3 - @max_point=(@points-1)*3 - @x1=@samples.times.map{rand(@points)}.to_scale - @x2=@samples.times.map{rand(@points)}.to_scale - @x3=@samples.times.map{rand(@points)}.to_scale - @ds={'a'=>@x1,'b'=>@x2,'c'=>@x3}.to_dataset - @icc=Statsample::Reliability::ItemCharacteristicCurve.new(@ds) - end - should "have a correct automatic vector_total" do + @samples = 100 + @points = rand(10) + 3 + @max_point = (@points - 1) * 3 + @x1 = Daru::Vector.new(@samples.times.map { rand(@points) }) + @x2 = Daru::Vector.new(@samples.times.map { rand(@points) }) + @x3 = Daru::Vector.new(@samples.times.map { rand(@points) }) + @ds = Daru::DataFrame.new({ :a => @x1, :b => @x2, :c => @x3 }) + @icc = Statsample::Reliability::ItemCharacteristicCurve.new(@ds) + end + should 'have a correct automatic vector_total' do assert_equal(@ds.vector_sum, @icc.vector_total) end - should "have a correct different vector_total" do - x2=@samples.times.map{rand(10)}.to_scale - @icc=Statsample::Reliability::ItemCharacteristicCurve.new(@ds,x2) + should 'have a correct different vector_total' do + x2 = Daru::Vector.new(@samples.times.map { rand(10) }) + @icc = Statsample::Reliability::ItemCharacteristicCurve.new(@ds, x2) assert_equal(x2, @icc.vector_total) assert_raises(ArgumentError) do - inc=(@samples+10).times.map{rand(10)}.to_scale - @icc=Statsample::Reliability::ItemCharacteristicCurve.new(@ds,inc) + inc = Daru::Vector.new((@samples + 10).times.map { rand(10) }) + @icc = Statsample::Reliability::ItemCharacteristicCurve.new(@ds, inc) end end - should "have 0% for 0 points on maximum value values" do - max=@icc.curve_field('a',0)[@max_point.to_f] - max||=0 + should 'have 0% for 0 points on maximum value values' do + max = @icc.curve_field(:a, 0)[@max_point.to_f] + max ||= 0 assert_in_delta(0, max) end - should "have 0 for max value on minimum value" do - max=@icc.curve_field('a',@max_point)[0.0] - max||=0 + should 'have 0 for max value on minimum value' do + max = @icc.curve_field(:a, @max_point)[0.0] + max ||= 0 assert_in_delta(0, max) end - should "have correct values of % for any value" do - sum=@icc.vector_total - total={} - total_g=sum.frequencies - index=rand(@points) - @x1.each_with_index do |v,i| - total[sum[i]]||=0 - total[sum[i]]+=1 if v==index + should 'have correct values of % for any value' do + sum = @icc.vector_total + total = {} + total_g = sum.frequencies + index = rand(@points) + @x1.each_with_index do |v, i| + total[sum[i]] ||= 0 + total[sum[i]] += 1 if v == index end - expected=total.each {|k,v| - total[k]=v.quo(total_g[k]) + expected = total.each {|k, v| + total[k] = v.quo(total_g[k]) } - assert_equal(expected, @icc.curve_field('a',index)) - + assert_equal(expected, @icc.curve_field(:a, index)) end - end - + context Statsample::Reliability::MultiScaleAnalysis do - setup do - size=100 - @scales=3 - @items_per_scale=10 - h={} + size = 100 + @scales = 3 + @items_per_scale = 10 + h = {} @scales.times {|s| @items_per_scale.times {|i| - h["#{s}_#{i}"] = (size.times.map {(s*2)+rand}).to_scale + h["#{s}_#{i}".to_sym] = Daru::Vector.new((size.times.map { (s * 2) + rand })) } } - @ds=h.to_dataset - @msa=Statsample::Reliability::MultiScaleAnalysis.new(:name=>'Multiple Analysis') do |m| - m.scale "complete", @ds + @ds = Daru::DataFrame.new(h) + @msa = Statsample::Reliability::MultiScaleAnalysis.new(name: 'Multiple Analysis') do |m| + m.scale 'complete', @ds @scales.times {|s| - m.scale "scale_#{s}", @ds.clone(@items_per_scale.times.map {|i| "#{s}_#{i}"}), {:name=>"Scale #{s}"} + m.scale "scale_#{s}", @ds.clone(*@items_per_scale.times.map { |i| "#{s}_#{i}".to_sym }), name: "Scale #{s}" } end end - should "Retrieve correct ScaleAnalysis for whole scale" do - sa=Statsample::Reliability::ScaleAnalysis.new(@ds, :name=>"Scale complete") - assert_equal(sa.variances_mean, @msa.scale("complete").variances_mean) + + should 'Retrieve correct ScaleAnalysis for whole scale' do + sa = Statsample::Reliability::ScaleAnalysis.new(@ds, name: 'Scale complete') + assert_equal(sa.variances_mean, @msa.scale('complete').variances_mean) end - should "Retrieve correct ScaleAnalysis for each scale" do + should 'Retrieve correct ScaleAnalysis for each scale' do @scales.times {|s| - sa=Statsample::Reliability::ScaleAnalysis.new(@ds.dup(@items_per_scale.times.map {|i| "#{s}_#{i}"}), :name=>"Scale #{s}") - assert_equal(sa.variances_mean,@msa.scale("scale_#{s}").variances_mean) + sa = Statsample::Reliability::ScaleAnalysis.new(@ds.dup(@items_per_scale.times.map { |i| "#{s}_#{i}".to_sym }), name: "Scale #{s}") + assert_equal(sa.variances_mean, @msa.scale("scale_#{s}").variances_mean) } end - should "retrieve correct correlation matrix for each scale" do - vectors={'complete' => @ds.vector_sum} + should 'retrieve correct correlation matrix for each scale' do + vectors = { :complete => @ds.vector_sum } @scales.times {|s| - vectors["scale_#{s}"]=@ds.dup(@items_per_scale.times.map {|i| "#{s}_#{i}"}).vector_sum + vectors["scale_#{s}".to_sym] = @ds.dup(@items_per_scale.times.map { |i| "#{s}_#{i}".to_sym }).vector_sum } - ds2=vectors.to_dataset + ds2 = Daru::DataFrame.new(vectors) assert_equal(Statsample::Bivariate.correlation_matrix(ds2), @msa.correlation_matrix) end - should "delete scale using delete_scale" do - @msa.delete_scale("complete") - assert_equal(@msa.scales.keys.sort, @scales.times.map {|s| "scale_#{s}"}) + should 'delete scale using delete_scale' do + @msa.delete_scale('complete') + assert_equal(@msa.scales.keys.sort, @scales.times.map { |s| "scale_#{s}" }) end - should "retrieve pca for scales" do - @msa.delete_scale("complete") - vectors=Hash.new + should 'retrieve pca for scales' do + @msa.delete_scale('complete') + vectors = {} @scales.times {|s| - vectors["scale_#{s}"]=@ds.dup(@items_per_scale.times.map {|i| "#{s}_#{i}"}).vector_sum + vectors["scale_#{s}".to_sym] = @ds.dup(@items_per_scale.times.map { |i| "#{s}_#{i}".to_sym }).vector_sum } - ds2=vectors.to_dataset - cor_matrix=Statsample::Bivariate.correlation_matrix(ds2) - m=3 - pca=Statsample::Factor::PCA.new(cor_matrix, :m=>m) - assert_equal(pca.component_matrix, @msa.pca(:m=>m).component_matrix) - end - should "retrieve acceptable summary" do - @msa.delete_scale("scale_0") - @msa.delete_scale("scale_1") - @msa.delete_scale("scale_2") - - - #@msa.summary_correlation_matrix=true - #@msa.summary_pca=true - - - assert(@msa.summary.size>0) + ds2 = Daru::DataFrame.new(vectors) + cor_matrix = Statsample::Bivariate.correlation_matrix(ds2) + m = 3 + pca = Statsample::Factor::PCA.new(cor_matrix, m: m) + assert_equal(pca.component_matrix, @msa.pca(m: m).component_matrix) + end + should 'retrieve acceptable summary' do + @msa.delete_scale('scale_0') + @msa.delete_scale('scale_1') + @msa.delete_scale('scale_2') + + # @msa.summary_correlation_matrix=true + # @msa.summary_pca=true + + assert(@msa.summary.size > 0) end end context Statsample::Reliability::ScaleAnalysis do - setup do - @x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_scale - @x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_scale - @x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_scale - @x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_scale - @ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset - @ia=Statsample::Reliability::ScaleAnalysis.new(@ds) - @cov_matrix=@ia.cov_m - end - should "return correct values for item analysis" do - assert_in_delta(0.980,@ia.alpha,0.001) - assert_in_delta(0.999,@ia.alpha_standarized,0.001) - var_mean=4.times.map{|m| @cov_matrix[m,m]}.to_scale.mean + setup do + @x1 = Daru::Vector.new([1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 30]) + @x2 = Daru::Vector.new([1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 50]) + @x3 = Daru::Vector.new([2, 2, 1, 1, 1, 2, 2, 2, 3, 4, 5, 40]) + @x4 = Daru::Vector.new([1, 2, 3, 4, 4, 4, 4, 3, 4, 4, 5, 30]) + @ds = Daru::DataFrame.new({ :x1 => @x1, :x2 => @x2, :x3 => @x3, :x4 => @x4 }) + @ia = Statsample::Reliability::ScaleAnalysis.new(@ds) + @cov_matrix = @ia.cov_m + end + should 'return correct values for item analysis' do + assert_in_delta(0.980, @ia.alpha, 0.001) + assert_in_delta(0.999, @ia.alpha_standarized, 0.001) + var_mean = Daru::Vector.new(4.times.map { |m| @cov_matrix[m, m] }).mean assert_in_delta(var_mean, @ia.variances_mean) - assert_equal(@x1.mean, @ia.item_statistics['x1'][:mean]) - assert_equal(@x4.mean, @ia.item_statistics['x4'][:mean]) - assert_in_delta(@x1.sds, @ia.item_statistics['x1'][:sds],1e-14) - assert_in_delta(@x4.sds, @ia.item_statistics['x4'][:sds],1e-14) - ds2=@ds.clone - ds2.delete_vector('x1') - vector_sum=ds2.vector_sum - assert_equal(vector_sum.mean, @ia.stats_if_deleted['x1'][:mean]) - assert_equal(vector_sum.sds, @ia.stats_if_deleted['x1'][:sds]) - assert_in_delta(vector_sum.variance, @ia.stats_if_deleted['x1'][:variance_sample],1e-10) + assert_equal(@x1.mean, @ia.item_statistics[:x1][:mean]) + assert_equal(@x4.mean, @ia.item_statistics[:x4][:mean]) + assert_in_delta(@x1.sds, @ia.item_statistics[:x1][:sds], 1e-14) + assert_in_delta(@x4.sds, @ia.item_statistics[:x4][:sds], 1e-14) + ds2 = @ds.clone + ds2.delete_vector(:x1) + vector_sum = ds2.vector_sum + assert_equal(vector_sum.mean, @ia.stats_if_deleted[:x1][:mean]) + assert_equal(vector_sum.sds, @ia.stats_if_deleted[:x1][:sds]) + assert_in_delta(vector_sum.variance, @ia.stats_if_deleted[:x1][:variance_sample], 1e-10) + + assert_equal(Statsample::Reliability.cronbach_alpha(ds2), @ia.stats_if_deleted[:x1][:alpha]) - assert_equal(Statsample::Reliability.cronbach_alpha(ds2), @ia.stats_if_deleted['x1'][:alpha]) - - covariances=[] + covariances = [] 4.times.each {|i| 4.times.each {|j| - if i!=j - covariances.push(@cov_matrix[i,j]) + if i != j + covariances.push(@cov_matrix[i, j]) end } } - assert_in_delta(covariances.to_scale.mean, @ia.covariances_mean) - assert_in_delta(0.999,@ia.item_total_correlation()['x1'],0.001) - assert_in_delta(1050.455,@ia.stats_if_deleted()['x1'][:variance_sample],0.001) + assert_in_delta(Daru::Vector.new(covariances).mean, @ia.covariances_mean) + assert_in_delta(0.999, @ia.item_total_correlation[:x1], 0.001) + assert_in_delta(1050.455, @ia.stats_if_deleted[:x1][:variance_sample], 0.001) end - should "return a summary" do - assert(@ia.summary.size>0) + should 'return a summary' do + assert(@ia.summary.size > 0) end - end end end diff --git a/test/test_reliability_icc.rb b/test/test_reliability_icc.rb index d413cc9..25f5e2a 100644 --- a/test/test_reliability_icc.rb +++ b/test/test_reliability_icc.rb @@ -1,140 +1,138 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) -$reliability_icc=nil +$reliability_icc = nil -class StatsampleReliabilityIccTestCase < MiniTest::Test +class StatsampleReliabilityIccTestCase < Minitest::Test context Statsample::Reliability::ICC do setup do - a=[9,6,8,7,10,6].to_scale - b=[2,1,4,1,5,2].to_scale - c=[5,3,6,2,6,4].to_scale - d=[8,2,8,6,9,7].to_scale - @ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset - @icc=Statsample::Reliability::ICC.new(@ds) + a = Daru::Vector.new([9, 6, 8, 7, 10, 6]) + b = Daru::Vector.new([2, 1, 4, 1, 5, 2]) + c = Daru::Vector.new([5, 3, 6, 2, 6, 4]) + d = Daru::Vector.new([8, 2, 8, 6, 9, 7]) + @ds = Daru::DataFrame.new({ :a => a, :b => b, :c => c, :d => d }) + @icc = Statsample::Reliability::ICC.new(@ds) end - should "basic method be correct" do - assert_equal(6,@icc.n) - assert_equal(4,@icc.k) + should 'basic method be correct' do + assert_equal(6, @icc.n) + assert_equal(4, @icc.k) end - should "total mean be correct" do + should 'total mean be correct' do assert_in_delta(5.291, @icc.total_mean, 0.001) end - should "df methods be correct" do + should 'df methods be correct' do assert_equal(5, @icc.df_bt) assert_equal(18, @icc.df_wt) assert_equal(3, @icc.df_bj) assert_equal(15, @icc.df_residual) end - should "ms between targets be correct" do + should 'ms between targets be correct' do assert_in_delta(11.24, @icc.ms_bt, 0.01) end - should "ms within targets be correct" do + should 'ms within targets be correct' do assert_in_delta(6.26, @icc.ms_wt, 0.01) end - should "ms between judges be correct" do + should 'ms between judges be correct' do assert_in_delta(32.49, @icc.ms_bj, 0.01) end - should "ms residual be correct" do + should 'ms residual be correct' do assert_in_delta(1.02, @icc.ms_residual, 0.01) end - context "with McGraw and Wong denominations," do - + context 'with McGraw and Wong denominations,' do end - context "with Shrout & Fleiss denominations, " do - should "icc(1,1) method be correct" do + context 'with Shrout & Fleiss denominations, ' do + should 'icc(1,1) method be correct' do assert_in_delta(0.17, @icc.icc_1_1, 0.01) end # Verified on SPSS and R - should "icc(2,1) method be correct" do + should 'icc(2,1) method be correct' do assert_in_delta(0.29, @icc.icc_2_1, 0.01) end - should "icc(3,1) method be correct" do + should 'icc(3,1) method be correct' do assert_in_delta(0.71, @icc.icc_3_1, 0.01) end - should "icc(1,k) method be correct" do + should 'icc(1,k) method be correct' do assert_in_delta(0.44, @icc.icc_1_k, 0.01) end # Verified on SPSS and R - should "icc(2,k) method be correct" do + should 'icc(2,k) method be correct' do assert_in_delta(0.62, @icc.icc_2_k, 0.01) - end - should "icc(3,k) method be correct" do + end + should 'icc(3,k) method be correct' do assert_in_delta(0.91, @icc.icc_3_k, 0.01) end - - should "icc(1,1) F be correct" do + + should 'icc(1,1) F be correct' do assert_in_delta(1.795, @icc.icc_1_f.f) end - should "icc(1,1) confidence interval should be correct" do + should 'icc(1,1) confidence interval should be correct' do assert_in_delta(-0.133, @icc.icc_1_1_ci[0], 0.001) assert_in_delta(0.723, @icc.icc_1_1_ci[1], 0.001) end - should "icc(1,k) confidence interval should be correct" do + should 'icc(1,k) confidence interval should be correct' do assert_in_delta(-0.884, @icc.icc_1_k_ci[0], 0.001) assert_in_delta(0.912, @icc.icc_1_k_ci[1], 0.001) end - - should "icc(2,1) F be correct" do + + should 'icc(2,1) F be correct' do assert_in_delta(11.027, @icc.icc_2_f.f) end - should "icc(2,1) confidence interval should be correct" do - #skip("Not yet operational") + should 'icc(2,1) confidence interval should be correct' do + # skip("Not yet operational") assert_in_delta(0.019, @icc.icc_2_1_ci[0], 0.001) assert_in_delta(0.761, @icc.icc_2_1_ci[1], 0.001) end - - # Verified on SPSS and R - should "icc(2,k) confidence interval should be correct" do - #skip("Not yet operational") - #p @icc.icc_2_k_ci + + # Verified on SPSS and R + should 'icc(2,k) confidence interval should be correct' do + # skip("Not yet operational") + # p @icc.icc_2_k_ci assert_in_delta(0.039, @icc.icc_2_k_ci[0], 0.001) assert_in_delta(0.929, @icc.icc_2_k_ci[1], 0.001) - end - #should "Shrout icc(2,k) and McGraw icc(a,k) ci be equal" do + # should "Shrout icc(2,k) and McGraw icc(a,k) ci be equal" do # assert_in_delta(@icc.icc_2_k_ci_shrout[0], @icc.icc_2_k_ci_mcgraw[0], 10e-5) - #end - - should "icc(3,1) F be correct" do + # end + + should 'icc(3,1) F be correct' do assert_in_delta(11.027, @icc.icc_3_f.f) end - - should "icc(3,1) confidence interval should be correct" do + + should 'icc(3,1) confidence interval should be correct' do assert_in_delta(0.342, @icc.icc_3_1_ci[0], 0.001) assert_in_delta(0.946, @icc.icc_3_1_ci[1], 0.001) end - should "icc(3,k) confidence interval should be correct" do + should 'icc(3,k) confidence interval should be correct' do assert_in_delta(0.676, @icc.icc_3_k_ci[0], 0.001) assert_in_delta(0.986, @icc.icc_3_k_ci[1], 0.001) end - should "incorrect type raises an error" do - assert_raise(::RuntimeError) do - @icc.type=:nonexistant_type + should 'incorrect type raises an error' do + assert_raise(::RuntimeError) do + @icc.type = :nonexistant_type end end end - + begin require 'rserve' - require 'statsample/rserve_extension' - context "McGraw and Wong" do + require 'daru/extensions/rserve' + context 'McGraw and Wong' do teardown do - @r=$reliability_icc[:r].close unless $reliability_icc[:r].nil? + @r = $reliability_icc[:r].close unless $reliability_icc[:r].nil? end setup do - if($reliability_icc.nil?) - size=100 - a=size.times.map {rand(10)}.to_scale - b=a.recode{|i|i+rand(4)-2} - c=a.recode{|i|i+rand(4)-2} - d=a.recode{|i|i+rand(4)-2} - @ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset - - @icc=Statsample::Reliability::ICC.new(@ds) - @r=Rserve::Connection.new - - @r.assign('ds',@ds) - + if $reliability_icc.nil? + size = 100 + a = Daru::Vector.new(size.times.map { rand(10) }) + b = a.recode { |i| i + rand(4) - 2 } + c = a.recode { |i| i + rand(4) - 2 } + d = a.recode { |i| i + rand(4) - 2 } + @ds = Daru::DataFrame.new({ :a => a, :b => b, :c => c, :d => d }) + + @icc = Statsample::Reliability::ICC.new(@ds) + @r = Rserve::Connection.new + + @r.assign('ds', @ds) + @r.void_eval("library(irr); iccs=list( icc_1=icc(ds,'o','c','s'), @@ -144,59 +142,57 @@ class StatsampleReliabilityIccTestCase < MiniTest::Test icc_a_1=icc(ds,'t','a','s'), icc_a_k=icc(ds,'t','a','a')) ") - @iccs=@r.eval('iccs').to_ruby - $reliability_icc={ :icc=>@icc, :iccs=>@iccs, :r=>@r + @iccs = @r.eval('iccs').to_ruby + $reliability_icc = { icc: @icc, iccs: @iccs, r: @r } - - end - @icc=$reliability_icc[:icc] - @iccs=$reliability_icc[:iccs] - @r=$reliability_icc[:r] + end + @icc = $reliability_icc[:icc] + @iccs = $reliability_icc[:iccs] + @r = $reliability_icc[:r] end [:icc_1, :icc_k, :icc_c_1, :icc_c_k, :icc_a_1, :icc_a_k].each do |t| context "ICC Type #{t} " do - should "value be correct" do - @icc.type=t - @r_icc=@iccs[t.to_s] - assert_in_delta(@r_icc['value'],@icc.r) + should 'value be correct' do + @icc.type = t + @r_icc = @iccs[t.to_s] + assert_in_delta(@r_icc['value'], @icc.r) end - should "fvalue be correct" do - @icc.type=t - @r_icc=@iccs[t.to_s] - assert_in_delta(@r_icc['Fvalue'],@icc.f.f) + should 'fvalue be correct' do + @icc.type = t + @r_icc = @iccs[t.to_s] + assert_in_delta(@r_icc['Fvalue'], @icc.f.f) end - should "num df be correct" do - @icc.type=t - @r_icc=@iccs[t.to_s] - assert_in_delta(@r_icc['df1'],@icc.f.df_num) + should 'num df be correct' do + @icc.type = t + @r_icc = @iccs[t.to_s] + assert_in_delta(@r_icc['df1'], @icc.f.df_num) end - should "den df be correct" do - @icc.type=t - @r_icc=@iccs[t.to_s] - assert_in_delta(@r_icc['df2'],@icc.f.df_den) + should 'den df be correct' do + @icc.type = t + @r_icc = @iccs[t.to_s] + assert_in_delta(@r_icc['df2'], @icc.f.df_den) end - should "f probability be correct" do - @icc.type=t - @r_icc=@iccs[t.to_s] - assert_in_delta(@r_icc['p.value'],@icc.f.probability) + should 'f probability be correct' do + @icc.type = t + @r_icc = @iccs[t.to_s] + assert_in_delta(@r_icc['p.value'], @icc.f.probability) end - should "bounds be equal" do - @icc.type=t - @r_icc=@iccs[t.to_s] - assert_in_delta(@r_icc['lbound'],@icc.lbound) - assert_in_delta(@r_icc['ubound'],@icc.ubound) + should 'bounds be equal' do + @icc.type = t + @r_icc = @iccs[t.to_s] + assert_in_delta(@r_icc['lbound'], @icc.lbound, 0.1) + assert_in_delta(@r_icc['ubound'], @icc.ubound, 0.1) end - should "summary generated" do - assert(@icc.summary.size>0) + should 'summary generated' do + assert(@icc.summary.size > 0) end end end end rescue - puts "requires rserve" + puts 'requires rserve' end - end end diff --git a/test/test_reliability_skillscale.rb b/test/test_reliability_skillscale.rb index 456c808..831740b 100644 --- a/test/test_reliability_skillscale.rb +++ b/test/test_reliability_skillscale.rb @@ -1,57 +1,57 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) - -class StatsampleReliabilitySkillScaleTestCase < MiniTest::Unit::TestCase +class StatsampleReliabilitySkillScaleTestCase < Minitest::Test context Statsample::Reliability::SkillScaleAnalysis do setup do - options=%w{a b c d e} - cases=20 - @id=cases.times.map {|v| v}.to_scale - @a=cases.times.map {options[rand(5)]}.to_vector - @b=cases.times.map {options[rand(5)]}.to_vector - @c=cases.times.map {options[rand(5)]}.to_vector - @d=cases.times.map {options[rand(5)]}.to_vector - @e=cases.times.map {|i| - i==0 ? options[rand(0)] : - rand()>0.8 ? nil : options[rand(5)] - }.to_vector - @ds={'id'=>@id,'a'=>@a,'b'=>@b,'c'=>@c,'d'=>@d,'e'=>@e}.to_dataset - @key={'a'=>"a", 'b'=>options[rand(5)], 'c'=>options[rand(5)], 'd'=>options[rand(5)],'e'=>options[rand(5)]} - @ssa=Statsample::Reliability::SkillScaleAnalysis.new(@ds, @key) - @ac=@a.map {|v| v==@key['a'] ? 1 : 0}.to_scale - @bc=@b.map {|v| v==@key['b'] ? 1 : 0}.to_scale - @cc=@c.map {|v| v==@key['c'] ? 1 : 0}.to_scale - @dc=@d.map {|v| v==@key['d'] ? 1 : 0}.to_scale - @ec=@e.map {|v| v.nil? ? nil : (v==@key['e'] ? 1 : 0)}.to_scale - + options = %w(a b c d e) + cases = 20 + @id = Daru::Vector.new(cases.times.map { |v| v }) + @a = Daru::Vector.new(cases.times.map { options[rand(5)] }) + @b = Daru::Vector.new(cases.times.map { options[rand(5)] }) + @c = Daru::Vector.new(cases.times.map { options[rand(5)] }) + @d = Daru::Vector.new(cases.times.map { options[rand(5)] }) + @e = Daru::Vector.new( + cases.times.map do |i| + i == 0 ? options[rand(0)] : + rand > 0.8 ? nil : options[rand(5)] + end + ) + @ds = Daru::DataFrame.new({ :id => @id, :a => @a, :b => @b, :c => @c, :d => @d, :e => @e }) + @key = { :a => 'a', :b => options[rand(5)], :c => options[rand(5)], :d => options[rand(5)], :e => options[rand(5)] } + @ssa = Statsample::Reliability::SkillScaleAnalysis.new(@ds, @key) + @ac = Daru::Vector.new(@a.map { |v| v == @key[:a] ? 1 : 0 }) + @bc = Daru::Vector.new(@b.map { |v| v == @key[:b] ? 1 : 0 }) + @cc = Daru::Vector.new(@c.map { |v| v == @key[:c] ? 1 : 0 }) + @dc = Daru::Vector.new(@d.map { |v| v == @key[:d] ? 1 : 0 }) + @ec = Daru::Vector.new(@e.map { |v| v.nil? ? nil : (v == @key[:e] ? 1 : 0) }) end - should "return proper corrected dataset" do - cds={'id'=>@id, 'a'=>@ac,'b'=>@bc,'c'=>@cc,'d'=>@dc, 'e'=>@ec}.to_dataset + should 'return proper corrected dataset' do + cds = Daru::DataFrame.new({ :id => @id, :a => @ac, :b => @bc, :c => @cc, :d => @dc, :e => @ec }) assert_equal(cds, @ssa.corrected_dataset) end - should "return proper corrected minimal dataset" do - cdsm={'a'=>@ac,'b'=>@bc,'c'=>@cc,'d'=>@dc, 'e'=>@ec}.to_dataset + should 'return proper corrected minimal dataset' do + cdsm = Daru::DataFrame.new({ :a => @ac, :b => @bc, :c => @cc, :d => @dc, :e => @ec }) assert_equal(cdsm, @ssa.corrected_dataset_minimal) end - should "return correct vector_sum and vector_sum" do - cdsm=@ssa.corrected_dataset_minimal + should 'return correct vector_sum and vector_sum' do + cdsm = @ssa.corrected_dataset_minimal assert_equal(cdsm.vector_sum, @ssa.vector_sum) assert_equal(cdsm.vector_mean, @ssa.vector_mean) end - should "not crash on rare case" do - a=Statsample::Vector["c","c","a","a","c","a","b","c","c","b","a","d","a","d","a","a","d","e","c","d"] - b=Statsample::Vector["e","b","e","b","c","d","a","e","e","c","b","e","e","b","d","c","e","b","b","d"] - c=Statsample::Vector["e","b","e","c","e","c","b","d","e","c","a","a","b","d","e","c","b","a","a","e"] - d=Statsample::Vector["a","b","d","d","e","b","e","b","d","c","e","a","c","d","c","c","e","d","d","b"] - e=Statsample::Vector["a","b",nil,"d","c","c","d",nil,"d","d","e","e",nil,nil,nil,"d","c",nil,"e","d"] - key={"a"=>"a", "b"=>"e", "c"=>"d", "d"=>"c", "e"=>"d"} - ds=Statsample::Dataset.new("a"=>a,"b"=>b,"c"=>c,"d"=>d,"e"=>e) - ssa=Statsample::Reliability::SkillScaleAnalysis.new(ds, key) + should 'not crash on rare case' do + a = Daru::Vector.new(['c', 'c', 'a', 'a', 'c', 'a', 'b', 'c', 'c', 'b', 'a', 'd', 'a', 'd', 'a', 'a', 'd', 'e', 'c', 'd']) + b = Daru::Vector.new(['e', 'b', 'e', 'b', 'c', 'd', 'a', 'e', 'e', 'c', 'b', 'e', 'e', 'b', 'd', 'c', 'e', 'b', 'b', 'd']) + c = Daru::Vector.new(['e', 'b', 'e', 'c', 'e', 'c', 'b', 'd', 'e', 'c', 'a', 'a', 'b', 'd', 'e', 'c', 'b', 'a', 'a', 'e']) + d = Daru::Vector.new(['a', 'b', 'd', 'd', 'e', 'b', 'e', 'b', 'd', 'c', 'e', 'a', 'c', 'd', 'c', 'c', 'e', 'd', 'd', 'b']) + e = Daru::Vector.new(['a', 'b', nil, 'd', 'c', 'c', 'd', nil, 'd', 'd', 'e', 'e', nil, nil, nil, 'd', 'c', nil, 'e', 'd']) + key = { :a => 'a', :b => 'e', :c => 'd', :d => 'c', :e => 'd' } + ds = Daru::DataFrame.new({:a => a, :b => b, :c => c, :d => d, :e => e}) + ssa = Statsample::Reliability::SkillScaleAnalysis.new(ds, key) assert(ssa.summary) end - - should "return valid summary" do - assert(@ssa.summary.size>0) + + should 'return valid summary' do + assert(@ssa.summary.size > 0) end end end diff --git a/test/test_resample.rb b/test/test_resample.rb index c1821e1..ce8701b 100644 --- a/test/test_resample.rb +++ b/test/test_resample.rb @@ -1,22 +1,24 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) -class StatsampleResampleTestCase < MiniTest::Unit::TestCase +class StatsampleResampleTestCase < Minitest::Test def initialize(*args) super end + def test_basic - r=Statsample::Resample.generate(20,1,10) - assert_equal(20,r.size) - assert(r.min>=1) - assert(r.max<=10) + r = Statsample::Resample.generate(20, 1, 10) + assert_equal(20, r.size) + assert(r.min >= 1) + assert(r.max <= 10) end + def test_repeat_and_save - r=Statsample::Resample.repeat_and_save(400) { - Statsample::Resample.generate(20,1,10).count(1) + r = Statsample::Resample.repeat_and_save(400) { + Statsample::Resample.generate(20, 1, 10).count(1) } - assert_equal(400,r.size) - v=Statsample::Vector.new(r,:scale) - a=v.count {|x| x > 3} - assert(a>=30 && a<=70) + assert_equal(400, r.size) + v = Daru::Vector.new(r) + a = v.count { |x| x > 3 } + assert(a >= 30 && a <= 70) end end diff --git a/test/test_rserve_extension.rb b/test/test_rserve_extension.rb deleted file mode 100644 index e718978..0000000 --- a/test/test_rserve_extension.rb +++ /dev/null @@ -1,42 +0,0 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -begin - require 'rserve' - require 'statsample/rserve_extension' - -class StatsampleRserveExtensionTestCase < MiniTest::Unit::TestCase - context "Statsample Rserve extensions" do - setup do - @r=Rserve::Connection.new - end - teardown do - @r.close - end - should "return a valid rexp for numeric vector" do - a=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale - rexp=a.to_REXP - assert(rexp.is_a? Rserve::REXP::Double) - assert_equal(rexp.to_ruby,a.data_with_nils) - @r.assign 'a',rexp - assert_equal(a.data_with_nils, @r.eval('a').to_ruby) - end - should "return a valid rserve dataframe for statsample datasets" do - a=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale - b=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale - c=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale - ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset - rexp=ds.to_REXP - assert(rexp.is_a? Rserve::REXP::GenericVector) - ret=rexp.to_ruby - assert_equal(a.data_with_nils, ret['a']) - @r.assign 'df', rexp - out_df=@r.eval('df').to_ruby - assert_equal('data.frame', out_df.attributes['class']) - assert_equal(['a','b','c'], out_df.attributes['names']) - assert_equal(a.data_with_nils, out_df['a']) - end - end -end - -rescue LoadError - puts "Require rserve extension" -end diff --git a/test/test_srs.rb b/test/test_srs.rb index 1d18cf9..c9d5abd 100644 --- a/test/test_srs.rb +++ b/test/test_srs.rb @@ -1,9 +1,9 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) -class StatsampleSrsTestCase < MiniTest::Unit::TestCase +class StatsampleSrsTestCase < Minitest::Test def test_std_error - assert_equal(384,Statsample::SRS.estimation_n0(0.05,0.5,0.95).to_i) - assert_equal(108,Statsample::SRS.estimation_n(0.05,0.5,150,0.95).to_i) - assert_in_delta(0.0289,Statsample::SRS.proportion_sd_kp_wor(0.5,100,150),0.001) + assert_equal(384, Statsample::SRS.estimation_n0(0.05, 0.5, 0.95).to_i) + assert_equal(108, Statsample::SRS.estimation_n(0.05, 0.5, 150, 0.95).to_i) + assert_in_delta(0.0289, Statsample::SRS.proportion_sd_kp_wor(0.5, 100, 150), 0.001) end end diff --git a/test/test_statistics.rb b/test/test_statistics.rb index 7fe47d3..f8b9372 100644 --- a/test/test_statistics.rb +++ b/test/test_statistics.rb @@ -1,77 +1,69 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleStatisicsTestCase < MiniTest::Unit::TestCase - +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleStatisicsTestCase < Minitest::Test def initialize(*args) super end + def test_p_using_cdf - assert_equal(0.25, Statsample::Test.p_using_cdf(0.25, tails=:left)) - assert_equal(0.75, Statsample::Test.p_using_cdf(0.25, tails=:right)) - assert_equal(0.50, Statsample::Test.p_using_cdf(0.25, tails=:both)) - assert_equal(1, Statsample::Test.p_using_cdf(0.50, tails=:both)) - assert_equal(0.05, Statsample::Test.p_using_cdf(0.025, tails=:both)) - assert_in_delta(0.05, Statsample::Test.p_using_cdf(0.975, tails=:both),0.0001) - + assert_equal(0.25, Statsample::Test.p_using_cdf(0.25, tails = :left)) + assert_equal(0.75, Statsample::Test.p_using_cdf(0.25, tails = :right)) + assert_equal(0.50, Statsample::Test.p_using_cdf(0.25, tails = :both)) + assert_equal(1, Statsample::Test.p_using_cdf(0.50, tails = :both)) + assert_equal(0.05, Statsample::Test.p_using_cdf(0.025, tails = :both)) + assert_in_delta(0.05, Statsample::Test.p_using_cdf(0.975, tails = :both), 0.0001) end + def test_recode_repeated - a=%w{a b c c d d d e} - exp=["a","b","c_1","c_2","d_1","d_2","d_3","e"] - assert_equal(exp,a.recode_repeated) + a = %w(a b c c d d d e) + exp = %w(a b c_1 c_2 d_1 d_2 d_3 e) + assert_equal(exp, Daru::ArrayHelper.recode_repeated(a)) end - def test_is_number - assert("10".is_number?) - assert("-10".is_number?) - assert("0.1".is_number?) - assert("-0.1".is_number?) - assert("10e3".is_number?) - assert("10e-3".is_number?) - assert(!"1212-1212-1".is_number?) - assert(!"a10".is_number?) - assert(!"".is_number?) + def test_is_number + assert('10'.is_number?) + assert('-10'.is_number?) + assert('0.1'.is_number?) + assert('-0.1'.is_number?) + assert('10e3'.is_number?) + assert('10e-3'.is_number?) + assert(!'1212-1212-1'.is_number?) + assert(!'a10'.is_number?) + assert(!''.is_number?) end + def test_estimation_mean - v=([42]*23+[41]*4+[36]*1+[32]*1+[29]*1+[27]*2+[23]*1+[19]*1+[16]*2+[15]*2+[14,11,10,9,7]+ [6]*3+[5]*2+[4,3]).to_vector(:scale) - assert_equal(50,v.size) - assert_equal(1471,v.sum()) - #limits=Statsample::SRS.mean_confidence_interval_z(v.mean(), v.sds(), v.size,676,0.80) + v = Daru::Vector.new([42] * 23 + [41] * 4 + [36] * 1 + [32] * 1 + [29] * 1 + [27] * 2 + [23] * 1 + [19] * 1 + [16] * 2 + [15] * 2 + [14, 11, 10, 9, 7] + [6] * 3 + [5] * 2 + [4, 3]) + assert_equal(50, v.size) + assert_equal(1471, v.sum) + # limits=Statsample::SRS.mean_confidence_interval_z(v.mean(), v.sds(), v.size,676,0.80) end + def test_estimation_proportion # total - pop=3042 - sam=200 - prop=0.19 + pop = 3042 + sam = 200 + prop = 0.19 assert_in_delta(81.8, Statsample::SRS.proportion_total_sd_ep_wor(prop, sam, pop), 0.1) # confidence limits - pop=500 - sam=100 - prop=0.37 - a=0.95 - l= Statsample::SRS.proportion_confidence_interval_z(prop, sam, pop, a) - assert_in_delta(0.28,l[0],0.01) - assert_in_delta(0.46,l[1],0.01) + pop = 500 + sam = 100 + prop = 0.37 + a = 0.95 + l = Statsample::SRS.proportion_confidence_interval_z(prop, sam, pop, a) + assert_in_delta(0.28, l[0], 0.01) + assert_in_delta(0.46, l[1], 0.01) end - def test_ml - if(true) - #real=[1,1,1,1].to_vector(:scale) - - #pred=[0.0001,0.0001,0.0001,0.0001].to_vector(:scale) - # puts Statsample::Bivariate.maximum_likehood_dichotomic(pred,real) - - end - end - def test_simple_linear_regression - a=[1,2,3,4,5,6].to_vector(:scale) - b=[6,2,4,10,12,8].to_vector(:scale) - reg = Statsample::Regression::Simple.new_from_vectors(a,b) - assert_in_delta((reg.ssr+reg.sse).to_f,reg.sst,0.001) - assert_in_delta(Statsample::Bivariate.pearson(a,b),reg.r,0.001) - assert_in_delta(2.4,reg.a,0.01) - assert_in_delta(1.314,reg.b,0.001) - assert_in_delta(0.657,reg.r,0.001) - assert_in_delta(0.432,reg.r2,0.001) + a = Daru::Vector.new([1, 2, 3, 4, 5, 6]) + b = Daru::Vector.new([6, 2, 4, 10, 12, 8]) + reg = Statsample::Regression::Simple.new_from_vectors(a, b) + assert_in_delta((reg.ssr + reg.sse).to_f, reg.sst, 0.001) + assert_in_delta(Statsample::Bivariate.pearson(a, b), reg.r, 0.001) + assert_in_delta(2.4, reg.a, 0.01) + assert_in_delta(1.314, reg.b, 0.001) + assert_in_delta(0.657, reg.r, 0.001) + assert_in_delta(0.432, reg.r2, 0.001) end end diff --git a/test/test_stest.rb b/test/test_stest.rb index e13c580..aa375b4 100644 --- a/test/test_stest.rb +++ b/test/test_stest.rb @@ -1,56 +1,69 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) -class StatsampleTestTestCase < MiniTest::Unit::TestCase +class StatsampleTestTestCase < Minitest::Test def test_chi_square_matrix_with_expected - real=Matrix[[95,95],[45,155]] - expected=Matrix[[68,122],[72,128]] + real = Matrix[[95, 95], [45, 155]] + expected = Matrix[[68, 122], [72, 128]] assert_nothing_raised do - Statsample::Test.chi_square(real,expected) + Statsample::Test.chi_square(real, expected) end - chi=Statsample::Test.chi_square(real,expected).chi_square - assert_in_delta(32.53,chi,0.1) - + chi = Statsample::Test.chi_square(real, expected).chi_square + assert_in_delta(32.53, chi, 0.1) end + def test_chi_square_matrix_only_observed - observed=Matrix[[20,30,40],[30,40,50],[60,70,80],[10,20,40]] + observed = Matrix[[20, 30, 40], [30, 40, 50], [60, 70, 80], [10, 20, 40]] assert_nothing_raised do Statsample::Test.chi_square(observed) end - chi=Statsample::Test.chi_square(observed) + chi = Statsample::Test.chi_square(observed) assert_in_delta(9.5602, chi.chi_square, 0.0001) assert_in_delta(0.1444, chi.probability, 0.0001) assert_equal(6, chi.df) - end - - def test_u_mannwhitney - a=[1,2,3,4,5,6].to_scale - b=[0,5,7,9,10,11].to_scale - assert_equal(7.5, Statsample::Test.u_mannwhitney(a,b).u) - assert_equal(7.5, Statsample::Test.u_mannwhitney(b,a).u) - a=[1, 7,8,9,10,11].to_scale - b=[2,3,4,5,6,12].to_scale - assert_equal(11, Statsample::Test.u_mannwhitney(a,b).u) + + def test_chi_square_vector + observed = Vector[20,30,15] + expected = Vector[20,20,20] + assert_nothing_raised do + Statsample::Test.chi_square(observed, expected) + end + chi = Statsample::Test.chi_square(observed, expected) + + assert_in_delta(6.25, chi.chi_square, 0.0001) + assert_in_delta(0.04393, chi.probability, 0.00001) + + assert_equal(2, chi.df) end + def test_u_mannwhitney + a = Daru::Vector.new([1, 2, 3, 4, 5, 6]) + b = Daru::Vector.new([0, 5, 7, 9, 10, 11]) + assert_equal(7.5, Statsample::Test.u_mannwhitney(a, b).u) + assert_equal(7.5, Statsample::Test.u_mannwhitney(b, a).u) + a = Daru::Vector.new([1, 7, 8, 9, 10, 11]) + b = Daru::Vector.new([2, 3, 4, 5, 6, 12]) + assert_equal(11, Statsample::Test.u_mannwhitney(a, b).u) + end def test_levene - a=[1,2,3,4,5,6,7,8,100,10].to_scale - b=[30,40,50,60,70,80,90,100,110,120].to_scale - levene=Statsample::Test::Levene.new([a,b]) + a = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7, 8, 100, 10]) + b = Daru::Vector.new([30, 40, 50, 60, 70, 80, 90, 100, 110, 120]) + levene = Statsample::Test::Levene.new([a, b]) assert_levene(levene) end + def test_levene_dataset - a=[1,2,3,4,5,6,7,8,100,10].to_scale - b=[30,40,50,60,70,80,90,100,110,120].to_scale - ds={'a'=>a,'b'=>b}.to_dataset - levene=Statsample::Test::Levene.new(ds) + a = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7, 8, 100, 10]) + b = Daru::Vector.new([30, 40, 50, 60, 70, 80, 90, 100, 110, 120]) + ds = Daru::DataFrame.new({ :a => a, :b => b }) + levene = Statsample::Test::Levene.new(ds) assert_levene(levene) end + def assert_levene(levene) assert_in_delta(0.778, levene.f, 0.001) assert_in_delta(0.389, levene.probability, 0.001) end - end diff --git a/test/test_stratified.rb b/test/test_stratified.rb index eb8ef45..3e619fe 100644 --- a/test/test_stratified.rb +++ b/test/test_stratified.rb @@ -1,17 +1,17 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) - -class StatsampleStratifiedTestCase < MiniTest::Unit::TestCase +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleStratifiedTestCase < Minitest::Test def initialize(*args) super end + def test_mean - a=[10,20,30,40,50] - b=[110,120,130,140] - pop=a+b - av=a.to_vector(:scale) - bv=b.to_vector(:scale) - popv=pop.to_vector(:scale) - assert_equal(popv.mean,Statsample::StratifiedSample.mean(av,bv)) + a = [10, 20, 30, 40, 50] + b = [110, 120, 130, 140] + pop = a + b + av = Daru::Vector.new(a) + bv = Daru::Vector.new(b) + popv = Daru::Vector.new(pop) + assert_equal(popv.mean, Statsample::StratifiedSample.mean(av, bv)) end end diff --git a/test/test_test_f.rb b/test/test_test_f.rb index b7cc4a8..0ef0650 100644 --- a/test/test_test_f.rb +++ b/test/test_test_f.rb @@ -1,32 +1,32 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleTestFTestCase < MiniTest::Unit::TestCase - context(Statsample::Test::F) do +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleTestFTestCase < Minitest::Test + context(Statsample::Test::F) do setup do - @ssb=84 - @ssw=68 - @df_num=2 - @df_den=15 - @f=Statsample::Test::F.new(@ssb.quo(@df_num),@ssw.quo(@df_den), @df_num, @df_den) + @ssb = 84 + @ssw = 68 + @df_num = 2 + @df_den = 15 + @f = Statsample::Test::F.new(@ssb.quo(@df_num), @ssw.quo(@df_den), @df_num, @df_den) end - should "have #f equal to msb/msw" do + should 'have #f equal to msb/msw' do assert_equal((@ssb.quo(@df_num)).quo(@ssw.quo(@df_den)), @f.f) end - should "have df total equal to df_num+df_den" do + should 'have df total equal to df_num+df_den' do assert_equal(@df_num + @df_den, @f.df_total) end - should "have probability near 0.002" do + should 'have probability near 0.002' do assert_in_delta(0.002, @f.probability, 0.0005) end - should "be coerced into float" do + should 'be coerced into float' do assert_equal(@f.to_f, @f.f) end - - context("method summary") do + + context('method summary') do setup do - @summary=@f.summary + @summary = @f.summary end - should "have size > 0" do - assert(@summary.size>0) + should 'have size > 0' do + assert(@summary.size > 0) end end end diff --git a/test/test_test_kolmogorovsmirnov.rb b/test/test_test_kolmogorovsmirnov.rb index 409d25d..7b698a1 100644 --- a/test/test_test_kolmogorovsmirnov.rb +++ b/test/test_test_kolmogorovsmirnov.rb @@ -1,28 +1,28 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleTestKolmogorovSmirnovTestCase < MiniTest::Unit::TestCase - context(Statsample::Test::KolmogorovSmirnov) do - should "calculate correctly D for two given samples" do - a=[1.1,2.5,5.6,9] - b=[1,2.3,5.8,10] - ks=Statsample::Test::KolmogorovSmirnov.new(a,b) - assert_equal(0.25,ks.d) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleTestKolmogorovSmirnovTestCase < Minitest::Test + context(Statsample::Test::KolmogorovSmirnov) do + should 'calculate correctly D for two given samples' do + a = [1.1, 2.5, 5.6, 9] + b = [1, 2.3, 5.8, 10] + ks = Statsample::Test::KolmogorovSmirnov.new(a, b) + assert_equal(0.25, ks.d) end - should "calculate correctly D for a normal sample and Normal Distribution" do - a=[0.30022510,-0.36664035,0.08593404,1.29881130,-0.49878633,-0.63056010, 0.28397638, -0.04913700,0.03566644,-1.33414346] - ks=Statsample::Test::KolmogorovSmirnov.new(a,Distribution::Normal) - assert_in_delta(0.282, ks.d,0.001) + should 'calculate correctly D for a normal sample and Normal Distribution' do + a = [0.30022510, -0.36664035, 0.08593404, 1.29881130, -0.49878633, -0.63056010, 0.28397638, -0.04913700, 0.03566644, -1.33414346] + ks = Statsample::Test::KolmogorovSmirnov.new(a, Distribution::Normal) + assert_in_delta(0.282, ks.d, 0.001) end - should "calculate correctly D for a variable normal and Normal Distribution" do - rng=Distribution::Normal.rng - a=100.times.map {rng.call} - ks=Statsample::Test::KolmogorovSmirnov.new(a,Distribution::Normal) - assert(ks.d<0.15) + should 'calculate correctly D for a variable normal and Normal Distribution' do + rng = Distribution::Normal.rng + a = 100.times.map { rng.call } + ks = Statsample::Test::KolmogorovSmirnov.new(a, Distribution::Normal) + assert(ks.d < 0.15) end - + context(Statsample::Test::KolmogorovSmirnov::EmpiricDistribution) do - should "Create a correct empirical distribution for an array" do - a=[10,9,8,7,6,5,4,3,2,1] - ed=Statsample::Test::KolmogorovSmirnov::EmpiricDistribution.new(a) + should 'Create a correct empirical distribution for an array' do + a = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + ed = Statsample::Test::KolmogorovSmirnov::EmpiricDistribution.new(a) assert_equal(0, ed.cdf(-2)) assert_equal(0.5, ed.cdf(5)) assert_equal(0.5, ed.cdf(5.5)) diff --git a/test/test_test_t.rb b/test/test_test_t.rb index 1c39a6b..3b8cce6 100644 --- a/test/test_test_t.rb +++ b/test/test_test_t.rb @@ -1,62 +1,62 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleTestTTestCase < MiniTest::Unit::TestCase +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) +class StatsampleTestTTestCase < Minitest::Test include Statsample::Test include Math - context T do + context T do setup do - @a=[30.02, 29.99, 30.11, 29.97, 30.01, 29.99].to_scale - @b=[29.89, 29.93, 29.72, 29.98, 30.02, 29.98].to_scale - @x1=@a.mean - @x2=@b.mean - @s1=@a.sd - @s2=@b.sd - @n1=@a.n - @n2=@b.n - end - should "calculate correctly standard t" do - t=Statsample::Test::T.new(@x1, @s1.quo(Math.sqrt(@a.n)), @a.n-1) - assert_equal((@x1).quo(@s1.quo(Math.sqrt(@a.n))), t.t) - assert_equal(@a.n-1, t.df) - assert(t.summary.size>0) - end - should "calculate correctly t for one sample" do - t1=[6, 4, 6, 7, 4,5,5,12,6,1].to_scale - t2=[9, 6, 5,10,10,8,7,10,6,5].to_scale - d=t1-t2 - t=Statsample::Test::T::OneSample.new(d) + @a = Daru::Vector.new([30.02, 29.99, 30.11, 29.97, 30.01, 29.99]) + @b = Daru::Vector.new([29.89, 29.93, 29.72, 29.98, 30.02, 29.98]) + @x1 = @a.mean + @x2 = @b.mean + @s1 = @a.sd + @s2 = @b.sd + @n1 = @a.size + @n2 = @b.size + end + should 'calculate correctly standard t' do + t = Statsample::Test::T.new(@x1, @s1.quo(Math.sqrt(@a.size)), @a.size - 1) + assert_equal((@x1).quo(@s1.quo(Math.sqrt(@a.size))), t.t) + assert_equal(@a.size - 1, t.df) + assert(t.summary.size > 0) + end + should 'calculate correctly t for one sample' do + t1 = Daru::Vector.new([6, 4, 6, 7, 4, 5, 5, 12, 6, 1]) + t2 = Daru::Vector.new([9, 6, 5, 10, 10, 8, 7, 10, 6, 5]) + d = t1 - t2 + t = Statsample::Test::T::OneSample.new(d) assert_in_delta(-2.631, t.t, 0.001) - assert_in_delta( 0.027, t.probability, 0.001) - assert_in_delta( 0.76012, t.se, 0.0001) - assert(t.summary.size>0) - end - should "calculate correctly t for two samples" do - assert_in_delta(1.959, T.two_sample_independent(@x1, @x2, @s1, @s2, @n1, @n2),0.001) - assert_in_delta(1.959, T.two_sample_independent(@x1, @x2, @s1, @s2, @n1, @n2,true),0.001) - end - should "calculate correctly df for equal and unequal variance" do - assert_equal(10, T.df_equal_variance(@n1,@n2)) - assert_in_delta(7.03, T.df_not_equal_variance(@s1,@s2,@n1,@n2),0.001) - end - should "calculate all values for T object" do - t=Statsample::Test.t_two_samples_independent(@a,@b) - assert(t.summary.size>0) - assert_in_delta(1.959, t.t_equal_variance,0.001) - assert_in_delta(1.959, t.t_not_equal_variance,0.001) - assert_in_delta(10, t.df_equal_variance,0.001) - assert_in_delta(7.03, t.df_not_equal_variance,0.001) - assert_in_delta(0.07856, t.probability_equal_variance,0.001) - assert_in_delta(0.09095, t.probability_not_equal_variance,0.001) - end - should "be the same using shorthand" do - v=100.times.map {rand(100)}.to_scale + assert_in_delta(0.027, t.probability, 0.001) + assert_in_delta(0.76012, t.se, 0.0001) + assert(t.summary.size > 0) + end + should 'calculate correctly t for two samples' do + assert_in_delta(1.959, T.two_sample_independent(@x1, @x2, @s1, @s2, @n1, @n2), 0.001) + assert_in_delta(1.959, T.two_sample_independent(@x1, @x2, @s1, @s2, @n1, @n2, true), 0.001) + end + should 'calculate correctly df for equal and unequal variance' do + assert_equal(10, T.df_equal_variance(@n1, @n2)) + assert_in_delta(7.03, T.df_not_equal_variance(@s1, @s2, @n1, @n2), 0.001) + end + should 'calculate all values for T object' do + t = Statsample::Test.t_two_samples_independent(@a, @b) + assert(t.summary.size > 0) + assert_in_delta(1.959, t.t_equal_variance, 0.001) + assert_in_delta(1.959, t.t_not_equal_variance, 0.001) + assert_in_delta(10, t.df_equal_variance, 0.001) + assert_in_delta(7.03, t.df_not_equal_variance, 0.001) + assert_in_delta(0.07856, t.probability_equal_variance, 0.001) + assert_in_delta(0.09095, t.probability_not_equal_variance, 0.001) + end + should 'be the same using shorthand' do + v = Daru::Vector.new(100.times.map { rand(100) }) assert_equal(Statsample::Test.t_one_sample(v).t, T::OneSample.new(v).t) end - should "calculate all values for one sample T test" do - u=@a.mean+(1-rand*2) - tos=T::OneSample.new(@a,{:u=>u}) - assert_equal((@a.mean-u).quo(@a.sd.quo(sqrt(@a.n))), tos.t) - assert_equal(@a.n-1, tos.df) - assert(tos.summary.size>0) + should 'calculate all values for one sample T test' do + u = @a.mean + (1 - rand * 2) + tos = T::OneSample.new(@a, u: u) + assert_equal((@a.mean - u).quo(@a.sd.quo(sqrt(@a.size))), tos.t) + assert_equal(@a.size - 1, tos.df) + assert(tos.summary.size > 0) end end end diff --git a/test/test_umannwhitney.rb b/test/test_umannwhitney.rb index 82817af..69a34f9 100644 --- a/test/test_umannwhitney.rb +++ b/test/test_umannwhitney.rb @@ -1,27 +1,27 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) -class StatsampleUMannWhitneyTestCase < MiniTest::Unit::TestCase +class StatsampleUMannWhitneyTestCase < Minitest::Test include Statsample::Test context Statsample::Test::UMannWhitney do setup do - @v1=[1,2,3,4,7,8,9,10,14,15].to_scale - @v2=[5,6,11,12,13,16,17,18,19].to_scale - @u=Statsample::Test::UMannWhitney.new(@v1,@v2) + @v1 = Daru::Vector.new([1, 2, 3, 4, 7, 8, 9, 10, 14, 15]) + @v2 = Daru::Vector.new([5, 6, 11, 12, 13, 16, 17, 18, 19]) + @u = Statsample::Test::UMannWhitney.new(@v1, @v2) end - should "have same result using class or Test#u_mannwhitney" do - assert_equal(Statsample::Test.u_mannwhitney(@v1,@v2).u, @u.u) + should 'have same result using class or Test#u_mannwhitney' do + assert_equal(Statsample::Test.u_mannwhitney(@v1, @v2).u, @u.u) end - should "have correct U values" do - assert_equal(73,@u.r1) - assert_equal(117,@u.r2) - assert_equal(18,@u.u) + should 'have correct U values' do + assert_equal(73, @u.r1) + assert_equal(117, @u.r2) + assert_equal(18, @u.u) end - should "have correct value for z" do - assert_in_delta(-2.205,@u.z,0.001) + should 'have correct value for z' do + assert_in_delta(-2.205, @u.z, 0.001) end - should "have correct value for z and exact probability" do - assert_in_delta(0.027,@u.probability_z,0.001) - assert_in_delta(0.028,@u.probability_exact,0.001) + should 'have correct value for z and exact probability' do + assert_in_delta(0.027, @u.probability_z, 0.001) + assert_in_delta(0.028, @u.probability_exact, 0.001) end end end diff --git a/test/test_vector.rb b/test/test_vector.rb index 2a00252..7685121 100644 --- a/test/test_vector.rb +++ b/test/test_vector.rb @@ -1,644 +1,12 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) -class StatsampleTestVector < MiniTest::Unit::TestCase - include Statsample::Shorthand - - def setup - @c = Statsample::Vector.new([5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99], :nominal) - @c.name="Test Vector" - @c.missing_values=[-99] - end - def assert_counting_tokens(b) - assert_equal([1,1,0,1,0,nil],b['a'].to_a) - assert_equal([0,1,0,0,0,nil],b['b'].to_a) - assert_equal([0,0,1,0,0,nil],b['c'].to_a) - assert_equal([0,0,1,1,0,nil],b['d'].to_a) - assert_equal([0,0,0,0,1,nil],b[10].to_a) - end - context Statsample do - setup do - @sample=100 - @a=@sample.times.map{|i| (i+rand(10)) %10 ==0 ? nil : rand(100)}.to_scale - @b=@sample.times.map{|i| (i+rand(10)) %10 ==0 ? nil : rand(100)}.to_scale - @correct_a=Array.new - @correct_b=Array.new - @a.each_with_index do |v,i| - if !@a[i].nil? and !@b[i].nil? - @correct_a.push(@a[i]) - @correct_b.push(@b[i]) - end - end - @correct_a=@correct_a.to_scale - @correct_b=@correct_b.to_scale - - @common=lambda do |av,bv| - assert_equal(@correct_a, av, "A no es esperado") - assert_equal(@correct_b, bv, "B no es esperado") - assert(!av.has_missing_data?, "A tiene datos faltantes") - assert(!bv.has_missing_data?, "b tiene datos faltantes") - end - end - should "return correct only_valid" do - av,bv=Statsample.only_valid @a,@b - av2,bv2=Statsample.only_valid av,bv - @common.call(av,bv) - assert_equal(av,av2) - assert_not_same(av,av2) - assert_not_same(bv,bv2) - end - should "return correct only_valid_clone" do - av,bv=Statsample.only_valid_clone @a,@b - @common.call(av,bv) - av2,bv2=Statsample.only_valid_clone av,bv - assert_equal(av,av2) - assert_same(av,av2) - assert_same(bv,bv2) - end - end - context Statsample::Vector do - setup do - @c = Statsample::Vector.new([5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99], :nominal) - @c.name="Test Vector" - @c.missing_values=[-99] - end - should_with_gsl "be created with GSL::Vector" do - gsl=GSL::Vector[1,2,3,4,5] - v=Statsample::Vector.new(gsl) - assert_equal([1,2,3,4,5], v.to_a) - refute(v.flawed?) - - end - - context "using matrix operations" do - setup do - @a=[1,2,3,4,5].to_scale - end - should "to_matrix returns a matrix with 1 row" do - mh=Matrix[[1,2,3,4,5]] - assert_equal(mh,@a.to_matrix) - end - should "to_matrix(:vertical) returns a matrix with 1 column" do - mv=Matrix.columns([[1,2,3,4,5]]) - assert_equal(mv,@a.to_matrix(:vertical)) - end - should "returns valid submatrixes" do - # 3*4 + 2*5 = 22 - a=[3,2].to_vector(:scale) - b=[4,5].to_vector(:scale) - assert_equal(22,(a.to_matrix*b.to_matrix(:vertical))[0,0]) - end - end - context "when initializing" do - setup do - @data=(10.times.map{rand(100)})+[nil] - @original=Statsample::Vector.new(@data, :scale) - end - should "be the sample using []" do - second=Statsample::Vector[*@data] - assert_equal(@original, second) - end - should "[] returns same results as R-c()" do - reference=[0,4,5,6,10].to_scale - assert_equal(reference, Statsample::Vector[0,4,5,6,10]) - assert_equal(reference, Statsample::Vector[0,4..6,10]) - assert_equal(reference, Statsample::Vector[[0],[4,5,6],[10]]) - assert_equal(reference, Statsample::Vector[[0],[4,[5,[6]]],[10]]) - - assert_equal(reference, Statsample::Vector[[0],[4,5,6].to_vector,[10]]) - - end - should "be the same usign #to_vector" do - lazy1=@data.to_vector(:scale) - assert_equal(@original,lazy1) - end - should "be the same using #to_scale" do - lazy2=@data.to_scale - assert_equal(@original,lazy2) - assert_equal(:scale,lazy2.type) - assert_equal(@data.find_all{|v| !v.nil?},lazy2.valid_data) - end - should "could use new_scale with size only" do - v1=10.times.map {nil}.to_scale - v2=Statsample::Vector.new_scale(10) - assert_equal(v1,v2) - - end - should "could use new_scale with size and value" do - a=rand - v1=10.times.map {a}.to_scale - v2=Statsample::Vector.new_scale(10,a) - assert_equal(v1,v2) - end - should "could use new_scale with func" do - v1=10.times.map {|i| i*2}.to_scale - v2=Statsample::Vector.new_scale(10) {|i| i*2} - assert_equal(v1,v2) - end - - end - - context "#split_by_separator" do - - setup do - @a = Statsample::Vector.new(["a","a,b","c,d","a,d",10,nil],:nominal) - @b=@a.split_by_separator(",") - end - should "returns a Hash" do - assert_kind_of(Hash, @b) - end - should "return a Hash with keys with different values of @a" do - expected=['a','b','c','d',10] - assert_equal(expected, @b.keys) - end - - should "returns a Hash, which values are Statsample::Vector" do - @b.each_key {|k| assert_instance_of(Statsample::Vector, @b[k])} - end - should "hash values are n times the tokens appears" do - assert_counting_tokens(@b) - end - should "#split_by_separator_freq returns the number of ocurrences of tokens" do - assert_equal({'a'=>3,'b'=>1,'c'=>1,'d'=>2,10=>1}, @a.split_by_separator_freq()) - end - should "using a different separator give the same values" do - a = Statsample::Vector.new(["a","a*b","c*d","a*d",10,nil],:nominal) - b=a.split_by_separator("*") - assert_counting_tokens(b) - end - end - should "return correct median_absolute_deviation" do - a=[1, 1, 2, 2, 4, 6, 9].to_scale - assert_equal(1, a.median_absolute_deviation) - end - should "return correct histogram" do - a=10.times.map {|v| v}.to_scale - hist=a.histogram(2) - assert_equal([5,5], hist.bin) - 3.times do |i| - assert_in_delta(i*4.5, hist.get_range(i)[0], 1e-9) - end - - end - should "have a name" do - @c.name=="Test Vector" - end - should "without explicit name, returns vector with succesive numbers" do - a=10.times.map{rand(100)}.to_scale - b=10.times.map{rand(100)}.to_scale - assert_match(/Vector \d+/, a.name) - a.name=~/Vector (\d+)/ - next_number=$1.to_i+1 - assert_equal("Vector #{next_number}",b.name) - end - should "save to a file and load the same Vector" do - outfile=Tempfile.new("vector.vec") - @c.save(outfile.path) - a=Statsample.load(outfile.path) - assert_equal(@c,a) - end - should "#collect returns an array" do - val=@c.collect {|v| v} - assert_equal(val,[5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99]) - end - - should "#recode returns a recoded array" do - a=@c.recode{|v| @c.is_valid?(v) ? 0 : 1 } - exp=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1].to_vector - assert_equal(exp,a) - exp.recode!{|v| v==0 ? 1:0} - exp2=(([1]*15)+([0]*3)).to_vector - assert_equal(exp2,exp) - end - should "#product returns the * of all values" do - a=[1,2,3,4,5].to_vector(:scale) - assert_equal(120,a.product) - end - - should "missing values" do - @c.missing_values=[10] - assert_equal([-99,-99,1,2,3,4,5,5,5,5,5,6,6,7,8,9], @c.valid_data.sort) - assert_equal([5,5,5,5,5,6,6,7,8,9,nil,1,2,3,4,nil,-99,-99], @c.data_with_nils) - @c.missing_values=[-99] - assert_equal(@c.valid_data.sort,[1,2,3,4,5,5,5,5,5,6,6,7,8,9,10]) - assert_equal(@c.data_with_nils,[5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,nil,nil]) - @c.missing_values=[] - assert_equal(@c.valid_data.sort,[-99,-99,1,2,3,4,5,5,5,5,5,6,6,7,8,9,10]) - assert_equal(@c.data_with_nils,[5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99]) - - end - should "correct has_missing_data? with missing data" do - a=[1,2,3,nil].to_vector - assert(a.has_missing_data?) - end - should "correct has_missing_data? without missing data" do - a=[1,2,3,4,10].to_vector - assert(!a.has_missing_data?) - end - should "with explicit missing_values, should respond has_missing_data?" do - a=[1,2,3,4,10].to_vector - a.missing_values=[10] - assert(a.has_missing_data?) - end - should "label correctly fields" do - @c.labels={5=>'FIVE'} - assert_equal(["FIVE","FIVE","FIVE","FIVE","FIVE",6,6,7,8,9,10,1,2,3,4,nil,-99, -99],@c.vector_labeled.to_a) - end - should "verify" do - h=@c.verify{|d| !d.nil? and d>0} - e={15=>nil,16=>-99,17=>-99} - assert_equal(e,h) - end - should "have a summary with name on it" do - assert_match(/#{@c.name}/, @c.summary) - end - - should "GSL::Vector based should push correcty" do - if Statsample.has_gsl? - v=GSL::Vector[1,2,3,4,5].to_scale - v.push(nil) - assert_equal([1,2,3,4,5,nil], v.to_a) - assert(v.flawed?) - else - skip("Requires GSL") - end - end - - - should "split correctly" do - a = Statsample::Vector.new(["a","a,b","c,d","a,d","d",10,nil],:nominal) - assert_equal([%w{a},%w{a b},%w{c d},%w{a d},%w{d},[10],nil], a.splitted) - end - should "multiply correct for scalar" do - a = [1,2,3].to_scale - assert_equal([5,10,15].to_scale, a*5) - end - should "multiply correct with other vector" do - a = [1,2,3].to_scale - b = [2,4,6].to_scale - - assert_equal([2,8,18].to_scale, a*b) - end - should "sum correct for scalar" do - a = [1,2,3].to_scale - assert_equal([11,12,13].to_scale, a+10) - end - - should "raise NoMethodError when method requires ordinal and vector is nominal" do - @c.type=:nominal - assert_raise(::NoMethodError) { @c.median } - end - - should "raise NoMethodError when method requires scalar and vector is ordinal" do - @c.type=:ordinal - assert_raise(::NoMethodError) { @c.mean } - end - should "jacknife correctly with named method" do - # First example - a=[1,2,3,4].to_scale - ds=a.jacknife(:mean) - assert_equal(a.mean, ds[:mean].mean) - ds=a.jacknife([:mean,:sd]) - assert_equal(a.mean, ds[:mean].mean) - assert_equal(a.sd, ds[:mean].sd) - end - should "jacknife correctly with custom method" do - # Second example - a=[17.23, 18.71,13.93,18.81,15.78,11.29,14.91,13.39, 18.21, 11.57, 14.28, 10.94, 18.83, 15.52,13.45,15.25].to_scale - ds=a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance) }) - exp=[1.605, 2.972, 1.151, 3.097, 0.998, 3.308, 0.942, 1.393, 2.416, 2.951, 1.043, 3.806, 3.122, 0.958, 1.362, 0.937].to_scale - - assert_similar_vector(exp, ds[:log_s2], 0.001) - assert_in_delta(2.00389, ds[:log_s2].mean, 0.00001) - assert_in_delta(1.091, ds[:log_s2].variance, 0.001) - end - should "jacknife correctly with k>1" do - a=rnorm(6) - ds=a.jacknife(:mean,2) - mean=a.mean - exp=[3*mean-2*(a[2]+a[3]+a[4]+a[5]) / 4, 3*mean-2*(a[0]+a[1]+a[4]+a[5]) / 4, 3*mean-2*(a[0]+a[1]+a[2]+a[3]) / 4].to_scale - assert_similar_vector(exp, ds[:mean], 1e-13) +class StatsampleTestVector < Minitest::Test + should 'return correct histogram' do + a = Daru::Vector.new(10.times.map { |v| v }) + hist = a.histogram(2) + assert_equal([5, 5], hist.bin) + 3.times do |i| + assert_in_delta(i * 4.5, hist.get_range(i)[0], 1e-9) end - should "bootstrap should return a vector with mean=mu and sd=se" do - a=rnorm(100) - ds=a.bootstrap([:mean,:sd],200) - se=1/Math.sqrt(a.size) - assert_in_delta(0, ds[:mean].mean, 0.3) - assert_in_delta(se, ds[:mean].sd, 0.02) - end - - - end - - - - def test_nominal - assert_equal(@c[1],5) - assert_equal({ 1=>1,2=>1,3=>1,4=>1,5=>5,6=>2,7=>1,8=>1, 9=>1,10=>1},@c.frequencies) - assert_equal({ 1=>1,2=>1,3=>1,4=>1,5=>5,6=>2,7=>1,8=>1, 9=>1,10=>1},@c._frequencies) - assert_equal({ 1 => 1.quo(15) ,2=>1.quo(15), 3=>1.quo(15),4=>1.quo(15),5=>5.quo(15),6=>2.quo(15),7=>1.quo(15), 8=>1.quo(15), 9=>1.quo(15),10=>1.quo(15)}, @c.proportions) - assert_equal(@c.proportion, 1.quo(15)) - assert_equal(@c.proportion(2), 1.quo(15)) - assert_equal([1,2,3,4,5,6,7,8,9,10], @c.factors.sort) - assert_equal(@c.mode,5) - assert_equal(@c.n_valid,15) - end - def test_equality - v1=[1,2,3].to_vector - v2=[1,2,3].to_vector - assert_equal(v1,v2) - v1=[1,2,3].to_vector(:nominal) - v2=[1,2,3].to_vector(:ordinal) - assert_not_equal(v1,v2) - v2=[1,2,3] - assert_not_equal(v1,v2) - v1=[1,2,3].to_vector() - v2=[1,2,3].to_vector() - assert_equal(v1,v2) - assert_equal(false, v1 == Object.new) - end - def test_vector_percentil - a=[1,2,2,3,4,5,5,5,6,10].to_scale - expected=[10,25,25,40,50,70,70,70,90,100].to_scale - assert_equal(expected, a.vector_percentil) - a=[1,nil,nil,2,2,3,4,nil,nil,5,5,5,6,10].to_scale - expected=[10,nil,nil,25,25,40,50,nil,nil,70,70,70,90,100].to_scale - assert_equal(expected, a.vector_percentil) - end - def test_ordinal - @c.type=:ordinal - assert_equal(5,@c.median) - assert_equal(4,@c.percentil(25)) - assert_equal(7,@c.percentil(75)) - - v=[200000, 200000, 210000, 220000, 230000, 250000, 250000, 250000, 270000, 300000, 450000, 130000, 140000, 140000, 140000, 145000, 148000, 165000, 170000, 180000, 180000, 180000, 180000, 180000, 180000 ].to_scale - assert_equal(180000,v.median) - a=[7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 9.0, 9.0, 10.0, 10.0, 10.0, 10.0, 10.0, 12.0, 12.0, 13.0, 14.0, 14.0, 2.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0].to_scale - assert_equal(4.5, a.percentil(25)) - assert_equal(6.5, a.percentil(50)) - assert_equal(9.5, a.percentil(75)) - assert_equal(3.0, a.percentil(10)) - end - def test_linear_percentil_strategy - values = [102, 104, 105, 107, 108, 109, 110, 112, 115, 116].shuffle.to_scale - assert_equal 102, values.percentil(0, :linear) - assert_equal 104.75, values.percentil(25, :linear) - assert_equal 108.5, values.percentil(50, :linear) - assert_equal 112.75, values.percentil(75, :linear) - assert_equal 116, values.percentil(100, :linear) - - values = [102, 104, 105, 107, 108, 109, 110, 112, 115, 116, 118].shuffle.to_scale - assert_equal 102, values.percentil(0, :linear) - assert_equal 105, values.percentil(25, :linear) - assert_equal 109, values.percentil(50, :linear) - assert_equal 115, values.percentil(75, :linear) - assert_equal 118, values.percentil(100, :linear) - end - def test_ranked - v1=[0.8,1.2,1.2,2.3,18].to_vector(:ordinal) - expected=[1,2.5,2.5,4,5].to_vector(:ordinal) - assert_equal(expected,v1.ranked) - v1=[nil,0.8,1.2,1.2,2.3,18,nil].to_vector(:ordinal) - expected=[nil,1,2.5,2.5,4,5,nil].to_vector(:ordinal) - assert_equal(expected,v1.ranked) - end - def test_scale - a=Statsample::Vector.new([1,2,3,4,"STRING"], :scale) - assert_equal(10, a.sum) - i=0 - factors=a.factors.sort - [0,1,2,3,4].each{|v| - assert(v==factors[i]) - assert(v.class==factors[i].class,"#{v} - #{v.class} != #{factors[i]} - #{factors[i].class}") - i+=1 - } - end - def test_vector_centered - mean=rand() - samples=11 - centered=samples.times.map {|i| i-((samples/2).floor).to_i}.to_scale - not_centered=centered.recode {|v| v+mean} - obs=not_centered.centered - centered.each_with_index do |v,i| - assert_in_delta(v,obs[i],0.0001) - end - end - def test_vector_standarized - v1=[1,2,3,4,nil].to_vector(:scale) - sds=v1.sds - expected=[((1-2.5).quo(sds)),((2-2.5).quo(sds)),((3-2.5).quo(sds)),((4-2.5).quo(sds)), nil].to_vector(:scale) - vs=v1.vector_standarized - assert_equal(expected, vs) - assert_equal(0,vs.mean) - assert_equal(1,vs.sds) - end - - def test_vector_standarized_with_zero_variance - v1=100.times.map {|i| 1}.to_scale - exp=100.times.map {nil}.to_scale - assert_equal(exp,v1.standarized) - end - - def test_check_type - v=Statsample::Vector.new - v.type=:nominal - assert_raise(NoMethodError) { v.check_type(:scale)} - assert_raise(NoMethodError) { v.check_type(:ordinal)} - assert(v.check_type(:nominal).nil?) - - v.type=:ordinal - - assert_raise(NoMethodError) { v.check_type(:scale)} - - assert(v.check_type(:ordinal).nil?) - assert(v.check_type(:nominal).nil?) - - - v.type=:scale - assert(v.check_type(:scale).nil?) - assert(v.check_type(:ordinal).nil?) - assert(v.check_type(:nominal).nil?) - - v.type=:date - assert_raise(NoMethodError) { v.check_type(:scale)} - assert_raise(NoMethodError) { v.check_type(:ordinal)} - assert_raise(NoMethodError) { v.check_type(:nominal)} - end - - def test_add - a=Statsample::Vector.new([1,2,3,4,5], :scale) - b=Statsample::Vector.new([11,12,13,14,15], :scale) - assert_equal([3,4,5,6,7], (a+2).to_a) - assert_equal([12,14,16,18,20], (a+b).to_a) - assert_raise ArgumentError do - a + @c - end - assert_raise TypeError do - a+"string" - end - a=Statsample::Vector.new([nil,1, 2 ,3 ,4 ,5], :scale) - b=Statsample::Vector.new([11, 12,nil,13,14,15], :scale) - assert_equal([nil,13,nil,16,18,20], (a+b).to_a) - assert_equal([nil,13,nil,16,18,20], (a+b.to_a).to_a) - end - def test_minus - a=Statsample::Vector.new([1,2,3,4,5], :scale) - b=Statsample::Vector.new([11,12,13,14,15], :scale) - assert_equal([-1,0,1,2,3], (a-2).to_a) - assert_equal([10,10,10,10,10], (b-a).to_a) - assert_raise ArgumentError do - a-@c - end - assert_raise TypeError do - a-"string" - end - a=Statsample::Vector.new([nil,1, 2 ,3 ,4 ,5], :scale) - b=Statsample::Vector.new([11, 12,nil,13,14,15], :scale) - assert_equal([nil,11,nil,10,10,10], (b-a).to_a) - assert_equal([nil,11,nil,10,10,10], (b-a.to_a).to_a) - end - def test_sum_of_squares - a=[1,2,3,4,5,6].to_vector(:scale) - assert_equal(17.5, a.sum_of_squared_deviation) - end - def test_average_deviation - a=[1,2,3,4,5,6,7,8,9].to_scale - assert_equal(20.quo(9), a.average_deviation_population) - end - def test_samples - srand(1) - assert_equal(100,@c.sample_with_replacement(100).size) - assert_equal(@c.valid_data.to_a.sort, @c.sample_without_replacement(15).sort) - assert_raise ArgumentError do - @c.sample_without_replacement(20) - end - @c.type=:scale - srand(1) - assert_equal(100, @c.sample_with_replacement(100).size) - assert_equal(@c.valid_data.to_a.sort, @c.sample_without_replacement(15).sort) - - end - def test_valid_data - a=Statsample::Vector.new([1,2,3,4,"STRING"]) - a.missing_values=[-99] - a.add(1,false) - a.add(2,false) - a.add(-99,false) - a.set_valid_data - exp_valid_data=[1,2,3,4,"STRING",1,2] - assert_equal(exp_valid_data,a.valid_data) - a.add(20,false) - a.add(30,false) - assert_equal(exp_valid_data,a.valid_data) - a.set_valid_data - exp_valid_data_2=[1,2,3,4,"STRING",1,2,20,30] - assert_equal(exp_valid_data_2,a.valid_data) - end - def test_set_value - @c[2]=10 - expected=[5,5,10,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99].to_vector - assert_equal(expected.data,@c.data) - end - def test_gsl - if Statsample.has_gsl? - a=Statsample::Vector.new([1,2,3,4,"STRING"], :scale) - - assert_equal(2,a.mean) - assert_equal(a.variance_sample_ruby,a.variance_sample) - assert_equal(a.standard_deviation_sample_ruby,a.sds) - assert_equal(a.variance_population_ruby,a.variance_population) - assert_equal(a.standard_deviation_population_ruby,a.standard_deviation_population) - assert_nothing_raised do - a=[].to_vector(:scale) - end - a.add(1,false) - a.add(2,false) - a.set_valid_data - assert_equal(3,a.sum) - b=[1,2,nil,3,4,5,nil,6].to_vector(:scale) - assert_equal(21, b.sum) - assert_equal(3.5, b.mean) - assert_equal(6,b.gsl.size) - c=[10,20,30,40,50,100,1000,2000,5000].to_scale - assert_in_delta(c.skew, c.skew_ruby ,0.0001) - assert_in_delta(c.kurtosis, c.kurtosis_ruby ,0.0001) - end - end - def test_vector_matrix - v1=%w{a a a b b b c c}.to_vector - v2=%w{1 3 4 5 6 4 3 2}.to_vector - v3=%w{1 0 0 0 1 1 1 0}.to_vector - ex=Matrix.rows([["a", "1", "1"], ["a", "3", "0"], ["a", "4", "0"], ["b", "5", "0"], ["b", "6", "1"], ["b", "4", "1"], ["c", "3", "1"], ["c", "2", "0"]]) - assert_equal(ex,Statsample.vector_cols_matrix(v1,v2,v3)) - end - def test_marshalling - v1=(0..100).to_a.collect{|n| rand(100)}.to_vector(:scale) - v2=Marshal.load(Marshal.dump(v1)) - assert_equal(v1,v2) - end - def test_dup - v1=%w{a a a b b b c c}.to_vector - v2=v1.dup - assert_equal(v1.data,v2.data) - assert_not_same(v1.data,v2.data) - assert_equal(v1.type,v2.type) - - v1.type=:ordinal - assert_not_equal(v1.type,v2.type) - assert_equal(v1.missing_values,v2.missing_values) - assert_not_same(v1.missing_values,v2.missing_values) - assert_equal(v1.labels,v2.labels) - assert_not_same(v1.labels,v2.labels) - - v3=v1.dup_empty - assert_equal([],v3.data) - assert_not_equal(v1.data,v3.data) - assert_not_same(v1.data,v3.data) - assert_equal(v1.type,v3.type) - v1.type=:ordinal - v3.type=:nominal - assert_not_equal(v1.type,v3.type) - assert_equal(v1.missing_values,v3.missing_values) - assert_not_same(v1.missing_values,v3.missing_values) - assert_equal(v1.labels,v3.labels) - assert_not_same(v1.labels,v3.labels) - end - def test_paired_ties - a=[0,0,0,1,1,2,3,3,4,4,4].to_vector(:ordinal) - expected=[2,2,2,4.5,4.5,6,7.5,7.5,10,10,10].to_vector(:ordinal) - assert_equal(expected,a.ranked) - end - def test_dichotomize - a= [0,0,0,1,2,3,nil].to_vector - exp=[0,0,0,1,1,1,nil].to_scale - assert_equal(exp,a.dichotomize) - a= [1,1,1,2,2,2,3].to_vector - exp=[0,0,0,1,1,1,1].to_scale - assert_equal(exp,a.dichotomize) - a= [0,0,0,1,2,3,nil].to_vector - exp=[0,0,0,0,1,1,nil].to_scale - assert_equal(exp,a.dichotomize(1)) - a= %w{a a a b c d}.to_vector - exp=[0,0,0,1,1,1].to_scale - assert_equal(exp, a.dichotomize) - end - def test_can_be_methods - a= [0,0,0,1,2,3,nil].to_vector - assert(a.can_be_scale?) - a=[0,"s",0,1,2,3,nil].to_vector - assert(!a.can_be_scale?) - a.missing_values=["s"] - assert(a.can_be_scale?) - - a=[Date.new(2009,10,10), Date.today(), "2009-10-10", "2009-1-1", nil, "NOW"].to_vector - assert(a.can_be_date?) - a=[Date.new(2009,10,10), Date.today(),nil,"sss"].to_vector - assert(!a.can_be_date?) - end - def test_date_vector - a=[Date.new(2009,10,10), :NOW, "2009-10-10", "2009-1-1", nil, "NOW","MISSING"].to_vector(:date, :missing_values=>["MISSING"]) - - assert(a.type==:date) - expected=[Date.new(2009,10,10), Date.today(), Date.new(2009,10,10), Date.new(2009,1,1), nil, Date.today(), nil ] - assert_equal(expected, a.date_data_with_nils) end end diff --git a/test/test_wilcoxonsignedrank.rb b/test/test_wilcoxonsignedrank.rb index f10b492..c32e341 100644 --- a/test/test_wilcoxonsignedrank.rb +++ b/test/test_wilcoxonsignedrank.rb @@ -1,67 +1,64 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) +require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb')) -class StatsampleUMannWhitneyTestCase < MiniTest::Unit::TestCase +class StatsampleUMannWhitneyTestCase < Minitest::Test include Statsample::Test context Statsample::Test::WilcoxonSignedRank do - context "Example 1" do - setup do - @v1=[110,122,125,120,140,124,123,137,135,145].to_scale - @v2=[125,115,130,140,140,115,140,125,140,135].to_scale - @u=Statsample::Test::WilcoxonSignedRank.new(@v1,@v2) - end - should "have same result using class or Test#u_mannwhitney" do - assert_equal(Statsample::Test.wilcoxon_signed_rank(@v1,@v2).w, @u.w) - end - should "have correct W values" do - assert_equal(9,@u.w) - end - should "have correct nr values" do - assert_equal(9,@u.nr) - end - should "have correct value for z" do - assert_in_delta(0.503,@u.z,0.001) - end - should "have correct value for probability_z" do - assert_in_delta(0.614,@u.probability_z,0.001) - end - should "have correct value for probability_exact" do - assert_in_delta(0.652,@u.probability_exact,0.001) - end - should "have summary" do - assert(@u.summary!="") - end - end - - context "Example 2" do - setup do - @v2=[78,24,64,45,64,52,30,50,64,50,78,22,84,40,90,72].to_scale - @v1=[78,24,62,48,68,56,25,44,56,40,68,36,68,20,58,32].to_scale - @u=Statsample::Test::WilcoxonSignedRank.new(@v1,@v2) - end - should "have same result using class or Test#u_mannwhitney" do - assert_equal(Statsample::Test.wilcoxon_signed_rank(@v1,@v2).w, @u.w) - end - should "have correct W values" do - assert_equal(67,@u.w) - end - should "have correct nr values" do - assert_equal(14,@u.nr) - end - should "have correct value for z" do - assert_in_delta(2.087,@u.z,0.001) - end - should "have correct value for probability_z" do - assert_in_delta(0.036,@u.probability_z,0.001) - end - should "have correct value for probability_exact" do - assert_in_delta(0.036,@u.probability_exact,0.001) - end - should "have summary" do - assert(@u.summary!="") - end - end - - - end - + context 'Example 1' do + setup do + @v1 = Daru::Vector.new([110, 122, 125, 120, 140, 124, 123, 137, 135, 145]) + @v2 = Daru::Vector.new([125, 115, 130, 140, 140, 115, 140, 125, 140, 135]) + @u = Statsample::Test::WilcoxonSignedRank.new(@v1, @v2) + end + should 'have same result using class or Test#u_mannwhitney' do + assert_equal(Statsample::Test.wilcoxon_signed_rank(@v1, @v2).w, @u.w) + end + should 'have correct W values' do + assert_equal(9, @u.w) + end + should 'have correct nr values' do + assert_equal(9, @u.nr) + end + should 'have correct value for z' do + assert_in_delta(0.503, @u.z, 0.001) + end + should 'have correct value for probability_z' do + assert_in_delta(0.614, @u.probability_z, 0.001) + end + should 'have correct value for probability_exact' do + assert_in_delta(0.652, @u.probability_exact, 0.001) + end + should 'have summary' do + assert(@u.summary != '') + end + end + + context 'Example 2' do + setup do + @v2 = Daru::Vector.new([78, 24, 64, 45, 64, 52, 30, 50, 64, 50, 78, 22, 84, 40, 90, 72]) + @v1 = Daru::Vector.new([78, 24, 62, 48, 68, 56, 25, 44, 56, 40, 68, 36, 68, 20, 58, 32]) + @u = Statsample::Test::WilcoxonSignedRank.new(@v1, @v2) + end + should 'have same result using class or Test#u_mannwhitney' do + assert_equal(Statsample::Test.wilcoxon_signed_rank(@v1, @v2).w, @u.w) + end + should 'have correct W values' do + assert_equal(67, @u.w) + end + should 'have correct nr values' do + assert_equal(14, @u.nr) + end + should 'have correct value for z' do + assert_in_delta(2.087, @u.z, 0.001) + end + should 'have correct value for probability_z' do + assert_in_delta(0.036, @u.probability_z, 0.001) + end + should 'have correct value for probability_exact' do + assert_in_delta(0.036, @u.probability_exact, 0.001) + end + should 'have summary' do + assert(@u.summary != '') + end + end + end end diff --git a/test/test_xls.rb b/test/test_xls.rb deleted file mode 100644 index 0a2584d..0000000 --- a/test/test_xls.rb +++ /dev/null @@ -1,52 +0,0 @@ -require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb')) -class StatsampleExcelTestCase < MiniTest::Unit::TestCase - context "Excel reader" do - setup do - @ds=Statsample::Excel.read(File.dirname(__FILE__)+"/fixtures/test_xls.xls") - end - should "set the number of cases" do - assert_equal(6,@ds.cases) - end - should "set correct field names" do - assert_equal(%w{id name age city a1},@ds.fields) - end - should "set a dataset equal to expected" do - id=[1,2,3,4,5,6].to_vector(:scale) - name=["Alex","Claude","Peter","Franz","George","Fernand"].to_vector(:nominal) - age=[20,23,25,nil,5.5,nil].to_vector(:scale) - city=["New York","London","London","Paris","Tome",nil].to_vector(:nominal) - a1=["a,b","b,c","a",nil,"a,b,c",nil].to_vector(:nominal) - ds_exp=Statsample::Dataset.new({'id'=>id,'name'=>name,'age'=>age,'city'=>city,'a1'=>a1}, %w{id name age city a1}) - ds_exp.fields.each{|f| - assert_equal(ds_exp[f],@ds[f]) - } - assert_equal(ds_exp,@ds) - end - should "set to nil empty cells" do - assert_equal(nil,@ds['age'][5]) - end - end - context "Excel writer" do - setup do - a=100.times.map{rand(100)}.to_scale - b=(["b"]*100).to_vector - @ds={'b'=>b, 'a'=>a}.to_dataset(%w{b a}) - tempfile=Tempfile.new("test_write.xls") - Statsample::Excel.write(@ds,tempfile.path) - @ds2=Statsample::Excel.read(tempfile.path) - end - should "return same fields as original" do - assert_equal(@ds.fields ,@ds2.fields) - end - should "return same number of cases as original" do - assert_equal(@ds.cases, @ds2.cases) - end - should "return same cases as original" do - i=0 - @ds2.each_array do |row| - assert_equal(@ds.case_as_array(i),row) - i+=1 - end - end - end -end diff --git a/web/Rakefile b/web/Rakefile deleted file mode 100644 index b2f4127..0000000 --- a/web/Rakefile +++ /dev/null @@ -1,39 +0,0 @@ -# -*- ruby -*- -require 'rake' -require 'fileutils' -directory "examples" - -def get_base(f) - f.sub(File.dirname(__FILE__)+"/../examples/","").gsub("/","_").gsub(".rb","") -end - - -EXAMPLES=Dir.glob(File.dirname(__FILE__)+"/../examples/**/*.rb").map {|v| [v, get_base(v)] -}.find_all{|v| !v[0].include?"_data"} - -EXAMPLES_BASE=EXAMPLES.map {|v| v[1]} - - -desc "Build all html, rtf and pdf files" -task :build_site do - ruby "build_site.rb" -end - - -task :clean do - Dir.glob(File.dirname(__FILE__)+"/examples/*.pdf").each do |t| - FileUtils.rm t - end - Dir.glob(File.dirname(__FILE__)+"/examples/*.html").each do |t| - FileUtils.rm t - end - Dir.glob(File.dirname(__FILE__)+"/examples/*.rtf").each do |t| - FileUtils.rm t - end - Dir.glob(File.dirname(__FILE__)+"/examples/images/*.*").each do |t| - FileUtils.rm t - end -end - - -load 'upload_task.rb' if File.exists? "upload_task.rb"