diff --git a/.gitignore b/.gitignore
index ce64169..f96e6d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+Gemfile.lock
doc.yaml
*.swp
*.rbc
@@ -11,3 +12,4 @@ examples/images/*
examples/*.html
web/upload_task.rb
.idea
+*.gem
diff --git a/.travis.yml b/.travis.yml
index f4a0791..4741681 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,13 +2,22 @@ language:
ruby
rvm:
- - '1.9.3'
- - '2.0.0'
- - '2.1.1'
+ - '2.0'
+ - '2.1'
+ - '2.2'
+ - '2.3.0'
+ - '2.4'
+
+matrix:
+ fast_finish:
+ true
+
+script: "bundle exec rake test"
+
+install:
+ - gem install bundler
+ - bundle install
-script:
- bundle exec rake test
-
before_install:
- sudo apt-get update -qq
- sudo apt-get install -y libgsl0-dev r-base r-base-dev
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..3365674
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,17 @@
+# Contributing guide
+
+## Installing statsample development dependencies
+
+Keep in mind that either nmatrix OR rb-gsl are NOT NECESSARY for using statsample. They are just required for an optional speed up.
+
+Statsample also works with [rb-gsl](https://github.com/sciruby/rb-gsl).
+
+Install dependencies:
+
+ `bundle install`
+
+And run the test suite (should be all green):
+
+ `bundle exec rake test`
+
+If you have problems installing nmatrix, please consult the [nmatrix installation wiki](https://github.com/SciRuby/nmatrix/wiki/Installation) or the [mailing list](https://groups.google.com/forum/#!forum/sciruby-dev).
\ No newline at end of file
diff --git a/Gemfile b/Gemfile
index ea8fc56..38eb365 100644
--- a/Gemfile
+++ b/Gemfile
@@ -1,18 +1,2 @@
source "https://www.rubygems.org"
-gem 'minitest'
-gem 'rdoc'
-gem 'mocha', '0.14.0' #:require=>'mocha/setup'
-gem 'shoulda','3.5.0'
-gem 'shoulda-matchers','2.2.0'
-gem 'hoe'
-#gem 'bio-statsample-timeseries'
-gem 'reportbuilder'
-gem 'dirty-memoize'
-gem 'distribution'
-gem 'extendmatrix'
-gem 'minimization'
-gem 'rserve-client'
-gem 'rubyvis'
-gem 'spreadsheet'
-gem 'rb-gsl'
-gem 'awesome_print'
+gemspec
diff --git a/Gemfile.lock b/Gemfile.lock
deleted file mode 100644
index ef5d88d..0000000
--- a/Gemfile.lock
+++ /dev/null
@@ -1,81 +0,0 @@
-GEM
- remote: https://www.rubygems.org/
- specs:
- activesupport (4.1.6)
- i18n (~> 0.6, >= 0.6.9)
- json (~> 1.7, >= 1.7.7)
- minitest (~> 5.1)
- thread_safe (~> 0.1)
- tzinfo (~> 1.1)
- awesome_print (1.2.0)
- clbustos-rtf (0.4.2)
- dirty-memoize (0.0.4)
- distribution (0.7.1)
- extendmatrix (0.3.1)
- hoe (3.13.0)
- rake (>= 0.8, < 11.0)
- i18n (0.6.11)
- json (1.8.1)
- metaclass (0.0.4)
- minimization (0.2.1)
- rb-gsl (~> 1.2)
- text-table (~> 1.2)
- minitest (5.4.2)
- mocha (0.14.0)
- metaclass (~> 0.0.1)
- narray (0.6.0.9)
- prawn (0.8.4)
- prawn-core (>= 0.8.4, < 0.9)
- prawn-layout (>= 0.8.4, < 0.9)
- prawn-security (>= 0.8.4, < 0.9)
- prawn-core (0.8.4)
- prawn-layout (0.8.4)
- prawn-security (0.8.4)
- prawn-svg (0.9.1.11)
- prawn (>= 0.8.4)
- rake (10.3.2)
- rb-gsl (1.16.0.2)
- narray (>= 0.5.9)
- rdoc (4.1.2)
- json (~> 1.4)
- reportbuilder (1.4.2)
- clbustos-rtf (~> 0.4.0)
- prawn (~> 0.8.4)
- prawn-svg (~> 0.9.1)
- text-table (~> 1.2)
- rserve-client (0.3.1)
- ruby-ole (1.2.11.7)
- rubyvis (0.6.1)
- shoulda (3.5.0)
- shoulda-context (~> 1.0, >= 1.0.1)
- shoulda-matchers (>= 1.4.1, < 3.0)
- shoulda-context (1.2.1)
- shoulda-matchers (2.2.0)
- activesupport (>= 3.0.0)
- spreadsheet (1.0.0)
- ruby-ole (>= 1.0)
- text-table (1.2.3)
- thread_safe (0.3.4)
- tzinfo (1.2.2)
- thread_safe (~> 0.1)
-
-PLATFORMS
- ruby
-
-DEPENDENCIES
- awesome_print
- dirty-memoize
- distribution
- extendmatrix
- hoe
- minimization
- minitest
- mocha (= 0.14.0)
- rb-gsl
- rdoc
- reportbuilder
- rserve-client
- rubyvis
- shoulda (= 3.5.0)
- shoulda-matchers (= 2.2.0)
- spreadsheet
diff --git a/History.txt b/History.txt
index a438896..40bc0db 100644
--- a/History.txt
+++ b/History.txt
@@ -1,9 +1,47 @@
+=== 2.1.0 / 2017-08-10
+ * Update documentation to reflect methods that have been removed (@lokeshh)
+ * Update daru dependency to v0.1.6 (@lokeshh)
+ * Remove pre-daru legacy methods like n_valid, missing value functions (@lokeshh)
+ * Update test suite with rubocop and rake. New tests for methods like Regression (@lokeshh)
+ * Introduce fitting a regression using string formulas (@lokeshh)
+
+=== 2.0.2 / 2016-03-11
+ * Update dependencies (spreadsheet, GSL)
+
+=== 2.0.1 / 2015-08-19
+ * Cleaned legacy containers in favor of `Daru::DataFrame` and `Daru::Vector`.
+
+=== 2.0.0 / 2015-06-20
+ * Added dependency on daru and replaced Statsample::Vector and Dataset with
+ Daru::Vector and Daru::DataFrame.
+ * NMatrix and gsl-nmatrix are used as development dependencies.
+
+=== 1.5.0 / 2015-06-11
+ * Made sure all methods work properly with and without GSL.
+ * Statsample works with either rb-gsl or gsl-nmatrix.
+ * Changed the data types of Statsample::Vector from :ordinal, :scale and
+ :nominal to only :numeric and :object. :numeric replaces :ordinal/:scale
+ and :object replaces :nominal. Methods for creating the older data types still
+ exist, but throw a warning prodding the user to use the new methods.
+
+=== 1.4.3 / 2015-04-27
+ * Removed rb-gsl dependency.
+
+=== 1.4.2 / 2015-04-07
+ * Statsample::CSV.read accepts numbers in scientific notation.
+ * Test on Ruby 2.2 via Travis CI.
+
+=== 1.4.1 / 2015-03-26
+ * Removed Hoe gem in order to use `statsample.gemspec`.
+ * Improved readability of some files by using rubocop.
+ * Removed a bad check in `cronbach_alpha` (#10).
+
=== 1.4.0 / 2014-10-11
* Replaced README.txt for README.md
* Replace File.exists? for File.exist?
+ New Dataset.join to join two dataset based on some fields
* Deleted MLE based regression (Probit and logistic). Now all GML methods are on statsample-glm
-
+
=== 1.3.1 / 2014-06-26
* Example referred to a SimpleRegression class which doesn't exist. Updated to working example.
@@ -23,7 +61,7 @@
* open svg on mac osx
=== 1.2.0 / 2011-12-15
-
+
* Added support for time series (TimeSeries object): MA, EMA, MACD, acf, lag and delta. [Rob Britton]
* Changed summary attribute to properly display 'b' value for simple linear regression [hstove]
* Merge pull request #6 from hstove/patch-1Changed summary attribute to properly display 'b' value for simple linear regression [Claudio Bustos]
@@ -34,9 +72,9 @@
* New Statsample::Anova::Contrast
* Jacknife and bootstrap for Vector. Thanks to John Firebaugh for the idea
* Improved Statsample::Analysis API
-* Updated CSV.read. Third argument is a Hash with options to CSV class
+* Updated CSV.read. Third argument is a Hash with options to CSV class
* Added restriction on Statsample::Excel.read
-* Updated spanish po
+* Updated spanish po
* Better summary for Vector
* Improving summary of t related test (confidence interval and estimate output)
* Replaced c for vector on Statsample::Analysis examples
@@ -51,7 +89,7 @@
=== 1.0.0 / 2011-01-27
* Added Statsample::Analysis, a beautiful DSL to perform fast statistical analysis using statsample. See directory /examples
-* Created benchmarks directory
+* Created benchmarks directory
* Removed Distribution module from statsample and moved to a gem. Changes on code to reflect new API
* Optimized simple regression. Better library detection
* New 'should_with_gsl' to test methods with gsl. Refactored Factor::MAP
@@ -62,17 +100,17 @@
* Modified examples using Statsample::Analysis
* Simplified eigen calculations
* Updated some examples. Added correlation matrix speed suite
-* Correlation matrix optimized. Better specs
-* Optimized correlation matrix. Use gsl matrix algebra or pairwise correlations depending on empiric calculated equations. See benchmarks/correlation_matrix.rb to see implementation of calculation
+* Correlation matrix optimized. Better specs
+* Optimized correlation matrix. Use gsl matrix algebra or pairwise correlations depending on empiric calculated equations. See benchmarks/correlation_matrix.rb to see implementation of calculation
* Moved tests fixtures from data to test/fixtures
* Fixed some errors on tests
-* Bug fix: constant_se on binomial regression have an error
-* All test should work on ruby 1.9.3
+* Bug fix: constant_se on binomial regression have an error
+* All test should work on ruby 1.9.3
* New Vector.[] and Vector.new_scale
-* Detect linearly dependent predictors on OLS.
+* Detect linearly dependent predictors on OLS.
=== 0.18.0 / 2011-01-07
-* New Statsample.load_excel
+* New Statsample.load_excel
* New Statsample.load_csv
* Statsample::Dataset#[] accepts an array of fields and uses clone
* New Dataset#correlation_matrix and Statsample::Dataset#covariance_matrix
@@ -83,19 +121,19 @@
* Improved summary for PCA using covariance matrix
* New attribute :label_angle for Statsample::Graph::Boxplot
* Fixed Scatterplots scaling problems
-* New attributes for Scatterplots: groups, minimum_x, minimum_y, maximum_x,
+* New attributes for Scatterplots: groups, minimum_x, minimum_y, maximum_x,
* New Statsample::Multiset#union allows to create a new dataset based on a m
* New Statsample::Multiset#each to traverse through datasets
* Bug fix: Vector#standarized and Vector#percentile crash on nil data
* Bug fix: Vector#mean and Vector#sd crash on data without valid values
* Modified methods names on Statsample::Factor::PCA : feature_vector to feature_matrix, data_transformation to principal_components
* Added Statsample::Vector.vector_centered
-* Factor::MAP.with_dataset() implemented
-* Bug fix: Factor::MAP with correlation matrix with non-real eigenvalues crashes * Added documentation for Graph::Histogram
+* Factor::MAP.with_dataset() implemented
+* Bug fix: Factor::MAP with correlation matrix with non-real eigenvalues crashes * Added documentation for Graph::Histogram
* Added MPA to Reliability::MultiScaleAnalysis
-* Added custom names for returned vectors and datasets
-* Updated spanish traslation
-* Graph::Histogram updated. Custom x and y max and min, optional normal distribution drawing
+* Added custom names for returned vectors and datasets
+* Updated spanish traslation
+* Graph::Histogram updated. Custom x and y max and min, optional normal distribution drawing
* Updated Histogram class, with several new methods compatibles with GSL::Histogram
=== 0.17.0 / 2010-12-09
@@ -106,18 +144,18 @@
=== 0.16.0 / 2010-11-13
* Works on ruby 1.9.2 and HEAD. Updated Rakefile and manifest
-* Removed all graph based on Svg::Graph.
+* Removed all graph based on Svg::Graph.
* First operative version of Graph with Rubyvis
-* Corrected bug on Distribution::Normal.cdf.
+* Corrected bug on Distribution::Normal.cdf.
* Added reference on references.txt
* Ruby-based random gaussian distribution generator when gsl not available
* Added population average deviation [Al Chou]
=== 0.15.1 / 2010-10-20
-* Statsample::Excel and Statsample::PlainText add name to vectors equal to field name
+* Statsample::Excel and Statsample::PlainText add name to vectors equal to field name
* Statsample::Dataset.delete_vector accept multiple fields.
-* Statsample::Dataset.dup_only_valid allows duplication of specific fields
-* ScaleAnalysis doesn't crash on one-item scales
+* Statsample::Dataset.dup_only_valid allows duplication of specific fields
+* ScaleAnalysis doesn't crash on one-item scales
* Updated references
=== 0.15.0 / 2010-09-07
@@ -126,14 +164,14 @@
* Added Spearman-Brown prophecy on Reliability module
* Distribution::F uses Gsl when available
* Added mean r.p.b. and item sd on Scale Analysis
-* Corrected bug on Vector.ary_method and example of Anova Two Way using vector.
+* Corrected bug on Vector.ary_method and example of Anova Two Way using vector.
=== 0.14.1 / 2010-08-18
-* Added extra information on $DEBUG=true.
-* Changed ParallelAnalysis: with_random_data parameters, bootstrap_method options are data and random, resolve bug related to number of factors to preserve, resolved bug related to original eigenvalues, can support failed bootstrap of data for Tetrachoric correlation.
-* Optimized eigenpairs on Matrix when GSL is available.
+* Added extra information on $DEBUG=true.
+* Changed ParallelAnalysis: with_random_data parameters, bootstrap_method options are data and random, resolve bug related to number of factors to preserve, resolved bug related to original eigenvalues, can support failed bootstrap of data for Tetrachoric correlation.
+* Optimized eigenpairs on Matrix when GSL is available.
* Added test for parallel analysis using data bootstraping
* Updated .pot and Manifest.txt
* Added test for kmo(global and univariate), bartlett and anti-image. Kmo and Bartlett have test based on Dziuban and Shirkey with correct results
@@ -142,16 +180,16 @@
* Added reference for Statsample::Factor::MAP
=== 0.14.0 / 2010-08-16
-* Added Statsample::Factor::MAP, to execute Velicer's (1976) MAP to determine the number of factors to retain on EFA
+* Added Statsample::Factor::MAP, to execute Velicer's (1976) MAP to determine the number of factors to retain on EFA
* Bug fix on test suite on Ruby 1.8.7
* Horn's Parallel Analysis operational and tested for pure random data
-* Fixed bug on Excel writer on Ruby1.9 (frozen string on header raises an error).
+* Fixed bug on Excel writer on Ruby1.9 (frozen string on header raises an error).
* Extra information on Factorial Analysis on summaries
-* Fixed bug on Factor::Rotation when used ::Matrix without field method.
+* Fixed bug on Factor::Rotation when used ::Matrix without field method.
* Added Vector#vector_percentil method
-* Summaries for PCA, Rotation, MultiScale and ScaleAnalysis created or improved.
+* Summaries for PCA, Rotation, MultiScale and ScaleAnalysis created or improved.
* Factor::PCA could have rotation and parallel analysis on summary.
-* Cronbach's alpha from covariance matrix raise an error on size<2
+* Cronbach's alpha from covariance matrix raise an error on size<2
* MultiScaleAnalysis could have Parallel Analysis on summary.
* Added Chi Square test
* Added new information on README.txt
@@ -168,7 +206,7 @@
* Polychoric and Tetrachoric moved to gem statsample-bivariate-extension
* All classes left with summary method include Summarizable now. Every method which return localizable string is now parsed with _()
-* Correct implementation of Reliability::MultiScaleAnalysis.
+* Correct implementation of Reliability::MultiScaleAnalysis.
* Spanish translation for Mann-Whitney's U
* Added example for Mann-Whitney's U test
* Better summary for Mann-Whitney's U Test
@@ -179,10 +217,10 @@
* Modified Rakefile to remove dependencies based on C extensions. These are moved to statsample-optimization
* T test with unequal variance fixed on i686
-* API Change: Renamed Reliability::ItemAnalysis and moved to independent file
+* API Change: Renamed Reliability::ItemAnalysis and moved to independent file
* New Reliability::MultiScaleAnalysis for easy analysis of scales on a same survey, includind reliability, correlation matrix and Factor Analysis
* Updated README to reflect changes on Reliability module
-* SvgGraph works with reportbuilder.
+* SvgGraph works with reportbuilder.
* Added methods on Polychoric based on Olsson(1979): the idea is estimate using second derivatives.
* Distribution test changed (reduced precision on 32 bits system
@@ -196,7 +234,7 @@
New features:
* Added Statsample::Anova::TwoWay and Statsample::Anova::TwoWayWithVectors
* Added Statsample.clone_only valid and Statsample::Dataset.clone_only_valid, for cheap copy on already clean vectors
-Optimizations and bug fix
+Optimizations and bug fix
* Removed library statistics2 from package. Used gem statistics2 instead, because have a extension version
* Added example for Reliability class
* Bug fix on Statsample::DominanceAnalysis
@@ -204,7 +242,7 @@
=== 0.10.0 / 2010-04-13
API modifications
-* Refactoring of Statsample::Anova module.
+* Refactoring of Statsample::Anova module.
* Statsample::Anova::OneWay :implementation of generic ANOVA One-Way, used by Multiple Regression, for example.
* Statsample::Anova::OneWayWithVectors: implementation of ANOVA One-Way to test differences of means.
@@ -228,7 +266,7 @@
=== 0.8.1 / 2010-03-29
* Fixed Regression summaries
=== 0.8.0 / 2010-03-29
-* New Statsample::Test::T module, with classes and methods to do Student's t tests for one and two samples.
+* New Statsample::Test::T module, with classes and methods to do Student's t tests for one and two samples.
* Statsample::PromiseAfter module to set a number of variables without explicitly call the compute or iterate method
* All tests ported to MiniUnit
* Directory 'demo' renamed to 'examples'
@@ -266,7 +304,7 @@
=== 0.6.4 / 2010-02-19
-* Dominance Analysis and Dominance Analysis Bootstrap allows multivariate dependent analysis.
+* Dominance Analysis and Dominance Analysis Bootstrap allows multivariate dependent analysis.
* Test suite for Dominance Analysis, using Azen and Budescu papers as references
* X^2 for polychoric correlation
@@ -285,12 +323,12 @@
* New Statsample::Factor module. Include classes for extracting factors (Statsample::Factor::PCA and Statsample::Factor::PrincipalAxis) and rotate component matrix ( Statsample::Factor::Rotation subclasses). For now, only orthogonal rotations
* New Statsample::Dataset.crosstab_with_asignation, Statsample::Dataset.one_to_many
* New class Statsample::Permutation to produce permutations of a given array
-* New class Statsample::Histogram, with same interface as GSL one
+* New class Statsample::Histogram, with same interface as GSL one
* New class Statsample::Test::UMannWhitney, to perform Mann-Whitney's U test. Gives z based and exact calculation of probability
* Improved support for ReportBuilder
* Statsample::Codification module reworked
* Fixed bugs on Dominance Analysis classes
-* Fixed bugs on Statsample::Vector.kurtosis and Statsample::Vector.skew
+* Fixed bugs on Statsample::Vector.kurtosis and Statsample::Vector.skew
=== 0.5.1 / 2009-10-06
@@ -354,15 +392,15 @@
* One Way Anova on Statsample::Anova::OneWay
* Dominance Analysis!!!! The one and only reason to develop a Multiple Regression on pure ruby.
-* Multiple Regression on Multiple Regression module. Pairwise (pure ruby) or MultipleRegressionPairwise and Listwise (optimized) on MultipleRegressionAlglib and
+* Multiple Regression on Multiple Regression module. Pairwise (pure ruby) or MultipleRegressionPairwise and Listwise (optimized) on MultipleRegressionAlglib and
* New Dataset#to_gsl_matrix, #from_to,#[..],#bootstrap,#vector_missing_values, #vector_count_characters, #each_with_index, #collect_with_index
* New Vector#box_cox_transformation
* Module Correlation renamed to Bivariate
* Some fancy methods and classes to create Summaries
* Some documentation about Algorithm used on doc_latex
* Deleted 'distributions' extension. Ruby/GSL has all the pdf and cdf you ever need.
-* Tests work without any dependency. Only nags about missing deps.
-* Test for MultipleRegression, Anova, Excel, Bivariate.correlation_matrix and many others
+* Tests work without any dependency. Only nags about missing deps.
+* Test for MultipleRegression, Anova, Excel, Bivariate.correlation_matrix and many others
=== 0.1.9 / 2009-05-22
@@ -372,8 +410,8 @@
* Module SRS: New methods estimation_n0 and estimation_n
* Module Reliability: new ItemCharacteristicCurve class
* New HtmlReport class
-* New experimental SPSS Class.
-* Converters: Module CSV with new options. Added write() method for GGobi module
+* New experimental SPSS Class.
+* Converters: Module CSV with new options. Added write() method for GGobi module
* New Mx exporter (http://www.vcu.edu/mx/)
* Class SimpleRegression: new methods standard error
@@ -404,7 +442,7 @@
=== 0.1.4 / 2008-08-27
* New extension, with cdf functions for
- chi-square, t, gamma and normal distributions.
+ chi-square, t, gamma and normal distributions.
Based on dcdflib (http://www.netlib.org/random/)
Also, has a function to calculate the tail for a noncentral T distribution
diff --git a/LICENSE.txt b/LICENSE.txt
index 9d0b178..6886323 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2009-2014, Claudio Bustos
+Copyright (c) 2009-2015, Claudio Bustos
All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
diff --git a/Manifest.txt b/Manifest.txt
deleted file mode 100644
index f465a24..0000000
--- a/Manifest.txt
+++ /dev/null
@@ -1,157 +0,0 @@
-.travis.yml
-Gemfile
-Gemfile.lock
-History.txt
-LICENSE.txt
-Manifest.txt
-README.md
-Rakefile
-benchmarks/correlation_matrix_15_variables.rb
-benchmarks/correlation_matrix_5_variables.rb
-benchmarks/correlation_matrix_methods/correlation_matrix.ds
-benchmarks/correlation_matrix_methods/correlation_matrix.html
-benchmarks/correlation_matrix_methods/correlation_matrix.rb
-benchmarks/correlation_matrix_methods/correlation_matrix.xls
-benchmarks/correlation_matrix_methods/correlation_matrix_gsl_ruby.ods
-benchmarks/correlation_matrix_methods/correlation_matrix_with_graphics.ods
-benchmarks/correlation_matrix_methods/results.ds
-benchmarks/factor_map.rb
-benchmarks/helpers_benchmark.rb
-data/locale/es/LC_MESSAGES/statsample.mo
-doc_latex/manual/equations.tex
-examples/boxplot.rb
-examples/correlation_matrix.rb
-examples/dataset.rb
-examples/dominance_analysis.rb
-examples/dominance_analysis_bootstrap.rb
-examples/histogram.rb
-examples/icc.rb
-examples/levene.rb
-examples/multiple_regression.rb
-examples/multivariate_correlation.rb
-examples/parallel_analysis.rb
-examples/polychoric.rb
-examples/principal_axis.rb
-examples/reliability.rb
-examples/scatterplot.rb
-examples/t_test.rb
-examples/tetrachoric.rb
-examples/u_test.rb
-examples/vector.rb
-examples/velicer_map_test.rb
-grab_references.rb
-lib/spss.rb
-lib/statsample.rb
-lib/statsample/analysis.rb
-lib/statsample/analysis/suite.rb
-lib/statsample/analysis/suitereportbuilder.rb
-lib/statsample/anova.rb
-lib/statsample/anova/contrast.rb
-lib/statsample/anova/oneway.rb
-lib/statsample/anova/twoway.rb
-lib/statsample/bivariate.rb
-lib/statsample/bivariate/pearson.rb
-lib/statsample/codification.rb
-lib/statsample/converter/csv.rb
-lib/statsample/converter/spss.rb
-lib/statsample/converters.rb
-lib/statsample/crosstab.rb
-lib/statsample/dataset.rb
-lib/statsample/dominanceanalysis.rb
-lib/statsample/dominanceanalysis/bootstrap.rb
-lib/statsample/factor.rb
-lib/statsample/factor/map.rb
-lib/statsample/factor/parallelanalysis.rb
-lib/statsample/factor/pca.rb
-lib/statsample/factor/principalaxis.rb
-lib/statsample/factor/rotation.rb
-lib/statsample/graph.rb
-lib/statsample/graph/boxplot.rb
-lib/statsample/graph/histogram.rb
-lib/statsample/graph/scatterplot.rb
-lib/statsample/histogram.rb
-lib/statsample/matrix.rb
-lib/statsample/multiset.rb
-lib/statsample/regression.rb
-lib/statsample/regression/multiple.rb
-lib/statsample/regression/multiple/alglibengine.rb
-lib/statsample/regression/multiple/baseengine.rb
-lib/statsample/regression/multiple/gslengine.rb
-lib/statsample/regression/multiple/matrixengine.rb
-lib/statsample/regression/multiple/rubyengine.rb
-lib/statsample/regression/simple.rb
-lib/statsample/reliability.rb
-lib/statsample/reliability/icc.rb
-lib/statsample/reliability/multiscaleanalysis.rb
-lib/statsample/reliability/scaleanalysis.rb
-lib/statsample/reliability/skillscaleanalysis.rb
-lib/statsample/resample.rb
-lib/statsample/rserve_extension.rb
-lib/statsample/shorthand.rb
-lib/statsample/srs.rb
-lib/statsample/test.rb
-lib/statsample/test/bartlettsphericity.rb
-lib/statsample/test/chisquare.rb
-lib/statsample/test/f.rb
-lib/statsample/test/kolmogorovsmirnov.rb
-lib/statsample/test/levene.rb
-lib/statsample/test/t.rb
-lib/statsample/test/umannwhitney.rb
-lib/statsample/test/wilcoxonsignedrank.rb
-lib/statsample/vector.rb
-lib/statsample/vector/gsl.rb
-lib/statsample/version.rb
-po/es/statsample.mo
-po/es/statsample.po
-po/statsample.pot
-references.txt
-setup.rb
-test/fixtures/bank2.dat
-test/fixtures/correlation_matrix.rb
-test/fixtures/hartman_23.matrix
-test/fixtures/repeated_fields.csv
-test/fixtures/stock_data.csv
-test/fixtures/test_csv.csv
-test/fixtures/test_xls.xls
-test/fixtures/tetmat_matrix.txt
-test/fixtures/tetmat_test.txt
-test/helpers_tests.rb
-test/test_analysis.rb
-test/test_anova_contrast.rb
-test/test_anovaoneway.rb
-test/test_anovatwoway.rb
-test/test_anovatwowaywithdataset.rb
-test/test_anovawithvectors.rb
-test/test_bartlettsphericity.rb
-test/test_bivariate.rb
-test/test_codification.rb
-test/test_crosstab.rb
-test/test_csv.rb
-test/test_dataset.rb
-test/test_dominance_analysis.rb
-test/test_factor.rb
-test/test_factor_map.rb
-test/test_factor_pa.rb
-test/test_ggobi.rb
-test/test_gsl.rb
-test/test_histogram.rb
-test/test_matrix.rb
-test/test_multiset.rb
-test/test_regression.rb
-test/test_reliability.rb
-test/test_reliability_icc.rb
-test/test_reliability_skillscale.rb
-test/test_resample.rb
-test/test_rserve_extension.rb
-test/test_srs.rb
-test/test_statistics.rb
-test/test_stest.rb
-test/test_stratified.rb
-test/test_test_f.rb
-test/test_test_kolmogorovsmirnov.rb
-test/test_test_t.rb
-test/test_umannwhitney.rb
-test/test_vector.rb
-test/test_wilcoxonsignedrank.rb
-test/test_xls.rb
-web/Rakefile
diff --git a/README.md b/README.md
index 8c8151d..cadaeae 100644
--- a/README.md
+++ b/README.md
@@ -1,192 +1,174 @@
# Statsample
-Homepage :: https://github.com/clbustos/statsample
-
-[](https://travis-ci.org/clbustos/statsample)
+[](https://travis-ci.org/SciRuby/statsample)
+[](https://codeclimate.com/github/SciRuby/statsample)
[](http://badge.fury.io/rb/statsample)
-## DESCRIPTION
-A suite for basic and advanced statistics on Ruby. Tested on Ruby 2.1.1p76 (June 2014), 1.8.7, 1.9.1, 1.9.2 (April, 2010), ruby-head(June, 2011) and JRuby 1.4 (Ruby 1.8.7 compatible).
+Homepage :: https://github.com/sciruby/statsample
-Include:
-* Descriptive statistics: frequencies, median, mean, standard error, skew, kurtosis (and many others).
-* Imports and exports datasets from and to Excel, CSV and plain text files.
-* Correlations: Pearson's r, Spearman's rank correlation (rho), point biserial, tau a, tau b and gamma. Tetrachoric and Polychoric correlation provides by +statsample-bivariate-extension+ gem.
-* Intra-class correlation
-* Anova: generic and vector-based One-way ANOVA and Two-way ANOVA, with contrasts for One-way ANOVA.
-* Tests: F, T, Levene, U-Mannwhitney.
-* Regression: Simple, Multiple (OLS), Probit and Logit
-* Factorial Analysis: Extraction (PCA and Principal Axis), Rotation (Varimax, Equimax, Quartimax) and Parallel Analysis and Velicer's MAP test, for estimation of number of factors.
-* Reliability analysis for simple scale and a DSL to easily analyze multiple scales using factor analysis and correlations, if you want it.
-* Basic time series support
-* Dominance Analysis, with multivariate dependent and bootstrap (Azen & Budescu)
-* Sample calculation related formulas
-* Structural Equation Modeling (SEM), using R libraries +sem+ and +OpenMx+
-* Creates reports on text, html and rtf, using ReportBuilder gem
-* Graphics: Histogram, Boxplot and Scatterplot
+# Installation
-## Principles
+You should have a recent version of GSL and R (with the `irr` and `Rserve` libraries) installed. In Ubuntu:
-* Software Design:
- * One module/class for each type of analysis
- * Options can be set as hash on initialize() or as setters methods
- * Clean API for interactive sessions
- * summary() returns all necessary informacion for interactive sessions
- * All statistical data available though methods on objects
- * All (important) methods should be tested. Better with random data.
-* Statistical Design
- * Results are tested against text results, SPSS and R outputs.
- * Go beyond Null Hiphotesis Testing, using confidence intervals and effect sizes when possible
- * (When possible) All references for methods are documented, providing sensible information on documentation
-
-## Features
-
-* Classes for manipulation and storage of data:
- * Statsample::Vector: An extension of an array, with statistical methods like sum, mean and standard deviation
- * Statsample::Dataset: a group of Statsample::Vector, analog to a excel spreadsheet or a dataframe on R. The base of almost all operations on statsample.
- * Statsample::Multiset: multiple datasets with same fields and type of vectors
-* Anova module provides generic Statsample::Anova::OneWay and vector based Statsample::Anova::OneWayWithVectors. Also you can create contrast using Statsample::Anova::Contrast
-* Module Statsample::Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric (see Bivariate::Tetrachoric) and polychoric (see Bivariate::Polychoric) correlations. Include methods to create correlation and covariance matrices
-* Multiple types of regression.
- * Simple Regression : Statsample::Regression::Simple
- * Multiple Regression: Statsample::Regression::Multiple
- * Logit Regression: Statsample::Regression::Binomial::Logit
- * Probit Regression: Statsample::Regression::Binomial::Probit
-* Factorial Analysis algorithms on Statsample::Factor module.
- * Classes for Extraction of factors:
- * Statsample::Factor::PCA
- * Statsample::Factor::PrincipalAxis
- * Classes for Rotation of factors:
- * Statsample::Factor::Varimax
- * Statsample::Factor::Equimax
- * Statsample::Factor::Quartimax
- * Classes for calculation of factors to retain
- * Statsample::Factor::ParallelAnalysis performs Horn's 'parallel analysis' to a principal components analysis to adjust for sample bias in the retention of components.
- * Statsample::Factor::MAP performs Velicer's Minimum Average Partial (MAP) test, which retain components as long as the variance in the correlation matrix represents systematic variance.
-* Dominance Analysis. Based on Budescu and Azen papers, dominance analysis is a method to analyze the relative importance of one predictor relative to another on multiple regression
- * Statsample::DominanceAnalysis class can report dominance analysis for a sample, using uni or multivariate dependent variables
- * Statsample::DominanceAnalysis::Bootstrap can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
-* Module Statsample::Codification, to help to codify open questions
-* Converters to import and export data:
- * Statsample::Database : Can create sql to create tables, read and insert data
- * Statsample::CSV : Read and write CSV files
- * Statsample::Excel : Read and write Excel files
- * Statsample::Mx : Write Mx Files
- * Statsample::GGobi : Write Ggobi files
-* Module Statsample::Crosstab provides function to create crosstab for categorical data
-* Module Statsample::Reliability provides functions to analyze scales with psychometric methods.
- * Class Statsample::Reliability::ScaleAnalysis provides statistics like mean, standard deviation for a scale, Cronbach's alpha and standarized Cronbach's alpha, and for each item: mean, correlation with total scale, mean if deleted, Cronbach's alpha is deleted.
- * Class Statsample::Reliability::MultiScaleAnalysis provides a DSL to easily analyze reliability of multiple scales and retrieve correlation matrix and factor analysis of them.
- * Class Statsample::Reliability::ICC provides intra-class correlation, using Shrout & Fleiss(1979) and McGraw & Wong (1996) formulations.
-* Module Statsample::SRS (Simple Random Sampling) provides a lot of functions to estimate standard error for several type of samples
-* Module Statsample::Test provides several methods and classes to perform inferencial statistics
- * Statsample::Test::BartlettSphericity
- * Statsample::Test::ChiSquare
- * Statsample::Test::F
- * Statsample::Test::KolmogorovSmirnov (only D value)
- * Statsample::Test::Levene
- * Statsample::Test::UMannWhitney
- * Statsample::Test::T
- * Statsample::Test::WilcoxonSignedRank
-* Module Graph provides several classes to create beautiful graphs using rubyvis
- * Statsample::Graph::Boxplot
- * Statsample::Graph::Histogram
- * Statsample::Graph::Scatterplot
-* Gem bio-statsample-timeseries provides module Statsample::TimeSeries with support for time series, including ARIMA estimation using Kalman-Filter.
-* Gem statsample-sem provides a DSL to R libraries +sem+ and +OpenMx+
-* Gem statsample-glm provides you with GML method, to work with Logistic, Poisson and Gaussian regression ,using ML or IRWLS.
-* Close integration with gem reportbuilder, to easily create reports on text, html and rtf formats.
-
-# Examples of use:
-
-See the [examples folder](https://github.com/clbustos/statsample/tree/master/examples/) too.
-
-## Boxplot
-
-```ruby
-require 'statsample'
-
-ss_analysis(Statsample::Graph::Boxplot) do
- n=30
- a=rnorm(n-1,50,10)
- b=rnorm(n, 30,5)
- c=rnorm(n,5,1)
- a.push(2)
- boxplot(:vectors=>[a,b,c], :width=>300, :height=>300, :groups=>%w{first first second}, :minimum=>0)
-end
-Statsample::Analysis.run # Open svg file on *nix application defined
+```bash
+$ sudo apt-get install libgsl0-dev r-base r-base-dev
+$ sudo Rscript -e "install.packages(c('Rserve', 'irr'))"
```
-## Correlation matrix
-
-```ruby
-require 'statsample'
-# Note R like generation of random gaussian variable
-# and correlation matrix
-
-ss_analysis("Statsample::Bivariate.correlation_matrix") do
- samples=1000
- ds=data_frame(
- 'a'=>rnorm(samples),
- 'b'=>rnorm(samples),
- 'c'=>rnorm(samples),
- 'd'=>rnorm(samples))
- cm=cor(ds)
- summary(cm)
-end
-
-Statsample::Analysis.run_batch # Echo output to console
+With these libraries in place, just install from rubygems:
+
+```bash
+$ [sudo] gem install statsample
```
-# Requirements
+On *nix, you should install statsample-optimization to retrieve gems gsl, statistics2 and a C extension to speed some methods.
-Optional:
+```bash
+$ [sudo] gem install statsample-optimization
+```
-* Plotting: gnuplot and rbgnuplot, SVG::Graph
-* Factorial analysis and polychorical correlation(joint estimate and polychoric series): gsl library and rb-gsl (https://rubygems.org/gems/rb-gsl/). You should install it using gem install rb-gsl.
+If you need to work on Structural Equation Modeling, you could see +statsample-sem+. You need R with +sem+ or +OpenMx+ [http://openmx.psyc.virginia.edu/] libraries installed
-*Note*: Use gsl 1.12.109 or later.
+```bash
+$ [sudo] gem install statsample-sem
+```
+# Testing
-# Resources
+See CONTRIBUTING for information on testing and contributing to statsample.
-* Source code on github :: http://github.com/clbustos/statsample
-* Docs :: http://statsample.apsique.cl/
-* Bug report and feature request :: http://github.com/clbustos/statsample/issues
-* E-mailing list :: http://groups.google.com/group/statsample
+# Documentation
-# Installation
+You can see the latest documentation in [rubydoc.info](http://www.rubydoc.info/github/sciruby/statsample/master).
-```bash
-$ sudo gem install statsample
-```
+# Usage
-On *nix, you should install statsample-optimization to retrieve gems gsl, statistics2 and a C extension to speed some methods.
+## Notebooks
-There are available precompiled version for Ruby 1.9 on x86, x86_64 and mingw32 archs.
+You can see some iruby notebooks here:
-```bash
-$ sudo gem install statsample-optimization
-```
+### Statistics
-If you use Ruby 1.8, you should compile statsample-optimization, usign parameter --platform ruby
+* [Correlation Matrix with daru and statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Correlation%20Matrix%20with%20daru%20and%20statsample.ipynb)
+* [Dominance Analysis with statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Dominance%20Analysis%20with%20statsample.ipynb)
+* [Reliability ICC](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Reliability%20ICC%20with%20statsample.ipynb)
+* [Levene Test](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Levene%20Test.ipynb)
+* [Multiple Regression](http://nbviewer.ipython.org/github/v0dro/sciruby-notebooks/blob/master/Statistics/Multiple%20Regression.ipynb)
+* [Parallel Analysis on PCA](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Parallel%20Analysis%20on%20PCA.ipynb)
+* [Polychoric Analysis](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Polychoric%20Correlation.ipynb)
+* [Reliability Scale and Multiscale Analysis](https://github.com/SciRuby/sciruby-notebooks/blob/master/Statistics/Reliability%20Scale%20Analysis.ipynb)
+* [Velicer MAP Test](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Statistics/Velicer%20MAP%20test.ipynb)
-```bash
-$ sudo gem install statsample-optimization --platform ruby
-```
+### Visualizations
-If you need to work on Structural Equation Modeling, you could see +statsample-sem+. You need R with +sem+ or +OpenMx+ [http://openmx.psyc.virginia.edu/] libraries installed
+* [Creating Boxplots with daru and statsample](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Boxplot%20with%20daru%20and%20statsample.ipynb)
+* [Creating A Histogram](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Creating%20a%20Histogram.ipynb)
+* [Creating a Scatterplot](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Scatterplot%20with%20statsample.ipynb)
-```bash
-$ sudo gem install statsample-sem
-```
+### Working with DataFrame and Vector
-Available setup.rb file
+* [Creating Vectors and DataFrames with daru](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Creation%20of%20Vector%20and%20DataFrame.ipynb)
+* [Detailed Usage of Daru::Vector](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Usage%20of%20Vector.ipynb)
+* [Detailed Usage of Daru::DataFrame](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Usage%20of%20DataFrame.ipynb)
+* [Visualizing Data with Daru::DataFrame](http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Visualization/Visualizing%20data%20with%20daru%20DataFrame.ipynb)
-```bash
-sudo gem ruby setup.rb
-```
+## Examples
+
+See the /examples directory for some use cases. The notebooks listed above have mostly
+the same examples, and they look better so you might want to see that first.
+
+# Description
+
+A suite for basic and advanced statistics on Ruby. Tested on CRuby 2.0.0, 2.1.1, 2.2 and 2.3.0 See `.travis.yml` for more information.
+
+Include:
+- Descriptive statistics: frequencies, median, mean, standard error, skew, kurtosis (and many others).
+- Correlations: Pearson's r, Spearman's rank correlation (rho), point biserial, tau a, tau b and gamma. Tetrachoric and Polychoric correlation provides by +statsample-bivariate-extension+ gem.
+- Intra-class correlation
+- Anova: generic and vector-based One-way ANOVA and Two-way ANOVA, with contrasts for One-way ANOVA.
+- Tests: F, T, Levene, U-Mannwhitney.
+- Regression: Simple, Multiple (OLS)
+- Factorial Analysis: Extraction (PCA and Principal Axis), Rotation (Varimax, Equimax, Quartimax) and Parallel Analysis and Velicer's MAP test, for estimation of number of factors.
+- Reliability analysis for simple scale and a DSL to easily analyze multiple scales using factor analysis and correlations, if you want it.
+- Basic time series support
+- Dominance Analysis, with multivariate dependent and bootstrap (Azen & Budescu)
+- Sample calculation related formulas
+- Structural Equation Modeling (SEM), using R libraries +sem+ and +OpenMx+
+- Creates reports on text, html and rtf, using ReportBuilder gem
+- Graphics: Histogram, Boxplot and Scatterplot
+
+## Principles
+
+- Software Design:
+ - One module/class for each type of analysis
+ - Options can be set as hash on initialize() or as setters methods
+ - Clean API for interactive sessions
+ - summary() returns all necessary informacion for interactive sessions
+ - All statistical data available though methods on objects
+ - All (important) methods should be tested. Better with random data.
+- Statistical Design
+ - Results are tested against text results, SPSS and R outputs.
+ - Go beyond Null Hiphotesis Testing, using confidence intervals and effect sizes when possible
+ - (When possible) All references for methods are documented, providing sensible information on documentation
+
+# Features
+
+- Classes for manipulation and storage of data:
+ - Uses [daru](https://github.com/v0dro/daru) for storing data and basic statistics.
+ - Statsample::Multiset: multiple datasets with same fields and type of vectors
+- Anova module provides generic Statsample::Anova::OneWay and vector based Statsample::Anova::OneWayWithVectors. Also you can create contrast using Statsample::Anova::Contrast
+- Module Statsample::Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric (see Bivariate::Tetrachoric) and polychoric (see Bivariate::Polychoric) correlations. Include methods to create correlation and covariance matrices
+- Multiple types of regression.
+ - Simple Regression : Statsample::Regression::Simple
+ - Multiple Regression: Statsample::Regression::Multiple
+- Factorial Analysis algorithms on Statsample::Factor module.
+ - Classes for Extraction of factors:
+ - Statsample::Factor::PCA
+ - Statsample::Factor::PrincipalAxis
+ - Classes for Rotation of factors:
+ - Statsample::Factor::Varimax
+ - Statsample::Factor::Equimax
+ - Statsample::Factor::Quartimax
+ - Classes for calculation of factors to retain
+ - Statsample::Factor::ParallelAnalysis performs Horn's 'parallel analysis' to a principal components analysis to adjust for sample bias in the retention of components.
+ - Statsample::Factor::MAP performs Velicer's Minimum Average Partial (MAP) test, which retain components as long as the variance in the correlation matrix represents systematic variance.
+- Dominance Analysis. Based on Budescu and Azen papers, dominance analysis is a method to analyze the relative importance of one predictor relative to another on multiple regression
+ - Statsample::DominanceAnalysis class can report dominance analysis for a sample, using uni or multivariate dependent variables
+ - Statsample::DominanceAnalysis::Bootstrap can execute bootstrap analysis to determine dominance stability, as recomended by Azen & Budescu (2003) link[http://psycnet.apa.org/journals/met/8/2/129/].
+- Module Statsample::Codification, to help to codify open questions
+- Converters to export data:
+ - Statsample::Mx : Write Mx Files
+ - Statsample::GGobi : Write Ggobi files
+- Module Statsample::Crosstab provides function to create crosstab for categorical data
+- Module Statsample::Reliability provides functions to analyze scales with psychometric methods.
+ - Class Statsample::Reliability::ScaleAnalysis provides statistics like mean, standard deviation for a scale, Cronbach's alpha and standarized Cronbach's alpha, and for each item: mean, correlation with total scale, mean if deleted, Cronbach's alpha is deleted.
+ - Class Statsample::Reliability::MultiScaleAnalysis provides a DSL to easily analyze reliability of multiple scales and retrieve correlation matrix and factor analysis of them.
+ - Class Statsample::Reliability::ICC provides intra-class correlation, using Shrout & Fleiss(1979) and McGraw & Wong (1996) formulations.
+- Module Statsample::SRS (Simple Random Sampling) provides a lot of functions to estimate standard error for several type of samples
+- Module Statsample::Test provides several methods and classes to perform inferencial statistics
+ - Statsample::Test::BartlettSphericity
+ - Statsample::Test::ChiSquare
+ - Statsample::Test::F
+ - Statsample::Test::KolmogorovSmirnov (only D value)
+ - Statsample::Test::Levene
+ - Statsample::Test::UMannWhitney
+ - Statsample::Test::T
+ - Statsample::Test::WilcoxonSignedRank
+- Module Graph provides several classes to create beautiful graphs using rubyvis
+ - Statsample::Graph::Boxplot
+ - Statsample::Graph::Histogram
+ - Statsample::Graph::Scatterplot
+- Gem bio-statsample-timeseries provides module Statsample::TimeSeries with support for time series, including ARIMA estimation using Kalman-Filter.
+- Gem statsample-sem provides a DSL to R libraries +sem+ and +OpenMx+
+- Gem statsample-glm provides you with GML method, to work with Logistic, Poisson and Gaussian regression ,using ML or IRWLS.
+- Close integration with gem reportbuilder, to easily create reports on text, html and rtf formats.
+
+# Resources
+
+- Source code on github :: http://github.com/sciruby/statsample
+- Bug report and feature request :: http://github.com/sciruby/statsample/issues
+- E-mailing list :: https://groups.google.com/forum/#!forum/sciruby-dev
-## License
+# License
BSD-3 (See LICENSE.txt)
diff --git a/Rakefile b/Rakefile
index d4e23b9..4d78a8f 100644
--- a/Rakefile
+++ b/Rakefile
@@ -1,32 +1,31 @@
-#!/usr/bin/ruby
-# -*- ruby -*-
-# -*- coding: utf-8 -*-
-$:.unshift(File.dirname(__FILE__)+'/lib/')
+$:.unshift File.expand_path("../lib/", __FILE__)
+lib_folder = File.expand_path("../lib", __FILE__)
-require 'rubygems'
-require 'statsample'
-require 'hoe'
-require 'rdoc'
+require 'statsample/version'
+require 'rake'
+require 'rake/testtask'
+require 'rdoc/task'
+require 'bundler/gem_tasks'
-Hoe.plugin :git
-Hoe.plugin :doofus
-desc "Ruby Lint"
-task :lint do
- executable=Config::CONFIG['RUBY_INSTALL_NAME']
- Dir.glob("lib/**/*.rb") {|f|
- if !system %{#{executable} -w -c "#{f}"}
- puts "Error on: #{f}"
- end
- }
+# Setup the necessary gems, specified in the gemspec.
+require 'bundler'
+begin
+ Bundler.setup(:default, :development)
+rescue Bundler::BundlerError => e
+ $stderr.puts e.message
+ $stderr.puts "Run `bundle install` to install missing gems"
+ exit e.status_code
end
-task :release do
-system %{git push origin master}
+Rake::TestTask.new do |t|
+ t.pattern = "test/test_*.rb"
end
-task "clobber_docs" do
- # Only to omit warnings
+RDoc::Task.new do |rdoc|
+ rdoc.main = "README.md"
+ rdoc.rdoc_files.include("README.md", "lib", "History.txt", "LICENSE.txt", "references.txt")
end
+
desc "Update pot/po files."
task "gettext:updatepo" do
require 'gettext/tools'
@@ -37,83 +36,9 @@ desc "Create mo-files"
task "gettext:makemo" do
require 'gettext/tools'
GetText.create_mofiles()
- # GetText.create_mofiles(true, "po", "locale") # This is for "Ruby on Rails".
-end
-
-h=Hoe.spec('statsample') do
- self.version=Statsample::VERSION
- self.urls=["https://github.com/clbustos/statsample"]
- #self.testlib=:minitest
- self.readme_file = 'README.md'
- self.urls = ['https://github.com/clbustos/statsample']
- self.developer('Claudio Bustos', 'clbustos@gmail.com')
- self.extra_deps << ["spreadsheet","~>0.6"] << ["reportbuilder", "~>1.4"] << ["minimization", "~>0.2.0"] << ["fastercsv", ">0"] << ["dirty-memoize", "~>0.0"] << ["extendmatrix","~>0.3.1"] << ["statsample-bivariate-extension", ">0"] << ["rserve-client"] << ["rubyvis"] << ["distribution"]
-
- self.extra_dev_deps << ["hoe","~>0"] << ["shoulda","~>3"] << ["minitest", "~>2"] << ["gettext", "~>0"] << ["mocha", "~>0"] << ["hoe-git", "~>0"]
-
- self.clean_globs << "test/images/*" << "demo/item_analysis/*" << "demo/Regression"
- self.post_install_message = <<-EOF
-***************************************************
-Thanks for installing statsample.
-
-On *nix, you could install statsample-optimization
-to retrieve gems gsl, statistics2 and a C extension
-to speed some methods.
-
- $ sudo gem install statsample-optimization
-
-On Ubuntu, install build-essential and libgsl0-dev
-using apt-get. Compile ruby 1.8 or 1.9 from
-source code first.
-
- $ sudo apt-get install build-essential libgsl0-dev
-
-
-*****************************************************
- EOF
- self.need_rdoc=false
-end
-
-if Rake.const_defined?(:RDocTask)
-Rake::RDocTask.new(:docs) do |rd|
- rd.main = h.readme_file
- rd.options << '-d' if (`which dot` =~ /\/dot/) unless
- ENV['NODOT'] || Hoe::WINDOZE
- rd.rdoc_dir = 'doc'
-
- rd.rdoc_files.include("lib/**/*.rb")
- rd.rdoc_files += h.spec.extra_rdoc_files
- rd.rdoc_files.reject! {|f| f=="Manifest.txt"}
- title = h.spec.rdoc_options.grep(/^(-t|--title)=?$/).first
- if title then
- rd.options << title
-
- unless title =~ /\=/ then # for ['-t', 'title here']
- title_index = spec.rdoc_options.index(title)
- rd.options << spec.rdoc_options[title_index + 1]
- end
- else
- title = "#{h.name}-#{h.version} Documentation"
- title = "#{h.rubyforge_name}'s " + title if h.rubyforge_name != h.name
- rd.options << '--title' << title
- end
end
+desc 'Run pry'
+task :pry do |task|
+ sh "pry -r #{lib_folder}/statsample.rb"
end
-
-desc 'Publish rdocs with analytics support'
-task :publicar_docs => [:clean] do
-# ruby %{agregar_adsense_a_doc.rb}
- path = File.expand_path("./doc.yaml")
- config = YAML.load(File.read(path))
- host = "#{config["user"]}@#{config["host"]}"
-
- remote_dir = config["dir"]
- local_dir = h.local_rdoc_dir
- Dir.glob(local_dir+"/**/*") {|file|
- sh %{chmod 755 #{file}}
- }
- sh %{rsync #{h.rsync_args} #{local_dir}/ #{host}:#{remote_dir}}
-end
-
-# vim: syntax=Ruby
diff --git a/benchmarks/correlation_matrix_15_variables.rb b/benchmarks/correlation_matrix_15_variables.rb
index 82f56eb..5e6a725 100644
--- a/benchmarks/correlation_matrix_15_variables.rb
+++ b/benchmarks/correlation_matrix_15_variables.rb
@@ -4,7 +4,6 @@
cases=250
vars=20
-
name "gsl matrix based vs. manual ruby correlation matrix (#{vars} vars, #{cases} cases)"
author 'Clbustos'
date '2011-01-18'
@@ -17,10 +16,12 @@
reps 200 #number of repetitions
-ds=vars.times.inject({}) {|ac,v|
-ac["x#{v}"]=Statsample::Vector.new_scale(cases) {rand()}
-ac
-}.to_dataset
+ds = Daru::DataFrame.new(
+ vars.times.inject({}) do |ac,v|
+ ac["x#{v}".to_sym]=Daru::Vector.new_with_size(cases) {rand()}
+ ac
+ end
+)
measure "Statsample::Bivariate.correlation_matrix_optimized" do
Statsample::Bivariate.correlation_matrix_optimized(ds)
diff --git a/benchmarks/correlation_matrix_5_variables.rb b/benchmarks/correlation_matrix_5_variables.rb
index e84f25c..418ebe4 100644
--- a/benchmarks/correlation_matrix_5_variables.rb
+++ b/benchmarks/correlation_matrix_5_variables.rb
@@ -17,11 +17,12 @@
reps 200 #number of repetitions
-
-ds=vars.times.inject({}) {|ac,v|
-ac["x#{v}"]=Statsample::Vector.new_scale(cases) {rand()}
-ac
-}.to_dataset
+ds = Daru::DataFrame.new(
+ vars.times.inject({}) do |ac,v|
+ ac["x#{v}".to_sym]=Daru::Vector.new_with_size(cases) {rand()}
+ ac
+ end
+)
measure "Statsample::Bivariate.correlation_matrix_optimized" do
Statsample::Bivariate.correlation_matrix_optimized(ds)
diff --git a/benchmarks/correlation_matrix_methods/correlation_matrix.rb b/benchmarks/correlation_matrix_methods/correlation_matrix.rb
index 4f5f842..dfb6add 100644
--- a/benchmarks/correlation_matrix_methods/correlation_matrix.rb
+++ b/benchmarks/correlation_matrix_methods/correlation_matrix.rb
@@ -5,11 +5,13 @@
require 'benchmark'
def create_dataset(vars,cases)
- ran=Distribution::Normal.rng
- ds=vars.times.inject({}) {|ac,v|
- ac["x#{v}"]=Statsample::Vector.new_scale(cases) {ran.call}
- ac
- }.to_dataset
+ ran = Distribution::Normal.rng
+ ds = Daru::DataFrame.new(
+ vars.times.inject({}) do |ac,v|
+ ac["x#{v}".to_sym] = Daru::Vector.new_with_size(cases) {ran.call}
+ ac
+ end
+ )
end
def prediction_pairwise(vars,cases)
@@ -19,19 +21,17 @@ def prediction_optimized(vars,cases)
Statsample::Bivariate.prediction_optimized(vars,cases) / 10
end
-
-
if !File.exists?("correlation_matrix.ds") or File.mtime(__FILE__) > File.mtime("correlation_matrix.ds")
reps=100 #number of repetitions
ds_sizes=[5,10,30,50,100,150,200,500,1000]
ds_vars=[3,4,5,10,20,30,40]
#ds_sizes=[5,10]
#ds_vars=[3,5,20]
-rs=Statsample::Dataset.new(%w{cases vars time_optimized time_pairwise})
+rs = Daru::DataFrame.new({}, order: [:cases, :vars, :time_optimized, :time_pairwise])
ds_sizes.each do |cases|
ds_vars.each do |vars|
- ds=create_dataset(vars,cases)
+ ds = create_dataset(vars,cases)
time_optimized= Benchmark.realtime do
reps.times {
Statsample::Bivariate.correlation_matrix_optimized(ds)
@@ -40,36 +40,32 @@ def prediction_optimized(vars,cases)
end
time_pairwise= Benchmark.realtime do
- reps.times {
- Statsample::Bivariate.correlation_matrix_pairwise(ds)
- }
+ reps.times { Statsample::Bivariate.correlation_matrix_pairwise(ds) }
end
puts "Cases:#{cases}, vars:#{vars} -> opt:%0.3f (%0.3f) | pair: %0.3f (%0.3f)" % [time_optimized, prediction_optimized(vars,cases), time_pairwise, prediction_pairwise(vars,cases)]
- rs.add_case({'cases'=>cases,'vars'=>vars,'time_optimized'=>Math.sqrt(time_optimized*1000),'time_pairwise'=>Math.sqrt(time_pairwise*1000)})
+ rs.add_row(Daru::Vector.new({
+ :cases => cases,
+ :vars => vars,
+ :time_optimized => Math.sqrt(time_optimized*1000),
+ :time_pairwise =>Math.sqrt(time_pairwise*1000)
+ })
+ )
end
- end
-
+ end
else
rs=Statsample.load("correlation_matrix.ds")
end
+rs[:c_v] = rs.collect {|row| row[:cases]*row[:vars]}
-rs.fields.each {|f| rs[f].type=:scale}
-
-rs['c_v']=rs.collect {|row| row['cases']*row['vars']}
-
-rs.update_valid_data
rs.save("correlation_matrix.ds")
Statsample::Excel.write(rs,"correlation_matrix.xls")
+rb = ReportBuilder.new(:name=>"Correlation matrix analysis")
-
-rb=ReportBuilder.new(:name=>"Correlation matrix analysis")
-
-rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_optimized','c_v']],'time_optimized', :digits=>6))
-rb.add(Statsample::Regression.multiple(rs[['cases','vars','time_pairwise','c_v']],'time_pairwise', :digits=>6))
-
+rb.add(Statsample::Regression.multiple(rs[:cases,:vars,:time_optimized,:c_v],:time_optimized, :digits=>6))
+rb.add(Statsample::Regression.multiple(rs[:cases,:vars,:time_pairwise,:c_v],:time_pairwise, :digits=>6))
rb.save_html("correlation_matrix.html")
diff --git a/examples/boxplot.rb b/examples/boxplot.rb
index 49feeab..ab91a0a 100644
--- a/examples/boxplot.rb
+++ b/examples/boxplot.rb
@@ -1,14 +1,26 @@
#!/usr/bin/ruby
+# == Description
+#
+# This example illustrates how daru, combined with Statsample::Graph::Boxplot
+# can be used for generating box plots of a normally distributed set of data.
+#
+# The 'rnorm' function, defined in statsample/shorthands generates a Daru::Vector
+# object which contains the specified number of random variables in a normal distribution.
+# It uses the 'distribution' gem for this purpose.
+#
+# Create a boxplot of the data by specifying the vectors a, b and c and providing
+# necessary options to Statsample::Graph::Boxplot. The 'boxplot' function is shorthand
+# for calling Statsample::Graph::Boxplot.
$:.unshift(File.dirname(__FILE__)+'/../lib/')
require 'statsample'
Statsample::Analysis.store(Statsample::Graph::Boxplot) do
- n=30
- a=rnorm(n-1,50,10)
- b=rnorm(n, 30,5)
- c=rnorm(n,5,1)
+ n = 30
+ a = rnorm(n-1,50,10)
+ b = rnorm(n, 30,5)
+ c = rnorm(n,5,1)
a.push(2)
+
boxplot(:vectors=>[a,b,c],:width=>300, :height=>300, :groups=>%w{first first second}, :minimum=>0)
-
end
if __FILE__==$0
diff --git a/examples/chisquare_test.rb b/examples/chisquare_test.rb
new file mode 100644
index 0000000..650753c
--- /dev/null
+++ b/examples/chisquare_test.rb
@@ -0,0 +1,23 @@
+#!/usr/bin/ruby
+$:.unshift(File.dirname(__FILE__)+'/../lib')
+require 'statsample'
+
+Statsample::Analysis.store(Statsample::Test::ChiSquare) do
+ # Collect the two vectors with the categorical data (raw number of occurences) into one matrix. Here
+ #--------------------------------------------
+ #| category | observation 1 | observation 2 |
+ #|------------------------------------------|
+ #| A | 100 | 20 |
+ #| B | 50 | 70 |
+ #| C | 30 | 100 |
+ #|------------------------------------------|
+ #
+ m=Matrix[[100, 50, 30],[20, 70, 100]]
+ x_2=Statsample::Test.chi_square(m)
+ # after the test is done, look at the p-value.
+ puts x_2.probability
+end
+
+if __FILE__==$0
+ Statsample::Analysis.run_batch
+end
diff --git a/examples/correlation_matrix.rb b/examples/correlation_matrix.rb
index 844e859..1a8a77e 100644
--- a/examples/correlation_matrix.rb
+++ b/examples/correlation_matrix.rb
@@ -1,15 +1,28 @@
#!/usr/bin/ruby
+
+# == Description
+#
+# Creating and summarizing a correlation matrix with daru and statsample
$:.unshift(File.dirname(__FILE__)+'/../lib/')
-require 'statsample'
+require 'statsample'
Statsample::Analysis.store("Statsample::Bivariate.correlation_matrix") do
+ # Create a Daru::DataFrame containing 4 vectors a, b, c and d.
+ #
+ # Notice that the `clone` option has been set to *false*. This tells Daru
+ # to not clone the Daru::Vectors being supplied by `rnorm`, since it would
+ # be unnecessarily counter productive to clone the vectors once they have
+ # been assigned to the dataframe.
samples=1000
- ds=data_frame(
- 'a'=>rnorm(samples),
- 'b'=>rnorm(samples),
- 'c'=>rnorm(samples),
- 'd'=>rnorm(samples))
- cm=cor(ds)
+ ds = Daru::DataFrame.new({
+ :a => rnorm(samples),
+ :b => rnorm(samples),
+ :c => rnorm(samples),
+ :d => rnorm(samples)
+ }, clone: false)
+
+ # Calculate correlation matrix by calling the `cor` shorthand.
+ cm = cor(ds)
summary(cm)
end
diff --git a/examples/dataset.rb b/examples/dataset.rb
index b993ddc..b7622de 100644
--- a/examples/dataset.rb
+++ b/examples/dataset.rb
@@ -1,13 +1,26 @@
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
+# == Description
+#
+# This example demonstrates creation of basic Vectors and DataFrames.
require 'statsample'
-Statsample::Analysis.store(Statsample::Dataset) do
- samples=1000
- a=Statsample::Vector.new_scale(samples) {r=rand(5); r==4 ? nil: r}
- b=Statsample::Vector.new_scale(samples) {r=rand(5); r==4 ? nil: r}
+Statsample::Analysis.store(Daru::DataFrame) do
+ samples = 1000
- ds={'a'=>a,'b'=>b}.to_dataset
+ # The 'new_with_size' function lets you specify the size of the
+ # vector as the argument and the block specifies how each element
+ # of the vector will be created.
+ a = Daru::Vector.new_with_size(samples) {r=rand(5); r==4 ? nil: r}
+ b = Daru::Vector.new_with_size(samples) {r=rand(5); r==4 ? nil: r}
+
+ # Pass the Daru::Vector objects in a Hash to the DataFrame constructor
+ # to make a DataFrame.
+ #
+ # The *order* option lets you specify the way the vectors in the Hash
+ # will be ordered. Not specifyin this will order vectors in alphabetical
+ # order by default.
+ ds = Daru::DataFrame.new({:a=>a,:b=>b}, order: [:b, :a])
summary(ds)
end
diff --git a/examples/dominance_analysis.rb b/examples/dominance_analysis.rb
index a832a8e..1208343 100644
--- a/examples/dominance_analysis.rb
+++ b/examples/dominance_analysis.rb
@@ -1,9 +1,10 @@
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
+# == Description
+#
+# Dominance Analysis with statsample
require 'statsample'
-
-
Statsample::Analysis.store(Statsample::DominanceAnalysis) do
sample=300
a=rnorm(sample)
@@ -11,17 +12,17 @@
c=rnorm(sample)
d=rnorm(sample)
- ds={'a'=>a,'b'=>b,'cc'=>c,'d'=>d}.to_dataset
+ ds = Daru::DataFrame.new({:a => a,:b => b,:cc => c,:d => d}, clone: false)
attach(ds)
- ds['y']=a*5+b*3+cc*2+d+rnorm(300)
+ ds[:y]=a*5 + b*3 + cc*2 + d + rnorm(300)
cm=cor(ds)
summary(cm)
- lr=lr(ds,'y')
+ lr=lr(ds,:y)
summary(lr)
- da=dominance_analysis(ds,'y')
+ da=dominance_analysis(ds,:y)
summary(da)
- da=dominance_analysis(ds,'y',:name=>"Dominance Analysis using group of predictors", :predictors=>['a', 'b', %w{cc d}])
+ da = dominance_analysis(ds,:y,:name=>"Dominance Analysis using group of predictors", :predictors=>[:a, :b, [:cc, :d]])
summary(da)
end
diff --git a/examples/dominance_analysis_bootstrap.rb b/examples/dominance_analysis_bootstrap.rb
index 6735e9f..c15efdc 100644
--- a/examples/dominance_analysis_bootstrap.rb
+++ b/examples/dominance_analysis_bootstrap.rb
@@ -3,27 +3,26 @@
require 'statsample'
Statsample::Analysis.store(Statsample::DominanceAnalysis::Bootstrap) do
-
sample=300
a=rnorm(sample)
b=rnorm(sample)
c=rnorm(sample)
d=rnorm(sample)
- a.name="a"
- b.name="b"
- c.name="c"
- d.name="d"
+ a.rename :a
+ b.rename :b
+ c.rename :c
+ d.rename :d
- ds={'a'=>a,'b'=>b,'cc'=>c,'d'=>d}.to_dataset
+ ds = Daru::DataFrame.new({:a => a,:b => b,:cc => c,:d => d})
attach(ds)
- ds['y1']=a*5+b*2+cc*2+d*2+rnorm(sample,0,10)
- ds['y2']=a*10+rnorm(sample)
+ ds[:y1] = a*5 + b*2 + cc*2 + d*2 + rnorm(sample,0,10)
+ ds[:y2] = a*10 + rnorm(sample)
- dab=dominance_analysis_bootstrap(ds, ['y1','y2'], :debug=>true)
+ dab=dominance_analysis_bootstrap(ds, [:y1,:y2], :debug=>true)
dab.bootstrap(100,nil)
summary(dab)
- ds2=ds['a'..'y1']
- dab2=dominance_analysis_bootstrap(ds2, 'y1', :debug=>true)
+ ds2=ds[:a..:y1]
+ dab2=dominance_analysis_bootstrap(ds2, :y1, :debug=>true)
dab2.bootstrap(100,nil)
summary(dab2)
end
diff --git a/examples/histogram.rb b/examples/histogram.rb
index ec36e1f..772c69f 100644
--- a/examples/histogram.rb
+++ b/examples/histogram.rb
@@ -1,12 +1,26 @@
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
+
+# == Description
+#
+# This example demonstrates how a histogram can be created
+# with statsample.
+#
+# The 'histogram' function creates a histogram by using the
+# Statsample::Graph::Histogram class. This class accepts data
+# in a Daru::Vector (as created by `rnorm`).
+#
+# A line showing normal distribution can be drawn by setting
+# the `:line_normal_distribution` option to *true*.
+#
+# See this notebook for an illustration:
+# http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/statistics/Creating%20a%20Histogram.ipynb
require 'statsample'
Statsample::Analysis.store(Statsample::Graph::Histogram) do
- histogram(rnorm(3000,0,20))
+ histogram(rnorm(3000,0,20), :line_normal_distribution => true)
end
-
if __FILE__==$0
Statsample::Analysis.run
end
diff --git a/examples/icc.rb b/examples/icc.rb
index b563ae4..1ef3b38 100644
--- a/examples/icc.rb
+++ b/examples/icc.rb
@@ -6,18 +6,17 @@
Statsample::Analysis.store(Statsample::Reliability::ICC) do
size=1000
- a=Statsample::Vector.new_scale(size) {rand(10)}
- b=a.recode{|i|i+rand(4)-2}
- c=a.recode{|i|i+rand(4)-2}
- d=a.recode{|i|i+rand(4)-2}
- @ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
+ a = Daru::Vector.new_with_size(size) {rand(10)}
+ b = a.recode{|i|i+rand(4)-2}
+ c = a.recode{|i|i+rand(4)-2}
+ d = a.recode{|i|i+rand(4)-2}
+ @ds = Daru::DataFrame.new({:a => a,:b => b,:c => c,:d => d})
@icc=Statsample::Reliability::ICC.new(@ds)
summary(@icc)
@icc.type=:icc_3_1
summary(@icc)
@icc.type=:icc_a_k
summary(@icc)
-
end
if __FILE__==$0
diff --git a/examples/levene.rb b/examples/levene.rb
index 8529ee2..fe75e78 100644
--- a/examples/levene.rb
+++ b/examples/levene.rb
@@ -1,15 +1,29 @@
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
+# == Description
+#
+# This example demonstrates how a levene test can be performed by
+# using Daru::Vector and the Statsample::Test::Levene class.
+#
+# Levene's test is an inferential statistic used to assess the
+# equality of variances for a variable calculated for two or more groups.
+#
+# == References
+#
+# http://en.wikipedia.org/wiki/Levene%27s_test
require 'statsample'
Statsample::Analysis.store(Statsample::Test::Levene) do
- a=[1,2,3,4,5,6,7,8,100,10].to_scale
- b=[30,40,50,60,70,80,90,100,110,120].to_scale
+ a = Daru::Vector.new([1,2,3,4,5,6,7,8,100,10])
+ b = Daru::Vector.new([30,40,50,60,70,80,90,100,110,120])
+
+ # The 'levene' function is used as a shorthand
+ # for creating a Statsample::Test::Levene object.
summary(levene([a,b]))
end
if __FILE__==$0
- Statsample::Analysis.run_batch
+ Statsample::Analysis.run_batch
end
diff --git a/examples/multiple_regression.rb b/examples/multiple_regression.rb
index 371be4b..4ae1277 100644
--- a/examples/multiple_regression.rb
+++ b/examples/multiple_regression.rb
@@ -1,15 +1,18 @@
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
+# == Description
+#
+# This example shows how multiple regression can be performed using statsample and daru.
require 'statsample'
Statsample::Analysis.store(Statsample::Regression::Multiple) do
samples=2000
- ds=dataset('a'=>rnorm(samples),'b'=>rnorm(samples),'cc'=>rnorm(samples),'d'=>rnorm(samples))
+ ds=dataset(:a => rnorm(samples),:b => rnorm(samples),:cc => rnorm(samples),:d => rnorm(samples))
attach(ds)
- ds['y']=a*5+b*3+cc*2+d+rnorm(samples)
- summary lr(ds,'y')
+ ds[:y] = a*5+b*3+cc*2+d+rnorm(samples)
+ summary lr(ds,:y)
end
if __FILE__==$0
diff --git a/examples/parallel_analysis.rb b/examples/parallel_analysis.rb
index 0684bda..1020ff0 100644
--- a/examples/parallel_analysis.rb
+++ b/examples/parallel_analysis.rb
@@ -1,6 +1,11 @@
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
+# == Description
+#
+# This example will explain how a parallel analysis can be performed on a PCA.
+# Parallel Analysis helps in determining how many components are to be retained
+# from the PCA.
require 'statsample'
samples=150
variables=30
@@ -8,18 +13,18 @@
Statsample::Analysis.store(Statsample::Factor::ParallelAnalysis) do
rng = Distribution::Normal.rng()
-f1=rnorm(samples)
-f2=rnorm(samples)
-f3=rnorm(samples)
+f1 = rnorm(samples)
+f2 = rnorm(samples)
+f3 = rnorm(samples)
vectors={}
variables.times do |i|
- vectors["v#{i}"]=samples.times.collect {|nv| f1[nv]*i+(f2[nv]*(15-i))+((f3[nv]*(30-i))*1.5)*rng.call}.to_scale
- vectors["v#{i}"].name="Vector #{i}"
+ vectors["v#{i}".to_sym] = Daru::Vector.new(samples.times.collect {|nv| f1[nv]*i+(f2[nv]*(15-i))+((f3[nv]*(30-i))*1.5)*rng.call})
+ vectors["v#{i}".to_sym].rename "Vector #{i}"
end
- ds=vectors.to_dataset
+ ds = Daru::DataFrame.new(vectors)
pa=Statsample::Factor::ParallelAnalysis.new(ds, :iterations=>iterations, :debug=>true)
pca=pca(cor(ds))
diff --git a/examples/polychoric.rb b/examples/polychoric.rb
index ca99e7c..dec2c3f 100644
--- a/examples/polychoric.rb
+++ b/examples/polychoric.rb
@@ -1,26 +1,39 @@
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
-$:.unshift("/home/cdx/usr/lib/statsample-bivariate-extension/lib/")
+# == Description
+# Polychoric Correlation using two-step and joint method
+#
+# Polychoric correlation in statsample requires installation of
+# the [statsample-bivariate-extension](https://rubygems.org/gems/statsample-bivariate-extension)
+# gem. This gem extends the Statsample::Bivariate class with useful
+# algorithms for polychoric and tetrachoric correlation.
+#
+# Statsample will automatically detect presence of polychoric/tetrachoric
+# algorithms so there is no need to explicitly require the gem.
+#
+# In this example we'll see how polychoric correlation can be
+# performed using statsample.
require 'statsample'
Statsample::Analysis.store(Statsample::Bivariate::Polychoric) do
-ct=Matrix[[rand(10)+50, rand(10)+50, rand(10)+1],
- [rand(20)+5, rand(50)+4, rand(10)+1],
- [rand(8)+1, rand(12)+1, rand(10)+1]]
+ ct=Matrix[[rand(10)+50, rand(10)+50, rand(10)+1],
+ [rand(20)+5, rand(50)+4, rand(10)+1],
+ [rand(8)+1, rand(12)+1, rand(10)+1]]
-# Estimation of polychoric correlation using two-step (default)
-poly=polychoric(ct, :name=>"Polychoric with two-step", :debug=>false)
-summary poly
+ # Estimation of polychoric correlation using two-step (default)
+ poly=polychoric(ct, :name=>"Polychoric with two-step", :debug=>false)
+ summary poly
-# Estimation of polychoric correlation using joint method (slow)
-poly=polychoric(ct, :method=>:joint, :name=>"Polychoric with joint")
-summary poly
+ # Estimation of polychoric correlation using joint method (slow)
+ poly=polychoric(ct, :method=>:joint, :name=>"Polychoric with joint")
+ summary poly
-# Uses polychoric series (not recomended)
+ # Uses polychoric series (not recomended)
-poly=polychoric(ct, :method=>:polychoric_series, :name=>"Polychoric with polychoric series")
-summary poly
+ poly=polychoric(ct, :method=>:polychoric_series, :name=>"Polychoric with polychoric series")
+ summary poly
end
+
if __FILE__==$0
Statsample::Analysis.run_batch
end
diff --git a/examples/principal_axis.rb b/examples/principal_axis.rb
index 75ae6a0..0e25b04 100644
--- a/examples/principal_axis.rb
+++ b/examples/principal_axis.rb
@@ -1,16 +1,20 @@
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
-
+# Principal Axis Analysis¶
+#
+# Here we use the Statsample::Factor::PrincipalAnalysis class
+# for principal axis analysis for a correlation or covariance matrix.
require 'statsample'
Statsample::Analysis.store(Statsample::Factor::PrincipalAxis) do
matrix=Matrix[
- [1.0, 0.709501601093587, 0.877596585880047, 0.272219316266807], [0.709501601093587, 1.0, 0.291633797330304, 0.871141831433844], [0.877596585880047, 0.291633797330304, 1.0, -0.213373722977167], [0.272219316266807, 0.871141831433844, -0.213373722977167, 1.0]]
+ [1.0, 0.709501601093587, 0.877596585880047, 0.272219316266807],
+ [0.709501601093587, 1.0, 0.291633797330304, 0.871141831433844],
+ [0.877596585880047, 0.291633797330304, 1.0, -0.213373722977167],
+ [0.272219316266807, 0.871141831433844, -0.213373722977167, 1.0]]
matrix.extend Statsample::CovariateMatrix
-
- #matrix.fields=%w{a b c d}
fa=principal_axis(matrix,:m=>1,:smc=>false)
summary fa
diff --git a/examples/reliability.rb b/examples/reliability.rb
index 27d7e25..3667c16 100644
--- a/examples/reliability.rb
+++ b/examples/reliability.rb
@@ -1,27 +1,26 @@
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib')
-require 'statsample'
-Statsample::Analysis.store(Statsample::Reliability) do
-
+# == Description
+#
+# Reliability Scale Analysis with statsample
+require 'statsample'
+Statsample::Analysis.store(Statsample::Reliability) do
samples=100
a=rnorm(samples)
- ds=Statsample::Dataset.new
+ ds = Daru::DataFrame.new({})
20.times do |i|
- ds["v#{i}"]=a+rnorm(samples,0,0.2)
+ ds["v#{i}".to_sym]= a + rnorm(samples,0,0.2)
end
- ds.update_valid_data
-
rel=Statsample::Reliability::ScaleAnalysis.new(ds)
summary rel
-
ms=Statsample::Reliability::MultiScaleAnalysis.new(:name=>"Multi Scale analyss") do |m|
- m.scale "Scale 1", ds.clone(%w{v1 v2 v3 v4 v5 v6 v7 v8 v9 v10})
- m.scale "Scale 2", ds.clone(%w{v11 v12 v13 v14 v15 v16 v17 v18 v19})
+ m.scale "Scale 1", ds.clone([:v1, :v2, :v3, :v4, :v5, :v6, :v7, :v8, :v9, :v10])
+ m.scale "Scale 2", ds.clone([:v11, :v12, :v13, :v14, :v15, :v16, :v17, :v18, :v19])
end
summary ms
@@ -30,4 +29,3 @@
if __FILE__==$0
Statsample::Analysis.run_batch
end
-
diff --git a/examples/scatterplot.rb b/examples/scatterplot.rb
index f238c5f..72a0c5f 100644
--- a/examples/scatterplot.rb
+++ b/examples/scatterplot.rb
@@ -2,6 +2,14 @@
$:.unshift(File.dirname(__FILE__)+'/../lib/')
$:.unshift('/home/cdx/dev/reportbuilder/lib/')
+# == Description
+#
+# Creating a scatterplot with statsample's Statsample::Graph::Scatterplot class.
+#
+# In this example we'll demonstrate how a normally distributed Daru::Vector can
+# be created using the daru and distribution gems, and how the values generated
+# can be plotted very easily using the 'scatterplot' shorthand and supplying X
+# and Y co-ordinates.
require 'benchmark'
require 'statsample'
n=100
diff --git a/examples/t_test.rb b/examples/t_test.rb
index ab1abf0..0a44cd9 100644
--- a/examples/t_test.rb
+++ b/examples/t_test.rb
@@ -1,5 +1,12 @@
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib')
+# == Description
+#
+# This example illustrates how a T test can be done and summarized with statsample
+#
+# == References
+#
+# http://en.wikipedia.org/wiki/Student%27s_t-test
require 'statsample'
Statsample::Analysis.store(Statsample::Test::T) do
diff --git a/examples/u_test.rb b/examples/u_test.rb
index d5ae14f..00d345d 100644
--- a/examples/u_test.rb
+++ b/examples/u_test.rb
@@ -1,11 +1,19 @@
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib')
+
+# == Description
+#
+# Example illustrating Mann-Whitney U test with statsample.
+#
+# == References
+#
+# http://en.wikipedia.org/wiki/Mann%E2%80%93Whitney_U_test
require 'statsample'
Statsample::Analysis.store(Statsample::Test::UMannWhitney) do
- a=10.times.map {rand(100)}.to_scale
- b=20.times.map {(rand(20))**2+50}.to_scale
+ a = Daru::Vector.new(10.times.map {rand(100)})
+ b = Daru::Vector.new(20.times.map {(rand(20))**2+50})
u=Statsample::Test::UMannWhitney.new(a,b)
summary u
diff --git a/examples/vector.rb b/examples/vector.rb
index f64e62b..964f870 100644
--- a/examples/vector.rb
+++ b/examples/vector.rb
@@ -1,15 +1,18 @@
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
-
+# == Description
+#
+# This example provides a small sneak-peak into creating a Daru::Vector.
+# For details on using Daru::Vector (with example on math, statistics and plotting)
+# see the notebook at this link:
+# http://nbviewer.ipython.org/github/SciRuby/sciruby-notebooks/blob/master/Data%20Analysis/Usage%20of%20Vector.ipynb
require 'statsample'
-Statsample::Analysis.store(Statsample::Vector) do
-
- a=Statsample::Vector.new_scale(1000) {r=rand(5); r==4 ? nil: r;}
+Statsample::Analysis.store(Daru::Vector) do
+ a = Daru::Vector.new_with_size(1000) {r=rand(5); r==4 ? nil: r;}
summary a
- b=c(1,2,3,4,6..10)
+ b = Daru::Vector[1,2,3,4,6..10]
summary b
-
end
if __FILE__==$0
diff --git a/examples/velicer_map_test.rb b/examples/velicer_map_test.rb
index 8ec3ed4..5a114d7 100644
--- a/examples/velicer_map_test.rb
+++ b/examples/velicer_map_test.rb
@@ -1,5 +1,8 @@
#!/usr/bin/ruby
$:.unshift(File.dirname(__FILE__)+'/../lib/')
+# == Description
+#
+# Velicer MAP test.
require 'statsample'
@@ -15,17 +18,18 @@
vectors={}
variables.times do |i|
- vectors["v#{i}"]=samples.times.collect {|nv|
- if i<5
- f1[nv]*5 + f2[nv] *2 +rng.call
- else
- f1[nv]*2 + f2[nv] *3 +rng.call
- end
- }.to_scale
+ vectors["v#{i}".to_sym]= Daru::Vector.new(
+ samples.times.collect do |nv|
+ if i<5
+ f1[nv]*5 + f2[nv] *2 +rng.call
+ else
+ f1[nv]*2 + f2[nv] *3 +rng.call
+ end
+ end)
end
- ds=vectors.to_dataset
+ ds = Daru::DataFrame.new(vectors)
cor=cor(ds)
pca=pca(cor)
diff --git a/lib/spss.rb b/lib/spss.rb
index 3c60dd3..50a2dca 100644
--- a/lib/spss.rb
+++ b/lib/spss.rb
@@ -1,4 +1,4 @@
-# = spss.rb -
+# = spss.rb -
#
# Provides utilites for working with spss files
#
@@ -12,40 +12,43 @@ class Element
def add(a)
@elements.push(a)
end
- def parse_elements(func=:to_s)
- @elements.collect{|e| " "+e.send(func)}.join("\n")
+
+ def parse_elements(func = :to_s)
+ @elements.collect{ |e| " "+e.send(func) }.join("\n")
end
+
def init_with config
- config.each {|key,value|
- self.send(key.to_s+"=",value) if methods.include? key.to_s
- }
+ config.each do |key, value|
+ self.send(key.to_s + "=", value) if methods.include? key.to_s
+ end
end
- def initialize(config={})
- @config=config
- @elements=[]
+
+ def initialize(config = {})
+ @config = config
+ @elements = []
end
end
class Dictionary < Element
attr_accessor :locale, :date_time, :row_count
- def initialize(config={})
+ def initialize(config = {})
super
init_with ({
- :locale=>"en_US",
+ :locale=>"en_US",
:date_time=>Time.new().strftime("%Y-%m-%dT%H:%M:%S"),
:row_count=>1
})
init_with config
end
-
+
def to_xml
"\n"+parse_elements(:to_xml)+"\n"
-
+
end
def to_spss
parse_elements(:to_spss)
end
end
-
+
class MissingValue < Element
attr_accessor :data, :type, :from, :to
def initialize(data,type=nil)
diff --git a/lib/statsample.rb b/lib/statsample.rb
index 30b4608..1352a54 100644
--- a/lib/statsample.rb
+++ b/lib/statsample.rb
@@ -1,4 +1,4 @@
-# = statsample.rb -
+# = statsample.rb -
# Statsample - Statistic package for Ruby
# Copyright (C) 2008-2014 Claudio Bustos
#
@@ -17,17 +17,18 @@
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
-
-#$:.unshift(File.dirname(__FILE__))
require 'matrix'
require 'extendmatrix'
require 'distribution'
require 'dirty-memoize'
require 'reportbuilder'
-
+require 'daru'
+require 'statsample/daru'
class Numeric
- def square ; self * self ; end
+ def square
+ self * self
+ end
end
class String
@@ -41,10 +42,10 @@ def is_number?
end
class Module
- def include_aliasing(m, suffix="ruby")
+ def include_aliasing(m, suffix = 'ruby')
m.instance_methods.each do |f|
if instance_methods.include? f
- alias_method("#{f}_#{suffix}",f)
+ alias_method("#{f}_#{suffix}", f)
remove_method f
end
end
@@ -53,78 +54,83 @@ def include_aliasing(m, suffix="ruby")
end
class Array
- # Recode repeated values on an array, adding the number of repetition
- # at the end
- # Example:
- # a=%w{a b c c d d d e}
- # a.recode_repeated
- # => ["a","b","c_1","c_2","d_1","d_2","d_3","e"]
- def recode_repeated
- if self.size!=self.uniq.size
- # Find repeated
- repeated=self.inject({}) {|a,v|
- (a[v].nil? ? a[v]=1 : a[v]+=1); a }.find_all{|k,v| v>1}.collect{|k,v| k}
- ns=repeated.inject({}) {|a,v| a[v]=0;a}
- self.collect do |f|
- if repeated.include? f
- ns[f]+=1
- sprintf("%s_%d",f,ns[f])
- else
- f
- end
- end
- else
- self
+ unless method_defined?(:sum)
+ def sum
+ inject(:+)
end
end
+
+ def mean
+ sum.fdiv(size)
+ end
+
+ # Calcualte sum of squares
+ def sum_of_squares(m=nil)
+ m ||= mean
+ inject(0) {|a,x| a + (x-m).square }
+ end
+
+ # Calculate sample variance
+ def variance_sample(m=nil)
+ m ||= mean
+ sum_of_squares(m).quo(size - 1)
+ end
+
+ # Calculate sample standard deviation
+ def sd
+ m ||= mean
+ Math::sqrt(variance_sample(m))
+ end
end
-def create_test(*args,&proc)
- description=args.shift
- fields=args
+def create_test(*args, &_proc)
+ description = args.shift
+ fields = args
[description, fields, Proc.new]
end
+
#--
# Test extensions
begin
require 'gettext'
rescue LoadError
def bindtextdomain(d) #:nodoc:
- d
+ d
end
-
+
# Bored module
module GetText #:nodoc:
- def _(t)
- t
+ def _(t)
+ t
end
end
end
+
# Library for statistical analysis on Ruby
#
# * Classes for manipulation and storage of data:
# * Module Statsample::Bivariate provides covariance and pearson, spearman, point biserial, tau a, tau b, gamma, tetrachoric (see Bivariate::Tetrachoric) and polychoric (see Bivariate::Polychoric) correlations. Include methods to create correlation and covariance matrices
# * Multiple types of regression on Statsample::Regression
# * Factorial Analysis algorithms on Statsample::Factor module.
-# * Dominance Analysis. Based on Budescu and Azen papers.link[http://psycnet.apa.org/journals/met/8/2/129/].
+# * Dominance Analysis. Based on Budescu and Azen papers.link[http://psycnet.apa.org/journals/met/8/2/129/].
# * Module Statsample::Codification, to help to codify open questions
# * Converters to import and export data from databases, csv and excel files.
# * Module Statsample::Crosstab provides function to create crosstab for categorical data
# * Reliability analysis provides functions to analyze scales.
# * Module Statsample::SRS (Simple Random Sampling) provides a lot of functions to estimate standard error for several type of samples
-# * Interfaces to gdchart, gnuplot and SVG::Graph
+# * Interfaces to gdchart, gnuplot and SVG::Graph
#
module Statsample
-
def self.create_has_library(library)
define_singleton_method("has_#{library}?") do
- cv="@@#{library}"
- if !class_variable_defined? cv
- begin
+ cv = "@@#{library}"
+ unless class_variable_defined? cv
+ begin
+ gem library.to_s # activate gem
require library.to_s
- class_variable_set(cv,true)
+ class_variable_set(cv, true)
rescue LoadError
- class_variable_set(cv,false)
+ class_variable_set(cv, false)
end
end
class_variable_get(cv)
@@ -132,8 +138,8 @@ def self.create_has_library(library)
end
create_has_library :gsl
-
- SPLIT_TOKEN = ","
+
+ SPLIT_TOKEN = ','
autoload(:Analysis, 'statsample/analysis')
autoload(:Database, 'statsample/converters')
autoload(:Anova, 'statsample/anova')
@@ -154,133 +160,123 @@ def self.create_has_library(library)
autoload(:Multivariate, 'statsample/multivariate')
autoload(:Multiset, 'statsample/multiset')
autoload(:StratifiedSample, 'statsample/multiset')
- autoload(:MLE, 'statsample/mle')
+ autoload(:MLE, 'statsample/mle')
autoload(:Regression, 'statsample/regression')
+ autoload(:FitModel, 'statsample/formula/fit_model')
autoload(:Test, 'statsample/test')
autoload(:Factor, 'statsample/factor')
autoload(:Graph, 'statsample/graph')
-
-
+
class << self
# Load a object saved on a file.
def load(filename)
if File.exist? filename
- o=false
- File.open(filename,"r") {|fp| o=Marshal.load(fp) }
+ o = false
+ File.open(filename, 'r') { |fp| o = Marshal.load(fp) }
o
else
false
end
end
-
-
-
+
# Create a matrix using vectors as columns.
# Use:
#
# matrix=Statsample.vector_cols_matrix(v1,v2)
def vector_cols_matrix(*vs)
# test
- size=vs[0].size
- vs.each{|v|
- raise ArgumentError,"Arguments should be Vector" unless v.instance_of? Statsample::Vector
- raise ArgumentError,"Vectors size should be the same" if v.size!=size
- }
- Matrix.rows((0...size).to_a.collect() {|i|
- vs.collect{|v| v[i]}
- })
+ size = vs[0].size
+
+ vs.each do |v|
+ fail ArgumentError, 'Arguments should be Vector' unless v.instance_of? Daru::Vector
+ fail ArgumentError, 'Vectors size should be the same' if v.size != size
+ end
+
+ Matrix.rows((0...size).to_a.collect { |i| vs.collect { |v| v[i] } })
end
+
# Returns a duplicate of the input vectors, without missing data
# for any of the vectors.
- #
- # a=[1,2,3,6,7,nil,3,5].to_scale
- # b=[nil,nil,5,6,4,5,10,2].to_scale
- # c=[2,4,6,7,4,5,6,7].to_scale
+ #
+ # a = Daru::Vector.new([1,2,3,6,7,nil,3,5])
+ # b = Daru::Vector.new([nil,nil,5,6,4,5,10,2])
+ # c = Daru::Vector.new([2,4,6,7,4,5,6,7])
# a2,b2,c2=Statsample.only_valid(a,b,c)
- # => [#,
- # #,
- # #]
+ # => [#,
+ # #,
+ # #]
#
def only_valid(*vs)
- i=1
- h=vs.inject({}) {|a,v| a["v#{i}"]=v;i+=1;a}
- ds=Statsample::Dataset.new(h).dup_only_valid
- ds.vectors.values
+ i = 1
+ h = vs.inject({}) { |acc, v| acc["v#{i}".to_sym] = v; i += 1; acc }
+ df = Daru::DataFrame.new(h).reject_values(*Daru::MISSING_VALUES)
+ df.map { |v| v }
end
-
- # Cheap version of #only_valid.
+
+ # Cheap version of #only_valid.
# If any vectors have missing_values, return only valid.
# If not, return the vectors itself
def only_valid_clone(*vs)
- if vs.any? {|v| v.flawed?}
+ if vs.any? { |v| v.include_values?(*Daru::MISSING_VALUES) }
only_valid(*vs)
else
vs
end
end
- end
-
-
-
-
+ end
+
module Util
# Reference: http://www.itl.nist.gov/div898/handbook/eda/section3/normprpl.htm
- def normal_order_statistic_medians(i,n)
- if i==1
- u= 1.0 - normal_order_statistic_medians(n,n)
- elsif i==n
- u=0.5**(1 / n.to_f)
+ def normal_order_statistic_medians(i, n)
+ if i == 1
+ u = 1.0 - normal_order_statistic_medians(n, n)
+ elsif i == n
+ u = 0.5**(1 / n.to_f)
else
- u= (i - 0.3175) / (n + 0.365)
+ u = (i - 0.3175) / (n + 0.365)
end
u
end
-
- def self.nice(s,e) # :nodoc:
- reverse = etrue).add(self).send(method)
+ bindtextdomain('statsample')
+ def summary(method = :to_text)
+ ReportBuilder.new(no_title: true).add(self).send(method)
end
end
module STATSAMPLE__ #:nodoc:
end
end
-
-
#--
-begin
+begin
require 'statsamplert'
rescue LoadError
module Statsample
- OPTIMIZED=false
+ OPTIMIZED = false
end
end
diff --git a/lib/statsample/analysis/suite.rb b/lib/statsample/analysis/suite.rb
index f4d97c4..49b4677 100644
--- a/lib/statsample/analysis/suite.rb
+++ b/lib/statsample/analysis/suite.rb
@@ -80,7 +80,7 @@ def scatterplot(*args)
def method_missing(name, *args,&block)
@attached.reverse.each do |ds|
- return ds[name.to_s] if ds.fields.include? (name.to_s)
+ return ds[name] if ds.vectors.to_a.include? (name)
end
raise "Method #{name} doesn't exists"
end
diff --git a/lib/statsample/anova/oneway.rb b/lib/statsample/anova/oneway.rb
index e0c20a5..a2d5bca 100644
--- a/lib/statsample/anova/oneway.rb
+++ b/lib/statsample/anova/oneway.rb
@@ -67,9 +67,9 @@ def report_building_table(builder) #:nodoc:
# One Way Anova with vectors
# Example:
- # v1=[2,3,4,5,6].to_scale
- # v2=[3,3,4,5,6].to_scale
- # v3=[5,3,1,5,6].to_scale
+ # v1 = Daru::Vector.new([2,3,4,5,6])
+ # v2 = Daru::Vector.new([3,3,4,5,6])
+ # v3 = Daru::Vector.new([5,3,1,5,6])
# anova=Statsample::Anova::OneWayWithVectors.new([v1,v2,v3])
# anova.f
# => 0.0243902439024391
@@ -90,10 +90,10 @@ class OneWayWithVectors < OneWay
def initialize(*args)
if args[0].is_a? Array
- @vectors=args.shift
+ @vectors = args.shift
else
- @vectors=args.find_all {|v| v.is_a? Statsample::Vector}
- opts=args.find {|v| v.is_a? Hash}
+ @vectors = args.find_all {|v| v.is_a? Daru::Vector}
+ opts = args.find {|v| v.is_a? Hash}
end
opts||=Hash.new
opts_default={:name=>_("Anova One-Way"),
@@ -164,7 +164,7 @@ def report_building(builder) # :nodoc:
if summary_descriptives
s.table(:name=>_("Descriptives"),:header=>%w{Name N Mean SD Min Max}.map {|v| _(v)}) do |t|
@vectors.each do |v|
- t.row [v.name, v.n_valid, "%0.4f" % v.mean, "%0.4f" % v.sd, "%0.4f" % v.min, "%0.4f" % v.max]
+ t.row [v.name, v.reject_values(*Daru::MISSING_VALUES).size, "%0.4f" % v.mean, "%0.4f" % v.sd, "%0.4f" % v.min, "%0.4f" % v.max]
end
end
end
diff --git a/lib/statsample/anova/twoway.rb b/lib/statsample/anova/twoway.rb
index f623e6c..49dae07 100644
--- a/lib/statsample/anova/twoway.rb
+++ b/lib/statsample/anova/twoway.rb
@@ -107,9 +107,9 @@ def report_building_table(builder) #:nodoc:
# Two Way Anova with vectors
# Example:
- # v1=[1,1,2,2].to_scale
- # v2=[1,2,1,2].to_scale
- # v3=[5,3,1,5].to_scale
+ # v1 = Daru::Vector.new([1,1,2,2])
+ # v2 = Daru::Vector.new([1,2,1,2])
+ # v3 = Daru::Vector.new([5,3,1,5])
# anova=Statsample::Anova::TwoWayWithVectors.new(:a=>v1,:b=>v2, :dependent=>v3)
#
class TwoWayWithVectors < TwoWay
@@ -121,25 +121,26 @@ class TwoWayWithVectors < TwoWay
# For now, only equal sample cells allowed
def initialize(opts=Hash.new)
raise "You should insert at least :a, :b and :dependent" unless [:a, :b, :dependent].all? {|v| opts.has_key? v}
- @a_var='a'
- @b_var='b'
- @dep_var='dependent'
- @a_vector, @b_vector, @dep_vector=Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent]
+ @a_var = :a
+ @b_var = :b
+ @dep_var = :dependent
+ @a_vector, @b_vector, @dep_vector =
+ Statsample.only_valid_clone opts[:a], opts[:b], opts[:dependent]
- ds={@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector}.to_dataset
- @ds=ds.clone_only_valid
- _p=@a_vector.factors.size
- _q=@b_vector.factors.size
- @x_general=@dep_vector.mean
- @axb_means={}
- @axb_sd={}
- @vectors=[]
+ ds = Daru::DataFrame.new({@a_var=>@a_vector, @b_var=>@b_vector, @dep_var=>@dep_vector})
+ @ds = ds.clone_only_valid
+ _p = @a_vector.factors.size
+ _q = @b_vector.factors.size
+ @x_general = @dep_vector.mean
+ @axb_means = {}
+ @axb_sd = {}
+ @vectors = []
n=nil
@ds.to_multiset_by_split(a_var,b_var).each_vector(dep_var) {|k,v|
- @axb_means[k]=v.mean
- @axb_sd[k]=v.sd
+ @axb_means[k] = v.mean
+ @axb_sd[k] = v.sd
@vectors << v
- n||=v.size
+ n ||= v.size
raise "All cell sizes should be equal" if n!=v.size
}
@@ -151,20 +152,21 @@ def initialize(opts=Hash.new)
@ds.to_multiset_by_split(b_var).each_vector(dep_var) {|k,v|
@b_means[k]=v.mean
}
- ss_a=n*_q*@ds[a_var].factors.inject(0) {|ac,v|
- ac+(@a_means[v]-@x_general)**2
+ ss_a = n*_q*@ds[a_var].factors.inject(0) {|ac,v|
+ ac + (@a_means[v]-@x_general)**2
}
ss_b=n*_p*@ds[b_var].factors.inject(0) {|ac,v|
ac+(@b_means[v]-@x_general)**2
}
- ss_within=@ds.collect {|row|
+ ss_within = @ds.collect(:row) { |row|
(row[dep_var]-@axb_means[[row[a_var],row[b_var]]])**2
}.sum
- ss_axb=n*@axb_means.inject(0) {|ac,v|
+ ss_axb = n*@axb_means.inject(0) {|ac,v|
j,k=v[0]
xjk=v[1]
ac+(xjk-@a_means[j]-@b_means[k]+@x_general)**2
}
+
df_a=_p-1
df_b=_q-1
df_within=(_p*_q)*(n-1)
@@ -186,9 +188,9 @@ def levene
def report_building(builder) #:nodoc:#
builder.section(:name=>@name) do |s|
if summary_descriptives
- s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].labeling(a)}+[_("%s Mean") % @name_b]) do |t|
+ s.table(:header =>['']+@ds[a_var].factors.map {|a| @ds[a_var].index_of(a)}+[_("%s Mean") % @name_b]) do |t|
@ds[b_var].factors.each do |b|
- t.row([@ds[b_var].labeling(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]])
+ t.row([@ds[b_var].index_of(b)]+@ds[a_var].factors.map {|a| "%0.3f" % @axb_means[[a,b]] } + ["%0.3f" % @b_means[b]])
end
t.row([_("%s Mean") % @name_a]+@ds[a_var].factors.map {|a| "%0.3f" % @a_means[a]}+ ["%0.3f" % @x_general])
end
diff --git a/lib/statsample/bivariate.rb b/lib/statsample/bivariate.rb
index d24e5ff..3ba1150 100644
--- a/lib/statsample/bivariate.rb
+++ b/lib/statsample/bivariate.rb
@@ -12,9 +12,10 @@ class << self
# Covariance between two vectors
def covariance(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
+
return nil if v1a.size==0
if Statsample.has_gsl?
- GSL::Stats::covariance(v1a.gsl, v2a.gsl)
+ GSL::Stats::covariance(v1a.to_gsl, v2a.to_gsl)
else
covariance_slow(v1a,v2a)
end
@@ -34,7 +35,9 @@ def covariance_slow(v1,v2) # :nodoc:
sum_of_squares(v1a,v2a) / (v1a.size-1)
end
def sum_of_squares(v1,v2)
- v1a,v2a=Statsample.only_valid_clone(v1,v2)
+ v1a,v2a=Statsample.only_valid_clone(v1,v2)
+ v1a.reset_index!
+ v2a.reset_index!
m1=v1a.mean
m2=v2a.mean
(v1a.size).times.inject(0) {|ac,i| ac+(v1a[i]-m1)*(v2a[i]-m2)}
@@ -44,13 +47,14 @@ def pearson(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
return nil if v1a.size ==0
if Statsample.has_gsl?
- GSL::Stats::correlation(v1a.gsl, v2a.gsl)
+ GSL::Stats::correlation(v1a.to_gsl, v2a.to_gsl)
else
pearson_slow(v1a,v2a)
end
end
def pearson_slow(v1,v2) # :nodoc:
v1a,v2a=Statsample.only_valid_clone(v1,v2)
+
# Calculate sum of squares
ss=sum_of_squares(v1a,v2a)
ss.quo(Math::sqrt(v1a.sum_of_squares) * Math::sqrt(v2a.sum_of_squares))
@@ -118,14 +122,16 @@ def residuals(from,del)
r=Statsample::Bivariate.pearson(from,del)
froms, dels = from.vector_standarized, del.vector_standarized
nv=[]
- froms.data_with_nils.each_index do |i|
+ froms.reset_index!
+ dels.reset_index!
+ froms.each_index do |i|
if froms[i].nil? or dels[i].nil?
nv.push(nil)
else
nv.push(froms[i]-r*dels[i])
end
end
- nv.to_vector(:scale)
+ Daru::Vector.new(nv)
end
# Correlation between v1 and v2, controling the effect of
# control on both.
@@ -135,7 +141,6 @@ def partial_correlation(v1,v2,control)
rv1con=pearson(v1a,cona)
rv2con=pearson(v2a,cona)
(rv1v2-(rv1con*rv2con)).quo(Math::sqrt(1-rv1con**2) * Math::sqrt(1-rv2con**2))
-
end
def covariance_matrix_optimized(ds)
@@ -153,50 +158,53 @@ def covariance_matrix_optimized(ds)
# Order of rows and columns depends on Dataset#fields order
def covariance_matrix(ds)
- vars,cases=ds.fields.size,ds.cases
- if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
+ vars,cases = ds.ncols, ds.nrows
+ if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
cm=covariance_matrix_optimized(ds)
else
cm=covariance_matrix_pairwise(ds)
-
end
cm.extend(Statsample::CovariateMatrix)
- cm.fields=ds.fields
+ cm.fields = ds.vectors.to_a
cm
end
def covariance_matrix_pairwise(ds)
cache={}
- matrix=ds.collect_matrix do |row,col|
- if (ds[row].type!=:scale or ds[col].type!=:scale)
- nil
- elsif row==col
- ds[row].variance
- else
- if cache[[col,row]].nil?
- cov=covariance(ds[row],ds[col])
- cache[[row,col]]=cov
- cov
+ vectors = ds.vectors.to_a
+ mat_rows = vectors.collect do |row|
+ vectors.collect do |col|
+ if (ds[row].type!=:numeric or ds[col].type!=:numeric)
+ nil
+ elsif row==col
+ ds[row].variance
else
- cache[[col,row]]
+ if cache[[col,row]].nil?
+ cov=covariance(ds[row],ds[col])
+ cache[[row,col]]=cov
+ cov
+ else
+ cache[[col,row]]
+ end
end
end
end
- matrix
+
+ Matrix.rows mat_rows
end
# Correlation matrix.
# Order of rows and columns depends on Dataset#fields order
def correlation_matrix(ds)
- vars,cases=ds.fields.size,ds.cases
- if !ds.has_missing_data? and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
+ vars, cases = ds.ncols, ds.nrows
+ if !ds.include_values?(*Daru::MISSING_VALUES) and Statsample.has_gsl? and prediction_optimized(vars,cases) < prediction_pairwise(vars,cases)
cm=correlation_matrix_optimized(ds)
else
cm=correlation_matrix_pairwise(ds)
end
cm.extend(Statsample::CovariateMatrix)
- cm.fields=ds.fields
+ cm.fields = ds.vectors.to_a
cm
end
@@ -212,33 +220,43 @@ def correlation_matrix_optimized(ds)
end
def correlation_matrix_pairwise(ds)
cache={}
- cm=ds.collect_matrix do |row,col|
- if row==col
- 1.0
- elsif (ds[row].type!=:scale or ds[col].type!=:scale)
- nil
- else
- if cache[[col,row]].nil?
- r=pearson(ds[row],ds[col])
- cache[[row,col]]=r
- r
+ vectors = ds.vectors.to_a
+ cm = vectors.collect do |row|
+ vectors.collect do |col|
+ if row==col
+ 1.0
+ elsif (ds[row].type!=:numeric or ds[col].type!=:numeric)
+ nil
else
- cache[[col,row]]
- end
+ if cache[[col,row]].nil?
+ r=pearson(ds[row],ds[col])
+ cache[[row,col]]=r
+ r
+ else
+ cache[[col,row]]
+ end
+ end
end
end
+
+ Matrix.rows cm
end
# Retrieves the n valid pairwise.
def n_valid_matrix(ds)
- ds.collect_matrix do |row,col|
- if row==col
- ds[row].valid_data.size
- else
- rowa,rowb=Statsample.only_valid_clone(ds[row],ds[col])
- rowa.size
+ vectors = ds.vectors.to_a
+ m = vectors.collect do |row|
+ vectors.collect do |col|
+ if row==col
+ ds[row].reject_values(*Daru::MISSING_VALUES).size
+ else
+ rowa,rowb = Statsample.only_valid_clone(ds[row],ds[col])
+ rowa.size
+ end
end
end
+
+ Matrix.rows m
end
# Matrix of correlation probabilities.
@@ -248,7 +266,7 @@ def correlation_probability_matrix(ds, tails=:both)
rows=ds.fields.collect do |row|
ds.fields.collect do |col|
v1a,v2a=Statsample.only_valid_clone(ds[row],ds[col])
- (row==col or ds[row].type!=:scale or ds[col].type!=:scale) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
+ (row==col or ds[row].type!=:numeric or ds[col].type!=:numeric) ? nil : prop_pearson(t_pearson(ds[row],ds[col]), v1a.size, tails)
end
end
Matrix.rows(rows)
@@ -256,27 +274,27 @@ def correlation_probability_matrix(ds, tails=:both)
# Spearman ranked correlation coefficient (rho) between 2 vectors
def spearman(v1,v2)
- v1a,v2a=Statsample.only_valid_clone(v1,v2)
- v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
+ v1a,v2a = Statsample.only_valid_clone(v1,v2)
+ v1r,v2r = v1a.ranked, v2a.ranked
pearson(v1r,v2r)
end
# Calculate Point biserial correlation. Equal to Pearson correlation, with
# one dichotomous value replaced by "0" and the other by "1"
def point_biserial(dichotomous,continous)
- ds={'d'=>dichotomous,'c'=>continous}.to_dataset.dup_only_valid
- raise(TypeError, "First vector should be dichotomous") if ds['d'].factors.size!=2
- raise(TypeError, "Second vector should be continous") if ds['c'].type!=:scale
- f0=ds['d'].factors.sort[0]
- m0=ds.filter_field('c') {|c| c['d']==f0}
- m1=ds.filter_field('c') {|c| c['d']!=f0}
- ((m1.mean-m0.mean).to_f / ds['c'].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.cases**2)
+ ds = Daru::DataFrame.new({:d=>dichotomous,:c=>continous}).reject_values(*Daru::MISSING_VALUES)
+ raise(TypeError, "First vector should be dichotomous") if ds[:d].factors.size != 2
+ raise(TypeError, "Second vector should be continous") if ds[:c].type != :numeric
+ f0=ds[:d].factors.sort.to_a[0]
+ m0=ds.filter_vector(:c) {|c| c[:d] == f0}
+ m1=ds.filter_vector(:c) {|c| c[:d] != f0}
+ ((m1.mean-m0.mean).to_f / ds[:c].sdp) * Math::sqrt(m0.size*m1.size.to_f / ds.nrows**2)
end
# Kendall Rank Correlation Coefficient (Tau a)
# Based on Hervé Adbi article
def tau_a(v1,v2)
v1a,v2a=Statsample.only_valid_clone(v1,v2)
n=v1.size
- v1r,v2r=v1a.ranked(:scale),v2a.ranked(:scale)
+ v1r,v2r=v1a.ranked,v2a.ranked
o1=ordered_pairs(v1r)
o2=ordered_pairs(v2r)
delta= o1.size*2-(o2 & o1).size*2
@@ -348,14 +366,15 @@ def pairs(matrix)
}
{'P'=>conc,'Q'=>disc,'Y'=>ties_y,'X'=>ties_x}
end
+
def ordered_pairs(vector)
- d=vector.data
- a=[]
- (0...(d.size-1)).each{|i|
- ((i+1)...(d.size)).each {|j|
+ d = vector.to_a
+ a = []
+ (0...(d.size-1)).each do |i|
+ ((i+1)...(d.size)).each do |j|
a.push([d[i],d[j]])
- }
- }
+ end
+ end
a
end
=begin
@@ -371,8 +390,8 @@ def sum_of_codeviated(v1,v2)
# Report the minimum number of cases valid of a covariate matrix
# based on a dataset
def min_n_valid(ds)
- min=ds.cases
- m=n_valid_matrix(ds)
+ min = ds.nrows
+ m = n_valid_matrix(ds)
for x in 0...m.row_size
for y in 0...m.column_size
min=m[x,y] if m[x,y] < min
@@ -380,8 +399,6 @@ def min_n_valid(ds)
end
min
end
-
-
end
end
end
diff --git a/lib/statsample/bivariate/pearson.rb b/lib/statsample/bivariate/pearson.rb
index 8dd6dea..4060ad4 100644
--- a/lib/statsample/bivariate/pearson.rb
+++ b/lib/statsample/bivariate/pearson.rb
@@ -7,8 +7,8 @@ module Bivariate
# variables.
#
# == Usage
- # a = [1,2,3,4,5,6].to_scale
- # b = [2,3,4,5,6,7].to_scale
+ # a = Daru::Vector.new([1,2,3,4,5,6])
+ # b = Daru::Vector.new([2,3,4,5,6,7])
# pearson = Statsample::Bivariate::Pearson.new(a,b)
# puts pearson.r
# puts pearson.t
diff --git a/lib/statsample/codification.rb b/lib/statsample/codification.rb
index bf76ef0..96d089f 100644
--- a/lib/statsample/codification.rb
+++ b/lib/statsample/codification.rb
@@ -34,24 +34,33 @@ class << self
# will be hashes, with keys = values, for recodification
def create_hash(dataset, vectors, sep=Statsample::SPLIT_TOKEN)
raise ArgumentError,"Array should't be empty" if vectors.size==0
- pro_hash=vectors.inject({}){|h,v_name|
- raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
- v=dataset[v_name]
- split_data=v.splitted(sep).flatten.collect {|c| c.to_s}.find_all {|c| !c.nil?}
+ pro_hash = vectors.inject({}) do |h,v_name|
+ v_name = v_name.is_a?(Numeric) ? v_name : v_name.to_sym
+ raise Exception, "Vector #{v_name} doesn't exists on Dataset" if
+ !dataset.vectors.include?(v_name)
+ v = dataset[v_name]
+ split_data = v.splitted(sep)
+ .flatten
+ .collect { |c| c.to_s }
+ .find_all{ |c| !c.nil? }
- factors=split_data.uniq.compact.sort.inject({}) {|ac,val| ac[val]=val;ac }
- h[v_name]=factors
+ factors = split_data.uniq
+ .compact
+ .sort
+ .inject({}) { |ac,val| ac[val] = val; ac }
+ h[v_name] = factors
h
- }
+ end
+
pro_hash
end
# Create a yaml to create a dictionary, based on vectors
# The keys will be vectors name on dataset and the values
# will be hashes, with keys = values, for recodification
#
- # v1=%w{a,b b,c d}.to_vector
- # ds={"v1"=>v1}.to_dataset
- # Statsample::Codification.create_yaml(ds,['v1'])
+ # v1 = Daru::Vector.new(%w{a,b b,c d})
+ # ds = Daru::DataFrame.new({:v1 => v1})
+ # Statsample::Codification.create_yaml(ds,[:v1])
# => "--- \nv1: \n a: a\n b: b\n c: c\n d: d\n"
def create_yaml(dataset, vectors, io=nil, sep=Statsample::SPLIT_TOKEN)
pro_hash=create_hash(dataset, vectors, sep)
@@ -69,16 +78,17 @@ def create_excel(dataset, vectors, filename, sep=Statsample::SPLIT_TOKEN)
if File.exist?(filename)
raise "Exists a file named #{filename}. Delete ir before overwrite."
end
- book = Spreadsheet::Workbook.new
+ book = Spreadsheet::Workbook.new
sheet = book.create_worksheet
- sheet.row(0).concat(%w{field original recoded})
- i=1
+ sheet.row(0).concat(%w(field original recoded))
+ i = 1
create_hash(dataset, vectors, sep).sort.each do |field, inner_hash|
inner_hash.sort.each do |k,v|
- sheet.row(i).concat([field.dup,k.dup,v.dup])
- i+=1
+ sheet.row(i).concat([field.to_s,k.to_s,v.to_s])
+ i += 1
end
end
+
book.write(filename)
end
# From a excel generates a dictionary hash
@@ -91,10 +101,11 @@ def excel_to_recoded_hash(filename)
sheet= book.worksheet 0
row_i=0
sheet.each do |row|
- row_i+=1
- next if row_i==1 or row[0].nil? or row[1].nil? or row[2].nil?
- h[row[0]]={} if h[row[0]].nil?
- h[row[0]][row[1]]=row[2]
+ row_i += 1
+ next if row_i == 1 or row[0].nil? or row[1].nil? or row[2].nil?
+ key = row[0].to_sym
+ h[key] ||= {}
+ h[key][row[1]] = row[2]
end
h
end
@@ -110,12 +121,12 @@ def inverse_hash(h, sep=Statsample::SPLIT_TOKEN)
end
def dictionary(h, sep=Statsample::SPLIT_TOKEN)
- h.inject({}) {|a,v| a[v[0]]=v[1].split(sep); a }
+ h.inject({}) { |a,v| a[v[0]]=v[1].split(sep); a }
end
def recode_vector(v,h,sep=Statsample::SPLIT_TOKEN)
- dict=dictionary(h,sep)
- new_data=v.splitted(sep)
+ dict = dictionary(h,sep)
+ new_data = v.splitted(sep)
new_data.collect do |c|
if c.nil?
nil
@@ -134,20 +145,22 @@ def recode_dataset_split!(dataset, dictionary_hash, sep=Statsample::SPLIT_TOKEN)
def _recode_dataset(dataset, h , sep=Statsample::SPLIT_TOKEN, split=false)
v_names||=h.keys
v_names.each do |v_name|
- raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.fields.include? v_name
- recoded=recode_vector(dataset[v_name], h[v_name],sep).collect { |c|
- if c.nil?
- nil
- else
- c.join(sep)
+ raise Exception, "Vector #{v_name} doesn't exists on Dataset" if !dataset.vectors.include? v_name
+ recoded = Daru::Vector.new(
+ recode_vector(dataset[v_name], h[v_name],sep).collect do |c|
+ if c.nil?
+ nil
+ else
+ c.join(sep)
+ end
end
- }.to_vector
- if(split)
+ )
+ if split
recoded.split_by_separator(sep).each {|k,v|
- dataset[v_name+"_"+k]=v
+ dataset[(v_name.to_s + "_" + k).to_sym] = v
}
else
- dataset[v_name+"_recoded"]=recoded
+ dataset[(v_name.to_s + "_recoded").to_sym] = recoded
end
end
end
diff --git a/lib/statsample/converter/csv.rb b/lib/statsample/converter/csv.rb
index e84442d..9834fac 100644
--- a/lib/statsample/converter/csv.rb
+++ b/lib/statsample/converter/csv.rb
@@ -1,78 +1,27 @@
+# This module will be removed in the next release.
+# Please shift to using Daru::DataFrame.from_csv and #write_csv for CSV
+# related operations.
module Statsample
- class CSV < SpreadsheetBase
- if RUBY_VERSION<"1.9"
- require 'fastercsv'
- CSV_klass=::FasterCSV
- else
- require 'csv'
- CSV_klass=::CSV
- end
+ class CSV
class << self
-
- def read19(filename,ignore_lines=0,csv_opts=Hash.new)
- #default first line is header
- csv_opts.merge!(:headers=>true, :header_converters => :symbol)
- csv = CSV_klass::Table.new(CSV_klass::read(filename,'r',csv_opts))
- csv_headers = if csv_opts[:headers]
- csv.headers
- else
- #as in R, if no header we name the headers as V1,V2,V3,V4,..
- 1.upto(csv.first.length).collect { |i| "V#{i}" }
- end
- #we invert row -> column. It means csv[0] is the first column and not row. Similar to R
- csv.by_col!
- thash = {}
- csv_headers.each_with_index do |header,idx|
- thash[header] = Statsample::Vector.new(csv[idx].drop(ignore_lines))
- end
- Statsample::Dataset.new(thash)
- end
- # Returns a Dataset based on a csv file
+ # Return a DataFrom created from a csv file.
#
- # USE:
- # ds=Statsample::CSV.read("test_csv.csv")
- def read(filename, empty=[''],ignore_lines=0,csv_opts=Hash.new)
- first_row=true
- fields=[]
- #fields_data={}
- ds=nil
- line_number=0
- csv=CSV_klass.open(filename,'rb', csv_opts)
- csv.each do |row|
- line_number+=1
- if(line_number<=ignore_lines)
- #puts "Skip line"
- next
- end
- row.collect!{|c| c.to_s }
- if first_row
- fields=extract_fields(row)
- ds=Statsample::Dataset.new(fields)
- first_row=false
- else
- rowa=process_row(row,empty)
- ds.add_case(rowa,false)
- end
- end
- convert_to_scale_and_date(ds,fields)
- ds.update_valid_data
- ds
+ # == NOTE
+ #
+ # This method has been DEPRECATED in favour of Daru::DataFrame.from_csv.
+ # Please switch to using that.
+ def read(filename, empty = [''], ignore_lines = 0, opts = {})
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_csv instead."
end
- # Save a Dataset on a csv file
+
+ # Save a Dataset on a csv file.
#
- # USE:
- # Statsample::CSV.write(ds,"test_csv.csv")
- def write(dataset,filename, convert_comma=false,*opts)
-
- writer=CSV_klass.open(filename,'w',*opts)
- writer << dataset.fields
- dataset.each_array do|row|
- if(convert_comma)
- row.collect!{|v| v.to_s.gsub(".",",")}
- end
- writer << row
- end
- writer.close
+ # == NOTE
+ #
+ # This method has BEEN DEPRECATED in favor of Daru::DataFrame#write_csv.
+ # Please use that instead.
+ def write(dataset, filename, convert_comma = false, opts = {})
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_csv instead."
end
end
end
diff --git a/lib/statsample/converter/spss.rb b/lib/statsample/converter/spss.rb
index 2599c82..8be7d1e 100644
--- a/lib/statsample/converter/spss.rb
+++ b/lib/statsample/converter/spss.rb
@@ -4,26 +4,27 @@ class << self
# Export a SPSS Matrix with tetrachoric correlations .
#
# Use:
- # ds=Statsample::Excel.read("my_data.xls")
+ # ds=Daru::DataFrame.from_excel("my_data.xls")
# puts Statsample::SPSS.tetrachoric_correlation_matrix(ds)
def tetrachoric_correlation_matrix(ds)
- dsv=ds.dup_only_valid
+ dsv=ds.reject_values(*Daru::MISSING_VALUES)
# Delete all vectors doesn't have variation
- dsv.fields.each{|f|
+ dsv.vectors.each { |f|
if dsv[f].factors.size==1
dsv.delete_vector(f)
else
dsv[f]=dsv[f].dichotomize
end
}
+
tcm=Statsample::Bivariate.tetrachoric_correlation_matrix(dsv)
- n=dsv.fields.collect {|f|
+ n=dsv.vectors.to_a.collect {|f|
sprintf("%d",dsv[f].size)
}
- meanlist=dsv.fields.collect{|f|
+ meanlist=dsv.vectors.to_a.collect{|f|
sprintf("%0.3f", dsv[f].mean)
}
- stddevlist=dsv.fields.collect{|f|
+ stddevlist=dsv.vectors.to_a.collect{|f|
sprintf("%0.3f", dsv[f].sd)
}
out=<<-HEREDOC
diff --git a/lib/statsample/converters.rb b/lib/statsample/converters.rb
index f5201ee..fbb1342 100644
--- a/lib/statsample/converters.rb
+++ b/lib/statsample/converters.rb
@@ -1,63 +1,36 @@
require 'statsample/converter/spss'
module Statsample
- # Create and dumps Datasets on a database
+ # Create and dumps Datasets on a database
+ #
+ # == NOTE
+ #
+ # Deprecated. Use Daru::DataFrame.from_sql and Daru::DataFrame#write_sql
module Database
class << self
# Read a database query and returns a Dataset
#
- # USE:
- #
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
- # Statsample.read(dbh, "SELECT * FROM test")
- #
+ # == NOTE
+ #
+ # Deprecated. Use Daru::DataFrame.from_sql instead.
def read(dbh,query)
- require 'dbi'
- sth=dbh.execute(query)
- vectors={}
- fields=[]
- sth.column_info.each {|c|
- vectors[c['name']]=Statsample::Vector.new([])
- vectors[c['name']].name=c['name']
- vectors[c['name']].type= (c['type_name']=='INTEGER' or c['type_name']=='DOUBLE') ? :scale : :nominal
- fields.push(c['name'])
- }
- ds=Statsample::Dataset.new(vectors,fields)
- sth.fetch do |row|
- ds.add_case(row.to_a, false )
- end
- ds.update_valid_data
- ds
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_sql instead."
end
+
# Insert each case of the Dataset on the selected table
#
- # USE:
- #
- # ds={'id'=>[1,2,3].to_vector, 'name'=>["a","b","c"].to_vector}.to_dataset
- # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
- # Statsample::Database.insert(ds,dbh,"test")
- #
+ # == NOTE
+ #
+ # Deprecated. Use Daru::DataFrame#write_sql instead
def insert(ds, dbh, table)
- require 'dbi'
- query="INSERT INTO #{table} ("+ds.fields.join(",")+") VALUES ("+((["?"]*ds.fields.size).join(","))+")"
- sth=dbh.prepare(query)
- ds.each_array{|c| sth.execute(*c) }
- return true
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_sql instead."
end
# Create a sql, basen on a given Dataset
#
- # USE:
- #
- # ds={'id'=>[1,2,3,4,5].to_vector,'name'=>%w{Alex Peter Susan Mary John}.to_vector}.to_dataset
- # Statsample::Database.create_sql(ds,'names')
- # ==>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
+ # == NOTE
#
+ # Deprecated. Use Daru::DataFrame#create_sql instead.
def create_sql(ds,table,charset="UTF8")
- sql="CREATE TABLE #{table} ("
- fields=ds.fields.collect{|f|
- v=ds[f]
- f+" "+v.db_type
- }
- sql+fields.join(",\n ")+") CHARACTER SET=#{charset};"
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#create_sql instead."
end
end
end
@@ -65,183 +38,49 @@ module Mondrian
class << self
def write(dataset,filename)
File.open(filename,"wb") do |fp|
- fp.puts dataset.fields.join("\t")
- dataset.each_array_with_nils do |row|
- row2=row.collect{|v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") }
+ fp.puts dataset.vectors.to_a.join("\t")
+ dataset.each_row do |row|
+ row2 = row.map { |v| v.nil? ? "NA" : v.to_s.gsub(/\s+/,"_") }
fp.puts row2.join("\t")
end
end
end
end
end
- class SpreadsheetBase
+
+ class PlainText
class << self
- def extract_fields(row)
- i=0;
- fields=row.to_a.collect{|c|
- if c.nil?
- i+=1
- "var%05d" % i
- else
- c.to_s.downcase
- end
- }
- fields.recode_repeated
+ def read(filename, fields)
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_plaintext instead."
end
-
- def process_row(row,empty)
- row.to_a.map do |c|
- if empty.include?(c)
- nil
- else
- if c.is_a? String and c.is_number?
- if c=~/^\d+$/
- c.to_i
- else
- c.gsub(",",".").to_f
- end
- else
- c
- end
- end
- end
- end
- def convert_to_scale_and_date(ds,fields)
- fields.each do |f|
- if ds[f].can_be_scale?
- ds[f].type=:scale
- elsif ds[f].can_be_date?
- ds[f].type=:date
- end
- end
- end
-
end
end
- class PlainText < SpreadsheetBase
- class << self
- def read(filename, fields)
- ds=Statsample::Dataset.new(fields)
- fp=File.open(filename,"r")
- fp.each_line do |line|
- row=process_row(line.strip.split(/\s+/),[""])
- next if row==["\x1A"]
- ds.add_case_array(row)
- end
- convert_to_scale_and_date(ds,fields)
- ds.update_valid_data
- fields.each {|f|
- ds[f].name=f
- }
- ds
- end
- end
- end
- class Excel < SpreadsheetBase
+
+ # This class has been DEPRECATED. Use Daru::DataFrame::from_excel
+ # Daru::DataFrame#write_excel for XLS file operations.
+ class Excel
class << self
# Write a Excel spreadsheet based on a dataset
# * TODO: Format nicely date values
+ #
+ # == NOTE
+ #
+ # Deprecated. Use Daru::DataFrame#write_csv.
def write(dataset,filename)
- require 'spreadsheet'
- book = Spreadsheet::Workbook.new
- sheet = book.create_worksheet
- format = Spreadsheet::Format.new :color => :blue,
- :weight => :bold
- sheet.row(0).concat(dataset.fields.map {|i| i.dup}) # Unfreeze strings
- sheet.row(0).default_format = format
- i=1
- dataset.each_array{|row|
- sheet.row(i).concat(row)
- i+=1
- }
- book.write(filename)
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame#write_excel instead."
end
- # This should be fixed.
- # If we have a Formula, should be resolver first
- def preprocess_row(row, dates)
- i=-1
- row.collect!{|c|
- i+=1
- if c.is_a? Spreadsheet::Formula
- if(c.value.is_a? Spreadsheet::Excel::Error)
- nil
- else
- c.value
- end
- elsif dates.include? i and !c.nil? and c.is_a? Numeric
- row.date(i)
- else
- c
- end
- }
- end
- private :process_row, :preprocess_row
-
# Returns a dataset based on a xls file
- # USE:
- # ds = Statsample::Excel.read("test.xls")
- #
+ #
+ # == NOTE
+ #
+ # Deprecated. Use Daru::DataFrame.from_excel instead.
def read(filename, opts=Hash.new)
- require 'spreadsheet'
- raise "options should be Hash" unless opts.is_a? Hash
- opts_default={
- :worksheet_id=>0,
- :ignore_lines=>0,
- :empty=>['']
- }
-
- opts=opts_default.merge opts
-
- worksheet_id=opts[:worksheet_id]
- ignore_lines=opts[:ignore_lines]
- empty=opts[:empty]
-
- first_row=true
- fields=[]
- fields_data={}
- ds=nil
- line_number=0
- book = Spreadsheet.open filename
- sheet= book.worksheet worksheet_id
- sheet.each do |row|
- begin
- dates=[]
- row.formats.each_index{|i|
- if !row.formats[i].nil? and row.formats[i].number_format=="DD/MM/YYYY"
- dates.push(i)
- end
- }
- line_number+=1
- next if(line_number<=ignore_lines)
-
- preprocess_row(row,dates)
- if first_row
- fields=extract_fields(row)
- ds=Statsample::Dataset.new(fields)
- first_row=false
- else
- rowa=process_row(row,empty)
- (fields.size - rowa.size).times {
- rowa << nil
- }
- ds.add_case(rowa,false)
- end
- rescue => e
- error="#{e.to_s}\nError on Line # #{line_number}:#{row.join(",")}"
- raise
- end
- end
- convert_to_scale_and_date(ds, fields)
- ds.update_valid_data
- fields.each {|f|
- ds[f].name=f
- }
- ds.name=filename
- ds
+ raise NoMethodError, "Deprecated. Use Daru::DataFrame.from_excel instead."
end
end
end
+
module Mx
class << self
def write(dataset,filename,type=:covariance)
@@ -250,12 +89,12 @@ def write(dataset,filename,type=:covariance)
fp.puts "! #{filename}"
fp.puts "! Output generated by Statsample"
fp.puts "Data Ninput=#{dataset.fields.size} Nobservations=#{dataset.cases}"
- fp.puts "Labels "+dataset.fields.join(" ")
+ fp.puts "Labels " + dataset.vectors.to_a.join(" ")
case type
when :raw
fp.puts "Rectangular"
dataset.each do |row|
- out=dataset.fields.collect do |f|
+ out=dataset.vectors.to_a.collect do |f|
if dataset[f].is_valid? row[f]
row[f]
else
@@ -293,22 +132,22 @@ def out(dataset,opt={})
carrier=OpenStruct.new
carrier.categorials=[]
carrier.conversions={}
- variables_def=dataset.fields.collect{|k|
+ variables_def=dataset.vectors.to_a.collect{|k|
variable_definition(carrier,dataset[k],k)
}.join("\n")
-
+
indexes=carrier.categorials.inject({}) {|s,c|
- s[dataset.fields.index(c)]=c
+ s[dataset.vectors.to_a.index(c)]=c
s
}
records=""
- dataset.each_array {|c|
- indexes.each{|ik,iv|
- c[ik]=carrier.conversions[iv][c[ik]]
+ dataset.each_row {|c|
+ indexes.each { |ik,iv|
+ c[ik] = carrier.conversions[iv][c[ik]]
}
records << "#{values_definition(c, default_opt[:missing])}\n"
}
-
+
out=<
@@ -346,7 +185,7 @@ def values_definition(c,missing)
# nickname = nickname
def variable_definition(carrier,v,name,nickname=nil)
nickname = (nickname.nil? ? "" : "nickname=\"#{nickname}\"" )
- if v.type==:nominal or v.data.find {|d| d.is_a? String }
+ if v.type==:object or v.to_a.find {|d| d.is_a? String }
carrier.categorials.push(name)
carrier.conversions[name]={}
factors=v.factors
@@ -354,17 +193,16 @@ def variable_definition(carrier,v,name,nickname=nil)
out << "\n"
out << (1..factors.size).to_a.collect{|i|
carrier.conversions[name][factors[i-1]]=i
- "#{v.labeling(factors[i-1])}"
+ "#{(v.labels[factors[i-1]] || factors[i-1])}"
}.join("\n")
out << "\n\n"
out
- elsif v.data.find {|d| d.is_a? Float}
+ elsif v.to_a.find {|d| d.is_a? Float}
""
else
""
end
end
-
end
end
end
diff --git a/lib/statsample/crosstab.rb b/lib/statsample/crosstab.rb
index 75cf075..6dc4710 100644
--- a/lib/statsample/crosstab.rb
+++ b/lib/statsample/crosstab.rb
@@ -8,45 +8,46 @@ class Crosstab
attr_reader :v_rows, :v_cols
attr_accessor :row_label, :column_label, :name, :percentage_row, :percentage_column, :percentage_total
def initialize(v1, v2, opts=Hash.new)
- #raise ArgumentError, "Both arguments should be Vectors" unless v1.is_a? Statsample::Vector and v2.is_a? Statsample::Vector
raise ArgumentError, "Vectors should be the same size" unless v1.size==v2.size
- @v_rows, @v_cols=Statsample.only_valid_clone(v1.to_vector,v2.to_vector)
- @cases=@v_rows.size
- @row_label=v1.name
- @column_label=v2.name
- @name=nil
+ @v_rows, @v_cols = Statsample.only_valid_clone(
+ Daru::Vector.new(v1),
+ Daru::Vector.new(v2))
+ @cases = @v_rows.size
+ @row_label = v1.name
+ @column_label = v2.name
+ @name = nil
@percentage_row = @percentage_column = @percentage_total=false
- opts.each{|k,v|
+ opts.each do |k,v|
self.send("#{k}=",v) if self.respond_to? k
- }
- @name||=_("Crosstab %s - %s") % [@row_label, @column_label]
+ end
+ @name ||= _("Crosstab %s - %s") % [@row_label, @column_label]
end
def rows_names
- @v_rows.factors.sort
+ @v_rows.factors.sort.reset_index!
end
def cols_names
- @v_cols.factors.sort
+ @v_cols.factors.sort.reset_index!
end
def rows_total
- @v_rows.frequencies
+ @v_rows.frequencies.to_h
end
def cols_total
- @v_cols.frequencies
+ @v_cols.frequencies.to_h
end
def frequencies
- base=rows_names.inject([]){|s,row|
- s+=cols_names.collect{|col| [row,col]}
- }.inject({}) {|s,par|
+ base = rows_names.inject([]) do |s,row|
+ s += cols_names.collect { |col| [row,col] }
+ end.inject({}) do |s,par|
s[par]=0
s
- }
- base.update(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a.to_vector.frequencies)
+ end
+ base.update(Daru::Vector.new(Statsample::vector_cols_matrix(@v_rows,@v_cols).to_a).frequencies.to_h)
end
def to_matrix
- f=frequencies
- rn=rows_names
- cn=cols_names
+ f = frequencies
+ rn = rows_names
+ cn = cols_names
Matrix.rows(rn.collect{|row|
cn.collect{|col| f[[row,col]]}
})
@@ -67,8 +68,8 @@ def frequencies_by_col
end
# Chi square, based on expected and real matrix
def chi_square
- require 'statsample/test'
- Statsample::Test.chi_square(self.to_matrix, matrix_expected)
+ require 'statsample/test'
+ Statsample::Test.chi_square(self.to_matrix, matrix_expected)
end
# Useful to obtain chi square
def matrix_expected
@@ -98,10 +99,10 @@ def report_building(builder)
generator.text(_("Rows: %s") % @row_label) unless @row_label.nil?
generator.text(_("Columns: %s") % @column_label) unless @column_label.nil?
- t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c)}+[_("Total")])
+ t=ReportBuilder::Table.new(:name=>@name+" - "+_("Raw"), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c)}+[_("Total")])
rn.each do |row|
total_row=0
- t_row=[@v_rows.labeling(row)]
+ t_row=[@v_rows.index_of(row)]
cn.each do |col|
data=fq[[row,col]]
total_row+=fq[[row,col]]
@@ -148,9 +149,9 @@ def table_percentage(generator,type)
when :total then _("% Total")
end
- t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.labeling(c) } + [_("Total")])
+ t=ReportBuilder::Table.new(:name=>@name+" - "+_(type_name), :header=>[""]+cols_names.collect {|c| @v_cols.index_of(c) } + [_("Total")])
rn.each do |row|
- t_row=[@v_rows.labeling(row)]
+ t_row=[@v_rows.index_of(row)]
cn.each do |col|
total=case type
when :row then rt[row]
diff --git a/lib/statsample/daru.rb b/lib/statsample/daru.rb
new file mode 100644
index 0000000..21f111a
--- /dev/null
+++ b/lib/statsample/daru.rb
@@ -0,0 +1,115 @@
+# Opening the Daru::DataFrame class for adding methods to convert from
+# data structures to specialized statsample data structues like Multiset.
+module Daru
+ class Vector
+ def histogram(bins=10)
+ type == :numeric or raise TypeError, "Only numeric Vectors can do this operation."
+
+ if bins.is_a? Array
+ h = Statsample::Histogram.alloc(bins)
+ else
+ # ugly patch. The upper limit for a bin has the form
+ # x < range
+ #h=Statsample::Histogram.new(self, bins)
+ valid = reject_values(*Daru::MISSING_VALUES)
+ min,max=Statsample::Util.nice(valid.min,valid.max)
+ # fix last data
+ if max == valid.max
+ max += 1e-10
+ end
+ h = Statsample::Histogram.alloc(bins,[min,max])
+ # Fix last bin
+ end
+
+ h.increment(valid)
+ h
+ end
+
+ # Variance of p, according to poblation size
+ def variance_proportion(n_poblation, v=1)
+ Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
+ end
+
+ # Variance of p, according to poblation size
+ def variance_total(n_poblation, v=1)
+ Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
+ end
+
+ def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
+ Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
+ end
+
+ def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
+ Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
+ end
+ end
+
+ class DataFrame
+ def crosstab(v1,v2,opts={})
+ Statsample::Crosstab.new(self[v1], self[v2],opts)
+ end
+
+ # Functions for converting to Statsample::Multiset
+ def to_multiset_by_split(*vecs)
+ require 'statsample/multiset'
+
+ if vecs.size == 1
+ to_multiset_by_split_one_field(vecs[0])
+ else
+ to_multiset_by_split_multiple_fields(*vecs)
+ end
+ end
+
+ # Creates a Statsample::Multiset, using one field
+ def to_multiset_by_split_one_field(field)
+ raise ArgumentError,"Should use a correct field name" if
+ !@vectors.include? field
+
+ factors = self[field].factors
+ ms = Statsample::Multiset.new_empty_vectors(@vectors.to_a, factors)
+ each_row do |row|
+ ms[row[field]].add_row(row)
+ end
+ #puts "Ingreso a los dataset"
+ ms.datasets.each do |k,ds|
+ ds.rename self[field].index_of(k)
+ end
+
+ ms
+ end
+
+ def to_multiset_by_split_multiple_fields(*fields)
+ fields.map!(&:to_sym)
+ factors_total=nil
+ fields.each do |f|
+ if factors_total.nil?
+ factors_total = self[f].factors.collect { |c| [c] }
+ else
+ suma = []
+ factors = self[f].factors
+ factors_total.each do |f1|
+ factors.each do |f2|
+ suma.push(f1+[f2])
+ end
+ end
+ factors_total = suma
+ end
+ end
+ ms = Statsample::Multiset.new_empty_vectors(vectors.to_a, factors_total)
+
+ p1 = eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}'.to_sym]"}.join(",")+"]].add_row(c) }"
+ each_row { |r| p1.call(r) }
+
+ ms.datasets.each do |k,ds|
+ ds.rename(
+ fields.size.times.map do |i|
+ f = fields[i]
+ sk = k[i]
+ self[f].index_of(sk)
+ end.join("-")
+ )
+ end
+ ms
+ end
+ end
+end
\ No newline at end of file
diff --git a/lib/statsample/dataset.rb b/lib/statsample/dataset.rb
index fbeea85..5243d12 100644
--- a/lib/statsample/dataset.rb
+++ b/lib/statsample/dataset.rb
@@ -1,1005 +1,10 @@
require 'statsample/vector'
class Hash
- # Creates a Statsample::Dataset based on a Hash
- def to_dataset(*args)
- Statsample::Dataset.new(self, *args)
+ # Creates a Statsample::Dataset based on a Hash
+ def to_dataframe(*args)
+ Daru::DataFrame.new(self, *args)
end
-end
-
-class Array
- def prefix(s) # :nodoc:
- self.collect{|c| s+c.to_s }
- end
- def suffix(s) # :nodoc:
- self.collect{|c| c.to_s+s }
- end
-end
-
-module Statsample
- class DatasetException < RuntimeError # :nodoc:
- attr_reader :ds,:exp
- def initialize(ds,e)
- @ds=ds
- @exp=e
- end
- def to_s
- m="Error on iteration: "+@exp.message+"\n"+@exp.backtrace.join("\n")
- m+="\nRow ##{@ds.i}:#{@ds.case_as_hash(@ds.i)}" unless @ds.i.nil?
- m
- end
- end
- # Set of cases with values for one or more variables,
- # analog to a dataframe on R or a standard data file of SPSS.
- # Every vector has #field name, which represent it. By default,
- # the vectors are ordered by it field name, but you can change it
- # the fields order manually.
- # The Dataset work as a Hash, with keys are field names
- # and values are Statsample::Vector
- #
- #
- # ==Usage
- # Create a empty dataset:
- # Dataset.new()
- # Create a dataset with three empty vectors, called v1, v2 and v3:
- # Dataset.new(%w{v1 v2 v3})
- # Create a dataset with two vectors, called v1
- # and v2:
- # Dataset.new({'v1'=>%w{1 2 3}.to_vector, 'v2'=>%w{4 5 6}.to_vector})
- # Create a dataset with two given vectors (v1 and v2),
- # with vectors on inverted order:
- # Dataset.new({'v2'=>v2,'v1'=>v1},['v2','v1'])
- #
- # The fast way to create a dataset uses Hash#to_dataset, with
- # field order as arguments
- # v1 = [1,2,3].to_scale
- # v2 = [1,2,3].to_scale
- # ds = {'v1'=>v2, 'v2'=>v2}.to_dataset(%w{v2 v1})
-
- class Dataset
- include Writable
- include Summarizable
- # Hash of Statsample::Vector
- attr_reader :vectors
- # Ordered ids of vectors
- attr_reader :fields
- # Name of dataset
- attr_accessor :name
- # Number of cases
- attr_reader :cases
- # Location of pointer on enumerations methods (like #each)
- attr_reader :i
-
- # Generates a new dataset, using three vectors
- # - Rows
- # - Columns
- # - Values
- #
- # For example, you have these values
- #
- # x y v
- # a a 0
- # a b 1
- # b a 1
- # b b 0
- #
- # You obtain
- # id a b
- # a 0 1
- # b 1 0
- #
- # Useful to process outputs from databases
- def self.crosstab_by_asignation(rows,columns,values)
- raise "Three vectors should be equal size" if rows.size!=columns.size or rows.size!=values.size
- cols_values=columns.factors
- cols_n=cols_values.size
- h_rows=rows.factors.inject({}){|a,v| a[v]=cols_values.inject({}){
- |a1,v1| a1[v1]=nil; a1
- }
- ;a}
- values.each_index{|i|
- h_rows[rows[i]][columns[i]]=values[i]
- }
- ds=Dataset.new(["_id"]+cols_values)
- cols_values.each{|c|
- ds[c].type=values.type
- }
- rows.factors.each {|row|
- n_row=Array.new(cols_n+1)
- n_row[0]=row
- cols_values.each_index {|i|
- n_row[i+1]=h_rows[row][cols_values[i]]
- }
- ds.add_case_array(n_row)
- }
- ds.update_valid_data
- ds
- end
- # Return true if any vector has missing data
- def has_missing_data?
- @vectors.any? {|k,v| v.has_missing_data?}
- end
- # Return a nested hash using fields as keys and
- # an array constructed of hashes with other values.
- # If block provided, is used to provide the
- # values, with parameters +row+ of dataset,
- # +current+ last hash on hierarchy and
- # +name+ of the key to include
- def nest(*tree_keys,&block)
- tree_keys=tree_keys[0] if tree_keys[0].is_a? Array
- out=Hash.new
- each do |row|
- current=out
- # Create tree
- tree_keys[0,tree_keys.size-1].each do |f|
- root=row[f]
- current[root]||=Hash.new
- current=current[root]
- end
- name=row[tree_keys.last]
- if !block
- current[name]||=Array.new
- current[name].push(row.delete_if{|key,value| tree_keys.include? key})
- else
- current[name]=block.call(row, current,name)
- end
- end
- out
- end
- # Creates a new dataset. A dataset is a set of ordered named vectors
- # of the same size.
- #
- # [vectors] With an array, creates a set of empty vectors named as
- # values on the array. With a hash, each Vector is assigned as
- # a variable of the Dataset named as its key
- # [fields] Array of names for vectors. Is only used for set the
- # order of variables. If empty, vectors keys on alfabethic order as
- # used as fields.
- def initialize(vectors={}, fields=[])
- @@n_dataset||=0
- @@n_dataset+=1
- @name=_("Dataset %d") % @@n_dataset
- @cases=0
- @gsl=nil
- @i=nil
-
- if vectors.instance_of? Array
- @fields=vectors.dup
- @vectors=vectors.inject({}){|a,x| a[x]=Statsample::Vector.new(); a}
- else
- # Check vectors
- @vectors=vectors
- @fields=fields
- check_order
- check_length
- end
- end
- #
- # Creates a copy of the given dataset, deleting all the cases with
- # missing data on one of the vectors.
- #
- # @param array of fields to include. No value include all fields
- #
- def dup_only_valid(*fields_to_include)
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
- fields_to_include=fields_to_include[0]
- end
- fields_to_include=@fields if fields_to_include.size==0
- if fields_to_include.any? {|f| @vectors[f].has_missing_data?}
- ds=Dataset.new(fields_to_include)
- fields_to_include.each {|f| ds[f].type=@vectors[f].type}
- each {|row|
- unless fields_to_include.any? {|f| @vectors[f].has_missing_data? and !@vectors[f].is_valid? row[f]}
- row_2=fields_to_include.inject({}) {|ac,v| ac[v]=row[v]; ac}
- ds.add_case(row_2)
- end
- }
- else
- ds=dup fields_to_include
- end
- ds.name= self.name
- ds
- end
- #
- # Returns a duplicate of the Dataset.
- # All vectors are copied, so any modification on new
- # dataset doesn't affect original dataset's vectors.
- # If fields given as parameter, only include those vectors.
- #
- # @param array of fields to include. No value include all fields
- # @return {Statsample::Dataset}
- def dup(*fields_to_include)
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
- fields_to_include=fields_to_include[0]
- end
- fields_to_include=@fields if fields_to_include.size==0
- vectors={}
- fields=[]
- fields_to_include.each{|f|
- raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
- vectors[f]=@vectors[f].dup
- fields.push(f)
- }
- ds=Dataset.new(vectors,fields)
- ds.name= self.name
- ds
- end
-
-
- # Returns an array with the fields from first argumen to last argument
- def from_to(from,to)
- raise ArgumentError, "Field #{from} should be on dataset" if !@fields.include? from
- raise ArgumentError, "Field #{to} should be on dataset" if !@fields.include? to
- @fields.slice(@fields.index(from)..@fields.index(to))
- end
-
- # Returns (when possible) a cheap copy of dataset.
- # If no vector have missing values, returns original vectors.
- # If missing values presents, uses Dataset.dup_only_valid.
- #
- # @param array of fields to include. No value include all fields
- # @return {Statsample::Dataset}
- def clone_only_valid(*fields_to_include)
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
- fields_to_include=fields_to_include[0]
- end
- fields_to_include=@fields.dup if fields_to_include.size==0
- if fields_to_include.any? {|v| @vectors[v].has_missing_data?}
- dup_only_valid(fields_to_include)
- else
- clone(fields_to_include)
- end
- end
- # Returns a shallow copy of Dataset.
- # Object id will be distinct, but @vectors will be the same.
- # @param array of fields to include. No value include all fields
- # @return {Statsample::Dataset}
- def clone(*fields_to_include)
- if fields_to_include.size==1 and fields_to_include[0].is_a? Array
- fields_to_include=fields_to_include[0]
- end
- fields_to_include=@fields.dup if fields_to_include.size==0
- ds=Dataset.new
- fields_to_include.each{|f|
- raise "Vector #{f} doesn't exists" unless @vectors.has_key? f
- ds[f]=@vectors[f]
- }
- ds.fields=fields_to_include
- ds.name=@name
- ds.update_valid_data
- ds
- end
- # Creates a copy of the given dataset, without data on vectors
- #
- # @return {Statsample::Dataset}
- def dup_empty
- vectors=@vectors.inject({}) {|a,v|
- a[v[0]]=v[1].dup_empty
- a
- }
- Dataset.new(vectors,@fields.dup)
- end
- # Merge vectors from two datasets
- # In case of name collition, the vectors names are changed to
- # x_1, x_2 ....
- #
- # @return {Statsample::Dataset}
- def merge(other_ds)
- raise "Cases should be equal (this:#{@cases}; other:#{other_ds.cases}" unless @cases==other_ds.cases
- types = @fields.collect{|f| @vectors[f].type} + other_ds.fields.collect{|f| other_ds[f].type}
- new_fields = (@fields+other_ds.fields).recode_repeated
- ds_new=Statsample::Dataset.new(new_fields)
- new_fields.each_index{|i|
- field=new_fields[i]
- ds_new[field].type=types[i]
- }
- @cases.times {|i|
- row=case_as_array(i)+other_ds.case_as_array(i)
- ds_new.add_case_array(row)
- }
- ds_new.update_valid_data
- ds_new
- end
-
- # Join 2 Datasets by given fields
- # type is one of :left and :inner, default is :left
- #
- # @return {Statsample::Dataset}
- def join(other_ds,fields_1=[],fields_2=[],type=:left)
- fields_new = other_ds.fields - fields_2
- fields = self.fields + fields_new
- other_ds_hash = {}
- other_ds.each do |row|
- key = row.select{|k,v| fields_2.include?(k)}.values
- value = row.select{|k,v| fields_new.include?(k)}
- if other_ds_hash[key].nil?
- other_ds_hash[key] = [value]
- else
- other_ds_hash[key] << value
- end
- end
-
- new_ds = Dataset.new(fields)
-
- self.each do |row|
- key = row.select{|k,v| fields_1.include?(k)}.values
-
- new_case = row.dup
-
- if other_ds_hash[key].nil?
- if type == :left
- fields_new.each{|field| new_case[field] = nil}
- new_ds.add_case(new_case)
- end
- else
- other_ds_hash[key].each do |new_values|
- new_ds.add_case new_case.merge(new_values)
- end
- end
-
- end
- new_ds
- end
- # Returns a dataset with standarized data.
- #
- # @return {Statsample::Dataset}
- def standarize
- ds=dup()
- ds.fields.each do |f|
- ds[f]=ds[f].vector_standarized
- end
- ds
- end
- # Generate a matrix, based on fields of dataset
- #
- # @return {::Matrix}
-
- def collect_matrix
- rows=@fields.collect{|row|
- @fields.collect{|col|
- yield row,col
- }
- }
- Matrix.rows(rows)
- end
-
- # We have the same datasets if +vectors+ and +fields+ are the same
- #
- # @return {Boolean}
- def ==(d2)
- @vectors==d2.vectors and @fields==d2.fields
- end
- # Returns vector c
- #
- # @return {Statsample::Vector}
- def col(c)
- @vectors[c]
- end
- alias_method :vector, :col
- # Equal to Dataset[name]=vector
- #
- # @return self
- def add_vector(name, vector)
- raise ArgumentError, "Vector have different size" if vector.size!=@cases
- @vectors[name]=vector
- check_order
- self
- end
- # Returns true if dataset have vector v.
- #
- # @return {Boolean}
- def has_vector? (v)
- return @vectors.has_key?(v)
- end
- # Creates a dataset with the random data, of a n size
- # If n not given, uses original number of cases.
- #
- # @return {Statsample::Dataset}
- def bootstrap(n=nil)
- n||=@cases
- ds_boot=dup_empty
- n.times do
- ds_boot.add_case_array(case_as_array(rand(n)))
- end
- ds_boot.update_valid_data
- ds_boot
- end
- # Fast version of #add_case.
- # Can only add one case and no error check if performed
- # You SHOULD use #update_valid_data at the end of insertion cycle
- #
- #
- def add_case_array(v)
- v.each_index {|i| d=@vectors[@fields[i]].data; d.push(v[i])}
- end
- # Insert a case, using:
- # * Array: size equal to number of vectors and values in the same order as fields
- # * Hash: keys equal to fields
- # If uvd is false, #update_valid_data is not executed after
- # inserting a case. This is very useful if you want to increase the
- # performance on inserting many cases, because #update_valid_data
- # performs check on vectors and on the dataset
-
- def add_case(v,uvd=true)
- case v
- when Array
- if (v[0].is_a? Array)
- v.each{|subv| add_case(subv,false)}
- else
- raise ArgumentError, "Input array size (#{v.size}) should be equal to fields number (#{@fields.size})" if @fields.size!=v.size
- v.each_index {|i| @vectors[@fields[i]].add(v[i],false)}
- end
- when Hash
- raise ArgumentError, "Hash keys should be equal to fields #{(v.keys - @fields).join(",")}" if @fields.sort!=v.keys.sort
- @fields.each{|f| @vectors[f].add(v[f],false)}
- else
- raise TypeError, 'Value must be a Array or a Hash'
- end
- if uvd
- update_valid_data
- end
- end
- # Check vectors and fields after inserting data. Use only
- # after #add_case_array or #add_case with second parameter to false
- def update_valid_data
- @gsl=nil
- @fields.each{|f| @vectors[f].set_valid_data}
- check_length
- end
- # Delete vector named +name+. Multiple fields accepted.
- def delete_vector(*args)
- if args.size==1 and args[0].is_a? Array
- names=args[0]
- else
- names=args
- end
- names.each do |name|
- @fields.delete(name)
- @vectors.delete(name)
- end
- end
-
- def add_vectors_by_split_recode(name_,join='-',sep=Statsample::SPLIT_TOKEN)
- split=@vectors[name_].split_by_separator(sep)
- i=1
- split.each{|k,v|
- new_field=name_+join+i.to_s
- v.name=name_+":"+k
- add_vector(new_field,v)
- i+=1
- }
- end
- def add_vectors_by_split(name,join='-',sep=Statsample::SPLIT_TOKEN)
- split=@vectors[name].split_by_separator(sep)
- split.each{|k,v|
- add_vector(name+join+k,v)
- }
- end
-
- def vector_by_calculation(type=:scale)
- a=[]
- each do |row|
- a.push(yield(row))
- end
- a.to_vector(type)
- end
- # Returns a vector with sumatory of fields
- # if fields parameter is empty, sum all fields
- def vector_sum(fields=nil)
- fields||=@fields
- vector=collect_with_index do |row, i|
- if(fields.find{|f| !@vectors[f].data_with_nils[i]})
- nil
- else
- fields.inject(0) {|ac,v| ac + row[v].to_f}
- end
- end
- vector.name=_("Sum from %s") % @name
- vector
- end
- # Check if #fields attribute is correct, after inserting or deleting vectors
- def check_fields(fields)
- fields||=@fields
- raise "Fields #{(fields-@fields).join(", ")} doesn't exists on dataset" if (fields-@fields).size>0
- fields
- end
-
- # Returns a vector with the numbers of missing values for a case
- def vector_missing_values(fields=nil)
- fields=check_fields(fields)
- collect_with_index do |row, i|
- fields.inject(0) {|a,v|
- a+ ((@vectors[v].data_with_nils[i].nil?) ? 1: 0)
- }
- end
- end
- def vector_count_characters(fields=nil)
- fields=check_fields(fields)
- collect_with_index do |row, i|
- fields.inject(0){|a,v|
- a+((@vectors[v].data_with_nils[i].nil?) ? 0: row[v].to_s.size)
- }
- end
- end
- # Returns a vector with the mean for a set of fields
- # if fields parameter is empty, return the mean for all fields
- # if max invalid parameter > 0, returns the mean for all tuples
- # with 0 to max_invalid invalid fields
- def vector_mean(fields=nil, max_invalid=0)
- a=[]
- fields=check_fields(fields)
- size=fields.size
- each_with_index do |row, i |
- # numero de invalidos
- sum=0
- invalids=0
- fields.each{|f|
- if !@vectors[f].data_with_nils[i].nil?
- sum+=row[f].to_f
- else
- invalids+=1
- end
- }
- if(invalids>max_invalid)
- a.push(nil)
- else
- a.push(sum.quo(size-invalids))
- end
- end
- a=a.to_vector(:scale)
- a.name=_("Means from %s") % @name
- a
- end
- # Check vectors for type and size.
- def check_length # :nodoc:
- size=nil
- @vectors.each do |k,v|
- raise Exception, "Data #{v.class} is not a vector on key #{k}" if !v.is_a? Statsample::Vector
- if size.nil?
- size=v.size
- else
- if v.size!=size
- raise Exception, "Vector #{k} have size #{v.size} and dataset have size #{size}"
- end
- end
- end
- @cases=size
- end
- # Retrieves each vector as [key, vector]
- def each_vector # :yield: |key, vector|
- @fields.each{|k| yield k, @vectors[k]}
- end
-
- if Statsample::STATSAMPLE__.respond_to?(:case_as_hash)
- def case_as_hash(c) # :nodoc:
- Statsample::STATSAMPLE__.case_as_hash(self,c)
- end
- else
- # Retrieves case i as a hash
- def case_as_hash(i)
- _case_as_hash(i)
- end
- end
-
- if Statsample::STATSAMPLE__.respond_to?(:case_as_array)
- def case_as_array(c) # :nodoc:
- Statsample::STATSAMPLE__.case_as_array(self,c)
- end
- else
- # Retrieves case i as a array, ordered on #fields order
- def case_as_array(i)
- _case_as_array(i)
- end
- end
- def _case_as_hash(c) # :nodoc:
- @fields.inject({}) {|a,x| a[x]=@vectors[x][c];a }
- end
- def _case_as_array(c) # :nodoc:
- @fields.collect {|x| @vectors[x][c]}
- end
-
- # Returns each case as a hash
- def each
- begin
- @i=0
- @cases.times {|i|
- @i=i
- row=case_as_hash(i)
- yield row
- }
- @i=nil
- rescue =>e
- raise DatasetException.new(self, e)
- end
- end
-
- # Returns each case as hash and index
- def each_with_index # :yield: |case, i|
- begin
- @i=0
- @cases.times{|i|
- @i=i
- row=case_as_hash(i)
- yield row, i
- }
- @i=nil
- rescue =>e
- raise DatasetException.new(self, e)
- end
- end
-
- # Returns each case as an array, coding missing values as nils
- def each_array_with_nils
- m=fields.size
- @cases.times {|i|
- @i=i
- row=Array.new(m)
- fields.each_index{|j|
- f=fields[j]
- row[j]=@vectors[f].data_with_nils[i]
- }
- yield row
- }
- @i=nil
- end
- # Returns each case as an array
- def each_array
- @cases.times {|i|
- @i=i
- row=case_as_array(i)
- yield row
- }
- @i=nil
- end
- # Set fields order. If you omit one or more vectors, they are
- # ordered by alphabetic order.
- def fields=(f)
- @fields=f
- check_order
- end
- # Check congruence between +fields+ attribute
- # and keys on +vectors
- def check_order #:nodoc:
- if(@vectors.keys.sort!=@fields.sort)
- @fields=@fields&@vectors.keys
- @fields+=@vectors.keys.sort-@fields
- end
- end
- # Returns the vector named i
- def[](i)
- if i.is_a? Range
- fields=from_to(i.begin,i.end)
- clone(*fields)
- elsif i.is_a? Array
- clone(i)
- else
- raise Exception,"Vector '#{i}' doesn't exists on dataset" unless @vectors.has_key?(i)
- @vectors[i]
- end
- end
- # Retrieves a Statsample::Vector, based on the result
- # of calculation performed on each case.
- def collect(type=:scale)
- data=[]
- each {|row|
- data.push yield(row)
- }
- Statsample::Vector.new(data,type)
- end
- # Same as Statsample::Vector.collect, but giving case index as second parameter on yield.
- def collect_with_index(type=:scale)
- data=[]
- each_with_index {|row, i|
- data.push(yield(row, i))
- }
- Statsample::Vector.new(data,type)
- end
- # Recode a vector based on a block
- def recode!(vector_name)
- 0.upto(@cases-1) {|i|
- @vectors[vector_name].data[i]=yield case_as_hash(i)
- }
- @vectors[vector_name].set_valid_data
- end
-
- def crosstab(v1,v2,opts={})
- Statsample::Crosstab.new(@vectors[v1], @vectors[v2],opts)
- end
- def[]=(i,v)
- if v.instance_of? Statsample::Vector
- @vectors[i]=v
- check_order
- else
- raise ArgumentError,"Should pass a Statsample::Vector"
- end
- end
- # Return data as a matrix. Column are ordered by #fields and
- # rows by orden of insertion
- def to_matrix
- rows=[]
- self.each_array{|c|
- rows.push(c)
- }
- Matrix.rows(rows)
- end
-
- if Statsample.has_gsl?
- def clear_gsl
- @gsl=nil
- end
-
- def to_gsl
- if @gsl.nil?
- if cases.nil?
- update_valid_data
- end
- @gsl=GSL::Matrix.alloc(cases,fields.size)
- self.each_array{|c|
- @gsl.set_row(@i,c)
- }
- end
- @gsl
- end
-
- end
-
- # Return a correlation matrix for fields included as parameters.
- # By default, uses all fields of dataset
- def correlation_matrix(fields=nil)
- if fields
- ds=clone(fields)
- else
- ds=self
- end
- Statsample::Bivariate.correlation_matrix(ds)
- end
- # Return a correlation matrix for fields included as parameters.
- # By default, uses all fields of dataset
- def covariance_matrix(fields=nil)
- if fields
- ds=clone(fields)
- else
- ds=self
- end
- Statsample::Bivariate.covariance_matrix(ds)
- end
-
- # Create a new dataset with all cases which the block returns true
- def filter
- ds=self.dup_empty
- each {|c|
- ds.add_case(c, false) if yield c
- }
- ds.update_valid_data
- ds.name=_("%s(filtered)") % @name
- ds
- end
-
- # creates a new vector with the data of a given field which the block returns true
- def filter_field(field)
- a=[]
- each do |c|
- a.push(c[field]) if yield c
- end
- a.to_vector(@vectors[field].type)
- end
-
- # Creates a Stastample::Multiset, using one or more fields
- # to split the dataset.
-
-
- def to_multiset_by_split(*fields)
- require 'statsample/multiset'
- if fields.size==1
- to_multiset_by_split_one_field(fields[0])
- else
- to_multiset_by_split_multiple_fields(*fields)
- end
- end
- # Creates a Statsample::Multiset, using one field
-
- def to_multiset_by_split_one_field(field)
- raise ArgumentError,"Should use a correct field name" if !@fields.include? field
- factors=@vectors[field].factors
- ms=Multiset.new_empty_vectors(@fields, factors)
- each {|c|
- ms[c[field]].add_case(c,false)
- }
- #puts "Ingreso a los dataset"
- ms.datasets.each {|k,ds|
- ds.update_valid_data
- ds.name=@vectors[field].labeling(k)
- ds.vectors.each{|k1,v1|
- # puts "Vector #{k1}:"+v1.to_s
- v1.type=@vectors[k1].type
- v1.name=@vectors[k1].name
- v1.labels=@vectors[k1].labels
-
- }
- }
- ms
- end
- def to_multiset_by_split_multiple_fields(*fields)
- factors_total=nil
- fields.each do |f|
- if factors_total.nil?
- factors_total=@vectors[f].factors.collect{|c|
- [c]
- }
- else
- suma=[]
- factors=@vectors[f].factors
- factors_total.each{|f1| factors.each{|f2| suma.push(f1+[f2]) } }
- factors_total=suma
- end
- end
- ms=Multiset.new_empty_vectors(@fields,factors_total)
-
- p1=eval "Proc.new {|c| ms[["+fields.collect{|f| "c['#{f}']"}.join(",")+"]].add_case(c,false) }"
- each{|c| p1.call(c)}
-
- ms.datasets.each do |k,ds|
- ds.update_valid_data
- ds.name=fields.size.times.map {|i|
- f=fields[i]
- sk=k[i]
- @vectors[f].labeling(sk)
- }.join("-")
- ds.vectors.each{|k1,v1|
- v1.type=@vectors[k1].type
- v1.name=@vectors[k1].name
- v1.labels=@vectors[k1].labels
-
- }
- end
- ms
-
- end
- # Returns a vector, based on a string with a calculation based
- # on vector
- # The calculation will be eval'ed, so you can put any variable
- # or expression valid on ruby
- # For example:
- # a=[1,2].to_vector(scale)
- # b=[3,4].to_vector(scale)
- # ds={'a'=>a,'b'=>b}.to_dataset
- # ds.compute("a+b")
- # => Vector [4,6]
- def compute(text)
- @fields.each{|f|
- if @vectors[f].type=:scale
- text.gsub!(f,"row['#{f}'].to_f")
- else
- text.gsub!(f,"row['#{f}']")
- end
- }
- collect_with_index {|row, i|
- invalid=false
- @fields.each{|f|
- if @vectors[f].data_with_nils[i].nil?
- invalid=true
- end
- }
- if invalid
- nil
- else
- eval(text)
- end
- }
- end
- # Test each row with one or more tests
- # each test is a Proc with the form
- # Proc.new {|row| row['age']>0}
- # The function returns an array with all errors
- def verify(*tests)
- if(tests[0].is_a? String)
- id=tests[0]
- tests.shift
- else
- id=@fields[0]
- end
- vr=[]
- i=0
- each do |row|
- i+=1
- tests.each{|test|
- if ! test[2].call(row)
- values=""
- if test[1].size>0
- values=" ("+test[1].collect{|k| "#{k}=#{row[k]}"}.join(", ")+")"
- end
- vr.push("#{i} [#{row[id]}]: #{test[0]}#{values}")
- end
- }
- end
- vr
- end
- def to_s
- "#<"+self.class.to_s+":"+self.object_id.to_s+" @name=#{@name} @fields=["+@fields.join(",")+"] cases="+@vectors[@fields[0]].size.to_s
- end
- def inspect
- self.to_s
- end
- # Creates a new dataset for one to many relations
- # on a dataset, based on pattern of field names.
- #
- # for example, you have a survey for number of children
- # with this structure:
- # id, name, child_name_1, child_age_1, child_name_2, child_age_2
- # with
- # ds.one_to_many(%w{id}, "child_%v_%n"
- # the field of first parameters will be copied verbatim
- # to new dataset, and fields which responds to second
- # pattern will be added one case for each different %n.
- # For example
- # cases=[
- # ['1','george','red',10,'blue',20,nil,nil],
- # ['2','fred','green',15,'orange',30,'white',20],
- # ['3','alfred',nil,nil,nil,nil,nil,nil]
- # ]
- # ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3})
- # cases.each {|c| ds.add_case_array c }
- # ds.one_to_many(['id'],'car_%v%n').to_matrix
- # => Matrix[
- # ["red", "1", 10],
- # ["blue", "1", 20],
- # ["green", "2", 15],
- # ["orange", "2", 30],
- # ["white", "2", 20]
- # ]
- #
- def one_to_many(parent_fields, pattern)
- #base_pattern=pattern.gsub(/%v|%n/,"")
- re=Regexp.new pattern.gsub("%v","(.+?)").gsub("%n","(\\d+?)")
- ds_vars=parent_fields
- vars=[]
- max_n=0
- h=parent_fields.inject({}) {|a,v| a[v]=Statsample::Vector.new([], @vectors[v].type);a }
- # Adding _row_id
- h['_col_id']=[].to_scale
- ds_vars.push("_col_id")
- @fields.each do |f|
- if f=~re
- if !vars.include? $1
- vars.push($1)
- h[$1]=Statsample::Vector.new([], @vectors[f].type)
- end
- max_n=$2.to_i if max_n < $2.to_i
- end
- end
- ds=Dataset.new(h,ds_vars+vars)
- each do |row|
- row_out={}
- parent_fields.each do |f|
- row_out[f]=row[f]
- end
- max_n.times do |n1|
- n=n1+1
- any_data=false
- vars.each do |v|
- data=row[pattern.gsub("%v",v.to_s).gsub("%n",n.to_s)]
- row_out[v]=data
- any_data=true if !data.nil?
- end
- if any_data
- row_out["_col_id"]=n
- ds.add_case(row_out,false)
- end
-
- end
- end
- ds.update_valid_data
- ds
- end
- def report_building(b)
- b.section(:name=>@name) do |g|
- g.text _"Cases: %d" % cases
- @fields.each do |f|
- g.text "Element:[#{f}]"
- g.parse_element(@vectors[f])
- end
- end
- end
- end
+ alias :to_dataset :to_dataframe
end
diff --git a/lib/statsample/dominanceanalysis.rb b/lib/statsample/dominanceanalysis.rb
index 6b4da5a..fed0a91 100644
--- a/lib/statsample/dominanceanalysis.rb
+++ b/lib/statsample/dominanceanalysis.rb
@@ -7,13 +7,13 @@ module Statsample
#
# == Use
#
- # a=1000.times.collect {rand}.to_scale
- # b=1000.times.collect {rand}.to_scale
- # c=1000.times.collect {rand}.to_scale
- # ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset
- # ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+rand()}
- # da=Statsample::DominanceAnalysis.new(ds,'y')
- # puts da.summary
+ # a = Daru::Vector.new(1000.times.collect {rand})
+ # b = Daru::Vector.new(1000.times.collect {rand})
+ # c = Daru::Vector.new(1000.times.collect {rand})
+ # ds= Daru::DataFrame.new({:a => a,:b => b,:c => c})
+ # ds[:y] = ds.collect_rows {|row| row[:a]*5 + row[:b]*3 + row[:c]*2 + rand()}
+ # da=Statsample::DominanceAnalysis.new(ds, :y)
+ # puts da.summary
#
# === Output:
#
@@ -115,21 +115,21 @@ def initialize(input, dependent, opts=Hash.new)
}
@dependent=dependent
@dependent=[@dependent] unless @dependent.is_a? Array
-
- @predictors ||= input.fields-@dependent
-
- @name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil?
-
- if input.is_a? Statsample::Dataset
+
+ if input.kind_of? Daru::DataFrame
+ @predictors ||= input.vectors.to_a - @dependent
@ds=input
@matrix=Statsample::Bivariate.correlation_matrix(input)
@cases=Statsample::Bivariate.min_n_valid(input)
elsif input.is_a? ::Matrix
+ @predictors ||= input.fields-@dependent
@ds=nil
@matrix=input
else
raise ArgumentError.new("You should use a Matrix or a Dataset")
end
+
+ @name=_("Dominance Analysis: %s over %s") % [ @predictors.flatten.join(",") , @dependent.join(",")] if @name.nil?
@models=nil
@models_data=nil
@general_averages=nil
@@ -264,22 +264,21 @@ def general_dominance
end
def md(m)
- models_data[m.sort {|a,b| a.to_s<=>b.to_s}]
+ models_data[m.sort {|a,b| a.to_s <=> b.to_s}]
end
# Get all model of size k
def md_k(k)
out=[]
- @models.each{|m| out.push(md(m)) if m.size==k }
+ @models.each{ |m| out.push(md(m)) if m.size==k }
out
end
# For a hash with arrays of numbers as values
# Returns a hash with same keys and
# value as the mean of values of original hash
-
def get_averages(averages)
out={}
- averages.each{|key,val| out[key]=val.to_vector(:scale).mean }
+ averages.each{ |key,val| out[key] = Daru::Vector.new(val).mean }
out
end
# Hash with average for each k size model.
diff --git a/lib/statsample/dominanceanalysis/bootstrap.rb b/lib/statsample/dominanceanalysis/bootstrap.rb
index 32d1588..d81a6fd 100644
--- a/lib/statsample/dominanceanalysis/bootstrap.rb
+++ b/lib/statsample/dominanceanalysis/bootstrap.rb
@@ -5,16 +5,16 @@ class DominanceAnalysis
#
# == Usage
#
- # require 'statsample'
- # a=100.times.collect {rand}.to_scale
- # b=100.times.collect {rand}.to_scale
- # c=100.times.collect {rand}.to_scale
- # d=100.times.collect {rand}.to_scale
- # ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
- # ds['y']=ds.collect{|row| row['a']*5+row['b']*2+row['c']*2+row['d']*2+10*rand()}
- # dab=Statsample::DominanceAnalysis::Bootstrap.new(ds2, 'y', :debug=>true)
- # dab.bootstrap(100,nil)
- # puts dab.summary
+ # require 'statsample'
+ # a = Daru::Vector.new(100.times.collect {rand})
+ # b = Daru::Vector.new(100.times.collect {rand})
+ # c = Daru::Vector.new(100.times.collect {rand})
+ # d = Daru::Vector.new(100.times.collect {rand})
+ # ds = Daru::DataFrame.new({:a => a,:b => b,:c => c,:d => d})
+ # ds[:y] = ds.collect_rows { |row| row[:a]*5+row[:b]*2+row[:c]*2+row[:d]*2+10*rand() }
+ # dab=Statsample::DominanceAnalysis::Bootstrap.new(ds, :y, :debug=>true)
+ # dab.bootstrap(100,nil)
+ # puts dab.summary
# Output
# Sample size: 100
# t: 1.98421693632958
@@ -91,28 +91,28 @@ class Bootstrap
ALPHA=0.95
# Create a new Dominance Analysis Bootstrap Object
#
- # * ds: A Dataset object
+ # * ds: A Daru::DataFrame object
# * y_var: Name of dependent variable
# * opts: Any other attribute of the class
def initialize(ds,y_var, opts=Hash.new)
- @ds=ds
- @y_var=y_var
- @n=ds.cases
+ @ds = ds
+ @y_var = y_var.respond_to?(:to_sym) ? y_var.to_sym : y_var
+ @n = ds.nrows
@n_samples=0
@alpha=ALPHA
@debug=false
if y_var.is_a? Array
- @fields=ds.fields-y_var
+ @fields=ds.vectors.to_a - y_var
@regression_class=Regression::Multiple::MultipleDependent
else
- @fields=ds.fields-[y_var]
+ @fields=ds.vectors.to_a - [y_var]
@regression_class=Regression::Multiple::MatrixEngine
end
- @samples_ga=@fields.inject({}){|a,v| a[v]=[];a}
+ @samples_ga=@fields.inject({}) { |a,v| a[v]=[]; a }
- @name=_("Bootstrap dominance Analysis: %s over %s") % [ ds.fields.join(",") , @y_var]
+ @name=_("Bootstrap dominance Analysis: %s over %s") % [ ds.vectors.to_a.join(",") , @y_var]
opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
@@ -130,15 +130,14 @@ def da
# each sample on @samples_td, @samples_cd, @samples_gd, @samples_ga
#
# * number_samples: Number of new samples to add
- # * n: size of each new sample. If nil, equal to original sample size
-
+ # * n: size of each new sample. If nil, equal to original sample size
def bootstrap(number_samples,n=nil)
number_samples.times{ |t|
@n_samples+=1
puts _("Bootstrap %d of %d") % [t+1, number_samples] if @debug
- ds_boot=@ds.bootstrap(n)
+ ds_boot=@ds.bootstrap(n)
da_1=DominanceAnalysis.new(ds_boot, @y_var, :regression_class => @regression_class)
-
+
da_1.total_dominance.each{|k,v|
@samples_td[k].push(v)
}
@@ -182,7 +181,7 @@ def report_building(builder) # :nodoc:
table.row([_("Complete dominance"),"","","","","","",""])
table.hr
@pairs.each{|pair|
- std=@samples_td[pair].to_vector(:scale)
+ std=Daru::Vector.new(@samples_td[pair])
ttd=da.total_dominance_pairwise(pair[0],pair[1])
table.row(summary_pairs(pair,std,ttd))
}
@@ -190,7 +189,7 @@ def report_building(builder) # :nodoc:
table.row([_("Conditional dominance"),"","","","","","",""])
table.hr
@pairs.each{|pair|
- std=@samples_cd[pair].to_vector(:scale)
+ std=Daru::Vector.new(@samples_cd[pair])
ttd=da.conditional_dominance_pairwise(pair[0],pair[1])
table.row(summary_pairs(pair,std,ttd))
@@ -199,7 +198,7 @@ def report_building(builder) # :nodoc:
table.row([_("General Dominance"),"","","","","","",""])
table.hr
@pairs.each{|pair|
- std=@samples_gd[pair].to_vector(:scale)
+ std=Daru::Vector.new(@samples_gd[pair])
ttd=da.general_dominance_pairwise(pair[0],pair[1])
table.row(summary_pairs(pair,std,ttd))
}
@@ -208,10 +207,9 @@ def report_building(builder) # :nodoc:
table=ReportBuilder::Table.new(:name=>_("General averages"), :header=>[_("var"), _("mean"), _("se"), _("p.5"), _("p.95")])
@fields.each{|f|
- v=@samples_ga[f].to_vector(:scale)
+ v=Daru::Vector.new(@samples_ga[f])
row=[@ds[f].name, sprintf("%0.3f",v.mean), sprintf("%0.3f",v.sd), sprintf("%0.3f",v.percentil(5)),sprintf("%0.3f",v.percentil(95))]
- table.row(row)
-
+ table.row(row)
}
generator.parse_element(table)
diff --git a/lib/statsample/factor.rb b/lib/statsample/factor.rb
index 686b8b2..ac99348 100644
--- a/lib/statsample/factor.rb
+++ b/lib/statsample/factor.rb
@@ -34,7 +34,7 @@ module Factor
# matrix is not appropriate for factor analysis."
#
def self.anti_image_covariance_matrix(matrix)
- s2=Matrix.diag(*(matrix.inverse.diagonal)).inverse
+ s2=Matrix.diagonal(*(matrix.inverse.diagonal)).inverse
aicm=(s2)*matrix.inverse*(s2)
aicm.extend(Statsample::CovariateMatrix)
aicm.fields=matrix.fields if matrix.respond_to? :fields
@@ -42,13 +42,12 @@ def self.anti_image_covariance_matrix(matrix)
end
def self.anti_image_correlation_matrix(matrix)
matrix=matrix.to_matrix
- s=Matrix.diag(*(matrix.inverse.diagonal)).sqrt.inverse
+ s=Matrix.diagonal(*(matrix.inverse.diagonal)).sqrt.inverse
aicm=s*matrix.inverse*s
aicm.extend(Statsample::CovariateMatrix)
aicm.fields=matrix.fields if matrix.respond_to? :fields
aicm
-
end
# Kaiser-Meyer-Olkin measure of sampling adequacy for correlation matrix.
@@ -101,6 +100,5 @@ def self.kmo_univariate(matrix, var)
end
sum_r.quo(sum_r+sum_q)
end
-
end
end
diff --git a/lib/statsample/factor/map.rb b/lib/statsample/factor/map.rb
index 963763a..26ac880 100644
--- a/lib/statsample/factor/map.rb
+++ b/lib/statsample/factor/map.rb
@@ -75,7 +75,8 @@ def compute
(ncol-1).times do |m|
puts "MAP:Eigenvalue #{m+1}" if $DEBUG
- a=loadings[0..(loadings.row_size-1),0..m]
+ a=use_gsl ? loadings[0..(loadings.row_size-1),0..m] :
+ loadings.minor(0..(loadings.row_size-1),0..m)
partcov= gsl_m - (a*a.transpose)
d=klass_m.diagonal(*(partcov.diagonal.collect {|v| Math::sqrt(1/v)}))
diff --git a/lib/statsample/factor/parallelanalysis.rb b/lib/statsample/factor/parallelanalysis.rb
index 5a7ff28..4f9cb48 100644
--- a/lib/statsample/factor/parallelanalysis.rb
+++ b/lib/statsample/factor/parallelanalysis.rb
@@ -22,13 +22,13 @@ module Factor
class ParallelAnalysis
def self.with_random_data(cases,vars,opts=Hash.new)
- require 'ostruct'
- ds=OpenStruct.new
- ds.fields=vars.times.map {|i| "v#{i+1}"}
- ds.cases=cases
+ ds= Daru::DataFrame.new({},
+ order: vars.times.map {|i| "v#{i+1}".to_sym},
+ index: cases )
opts=opts.merge({:bootstrap_method=> :random, :no_data=>true})
new(ds, opts)
end
+
include DirtyMemoize
include Summarizable
# Number of random sets to produce. 50 by default
@@ -61,9 +61,9 @@ def self.with_random_data(cases,vars,opts=Hash.new)
attr_accessor :use_gsl
def initialize(ds, opts=Hash.new)
@ds=ds
- @fields=@ds.fields
+ @fields=@ds.vectors.to_a
@n_variables=@fields.size
- @n_cases=ds.cases
+ @n_cases=ds.nrows
opts_default={
:name=>_("Parallel Analysis"),
:iterations=>50, # See Liu and Rijmen (2008)
@@ -82,7 +82,7 @@ def initialize(ds, opts=Hash.new)
# Number of factor to retent
def number_of_factors
total=0
- ds_eigenvalues.fields.each_with_index do |f,i|
+ ds_eigenvalues.vectors.to_a.each_with_index do |f,i|
if (@original[i]>0 and @original[i]>ds_eigenvalues[f].percentil(percentil))
total+=1
else
@@ -101,7 +101,7 @@ def report_building(g) #:nodoc:
s.text _("Number of iterations: %d") % @iterations
if @no_data
s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("generated eigenvalue"), "p.#{percentil}"]) do |t|
- ds_eigenvalues.fields.each_with_index do |f,i|
+ ds_eigenvalues.vectors.to_a.each_with_index do |f,i|
v=ds_eigenvalues[f]
t.row [i+1, "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), ]
end
@@ -109,7 +109,7 @@ def report_building(g) #:nodoc:
else
s.text _("Number or factors to preserve: %d") % number_of_factors
s.table(:name=>_("Eigenvalues"), :header=>[_("n"), _("data eigenvalue"), _("generated eigenvalue"),"p.#{percentil}",_("preserve?")]) do |t|
- ds_eigenvalues.fields.each_with_index do |f,i|
+ ds_eigenvalues.vectors.to_a.each_with_index do |f,i|
v=ds_eigenvalues[f]
t.row [i+1, "%0.4f" % @original[i], "%0.4f" % v.mean, "%0.4f" % v.percentil(percentil), (v.percentil(percentil)>0 and @original[i] > v.percentil(percentil)) ? "Yes":""]
end
@@ -120,11 +120,9 @@ def report_building(g) #:nodoc:
end
# Perform calculation. Shouldn't be called directly for the user
def compute
+ @original=Statsample::Bivariate.send(matrix_method, @ds).eigenvalues unless no_data
+ @ds_eigenvalues=Daru::DataFrame.new({}, order: (1..@n_variables).map{|v| ("ev_%05d" % v).to_sym})
-
- @original=Statsample::Bivariate.send(matrix_method, @ds).eigenvalues unless no_data
- @ds_eigenvalues=Statsample::Dataset.new((1..@n_variables).map{|v| "ev_%05d" % v})
- @ds_eigenvalues.fields.each {|f| @ds_eigenvalues[f].type=:scale}
if bootstrap_method==:parameter or bootstrap_method==:random
rng = Distribution::Normal.rng
end
@@ -133,19 +131,18 @@ def compute
begin
puts "#{@name}: Iteration #{i}" if $DEBUG or debug
# Create a dataset of dummy values
- ds_bootstrap=Statsample::Dataset.new(@ds.fields)
+ ds_bootstrap = Daru::DataFrame.new({}, order: @ds.vectors, index: @n_cases)
@fields.each do |f|
if bootstrap_method==:random
- ds_bootstrap[f]=@n_cases.times.map {|c| rng.call}.to_scale
+ ds_bootstrap[f] = Daru::Vector.new(@n_cases.times.map {|c| rng.call})
elsif bootstrap_method==:data
- ds_bootstrap[f]=ds[f].sample_with_replacement(@n_cases)
+ ds_bootstrap[f] = ds[f].sample_with_replacement(@n_cases)
else
raise "bootstrap_method doesn't recogniced"
end
end
- ds_bootstrap.update_valid_data
-
+
matrix=Statsample::Bivariate.send(matrix_method, ds_bootstrap)
matrix=matrix.to_gsl if @use_gsl
if smc
@@ -155,13 +152,12 @@ def compute
end
end
ev=matrix.eigenvalues
- @ds_eigenvalues.add_case_array(ev)
+ @ds_eigenvalues.add_row(ev)
rescue Statsample::Bivariate::Tetrachoric::RequerimentNotMeet => e
puts "Error: #{e}" if $DEBUG
redo
end
end
- @ds_eigenvalues.update_valid_data
end
dirty_memoize :number_of_factors, :ds_eigenvalues
dirty_writer :iterations, :bootstrap_method, :percentil, :smc
diff --git a/lib/statsample/factor/pca.rb b/lib/statsample/factor/pca.rb
index fa5fb37..799c185 100644
--- a/lib/statsample/factor/pca.rb
+++ b/lib/statsample/factor/pca.rb
@@ -13,11 +13,11 @@ module Factor
#
# == Usage:
# require 'statsample'
- # a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale
- # b=[2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9].to_scale
- # ds={'a'=>a,'b'=>b}.to_dataset
- # cor_matrix=Statsample::Bivariate.correlation_matrix(ds)
- # pca=Statsample::Factor::PCA.new(cor_matrix)
+ # a = Daru::Vector.new([2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1])
+ # b = Daru::Vector.new([2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9])
+ # ds = Daru::DataFrame.new({:a => a,:b => b})
+ # cor_matrix = Statsample::Bivariate.correlation_matrix(ds)
+ # pca= Statsample::Factor::PCA.new(cor_matrix)
# pca.m
# => 1
# pca.eigenvalues
@@ -52,11 +52,13 @@ class PCA
attr_accessor :rotation_type
attr_accessor :matrix_type
def initialize(matrix, opts=Hash.new)
- @use_gsl=nil
+ @use_gsl = opts[:use_gsl]
+ opts.delete :use_gsl
+
@name=_("Principal Component Analysis")
@matrix=matrix
@n_variables=@matrix.column_size
- @variables_names=(@matrix.respond_to? :fields) ? @matrix.fields : @n_variables.times.map {|i| _("VAR_%d") % (i+1)}
+ @variables_names=(@matrix.respond_to? :fields) ? @matrix.fields : @n_variables.times.map {|i| "VAR_#{i+1}".to_sym }
@matrix_type = @matrix.respond_to?(:_type) ? @matrix._type : :correlation
@@ -67,13 +69,14 @@ def initialize(matrix, opts=Hash.new)
opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
+
if @use_gsl.nil?
@use_gsl=Statsample.has_gsl?
end
if @matrix.respond_to? :fields
@variables_names=@matrix.fields
else
- @variables_names=@n_variables.times.map {|i| "V#{i+1}"}
+ @variables_names=@n_variables.times.map {|i| "V#{i+1}".to_sym}
end
calculate_eigenpairs
@@ -81,7 +84,6 @@ def initialize(matrix, opts=Hash.new)
# Set number of factors with eigenvalues > 1
@m=@eigenpairs.find_all {|ev,ec| ev>=1.0}.size
end
-
end
def rotation
@rotation_type.new(component_matrix)
@@ -92,10 +94,10 @@ def total_eigenvalues
def create_centered_ds
h={}
@original_ds.factors.each {|f|
- mean=@original_ds[f].mean
- h[f]=@original_ds[f].recode {|c| c-mean}
+ mean = @original_ds[f].mean
+ h[f] = @original_ds[f].recode {|c| c-mean}
}
- @ds=h.to_dataset
+ @ds = Daru::DataFrame.new(h)
end
# Feature matrix for +m+ factors
@@ -137,8 +139,8 @@ def principal_components(input, m=nil)
pcs=(fv.transpose*data_matrix.transpose).transpose
pcs.extend Statsample::NamedMatrix
- pcs.fields_y=m.times.map {|i| "PC_%d" % (i+1)}
- pcs.to_dataset
+ pcs.fields_y = m.times.map { |i| "PC_#{i+1}".to_sym }
+ pcs.to_dataframe
end
def component_matrix(m=nil)
var="component_matrix_#{matrix_type}"
@@ -159,7 +161,7 @@ def component_matrix_covariance(m=nil)
cm.extend NamedMatrix
cm.name=_("Component matrix (from covariance)")
cm.fields_x = @variables_names
- cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)}
+ cm.fields_y = m.times.map {|i| "PC_#{i+1}".to_sym }
cm
end
@@ -180,17 +182,16 @@ def component_matrix_correlation(m=nil)
cm.extend CovariateMatrix
cm.name=_("Component matrix")
cm.fields_x = @variables_names
- cm.fields_y = m.times.map {|i| "PC_%d" % (i+1)}
+ cm.fields_y = m.times.map { |i| "PC_#{i+1}".to_sym }
cm
end
def communalities(m=nil)
-
m||=@m
h=[]
@n_variables.times do |i|
sum=0
m.times do |j|
- sum+=(@eigenpairs[j][0].abs*@eigenpairs[j][1][i]**2)
+ sum += (@eigenpairs[j][0].abs*@eigenpairs[j][1][i]**2)
end
h.push(sum)
end
@@ -202,11 +203,11 @@ def eigenvalues
end
def eigenvectors
@eigenpairs.collect {|c|
- @use_gsl ? c[1].to_gsl : c[1].to_vector
+ @use_gsl ? c[1].to_gsl : Daru::Vector.new(c[1])
}
end
def calculate_eigenpairs
- @eigenpairs= @use_gsl ? @matrix.to_gsl.eigenpairs : @matrix.to_matrix.eigenpairs_ruby
+ @eigenpairs= @use_gsl ? @matrix.to_gsl.eigenpairs : @matrix.to_matrix.eigenpairs_ruby
end
diff --git a/lib/statsample/factor/principalaxis.rb b/lib/statsample/factor/principalaxis.rb
index 4420bf3..1df7aa7 100644
--- a/lib/statsample/factor/principalaxis.rb
+++ b/lib/statsample/factor/principalaxis.rb
@@ -6,9 +6,9 @@ module Factor
#
# == Usage:
# require 'statsample'
- # a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale
- # b=[2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9].to_scale
- # ds={'a'=>a,'b'=>b}.to_dataset
+ # a = Daru::Vector.new([2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1])
+ # b = Daru::Vector.new([2.4,0.7,2.9,2.2,3.0,2.7,1.6,1.1,1.6,0.9])
+ # ds= Daru::DataFrame.new({:a => a,:b => b})
# cor_matrix=Statsample::Bivariate.correlation_matrix(ds)
# pa=Statsample::Factor::PrincipalAxis.new(cor_matrix)
# pa.iterate(1)
diff --git a/lib/statsample/formula/fit_model.rb b/lib/statsample/formula/fit_model.rb
new file mode 100644
index 0000000..5ed76b3
--- /dev/null
+++ b/lib/statsample/formula/fit_model.rb
@@ -0,0 +1,46 @@
+require 'statsample/formula/formula'
+
+module Statsample
+ # Class for performing regression
+ class FitModel
+ def initialize(formula, df, opts = {})
+ @formula = FormulaWrapper.new formula, df
+ @df = df
+ @opts = opts
+ end
+
+ def model
+ @model || fit_model
+ end
+
+ def predict(new_data)
+ model.predict(df_for_prediction(new_data))
+ end
+
+ def df_for_prediction df
+ canonicalize_df(df)
+ end
+
+ def df_for_regression
+ df = canonicalize_df(@df)
+ df[@formula.y.value] = @df[@formula.y.value]
+ df
+ end
+
+ def canonicalize_df(orig_df)
+ tokens = @formula.canonical_tokens
+ tokens.shift if tokens.first.value == '1'
+ df = tokens.map { |t| t.to_df orig_df }.reduce(&:merge)
+ df
+ end
+
+ def fit_model
+ # TODO: Add support for inclusion/exclusion of intercept
+ @model = Statsample::Regression.multiple(
+ df_for_regression,
+ @formula.y.value,
+ @opts
+ )
+ end
+ end
+end
diff --git a/lib/statsample/formula/formula.rb b/lib/statsample/formula/formula.rb
new file mode 100644
index 0000000..47d5943
--- /dev/null
+++ b/lib/statsample/formula/formula.rb
@@ -0,0 +1,306 @@
+module Statsample
+ # This class recognizes what terms are numeric
+ # and accordingly forms groups which are fed to Formula
+ # Once they are parsed with Formula, they are combined back
+ class FormulaWrapper
+ attr_reader :tokens, :y, :canonical_tokens
+
+ # Initializes formula wrapper object to parse a given formula into
+ # some tokens which do not overlap one another.
+ # @note Specify 0 as a term in the formula if you do not want constant
+ # to be included in the parsed formula
+ # @param [string] formula to parse
+ # @param [Daru::DataFrame] df dataframe requried to know what vectors
+ # are numerical
+ # @example
+ # df = Daru::DataFrame.from_csv 'spec/data/df.csv'
+ # df.to_category 'c', 'd', 'e'
+ # formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
+ # formula.canonical_to_s
+ # #=> "1+c(-)+d(-):c+a"
+ def initialize(formula, df)
+ @df = df
+ # @y store the LHS term that is name of vector to be predicted
+ # @tokens store the RHS terms of the formula
+ @y, *@tokens = split_to_tokens(formula)
+ @tokens = @tokens.uniq.sort
+ manage_constant_term
+ @canonical_tokens = non_redundant_tokens
+ end
+
+ # Returns canonical tokens in a readable form.
+ # @return [String] canonical tokens in a readable form.
+ # @note 'y~a+b(-)' means 'a' exist in full rank expansion
+ # and 'b(-)' exist in reduced rank expansion
+ # @example
+ # df = Daru::DataFrame.from_csv 'spec/data/df.csv'
+ # df.to_category 'c', 'd', 'e'
+ # formula = Statsample::GLM::FormulaWrapper.new 'y~a+d:c', df
+ # formula.canonical_to_s
+ # #=> "1+c(-)+d(-):c+a"
+ def canonical_to_s
+ canonical_tokens.join '+'
+ end
+
+ # Returns tokens to produce non-redundant design matrix
+ # @return [Array] array of tokens that do not produce redundant matrix
+ def non_redundant_tokens
+ groups = split_to_groups
+ # TODO: An enhancement
+ # Right now x:c appears as c:x
+ groups.each { |k, v| groups[k] = strip_numeric v, k }
+ groups.each { |k, v| groups[k] = Formula.new(v).canonical_tokens }
+ groups.flat_map { |k, v| add_numeric v, k }
+ end
+
+ private
+
+ # Removes intercept token if term '0' is found in the formula.
+ # Intercept token remains if term '1' is found.
+ # If neither term '0' nor term '1' is found then, intercept token is added.
+ def manage_constant_term
+ @tokens.unshift Token.new('1') unless
+ @tokens.include?(Token.new('1')) ||
+ @tokens.include?(Token.new('0'))
+ @tokens.delete Token.new('0')
+ end
+
+ # Groups the tokens to gropus based on the numerical terms
+ # they are interacting with.
+ def split_to_groups
+ @tokens.group_by { |t| extract_numeric t }
+ end
+
+ # Add numeric interaction term which was removed earlier
+ # @param [Array] tokens tokens on which to add numerical terms
+ # @param [Array] numeric array of numeric terms to add
+ def add_numeric(tokens, numeric)
+ tokens.map do |t|
+ terms = t.interact_terms + numeric
+ if terms == ['1']
+ Token.new('1')
+ else
+ terms = terms.reject { |i| i == '1' }
+ Token.new terms.join(':'), t.full
+ end
+ end
+ end
+
+ # Strip numerical interacting terms
+ # @param [Array] tokens tokens from which to strip numeric
+ # @param [Array] numeric array of numeric terms to strip from tokens
+ # @return [Array] array of tokens with striped numerical terms
+ def strip_numeric(tokens, numeric)
+ tokens.map do |t|
+ terms = t.interact_terms - numeric
+ terms = ['1'] if terms.empty?
+ Token.new terms.join(':')
+ end
+ end
+
+ # Extract numeric interacting terms
+ # @param [Statsample::GLM::Token] token form which to extract numeric terms
+ # @return [Array] array of numericl terms
+ def extract_numeric(token)
+ terms = token.interact_terms
+ return [] if terms == ['1']
+ terms.reject { |t| @df[t].category? }
+ end
+
+ def split_to_tokens(formula)
+ formula = formula.gsub(/\s+/, '')
+ lhs_term, rhs = formula.split '~'
+ rhs_terms = rhs.split '+'
+ ([lhs_term] + rhs_terms).map { |t| Token.new t }
+ end
+ end
+
+ # To process formula language
+ class Formula
+ attr_reader :tokens, :canonical_tokens
+
+ def initialize(tokens)
+ @tokens = tokens
+ @canonical_tokens = parse_formula
+ end
+
+ def canonical_to_s
+ canonical_tokens.join '+'
+ end
+
+ private
+
+ def parse_formula
+ @tokens.inject([]) do |acc, token|
+ acc + add_non_redundant_elements(token, acc)
+ end
+ end
+
+ def add_non_redundant_elements(token, result_so_far)
+ return [token] if token.value == '1'
+ tokens = token.expand
+ result_so_far = result_so_far.flat_map(&:expand)
+ tokens -= result_so_far
+ contract_if_possible tokens
+ end
+
+ def contract_if_possible(tokens)
+ tokens.combination(2).each do |a, b|
+ result = a.add b
+ next unless result
+ tokens.delete a
+ tokens.delete b
+ tokens << result
+ return contract_if_possible tokens
+ end
+ tokens.sort
+ end
+ end
+
+ # To encapsulate interaction as well as non-interaction terms
+ class Token
+ attr_reader :value, :full, :interact_terms
+
+ def initialize(value, full = true)
+ @interact_terms = value.include?(':') ? value.split(':') : [value]
+ @full = coerce_full full
+ end
+
+ def value
+ interact_terms.join(':')
+ end
+
+ def size
+ # TODO: Return size 1 for value '1' also
+ # CAn't do this at the moment because have to make
+ # changes in sorting first
+ value == '1' ? 0 : interact_terms.size
+ end
+
+ def add(other)
+ # ANYTHING + FACTOR- : ANYTHING = FACTOR : ANYTHING
+ # ANYTHING + ANYTHING : FACTOR- = ANYTHING : FACTOR
+ if size > other.size
+ other.add self
+
+ elsif other.size == 2 &&
+ size == 1 &&
+ other.interact_terms.last == value &&
+ other.full.last == full.first &&
+ other.full.first == false
+ Token.new(
+ "#{other.interact_terms.first}:#{value}",
+ [true, other.full.last]
+ )
+
+ elsif other.size == 2 &&
+ size == 1 &&
+ other.interact_terms.first == value &&
+ other.full.first == full.first &&
+ other.full.last == false
+ Token.new(
+ "#{value}:#{other.interact_terms.last}",
+ [other.full.first, true]
+ )
+
+ elsif value == '1' &&
+ other.size == 1
+ Token.new(other.value, true)
+ end
+ end
+
+ def ==(other)
+ value == other.value &&
+ full == other.full
+ end
+
+ alias eql? ==
+
+ def hash
+ value.hash ^ full.hash
+ end
+
+ def <=>(other)
+ size <=> other.size
+ end
+
+ def to_s
+ interact_terms
+ .zip(full)
+ .map { |t, f| f ? t : t + '(-)' }
+ .join ':'
+ end
+
+ def expand
+ case size
+ when 0
+ [self]
+ when 1
+ [Token.new('1'), Token.new(value, false)]
+ when 2
+ a, b = interact_terms
+ [Token.new('1'), Token.new(a, false), Token.new(b, false),
+ Token.new(a + ':' + b, [false, false])]
+ end
+ end
+
+ def to_df(df)
+ case size
+ when 1
+ if df[value].category?
+ df[value].contrast_code full: full.first
+ else
+ Daru::DataFrame.new value => df[value].to_a
+ end
+ when 2
+ to_df_when_interaction(df)
+ end
+ end
+
+ private
+
+ def coerce_full(value)
+ if value.is_a? Array
+ value + Array.new((@interact_terms.size - value.size), true)
+ else
+ [value] * @interact_terms.size
+ end
+ end
+
+ def to_df_when_interaction(df)
+ case interact_terms.map { |t| df[t].category? }
+ when [true, true]
+ df.interact_code(interact_terms, full)
+ when [false, false]
+ to_df_numeric_interact_with_numeric df
+ when [true, false]
+ to_df_category_interact_with_numeric df
+ when [false, true]
+ to_df_numeric_interact_with_category df
+ end
+ end
+
+ def to_df_numeric_interact_with_numeric(df)
+ Daru::DataFrame.new value => (df[interact_terms.first] *
+ df[interact_terms.last]).to_a
+ end
+
+ def to_df_category_interact_with_numeric(df)
+ a, b = interact_terms
+ Daru::DataFrame.new(
+ df[a].contrast_code(full: full.first)
+ .map { |dv| ["#{dv.name}:#{b}", (dv * df[b]).to_a] }
+ .to_h
+ )
+ end
+
+ def to_df_numeric_interact_with_category(df)
+ a, b = interact_terms
+ Daru::DataFrame.new(
+ df[b].contrast_code(full: full.last)
+ .map { |dv| ["#{a}:#{dv.name}", (dv * df[a]).to_a] }
+ .to_h
+ )
+ end
+ end
+end
diff --git a/lib/statsample/graph/boxplot.rb b/lib/statsample/graph/boxplot.rb
index da1cd7d..f07b7d2 100644
--- a/lib/statsample/graph/boxplot.rb
+++ b/lib/statsample/graph/boxplot.rb
@@ -8,12 +8,12 @@ module Graph
#
# == Usage
# === Svg output
- # a=[1,2,3,4].to_scale
- # b=[3,4,5,6].to_scale
- # puts Statsample::Graph::Boxplot.new(:vectors=>[a,b]).to_svg
+ # a = Daru::Vector.new([1,2,3,4])
+ # b = Daru::Vector.new([3,4,5,6])
+ # puts Statsample::Graph::Boxplot.new(:vectors=>[a,b]).to_svg
# === Using ReportBuilder
- # a=[1,2,3,4].to_scale
- # b=[3,4,5,6].to_scale
+ # a = Daru::Vector.new([1,2,3,4])
+ # b = Daru::Vector.new([3,4,5,6])
# rb=ReportBuilder.new
# rb.add(Statsample::Graph::Boxplot.new(:vectors=>[a,b]))
# rb.save_html('boxplot.html')
@@ -85,8 +85,6 @@ def rubyvis_panel # :nodoc:
min||=@vectors.map {|v| v.min}.min
max||=@vectors.map {|v| v.max}.max
-
-
margin_hor=margin_left + margin_right
margin_vert=margin_top + margin_bottom
x_scale = pv.Scale.ordinal(@vectors.size.times.map.to_a).split_banded(0, width-margin_hor, 4.0/5)
@@ -115,12 +113,10 @@ def rubyvis_panel # :nodoc:
out[:low_whisker]=min
out[:high_whisker]=max
# And now, data outside whiskers
- out[:outliers]=v.data_with_nils.find_all {|d| d < min or d > max }
+ out[:outliers]=v.to_a.find_all {|d| d < min or d > max }
out
}
-
-
-
+
vis=Rubyvis::Panel.new do |pan|
pan.width width - margin_hor
pan.height height - margin_vert
@@ -157,7 +153,6 @@ def rubyvis_panel # :nodoc:
bp.left {|v| x_scale[index]}
bp.width x_scale.range_band
-
# Bar
bp.bar do |b|
b.bottom {|v| y_scale[v[:percentil_25]]}
@@ -168,9 +163,7 @@ def rubyvis_panel # :nodoc:
colors.scale(that.groups[parent.index]).darker
else
colors.scale(index).darker
- end
-
-
+ end
}
b.fill_style {|v|
if that.groups
@@ -237,7 +230,6 @@ def report_building(builder) # :nodoc:
builder.section(:name=>name) do |b|
b.image(to_svg, :type=>'svg', :width=>width, :height=>height)
end
-
end
end
end
diff --git a/lib/statsample/graph/histogram.rb b/lib/statsample/graph/histogram.rb
index 3fd21d7..696cfa5 100644
--- a/lib/statsample/graph/histogram.rb
+++ b/lib/statsample/graph/histogram.rb
@@ -6,10 +6,10 @@ module Graph
#
# == Usage
# === Svg output
- # a=[1,2,3,4].to_scale
- # puts Statsample::Graph::Histogram.new(a).to_svg
+ # a = Daru::Vector.new([1,2,3,4])
+ # puts Statsample::Graph::Histogram.new(a).to_svg
# === Using ReportBuilder
- # a=[1,2,3,4].to_scale
+ # a = Daru::Vector.new([1,2,3,4])
# rb=ReportBuilder.new
# rb.add(Statsample::Graph::Histogram.new(a))
# rb.save_html('histogram.html')
@@ -70,7 +70,7 @@ def pre_vis # :nodoc:
@hist=@data
@mean=@hist.estimated_mean
@sd=@hist.estimated_standard_deviation
- elsif @data.is_a? Statsample::Vector
+ elsif @data.is_a? Daru::Vector
@mean=@data.mean
@sd=@data.sd
@bins||=Math::sqrt(@data.size).floor
diff --git a/lib/statsample/graph/scatterplot.rb b/lib/statsample/graph/scatterplot.rb
index d6f2ee8..6bc29bb 100644
--- a/lib/statsample/graph/scatterplot.rb
+++ b/lib/statsample/graph/scatterplot.rb
@@ -10,12 +10,12 @@ module Graph
# The data is displayed as a collection of points, each having the value of one variable determining the position on the horizontal axis and the value of the other variable determining the position on the vertical axis.[2] This kind of plot is also called a scatter chart, scatter diagram and scatter graph.
# == Usage
# === Svg output
- # a=[1,2,3,4].to_scale
- # b=[3,4,5,6].to_scale
+ # a = Daru::Vector.new([1,2,3,4])
+ # b = Daru::Vector.new([3,4,5,6])
# puts Statsample::Graph::Scatterplot.new(a,b).to_svg
# === Using ReportBuilder
- # a=[1,2,3,4].to_scale
- # b=[3,4,5,6].to_scale
+ # a = Daru::Vector.new([1,2,3,4])
+ # b = Daru::Vector.new([3,4,5,6])
# rb=ReportBuilder.new
# rb.add(Statsample::Graph::Scatterplot.new(a,b))
# rb.save_html('scatter.html')
@@ -195,17 +195,18 @@ def rubyvis_panel # :nodoc:
end
vis
end
+
# Returns SVG with scatterplot
def to_svg
- rp=rubyvis_panel
+ rp = rubyvis_panel
rp.render
rp.to_svg
end
+
def report_building(builder) # :nodoc:
builder.section(:name=>name) do |b|
b.image(to_svg, :type=>'svg', :width=>width, :height=>height)
- end
-
+ end
end
end
end
diff --git a/lib/statsample/histogram.rb b/lib/statsample/histogram.rb
index be6564e..4825890 100644
--- a/lib/statsample/histogram.rb
+++ b/lib/statsample/histogram.rb
@@ -37,135 +37,144 @@ module Statsample
# == Reference:
# * http://www.gnu.org/software/gsl/manual/html_node/The-histogram-struct.html
- class Histogram
- include Enumerable
- class << self
- # Alloc +n_bins+, using +range+ as ranges of bins
- def alloc(n_bins, range=nil, opts=Hash.new)
- Histogram.new(n_bins, range, opts)
-
- end
- # Alloc +n_bins+ bins, using +p1+ as minimum and +p2+
- # as maximum
- def alloc_uniform(n_bins, p1=nil,p2=nil)
- if p1.is_a? Array
- min,max=p1
- else
- min,max=p1,p2
- end
- range=max - min
- step=range / n_bins.to_f
- range=(n_bins+1).times.map {|i| min + (step*i)}
- Histogram.new(range)
- end
- end
- attr_accessor :name
- attr_reader :bin
- attr_reader :range
- include GetText
- bindtextdomain("statsample")
- def initialize(p1, min_max=false, opts=Hash.new)
+ class Histogram
+ include Enumerable
+
+ class << self
+ # Alloc +n_bins+, using +range+ as ranges of bins
+ def alloc(n_bins, range=nil, opts=Hash.new)
+ Histogram.new(n_bins, range, opts)
+ end
+ # Alloc +n_bins+ bins, using +p1+ as minimum and +p2+
+ # as maximum
+ def alloc_uniform(n_bins, p1=nil,p2=nil)
if p1.is_a? Array
- range=p1
- @n_bins=p1.size-1
- elsif p1.is_a? Integer
- @n_bins=p1
+ min,max=p1
+ else
+ min,max=p1,p2
end
-
- @bin=[0.0]*(@n_bins)
- if(min_max)
- min, max=min_max[0], min_max[1]
- range=Array.new(@n_bins+1)
- (@n_bins+1).times {|i| range[i]=min+(i*(max-min).quo(@n_bins)) }
- end
- range||=[0.0]*(@n_bins+1)
- set_ranges(range)
- @name=""
- opts.each{|k,v|
- self.send("#{k}=",v) if self.respond_to? k
- }
+ range=max - min
+ step=range / n_bins.to_f
+ range=(n_bins+1).times.map {|i| min + (step*i)}
+ Histogram.new(range)
end
- # Number of bins
- def bins
- @n_bins
+ end
+
+ attr_accessor :name
+ attr_reader :bin
+ attr_reader :range
+
+ include GetText
+ bindtextdomain("statsample")
+
+ def initialize(p1, min_max=false, opts=Hash.new)
+
+ if p1.is_a? Array
+ range=p1
+ @n_bins=p1.size-1
+ elsif p1.is_a? Integer
+ @n_bins=p1
end
- #
- def increment(x, w=1)
- if x.respond_to? :each
- x.each{|y| increment(y,w) }
- elsif x.is_a? Numeric
- (range.size-1).times do |i|
- if x>=range[i] and x= range[i] and x < range[i+1]
+ @bin[i] += w
+ break
end
end
end
- def set_ranges(range)
- raise "Range size should be bin+1" if range.size!=@bin.size+1
- @range=range
- end
- def get_range(i)
- [@range[i],@range[i+1]]
- end
- def max
- @range.last
- end
- def min
- @range.first
- end
- def max_val
- @bin.max
- end
- def min_val
- @bin.min
- end
- def each
- bins.times.each do |i|
- r=get_range(i)
- arg={:i=>i, :low=>r[0],:high=>r[1], :middle=>(r[0]+r[1]) / 2.0, :value=>@bin[i]}
- yield arg
- end
- end
- def estimated_variance
- sum,n=0,0
- mean=estimated_mean
- each do |v|
- sum+=v[:value]*(v[:middle]-mean)**2
- n+=v[:value]
- end
- sum / (n-1)
- end
- def estimated_standard_deviation
- Math::sqrt(estimated_variance)
- end
- def estimated_mean
- sum,n=0,0
- each do |v|
- sum+= v[:value]* v[:middle]
- n+=v[:value]
- end
- sum / n
- end
- alias :mean :estimated_mean
- alias :sigma :estimated_standard_deviation
-
- def sum(start=nil,_end=nil)
- start||=0
- _end||=@n_bins-1
- (start.._end).inject(0) {|ac,i| ac+@bin[i]}
+ end
+
+ def set_ranges(range)
+ raise "Range size should be bin+1" if range.size!=@bin.size+1
+ @range=range
+ end
+
+ def get_range(i)
+ [@range[i],@range[i+1]]
+ end
+
+ def max
+ @range.last
+ end
+
+ def min
+ @range.first
+ end
+ def max_val
+ @bin.max
+ end
+ def min_val
+ @bin.min
+ end
+ def each
+ bins.times.each do |i|
+ r=get_range(i)
+ arg={:i=>i, :low=>r[0],:high=>r[1], :middle=>(r[0]+r[1]) / 2.0, :value=>@bin[i]}
+ yield arg
end
- def report_building(generator)
- hg=Statsample::Graph::Histogram.new(self)
- generator.parse_element(hg)
+ end
+ def estimated_variance
+ sum,n=0,0
+ mean=estimated_mean
+ each do |v|
+ sum+=v[:value]*(v[:middle]-mean)**2
+ n+=v[:value]
+ end
+ sum / (n-1)
+ end
+ def estimated_standard_deviation
+ Math::sqrt(estimated_variance)
+ end
+ def estimated_mean
+ sum,n=0,0
+ each do |v|
+ sum+= v[:value]* v[:middle]
+ n+=v[:value]
end
- def report_building_text(generator)
- @range.each_with_index do |r,i|
- next if i==@bin.size
- generator.text(sprintf("%5.2f : %d", r, @bin[i]))
- end
+ sum / n
+ end
+ alias :mean :estimated_mean
+ alias :sigma :estimated_standard_deviation
+
+ def sum(start=nil,_end=nil)
+ start||=0
+ _end||=@n_bins-1
+ (start.._end).inject(0) {|ac,i| ac+@bin[i]}
+ end
+ def report_building(generator)
+ hg=Statsample::Graph::Histogram.new(self)
+ generator.parse_element(hg)
+ end
+ def report_building_text(generator)
+ @range.each_with_index do |r,i|
+ next if i==@bin.size
+ generator.text(sprintf("%5.2f : %d", r, @bin[i]))
end
end
+ end
end
diff --git a/lib/statsample/matrix.rb b/lib/statsample/matrix.rb
index 662bd0a..a7102b0 100644
--- a/lib/statsample/matrix.rb
+++ b/lib/statsample/matrix.rb
@@ -10,45 +10,46 @@ class ::Matrix
def to_matrix
self
end
- def to_dataset
- f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map {|i| _("VAR_%d") % (i+1) }
- ds=Statsample::Dataset.new(f)
+
+ def to_dataframe
+ f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map {|i| "VAR_#{i+1}".to_sym }
+ f = [f] unless f.is_a?(Array)
+ ds = Daru::DataFrame.new({}, order: f)
f.each do |ff|
- ds[ff].type=:scale
- ds[ff].name=ff
+ ds[ff].rename ff
end
row_size.times {|i|
- ds.add_case_array(self.row(i).to_a)
+ ds.add_row(self.row(i).to_a)
}
- ds.update_valid_data
- ds.name=self.name if self.respond_to? :name
+ ds.rename(self.name) if self.respond_to? :name
ds
end
+
+ alias :to_dataset :to_dataframe
+
if defined? :eigenpairs
alias_method :eigenpairs_ruby, :eigenpairs
end
-
+
if Statsample.has_gsl?
# Optimize eigenpairs of extendmatrix module using gsl
def eigenpairs
to_gsl.eigenpairs
end
end
-
+
def eigenvalues
eigenpairs.collect {|v| v[0]}
end
+
def eigenvectors
eigenpairs.collect {|v| v[1]}
end
+
def eigenvectors_matrix
Matrix.columns(eigenvectors)
end
-
-
-
-
def to_gsl
out=[]
self.row_size.times{|i|
@@ -56,6 +57,10 @@ def to_gsl
}
GSL::Matrix[*out]
end
+
+ def []=(i, j, x)
+ @rows[i][j] = x
+ end
end
module GSL
@@ -64,9 +69,11 @@ class Col
def to_matrix
::Matrix.columns([self.size.times.map {|i| self[i]}])
end
+
def to_ary
to_a
end
+
def to_gsl
self
end
@@ -76,53 +83,60 @@ class Matrix
def to_gsl
self
end
-
- def to_dataset
- f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map {|i| _("VAR_%d") % (i+1) }
- ds=Statsample::Dataset.new(f)
+
+ def to_dataframe
+ f = (self.respond_to? :fields_y) ? fields_y : column_size.times.map { |i| "VAR_#{i+1}".to_sym }
+ ds=Daru::DataFrame.new({}, order: f)
f.each do |ff|
- ds[ff].type=:scale
- ds[ff].name=ff
+ ds[ff].rename ff
end
+
row_size.times {|i|
- ds.add_case_array(self.row(i).to_a)
+ ds.add_row(self.row(i).to_a)
}
- ds.update_valid_data
- ds.name=self.name if self.respond_to? :name
+ ds.rename(self.name) if self.respond_to? :name
ds
end
-
+
+ alias :to_dataset :to_dataframe
+
def row_size
size1
end
+
def column_size
size2
end
+
def determinant
det
end
+
def inverse
GSL::Linalg::LU.invert(self)
end
+
def eigenvalues
eigenpairs.collect {|v| v[0]}
end
+
def eigenvectors
eigenpairs.collect {|v| v[1]}
end
-
+
# Matrix sum of squares
def mssq
sum=0
to_v.each {|i| sum+=i**2}
sum
end
-
+
def eigenvectors_matrix
eigval, eigvec= GSL::Eigen.symmv(self)
GSL::Eigen::symmv_sort(eigval, eigvec, GSL::Eigen::SORT_VAL_DESC)
- eigvec
+ eigvec
end
+
def eigenpairs
eigval, eigvec= GSL::Eigen.symmv(self)
GSL::Eigen::symmv_sort(eigval, eigvec, GSL::Eigen::SORT_VAL_DESC)
@@ -130,19 +144,21 @@ def eigenpairs
[eigval[i],eigvec.get_col(i)]
}
end
-
+
#def eigenpairs_ruby
# self.to_matrix.eigenpairs_ruby
#end
def square?
size1==size2
end
+
def to_matrix
rows=self.size1
cols=self.size2
out=(0...rows).collect{|i| (0...cols).collect {|j| self[i,j]} }
::Matrix.rows(out)
end
+
def total_sum
sum=0
size1.times {|i|
@@ -158,7 +174,7 @@ def total_sum
module Statsample
# Module to add names to X and Y fields
module NamedMatrix
- include Summarizable
+ include Summarizable
def fields
raise "Should be square" if !square?
@@ -178,10 +194,10 @@ def fields_y=(v)
@fields_y=v
end
def fields_x
- @fields_x||=row_size.times.collect {|i| _("X%d") % i}
+ @fields_x||=row_size.times.collect {|i| _("X%d") % i}
end
def fields_y
- @fields_y||=column_size.times.collect {|i| _("Y%d") % i}
+ @fields_y||=column_size.times.collect {|i| _("Y%d") % i}
end
def name
@@ -195,13 +211,13 @@ def get_new_name
@@named_matrix+=1
_("Matrix %d") % @@named_matrix
end
-
+
end
# Module to add method for variance/covariance and correlation matrices
# == Usage
# matrix=Matrix[[1,2],[2,3]]
# matrix.extend CovariateMatrix
- #
+ #
module CovariateMatrix
include NamedMatrix
@@covariatematrix=0
@@ -217,7 +233,7 @@ def _type
else
@type
end
-
+
end
def _type=(t)
@type=t
@@ -233,7 +249,7 @@ def correlation
end
}
})
- matrix.extend CovariateMatrix
+ matrix.extend CovariateMatrix
matrix.fields_x=fields_x
matrix.fields_y=fields_y
matrix._type=:correlation
@@ -242,19 +258,19 @@ def correlation
self
end
end
-
-
+
+
# Get variance for field k
- #
+ #
def variance(k)
submatrix([k])[0,0]
end
-
+
def get_new_name
@@covariatematrix+=1
_("Covariate matrix %d") % @@covariatematrix
end
-
+
# Select a submatrix of factors. If you have a correlation matrix
# with a, b and c, you could obtain a submatrix of correlations of
# a and b, b and c or a and b
@@ -264,7 +280,7 @@ def get_new_name
#
# Example:
# a=Matrix[[1.0, 0.3, 0.2],
- # [0.3, 1.0, 0.5],
+ # [0.3, 1.0, 0.5],
# [0.2, 0.5, 1.0]]
# a.extend CovariateMatrix
# a.fields=%w{a b c}
@@ -272,31 +288,31 @@ def get_new_name
# => Matrix[[0.5],[0.3]]
# a.submatrix(%w{c a})
# => Matrix[[1.0, 0.2] , [0.2, 1.0]]
- def submatrix(rows,columns=nil)
- raise ArgumentError, "rows shouldn't be empty" if rows.respond_to? :size and rows.size==0
- columns||=rows
+ def submatrix(rows,columns = nil)
+ raise ArgumentError, "rows shouldn't be empty" if rows.respond_to? :size and rows.size == 0
+ columns ||= rows
# Convert all fields on index
- row_index=rows.collect {|v|
- r=v.is_a?(Numeric) ? v : fields_x.index(v)
+ row_index = rows.collect do |v|
+ r = v.is_a?(Numeric) ? v : fields_x.index(v)
raise "Index #{v} doesn't exists on matrix" if r.nil?
r
- }
- column_index=columns.collect {|v|
- r=v.is_a?(Numeric) ? v : fields_y.index(v)
+ end
+
+ column_index = columns.collect do |v|
+ r = v.is_a?(Numeric) ? v : fields_y.index(v)
raise "Index #{v} doesn't exists on matrix" if r.nil?
r
- }
-
-
+ end
+
+
fx=row_index.collect {|v| fields_x[v]}
fy=column_index.collect {|v| fields_y[v]}
-
- matrix= Matrix.rows(row_index.collect {|i|
- row=column_index.collect {|j| self[i,j]}})
- matrix.extend CovariateMatrix
- matrix.fields_x=fx
- matrix.fields_y=fy
- matrix._type=_type
+
+ matrix = Matrix.rows(row_index.collect { |i| column_index.collect { |j| self[i, j] }})
+ matrix.extend CovariateMatrix
+ matrix.fields_x = fx
+ matrix.fields_y = fy
+ matrix._type = _type
matrix
end
def report_building(generator)
diff --git a/lib/statsample/multiset.rb b/lib/statsample/multiset.rb
index e7cbe4f..9f50762 100644
--- a/lib/statsample/multiset.rb
+++ b/lib/statsample/multiset.rb
@@ -5,20 +5,21 @@ module Statsample
class Multiset
# Name of fields
attr_reader :fields
- # Array with Statsample::Dataset
+ # Array with Daru::DataFrame
attr_reader :datasets
# To create a multiset
# * Multiset.new(%w{f1 f2 f3}) # define only fields
def initialize(fields)
- @fields=fields
- @datasets={}
+ @fields=fields
+ @datasets={}
end
def self.new_empty_vectors(fields,ds_names)
- ms=Multiset.new(fields)
- ds_names.each{|d|
- ms.add_dataset(d,Dataset.new(fields))
- }
- ms
+ ms = Multiset.new(fields)
+ ds_names.each do |d|
+ ms.add_dataset(d, Daru::DataFrame.new({}, order: fields))
+ end
+
+ ms
end
# Generate a new dataset as a union of partial dataset
# If block given, this is applied to each dataset before union
@@ -29,65 +30,64 @@ def union(&block)
labels={}
each do |k,ds|
if block
- ds=ds.dup
+ ds = ds.dup
yield k,ds
end
@fields.each do |f|
- union_field[f]||=Array.new
- union_field[f].concat(ds[f].data)
- types[f]||=ds[f].type
- names[f]||=ds[f].name
- labels[f]||=ds[f].labels
+ union_field[f] ||= Array.new
+ union_field[f].concat(ds[f].to_a)
+ types[f] ||= ds[f].type
+ names[f] ||= ds[f].name
+ labels[f] ||= ds[f].index.to_a
end
end
@fields.each do |f|
- union_field[f]=union_field[f].to_vector(types[f])
- union_field[f].name=names[f]
- union_field[f].labels=labels[f]
+ union_field[f] = Daru::Vector.new(union_field[f], name: names[f])
end
- ds_union=union_field.to_dataset
- ds_union.fields=@fields
+
+ ds_union = Daru::DataFrame.new(union_field, order: @fields)
ds_union
end
+
def datasets_names
- @datasets.keys.sort
+ @datasets.keys.sort
end
+
def n_datasets
- @datasets.size
+ @datasets.size
end
+
def add_dataset(key,ds)
- if(ds.fields!=@fields)
- raise ArgumentError, "Dataset(#{ds.fields.to_s})must have the same fields of the Multiset(#{@fields})"
+ if ds.vectors.to_a != @fields
+ raise ArgumentError, "Dataset(#{ds.vectors.to_a.to_s})must have the same fields of the Multiset(#{@fields})"
else
- @datasets[key]=ds
+ @datasets[key] = ds
end
end
def sum_field(field)
@datasets.inject(0) {|a,da|
- stratum_name=da[0]
- vector=da[1][field]
- val=yield stratum_name,vector
- a+val
+ stratum_name = da[0]
+ vector = da[1][field]
+ val = yield stratum_name,vector
+ a + val
}
end
def collect_vector(field)
- @datasets.collect {|k,v|
- yield k, v[field]
- }
+ @datasets.collect { |k,v| yield k, v[field] }
end
def each_vector(field)
- @datasets.each {|k,v|
- yield k, v[field]
- }
+ @datasets.each { |k,v| yield k, v[field] }
end
- def[](i)
+
+ def [](i)
@datasets[i]
end
+
def each(&block)
@datasets.each {|k,ds|
- next if ds.cases==0
+ next if ds.nrows == 0
block.call(k,ds)
}
end
@@ -204,9 +204,9 @@ def initialize(ms,strata_sizes)
@ms=ms
raise ArgumentError,"You should put a strata size for each dataset" if strata_sizes.keys.sort!=ms.datasets_names
@strata_sizes=strata_sizes
- @population_size=@strata_sizes.inject(0) {|a,x| a+x[1]}
+ @population_size=@strata_sizes.inject(0) { |a,x| a+x[1] }
@strata_number=@ms.n_datasets
- @sample_size=@ms.datasets.inject(0) {|a,x| a+x[1].cases}
+ @sample_size=@ms.datasets.inject(0) { |a,x| a+x[1].nrows }
end
# Number of strata
def strata_number
diff --git a/lib/statsample/regression.rb b/lib/statsample/regression.rb
index 0016e8a..b2ae630 100644
--- a/lib/statsample/regression.rb
+++ b/lib/statsample/regression.rb
@@ -15,8 +15,6 @@ module Statsample
#
# * Simple Regression : Statsample::Regression::Simple
# * Multiple Regression: Statsample::Regression::Multiple
- # * Logit Regression: Statsample::Regression::Binomial::Logit
- # * Probit Regression: Statsample::Regression::Binomial::Probit
module Regression
LinearDependency=Class.new(Exception)
@@ -25,8 +23,8 @@ module Regression
# * x: independent Vector
# * y: dependent Vector
# Usage:
- # x=100.times.collect {|i| rand(100)}.to_scale
- # y=100.times.collect {|i| 2+x[i]*2+rand()}.to_scale
+ # x = Daru::Vector.new(100.times.collect {|i| rand(100)})
+ # y = Daru::Vector.new(100.times.collect {|i| 2+x[i]*2+rand()})
# sr=Statsample::Regression.simple(x,y)
# sr.a
# => 2.51763295177808
@@ -49,7 +47,7 @@ def self.simple(x,y)
# * :pairwise: uses correlation matrix. Use with caution.
#
# Usage:
- # lr=Statsample::Regression::multiple(ds,'y')
+ # lr=Statsample::Regression::multiple(ds,:y)
def self.multiple(ds,y_var, opts=Hash.new)
missing_data= (opts[:missing_data].nil? ) ? :listwise : opts.delete(:missing_data)
if missing_data==:pairwise
@@ -58,7 +56,7 @@ def self.multiple(ds,y_var, opts=Hash.new)
if Statsample.has_gsl? and false
Statsample::Regression::Multiple::GslEngine.new(ds, y_var, opts)
else
- ds2=ds.dup_only_valid
+ ds2=ds.reject_values(*Daru::MISSING_VALUES)
Statsample::Regression::Multiple::RubyEngine.new(ds2,y_var, opts)
end
end
diff --git a/lib/statsample/regression/multiple.rb b/lib/statsample/regression/multiple.rb
index 317efbc..a641363 100644
--- a/lib/statsample/regression/multiple.rb
+++ b/lib/statsample/regression/multiple.rb
@@ -6,12 +6,12 @@ module Regression
# Use:.
#
# require 'statsample'
- # a=1000.times.collect {rand}.to_scale
- # b=1000.times.collect {rand}.to_scale
- # c=1000.times.collect {rand}.to_scale
- # ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset
- # ds['y']=ds.collect{|row| row['a']*5+row['b']*3+row['c']*2+rand()}
- # lr=Statsample::Regression.multiple(ds,'y')
+ # a = Daru::Vector.new(1000.times.collect {rand})
+ # b = Daru::Vector.new(1000.times.collect {rand})
+ # c = Daru::Vector.new(1000.times.collect {rand})
+ # ds= Daru::DataFrame.new({:a => a,:b => b,:c => c})
+ # ds[:y]=ds.collect{|row| row[:a]*5 + row[:b]*3 + row[:c]*2 + rand()}
+ # lr=Statsample::Regression.multiple(ds, :y)
# puts lr.summary
# Summary for regression of a,b,c over y
# *************************************************************
@@ -53,8 +53,8 @@ def significance
def initialize(matrix,y_var, opts=Hash.new)
matrix.extend Statsample::CovariateMatrix
@matrix=matrix
- @fields=matrix.fields-y_var
- @y_var=y_var
+ @fields=matrix.fields - y_var
+ @y_var = y_var
@q=@y_var.size
@matrix_cor=matrix.correlation
@matrix_cor_xx = @matrix_cor.submatrix(@fields)
@@ -84,8 +84,6 @@ def p2yx
vxy.quo(@q)
end
end
-
-
end
end
end
diff --git a/lib/statsample/regression/multiple/alglibengine.rb b/lib/statsample/regression/multiple/alglibengine.rb
index d6ab942..05964ee 100644
--- a/lib/statsample/regression/multiple/alglibengine.rb
+++ b/lib/statsample/regression/multiple/alglibengine.rb
@@ -9,108 +9,115 @@ module Multiple
# If you need pairwise, use RubyEngine
# Example:
#
-# @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
-# @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
-# @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
-# @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
-# ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
-# lr=Statsample::Regression::Multiple::AlglibEngine.new(ds,'y')
+# @a = Daru::Vector.new([1,3,2,4,3,5,4,6,5,7])
+# @b = Daru::Vector.new([3,3,4,4,5,5,6,6,4,4])
+# @c = Daru::Vector.new([11,22,30,40,50,65,78,79,99,100])
+# @y = Daru::Vector.new([3,4,5,6,7,8,9,10,20,30])
+# ds = Daru::DataFrame.new({:a => @a,:b => @b,:c => @c,:y => @y})
+# lr=Statsample::Regression::Multiple::AlglibEngine.new(ds, :y)
#
class AlglibEngine < BaseEngine
def initialize(ds,y_var, opts=Hash.new)
super
- @ds=ds.dup_only_valid
- @ds_valid=@ds
- @dy=@ds[@y_var]
- @ds_indep=ds.dup(ds.fields-[y_var])
+ @ds = ds.reject_values(*Daru::MISSING_VALUES)
+ @ds_valid = @ds
+ @dy = @ds[@y_var]
+ @ds_indep = ds.dup(ds.vectors.to_a - [y_var])
# Create a custom matrix
- columns=[]
- @fields=[]
- @ds.fields.each{|f|
- if f!=@y_var
- columns.push(@ds[f].to_a)
- @fields.push(f)
- end
- }
- @dep_columns=columns.dup
+ columns = []
+ @fields = []
+ @ds.vectors.each do |f|
+ if f != @y_var
+ columns.push(@ds[f].to_a)
+ @fields.push(f)
+ end
+ end
+ @dep_columns = columns.dup
columns.push(@ds[@y_var])
matrix=Matrix.columns(columns)
@lr_s=nil
@lr=::Alglib::LinearRegression.build_from_matrix(matrix)
@coeffs=assign_names(@lr.coeffs)
-
end
- def _dump(i)
- Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
- end
- def self._load(data)
- h=Marshal.load(data)
- self.new(h['ds'], h['y_var'])
- end
-
- def coeffs
- @coeffs
- end
- # Coefficients using a constant
- # Based on http://www.xycoon.com/ols1.htm
- def matrix_resolution
- mse_p=mse
- columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
- columns.unshift([1.0]*@ds.cases)
- y=Matrix.columns([@dy.data.map {|i| i.to_f}])
- x=Matrix.columns(columns)
- xt=x.t
- matrix=((xt*x)).inverse*xt
- matrix*y
- end
- def r2
- r**2
- end
- def r
- Bivariate::pearson(@dy,predicted)
- end
- def sst
- @dy.ss
- end
- def constant
- @lr.constant
- end
- def standarized_coeffs
- l=lr_s
- assign_names(l.coeffs)
- end
- def lr_s
- if @lr_s.nil?
- build_standarized
- end
- @lr_s
- end
- def build_standarized
- @ds_s=@ds.standarize
- columns=[]
- @ds_s.fields.each{|f|
- columns.push(@ds_s[f].to_a) unless f==@y_var
- }
- @dep_columns_s=columns.dup
- columns.push(@ds_s[@y_var])
- matrix=Matrix.columns(columns)
- @lr_s=Alglib::LinearRegression.build_from_matrix(matrix)
- end
- def process(v)
- @lr.process(v)
- end
- def process_s(v)
- lr_s.process(v)
- end
- # ???? Not equal to SPSS output
- def standarized_residuals
- res=residuals
- red_sd=residuals.sds
- res.collect {|v|
- v.quo(red_sd)
- }.to_vector(:scale)
+ def _dump(i)
+ Marshal.dump({'ds'=>@ds,'y_var'=>@y_var})
+ end
+
+ def self._load(data)
+ h=Marshal.load(data)
+ self.new(h['ds'], h['y_var'])
+ end
+
+ def coeffs
+ @coeffs
+ end
+ # Coefficients using a constant
+ # Based on http://www.xycoon.com/ols1.htm
+ def matrix_resolution
+ mse_p=mse
+ columns=@dep_columns.dup.map {|xi| xi.map{|i| i.to_f}}
+ columns.unshift([1.0]*@ds.cases)
+ y=Matrix.columns([@dy.data.map {|i| i.to_f}])
+ x=Matrix.columns(columns)
+ xt=x.t
+ matrix=((xt*x)).inverse*xt
+ matrix*y
+ end
+
+ def r2
+ r**2
+ end
+
+ def r
+ Bivariate::pearson(@dy,predicted)
+ end
+
+ def sst
+ @dy.ss
+ end
+
+ def constant
+ @lr.constant
+ end
+
+ def standarized_coeffs
+ l=lr_s
+ assign_names(l.coeffs)
+ end
+
+ def lr_s
+ if @lr_s.nil?
+ build_standarized
end
+ @lr_s
+ end
+
+ def build_standarized
+ @ds_s=@ds.standardize
+ columns=[]
+ @ds_s.vectors.each{|f|
+ columns.push(@ds_s[f].to_a) unless f == @y_var
+ }
+ @dep_columns_s=columns.dup
+ columns.push(@ds_s[@y_var])
+ matrix=Matrix.columns(columns)
+ @lr_s=Alglib::LinearRegression.build_from_matrix(matrix)
+ end
+
+ def process(v)
+ @lr.process(v)
+ end
+
+ def process_s(v)
+ lr_s.process(v)
+ end
+ # ???? Not equal to SPSS output
+ def standarized_residuals
+ res = residuals
+ red_sd = residuals.sds
+ Daru::Vector.new(res.collect {|v| v.quo(red_sd) })
+ end
end
end
end
diff --git a/lib/statsample/regression/multiple/baseengine.rb b/lib/statsample/regression/multiple/baseengine.rb
index d5e08ae..f2fdf82 100644
--- a/lib/statsample/regression/multiple/baseengine.rb
+++ b/lib/statsample/regression/multiple/baseengine.rb
@@ -19,13 +19,12 @@ def self.univariate?
end
def initialize(ds, y_var, opts = Hash.new)
@ds=ds
- @predictors_n=@ds.fields.size-1
- @total_cases=@ds.cases
- @cases=@ds.cases
+ @predictors_n=@ds.vectors.size-1
+ @total_cases=@ds.nrows
+ @cases=@ds.nrows
@y_var=y_var
@r2=nil
- @name=_("Multiple Regression: %s over %s") % [ ds.fields.join(",") , @y_var]
-
+ @name=_("Multiple Regression: %s over %s") % [ ds.vectors.to_a.join(",") , @y_var]
opts_default={:digits=>3}
@opts=opts_default.merge opts
@@ -33,7 +32,6 @@ def initialize(ds, y_var, opts = Hash.new)
@opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
-
end
# Calculate F Test
def anova
@@ -45,15 +43,17 @@ def se_estimate
end
# Retrieves a vector with predicted values for y
def predicted
- @total_cases.times.collect { |i|
- invalid=false
- vect=@dep_columns.collect {|v| invalid=true if v[i].nil?; v[i]}
- if invalid
- nil
- else
- process(vect)
+ Daru::Vector.new(
+ @total_cases.times.collect do |i|
+ invalid = false
+ vect = @dep_columns.collect {|v| invalid = true if v[i].nil?; v[i]}
+ if invalid
+ nil
+ else
+ process(vect)
+ end
end
- }.to_vector(:scale)
+ )
end
# Retrieves a vector with standarized values for y
def standarized_predicted
@@ -61,15 +61,17 @@ def standarized_predicted
end
# Retrieves a vector with residuals values for y
def residuals
- (0...@total_cases).collect{|i|
- invalid=false
- vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
- if invalid or @ds[@y_var][i].nil?
- nil
- else
- @ds[@y_var][i] - process(vect)
+ Daru::Vector.new(
+ (0...@total_cases).collect do |i|
+ invalid=false
+ vect=@dep_columns.collect{|v| invalid=true if v[i].nil?; v[i]}
+ if invalid or @ds[@y_var][i].nil?
+ nil
+ else
+ @ds[@y_var][i] - process(vect)
+ end
end
- }.to_vector(:scale)
+ )
end
# R Multiple
def r
@@ -131,12 +133,10 @@ def probability
# Tolerance for a given variable
# http://talkstats.com/showthread.php?t=5056
def tolerance(var)
- ds=assign_names(@dep_columns)
- ds.each{|k,v|
- ds[k]=v.to_vector(:scale)
- }
- lr=self.class.new(ds.to_dataset,var)
- 1-lr.r2
+ ds = assign_names(@dep_columns)
+ ds.each { |k,v| ds[k] = Daru::Vector.new(v) }
+ lr = self.class.new(Daru::DataFrame.new(ds),var)
+ 1 - lr.r2
end
# Tolerances for each coefficient
def coeffs_tolerances
@@ -165,12 +165,12 @@ def se_r2
def estimated_variance_covariance_matrix
#mse_p=mse
columns=[]
- @ds_valid.fields.each{|k|
- v=@ds_valid[k]
- columns.push(v.data) unless k==@y_var
+ @ds_valid.vectors.each{|k|
+ v = @ds_valid[k]
+ columns.push(v.to_a) unless k == @y_var
}
columns.unshift([1.0]*@valid_cases)
- x=Matrix.columns(columns)
+ x=::Matrix.columns(columns)
matrix=((x.t*x)).inverse * mse
matrix.collect {|i| Math::sqrt(i) if i>=0 }
end
diff --git a/lib/statsample/regression/multiple/gslengine.rb b/lib/statsample/regression/multiple/gslengine.rb
index 5f3ef32..2462900 100644
--- a/lib/statsample/regression/multiple/gslengine.rb
+++ b/lib/statsample/regression/multiple/gslengine.rb
@@ -9,43 +9,44 @@ module Multiple
# If you need pairwise, use RubyEngine
# Example:
#
- # @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
- # @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
- # @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
- # @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
- # ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
- # lr=Statsample::Regression::Multiple::GslEngine.new(ds,'y')
+ # @a = Daru::Vector.new([1,3,2,4,3,5,4,6,5,7])
+ # @b = Daru::Vector.new([3,3,4,4,5,5,6,6,4,4])
+ # @c = Daru::Vector.new([11,22,30,40,50,65,78,79,99,100])
+ # @y = Daru::Vector.new([3,4,5,6,7,8,9,10,20,30])
+ # ds = Daru::DataFrame.new({:a => @a,:b => @b,:c => @c,:y => @y})
+ # lr=Statsample::Regression::Multiple::GslEngine.new(ds,:y)
#
class GslEngine < BaseEngine
def initialize(ds,y_var, opts=Hash.new)
super
- @ds=ds.dup_only_valid
- @ds_valid=@ds
- @valid_cases=@ds_valid.cases
- @dy=@ds[@y_var]
- @ds_indep=ds.dup(ds.fields-[y_var])
+ @ds = ds.reject_values(*Daru::MISSING_VALUES)
+ @ds_valid = @ds
+ @valid_cases = @ds_valid.nrows
+ @dy = @ds[@y_var]
+ @ds_indep = ds.dup(ds.vectors.to_a - [y_var])
# Create a custom matrix
columns=[]
@fields=[]
- max_deps = GSL::Matrix.alloc(@ds.cases, @ds.fields.size)
- constant_col=@ds.fields.size-1
- for i in 0...@ds.cases
+ max_deps = GSL::Matrix.alloc(@ds.nrows, @ds.vectors.size)
+ constant_col=@ds.vectors.size-1
+ for i in 0...@ds.nrows
max_deps.set(i,constant_col,1)
end
- j=0
- @ds.fields.each{|f|
- if f!=@y_var
- @ds[f].each_index{|i1|
+ j = 0
+ @ds.vectors.each do |f|
+ if f != @y_var
+ @ds[f].each_index do |i1|
max_deps.set(i1,j,@ds[f][i1])
- }
+ end
+
columns.push(@ds[f].to_a)
@fields.push(f)
- j+=1
+ j += 1
end
- }
- @dep_columns=columns.dup
- @lr_s=nil
- c, @cov, @chisq, @status = GSL::MultiFit.linear(max_deps, @dy.gsl)
+ end
+ @dep_columns = columns.dup
+ @lr_s = nil
+ c, @cov, @chisq, @status = GSL::MultiFit.linear(max_deps, @dy.to_gsl)
@constant=c[constant_col]
@coeffs_a=c.to_a.slice(0...constant_col)
@coeffs=assign_names(@coeffs_a)
@@ -97,7 +98,7 @@ def lr_s
@lr_s
end
def build_standarized
- @ds_s=@ds.standarize
+ @ds_s=@ds.standardize
@lr_s=GslEngine.new(@ds_s,@y_var)
end
def process_s(v)
@@ -107,24 +108,20 @@ def process_s(v)
def standarized_residuals
res=residuals
red_sd=residuals.sds
- res.collect {|v|
- v.quo(red_sd)
- }.to_vector(:scale)
+ Daru::Vector.new(res.collect {|v| v.quo(red_sd) })
end
# Standard error for coeffs
def coeffs_se
- out={}
- evcm=estimated_variance_covariance_matrix
- @ds_valid.fields.each_with_index do |f,i|
-
- mi=i+1
- next if f==@y_var
- out[f]=evcm[mi,mi]
+ out = {}
+ evcm = estimated_variance_covariance_matrix
+ @ds_valid.vectors.to_a.each_with_index do |f,i|
+ mi = i+1
+ next if f == @y_var
+ out[f] = evcm[mi,mi]
end
out
end
-
end
end
end
diff --git a/lib/statsample/regression/multiple/matrixengine.rb b/lib/statsample/regression/multiple/matrixengine.rb
index 86ddc52..9c780f3 100644
--- a/lib/statsample/regression/multiple/matrixengine.rb
+++ b/lib/statsample/regression/multiple/matrixengine.rb
@@ -59,8 +59,6 @@ def initialize(matrix,y_var, opts=Hash.new)
@matrix_y = @matrix_cor.submatrix(@fields, [y_var])
@matrix_y_cov = @matrix_cov.submatrix(@fields, [y_var])
-
-
@y_sd=Math::sqrt(@matrix_cov.submatrix([y_var])[0,0])
@x_sd=@n_predictors.times.inject({}) {|ac,i|
@@ -77,14 +75,14 @@ def initialize(matrix,y_var, opts=Hash.new)
@y_mean=0.0
@name=_("Multiple reggresion of %s on %s") % [@fields.join(","), @y_var]
- opts_default={:digits=>3}
- opts=opts_default.merge opts
+ opts_default = {:digits=>3}
+ opts = opts_default.merge opts
opts.each{|k,v|
self.send("#{k}=",v) if self.respond_to? k
}
result_matrix=@matrix_x_cov.inverse * @matrix_y_cov
- if matrix._type==:covariance
+ if matrix._type == :covariance
@coeffs=result_matrix.column(0).to_a
@coeffs_stan=coeffs.collect {|k,v|
coeffs[k]*@x_sd[k].quo(@y_sd)
@@ -116,12 +114,12 @@ def r
end
# Value of constant
def constant
- c=coeffs
- @y_mean - @fields.inject(0){|a,k| a + (c[k] * @x_mean[k])}
+ c = coeffs
+ @y_mean - @fields.inject(0) { |a,k| a + (c[k] * @x_mean[k])}
end
# Hash of b or raw coefficients
def coeffs
- assign_names(@coeffs)
+ assign_names(@coeffs)
end
# Hash of beta or standarized coefficients
@@ -185,7 +183,7 @@ def constant_se
sd[:constant]=0
fields=[:constant]+@matrix_cov.fields-[@y_var]
# Recreate X'X using the variance-covariance matrix
- xt_x=Matrix.rows(fields.collect {|i|
+ xt_x=::Matrix.rows(fields.collect {|i|
fields.collect {|j|
if i==:constant or j==:constant
cov=0
diff --git a/lib/statsample/regression/multiple/rubyengine.rb b/lib/statsample/regression/multiple/rubyengine.rb
index fcee05f..6b36804 100644
--- a/lib/statsample/regression/multiple/rubyengine.rb
+++ b/lib/statsample/regression/multiple/rubyengine.rb
@@ -8,76 +8,72 @@ module Multiple
#
# Example:
#
-# @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
-# @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
-# @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
-# @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
-# ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
-# lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
+# @a = Daru::Vector.new([1,3,2,4,3,5,4,6,5,7])
+# @b = Daru::Vector.new([3,3,4,4,5,5,6,6,4,4])
+# @c = Daru::Vector.new([11,22,30,40,50,65,78,79,99,100])
+# @y = Daru::Vector.new([3,4,5,6,7,8,9,10,20,30])
+# ds = Daru::DataFrame.new({:a => @a,:b => @b,:c => @c,:y => @y})
+# lr=Statsample::Regression::Multiple::RubyEngine.new(ds,:y)
class RubyEngine < MatrixEngine
def initialize(ds,y_var, opts=Hash.new)
- matrix=ds.correlation_matrix
- fields_indep=ds.fields-[y_var]
- default={
- :y_mean=>ds[y_var].mean,
- :x_mean=>fields_indep.inject({}) {|ac,f| ac[f]=ds[f].mean; ac},
- :y_sd=>ds[y_var].sd,
- :x_sd=>fields_indep.inject({}) {|ac,f| ac[f]=ds[f].sd; ac},
- :cases=>Statsample::Bivariate.min_n_valid(ds)
+ matrix = Statsample::Bivariate.correlation_matrix ds
+ fields_indep=ds.vectors.to_a - [y_var]
+ default= {
+ :y_mean => ds[y_var].mean,
+ :x_mean => fields_indep.inject({}) {|ac,f| ac[f]=ds[f].mean; ac},
+ :y_sd => ds[y_var].sd,
+ :x_sd => fields_indep.inject({}) {|ac,f| ac[f]=ds[f].sd; ac},
+ :cases => Statsample::Bivariate.min_n_valid(ds)
}
- opts=opts.merge(default)
+ opts = opts.merge(default)
super(matrix, y_var, opts)
- @ds=ds
- @dy=ds[@y_var]
- @ds_valid=ds.dup_only_valid
- @total_cases=@ds.cases
- @valid_cases=@ds_valid.cases
- @ds_indep = ds.dup(ds.fields-[y_var])
+ @ds = ds
+ @dy = ds[@y_var]
+ @ds_valid = ds.reject_values(*Daru::MISSING_VALUES)
+ @total_cases = @ds.nrows
+ @valid_cases = @ds_valid.nrows
+ @ds_indep = ds.dup(ds.vectors.to_a - [y_var])
set_dep_columns
end
def set_dep_columns
- @dep_columns=[]
- @ds_indep.each_vector{|k,v|
- @dep_columns.push(v.data_with_nils)
- }
+ @dep_columns = []
+ @ds_indep.each_vector { |v| @dep_columns.push(v.to_a) }
end
def fix_with_mean
i=0
- @ds_indep.each do |row|
+ @ds_indep.each(:row) do |row|
empty=[]
row.each do |k,v|
empty.push(k) if v.nil?
end
+
if empty.size==1
@ds_indep[empty[0]][i]=@ds[empty[0]].mean
end
- i+=1
+ i += 1
end
- @ds_indep.update_valid_data
set_dep_columns
end
def fix_with_regression
- i=0
- @ds_indep.each{|row|
- empty=[]
- row.each{|k,v|
- empty.push(k) if v.nil?
- }
+ i = 0
+ @ds_indep.each(:row) do |row|
+ empty = []
+ row.each { |k,v| empty.push(k) if v.nil? }
if empty.size==1
- field=empty[0]
- lr=MultipleRegression.new(@ds_indep,field)
- fields=[]
- @ds_indep.fields.each{|f|
- fields.push(row[f]) unless f==field
+ field = empty[0]
+ lr = MultipleRegression.new(@ds_indep,field)
+ fields = []
+ @ds_indep.vectors.each { |f|
+ fields.push(row[f]) unless f == field
}
+
@ds_indep[field][i]=lr.process(fields)
end
i+=1
- }
- @ds_indep.update_valid_data
+ end
set_dep_columns
end
# Standard error for constant
diff --git a/lib/statsample/reliability.rb b/lib/statsample/reliability.rb
index e5fb50c..5e81fd3 100644
--- a/lib/statsample/reliability.rb
+++ b/lib/statsample/reliability.rb
@@ -4,31 +4,30 @@ class << self
# Calculate Chonbach's alpha for a given dataset.
# only uses tuples without missing data
def cronbach_alpha(ods)
- ds=ods.dup_only_valid
- return nil if ds.vectors.any? {|k,v| v.variance==0}
- n_items=ds.fields.size
- return nil if n_items<=1
- s2_items=ds.vectors.inject(0) {|ac,v|
- ac+v[1].variance }
- total=ds.vector_sum
+ ds = ods.reject_values(*Daru::MISSING_VALUES)
+ n_items = ds.ncols
+ return nil if n_items <= 1
+ s2_items = ds.to_h.values.inject(0) { |ac,v|
+ ac + v.variance }
+ total = ds.vector_sum
- (n_items.quo(n_items-1)) * (1-(s2_items.quo(total.variance)))
+ (n_items.quo(n_items - 1)) * (1 - (s2_items.quo(total.variance)))
end
# Calculate Chonbach's alpha for a given dataset
# using standarized values for every vector.
# Only uses tuples without missing data
# Return nil if one or more vectors has 0 variance
def cronbach_alpha_standarized(ods)
+ ds = ods.reject_values(*Daru::MISSING_VALUES)
+ return nil if ds.any? { |v| v.variance==0}
- ds=ods.dup_only_valid
-
- return nil if ds.vectors.any? {|k,v| v.variance==0}
-
- ds=ds.fields.inject({}){|a,f|
- a[f]=ods[f].standarized;
- a
- }.to_dataset
-
+ ds = Daru::DataFrame.new(
+ ds.vectors.to_a.inject({}) { |a,i|
+ a[i] = ods[i].standardize
+ a
+ }
+ )
+
cronbach_alpha(ds)
end
# Predicted reliability of a test by replicating
@@ -55,10 +54,10 @@ def cronbach_alpha_from_n_s2_cov(n,s2,cov)
end
# Get Cronbach's alpha from a covariance matrix
def cronbach_alpha_from_covariance_matrix(cov)
- n=cov.row_size
+ n = cov.row_size
raise "covariance matrix should have at least 2 variables" if n < 2
- s2=n.times.inject(0) {|ac,i| ac+cov[i,i]}
- (n.quo(n-1))*(1-(s2.quo(cov.total_sum)))
+ s2 = n.times.inject(0) { |ac,i| ac + cov[i,i] }
+ (n.quo(n - 1)) * (1 - (s2.quo(cov.total_sum)))
end
# Returns n necessary to obtain specific alpha
# given variance and covariance mean of items
@@ -83,8 +82,6 @@ def n_for_desired_alpha(alpha,s2,cov)
end
c_a=cronbach_alpha_from_n_s2_cov(n,s2,cov)
dif=c_a - alpha
- #puts "#{n} , #{c_a}"
-
end
n
end
@@ -111,20 +108,20 @@ class ItemCharacteristicCurve
attr_reader :totals, :counts, :vector_total
def initialize (ds, vector_total=nil)
vector_total||=ds.vector_sum
- raise ArgumentError, "Total size != Dataset size" if vector_total.size!=ds.cases
+ raise ArgumentError, "Total size != Dataset size" if vector_total.size != ds.nrows
@vector_total=vector_total
@ds=ds
@totals={}
- @counts=@ds.fields.inject({}) {|a,v| a[v]={};a}
+ @counts=@ds.vectors.to_a.inject({}) {|a,v| a[v]={};a}
process
end
def process
i=0
- @ds.each do |row|
+ @ds.each_row do |row|
tot=@vector_total[i]
@totals[tot]||=0
@totals[tot]+=1
- @ds.fields.each do |f|
+ @ds.vectors.each do |f|
item=row[f].to_s
@counts[f][tot]||={}
@counts[f][tot][item]||=0
@@ -150,4 +147,4 @@ def curve_field(field, item)
require 'statsample/reliability/icc.rb'
require 'statsample/reliability/scaleanalysis.rb'
require 'statsample/reliability/skillscaleanalysis.rb'
-require 'statsample/reliability/multiscaleanalysis.rb'
\ No newline at end of file
+require 'statsample/reliability/multiscaleanalysis.rb'
diff --git a/lib/statsample/reliability/icc.rb b/lib/statsample/reliability/icc.rb
index 1277acc..8780d95 100644
--- a/lib/statsample/reliability/icc.rb
+++ b/lib/statsample/reliability/icc.rb
@@ -6,12 +6,12 @@ module Reliability
# several ratings) on a target and another measurement obtained on that target"
# == Usage
# require 'statsample'
- # size=1000
- # a = size.times.map {rand(10)}.to_scale
+ # size = 1000
+ # a = Daru::Vector.new(size.times.map {rand(10)})
# b = a.recode{|i|i+rand(4)-2}
- # c =a.recode{|i|i+rand(4)-2}
+ # c = a.recode{|i|i+rand(4)-2}
# d = a.recode{|i|i+rand(4)-2}
- # ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
+ # ds = Daru::DataFrame.new({:a => a,:b => b,:c => c,:d => d})
# # Use :type attribute to set type to summarize
# icc=Statsample::Reliability::ICC.new(ds, :type=>:icc_1_k)
# puts icc.summary
@@ -96,10 +96,10 @@ class ICC
attr_accessor :alpha
attr_accessor :name
def initialize(ds, opts=Hash.new)
- @ds=ds.dup_only_valid
- @vectors=@ds.vectors.values
- @n=@ds.cases
- @k=@ds.fields.size
+ @ds=ds.reject_values(*Daru::MISSING_VALUES)
+ @vectors=@ds.map { |e| e }
+ @n=@ds.nrows
+ @k=@ds.ncols
compute
@g_rho=0
@alpha=0.05
diff --git a/lib/statsample/reliability/multiscaleanalysis.rb b/lib/statsample/reliability/multiscaleanalysis.rb
index 3222593..ae74cb8 100644
--- a/lib/statsample/reliability/multiscaleanalysis.rb
+++ b/lib/statsample/reliability/multiscaleanalysis.rb
@@ -6,17 +6,17 @@ module Reliability
# PCA and Factor Analysis.
#
# == Usage
- # @x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_vector(:scale)
- # @x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_vector(:scale)
- # @x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_vector(:scale)
- # @x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_vector(:scale)
- # ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset
+ # @x1 = Daru::Vector.new([1,1,1,1,2,2,2,2,3,3,3,30])
+ # @x2 = Daru::Vector.new([1,1,1,2,2,3,3,3,3,4,4,50])
+ # @x3 = Daru::Vector.new([2,2,1,1,1,2,2,2,3,4,5,40])
+ # @x4 = Daru::Vector.new([1,2,3,4,4,4,4,3,4,4,5,30])
+ # ds = Daru::DataFrame.new({:x1 => @x1,:x2 => @x2,:x3 => @x3,:x4 => @x4})
# opts={:name=>"Scales", # Name of analysis
# :summary_correlation_matrix=>true, # Add correlation matrix
# :summary_pca } # Add PCA between scales
# msa=Statsample::Reliability::MultiScaleAnalysis.new(opts) do |m|
- # m.scale :s1, ds.clone(%w{x1 x2})
- # m.scale :s2, ds.clone(%w{x3 x4}), {:name=>"Scale 2"}
+ # m.scale :s1, ds.clone([:x1, :x2])
+ # m.scale :s2, ds.clone([:x3, :x4]), {:name=>"Scale 2"}
# end
# # Retrieve summary
# puts msa.summary
@@ -107,7 +107,7 @@ def delete_scale(code)
# Retrieves a Principal Component Analysis (Factor::PCA)
# using all scales, using opts a options.
def pca(opts=nil)
- opts||=pca_options
+ opts ||= pca_options
Statsample::Factor::PCA.new(correlation_matrix, opts)
end
# Retrieve Velicer's MAP
@@ -123,14 +123,14 @@ def principal_axis_analysis(opts=nil)
Statsample::Factor::PrincipalAxis.new(correlation_matrix, opts)
end
def dataset_from_scales
- ds=Dataset.new(@scales_keys)
+ ds = Daru::DataFrame.new({}, order: @scales_keys.map(&:to_sym))
@scales.each_pair do |code,scale|
- ds[code.to_s]=scale.ds.vector_sum
- ds[code.to_s].name=scale.name
+ ds[code.to_sym] = scale.ds.vector_sum
end
- ds.update_valid_data
+
ds
end
+
def parallel_analysis(opts=nil)
opts||=parallel_analysis_options
Statsample::Factor::ParallelAnalysis.new(dataset_from_scales, opts)
@@ -140,6 +140,7 @@ def parallel_analysis(opts=nil)
def correlation_matrix
Statsample::Bivariate.correlation_matrix(dataset_from_scales)
end
+
def report_building(b) # :nodoc:
b.section(:name=>name) do |s|
s.section(:name=>_("Reliability analysis of scales")) do |s2|
diff --git a/lib/statsample/reliability/scaleanalysis.rb b/lib/statsample/reliability/scaleanalysis.rb
index 9a48d0e..9a52230 100644
--- a/lib/statsample/reliability/scaleanalysis.rb
+++ b/lib/statsample/reliability/scaleanalysis.rb
@@ -3,12 +3,12 @@ module Reliability
# Analysis of a Scale. Analoge of Scale Reliability analysis on SPSS.
# Returns several statistics for complete scale and each item
# == Usage
- # @x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_vector(:scale)
- # @x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_vector(:scale)
- # @x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_vector(:scale)
- # @x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_vector(:scale)
- # ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset
- # ia=Statsample::Reliability::ScaleAnalysis.new(ds)
+ # @x1 = Daru::Vector.new([1,1,1,1,2,2,2,2,3,3,3,30])
+ # @x2 = Daru::Vector.new([1,1,1,2,2,3,3,3,3,4,4,50])
+ # @x3 = Daru::Vector.new([2,2,1,1,1,2,2,2,3,4,5,40])
+ # @x4 = Daru::Vector.new([1,2,3,4,4,4,4,3,4,4,5,30])
+ # ds = Daru::DataFrame.new({:x1 => @x1,:x2 => @x2,:x3 => @x3,:x4 => @x4})
+ # ia = Statsample::Reliability::ScaleAnalysis.new(ds)
# puts ia.summary
class ScaleAnalysis
include Summarizable
@@ -16,40 +16,40 @@ class ScaleAnalysis
attr_accessor :name
attr_accessor :summary_histogram
def initialize(ds, opts=Hash.new)
- @dumped=ds.fields.find_all {|f|
- ds[f].variance==0
+ @dumped=ds.vectors.to_a.find_all {|f|
+ ds[f].variance == 0
}
- @ods=ds
- @ds=ds.dup_only_valid(ds.fields - @dumped)
- @ds.name=ds.name
+ @ods = ds
+ @ds = ds.reject_values(*Daru::MISSING_VALUES).dup(ds.vectors.to_a - @dumped)
+ @ds.rename ds.name
- @k=@ds.fields.size
- @total=@ds.vector_sum
+ @k = @ds.ncols
+ @total = @ds.vector_sum
@o_total=@dumped.size > 0 ? @ods.vector_sum : nil
- @vector_mean=@ds.vector_mean
- @item_mean=@vector_mean.mean
- @item_sd=@vector_mean.sd
+ @vector_mean = @ds.vector_mean
+ @item_mean = @vector_mean.mean
+ @item_sd = @vector_mean.sd
- @mean=@total.mean
- @median=@total.median
-
- @skew=@total.skew
- @kurtosis=@total.kurtosis
- @sd = @total.sd
- @variance=@total.variance
- @valid_n = @total.size
- opts_default={
- :name=>_("Reliability Analysis"),
- :summary_histogram=>true
+ @mean = @total.mean
+ @median = @total.median
+ @skew = @total.skew
+ @kurtosis = @total.kurtosis
+ @sd = @total.sd
+ @variance = @total.variance
+ @valid_n = @total.size
+
+ opts_default = {
+ :name => _("Reliability Analysis"),
+ :summary_histogram => true
}
- @opts=opts_default.merge(opts)
- @opts.each{|k,v| self.send("#{k}=",v) if self.respond_to? k }
+ @opts = opts_default.merge(opts)
+ @opts.each{ |k,v| self.send("#{k}=",v) if self.respond_to? k }
@cov_m=Statsample::Bivariate.covariance_matrix(@ds)
# Mean for covariances and variances
- @variances=@k.times.map {|i| @cov_m[i,i]}.to_scale
+ @variances = Daru::Vector.new(@k.times.map { |i| @cov_m[i,i] })
@variances_mean=@variances.mean
@covariances_mean=(@variance-@variances.sum).quo(@k**2-@k)
#begin
@@ -66,7 +66,7 @@ def item_characteristic_curve
total={}
@ds.each do |row|
tot=@total[i]
- @ds.fields.each do |f|
+ @ds.vectors.each do |f|
out[f]||= {}
total[f]||={}
out[f][tot]||= 0
@@ -87,43 +87,40 @@ def item_characteristic_curve
# Adjusted RPB(Point biserial-correlation) for each item
#
def item_total_correlation
- @itc||=@ds.fields.inject({}) do |a,v|
- vector=@ds[v].clone
- ds2=@ds.clone
- ds2.delete_vector(v)
- total=ds2.vector_sum
- a[v]=Statsample::Bivariate.pearson(vector,total)
+ vecs = @ds.vectors.to_a
+ @itc ||= vecs.inject({}) do |a,v|
+ total=@ds.vector_sum(vecs - [v])
+ a[v]=Statsample::Bivariate.pearson(@ds[v],total)
a
end
end
def mean_rpb
- item_total_correlation.values.to_scale.mean
+ Daru::Vector.new(item_total_correlation.values).mean
end
def item_statistics
- @is||=@ds.fields.inject({}) do |a,v|
- a[v]={:mean=>@ds[v].mean, :sds=>Math::sqrt(@cov_m.variance(v))}
- a
- end
+ @is||=@ds.vectors.to_a.inject({}) do |a,v|
+ a[v]={:mean=>@ds[v].mean, :sds=>Math::sqrt(@cov_m.variance(v))}
+ a
+ end
end
# Returns a dataset with cases ordered by score
# and variables ordered by difficulty
def item_difficulty_analysis
dif={}
- @ds.fields.each{|f| dif[f]=@ds[f].mean }
- dif_sort=dif.sort{|a,b| -(a[1]<=>b[1])}
+ @ds.vectors.each{|f| dif[f]=@ds[f].mean }
+ dif_sort = dif.sort { |a,b| -(a[1]<=>b[1]) }
scores_sort={}
scores=@ds.vector_mean
- scores.each_index{|i| scores_sort[i]=scores[i] }
+ scores.each_index{ |i| scores_sort[i]=scores[i] }
scores_sort=scores_sort.sort{|a,b| a[1]<=>b[1]}
- ds_new=Statsample::Dataset.new(['case','score'] + dif_sort.collect{|a,b| a})
+ ds_new = Daru::DataFrame.new({}, order: ([:case,:score] + dif_sort.collect{|a,b| a.to_sym}))
scores_sort.each do |i,score|
- row=[i, score]
- case_row=@ds.case_as_hash(i)
- dif_sort.each{|variable,dif_value| row.push(case_row[variable]) }
- ds_new.add_case_array(row)
+ row = [i, score]
+ case_row = @ds.row[i].to_h
+ dif_sort.each{ |variable,dif_value| row.push(case_row[variable]) }
+ ds_new.add_row(row)
end
- ds_new.update_valid_data
ds_new
end
@@ -132,9 +129,10 @@ def stats_if_deleted
end
def stats_if_deleted_intern # :nodoc:
- return Hash.new if @ds.fields.size==1
- @ds.fields.inject({}) do |a,v|
- cov_2=@cov_m.submatrix(@ds.fields-[v])
+ return Hash.new if @ds.ncols == 1
+ vecs = @ds.vectors.to_a
+ vecs.inject({}) do |a,v|
+ cov_2=@cov_m.submatrix(vecs - [v])
#ds2=@ds.clone
#ds2.delete_vector(v)
#total=ds2.vector_sum
@@ -151,11 +149,10 @@ def stats_if_deleted_intern # :nodoc:
def report_building(builder) #:nodoc:
builder.section(:name=>@name) do |s|
-
if @dumped.size>0
s.section(:name=>"Items with variance=0") do |s1|
s.table(:name=>_("Summary for %s with all items") % @name) do |t|
- t.row [_("Items"), @ods.fields.size]
+ t.row [_("Items"), @ods.ncols]
t.row [_("Sum mean"), "%0.4f" % @o_total.mean]
t.row [_("S.d. mean"), "%0.4f" % @o_total.sd]
end
@@ -170,7 +167,7 @@ def report_building(builder) #:nodoc:
s.table(:name=>_("Summary for %s") % @name) do |t|
- t.row [_("Valid Items"), @ds.fields.size]
+ t.row [_("Valid Items"), @ds.ncols]
t.row [_("Valid cases"), @valid_n]
t.row [_("Sum mean"), "%0.4f" % @mean]
@@ -193,8 +190,8 @@ def report_building(builder) #:nodoc:
end
if (@alpha)
- s.text _("Items for obtain alpha(0.8) : %d" % Statsample::Reliability::n_for_desired_reliability(@alpha, 0.8, @ds.fields.size))
- s.text _("Items for obtain alpha(0.9) : %d" % Statsample::Reliability::n_for_desired_reliability(@alpha, 0.9, @ds.fields.size))
+ s.text _("Items for obtain alpha(0.8) : %d" % Statsample::Reliability::n_for_desired_reliability(@alpha, 0.8, @ds.ncols))
+ s.text _("Items for obtain alpha(0.9) : %d" % Statsample::Reliability::n_for_desired_reliability(@alpha, 0.9, @ds.ncols))
end
@@ -203,7 +200,7 @@ def report_building(builder) #:nodoc:
itc=item_total_correlation
s.table(:name=>_("Items report for %s") % @name, :header=>["item","mean","sd", "mean if deleted", "var if deleted", "sd if deleted"," item-total correl.", "alpha if deleted"]) do |t|
- @ds.fields.each do |f|
+ @ds.vectors.each do |f|
row=["#{@ds[f].name}(#{f})"]
if is[f]
row+=[sprintf("%0.5f",is[f][:mean]), sprintf("%0.5f", is[f][:sds])]
diff --git a/lib/statsample/reliability/skillscaleanalysis.rb b/lib/statsample/reliability/skillscaleanalysis.rb
index 5ce410b..aff272b 100644
--- a/lib/statsample/reliability/skillscaleanalysis.rb
+++ b/lib/statsample/reliability/skillscaleanalysis.rb
@@ -4,11 +4,11 @@ module Reliability
# Given a dataset with results and a correct answers hash,
# generates a ScaleAnalysis
# == Usage
- # x1=%{a b b c}.to_vector
- # x2=%{b a b c}.to_vector
- # x3=%{a c b a}.to_vector
- # ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3}.to_dataset
- # key={'x1'=>'a','x2'=>'b','x3'=>'a'}
+ # x1 = Daru::Vector.new(%{a b b c})
+ # x2 = Daru::Vector.new(%{b a b c})
+ # x3 = Daru::Vector.new(%{a c b a})
+ # ds = Daru::DataFrame.new({:x1 => @x1, :x2 => @x2, :x3 => @x3})
+ # key={ :x1 => 'a',:x2 => 'b', :x3 => 'a'}
# ssa=Statsample::Reliability::SkillScaleAnalysis.new(ds,key)
# puts ssa.summary
class SkillScaleAnalysis
@@ -30,53 +30,59 @@ def initialize(ds,key,opts=Hash.new)
end
# Dataset only corrected vectors
def corrected_dataset_minimal
- cds=corrected_dataset
- dsm=@key.keys.inject({}) {|ac,v| ac[v]=cds[v];ac}.to_dataset
- @key.keys.each do |k|
- dsm[k].name=_("%s(corrected)") % @ds[k].name
- dsm[k].labels=@ds[k].labels
- end
+ cds = corrected_dataset
+ dsm = Daru::DataFrame.new(
+ @key.keys.inject({}) do |ac,v|
+ ac[v] = cds[v]
+ ac
+ end
+ )
- dsm.name=_("Corrected dataset from %s") % @ds.name
+ dsm.rename _("Corrected dataset from %s") % @ds.name
dsm
end
+
def vector_sum
corrected_dataset_minimal.vector_sum
end
+
def vector_mean
corrected_dataset_minimal.vector_mean
end
+
def scale_analysis
- sa=ScaleAnalysis.new(corrected_dataset_minimal)
+ sa = ScaleAnalysis.new(corrected_dataset_minimal)
sa.name=_("%s (Scale Analysis)") % @name
sa
end
+
def corrected_dataset
if @cds.nil?
- @cds=@ds.dup_empty
- @key.keys.each {|k| @cds[k].type=:scale; @cds[k].name=@ds[k].name}
- @ds.each do |row|
- out={}
- row.each do |k,v|
- if @key.keys.include? k
- if @ds[k].is_valid? v
- out[k]= @key[k]==v ? 1 : 0
+ @cds = Daru::DataFrame.new({}, order: @ds.vectors, name: @ds.name)
+ @ds.each_row do |row|
+ out = {}
+ row.each_with_index do |v, k|
+ if @key.has_key? k
+ if @ds[k].reject_values(*Daru::MISSING_VALUES).include_values? v
+ out[k]= @key[k] == v ? 1 : 0
else
- out[k]=nil
+ out[k] = nil
end
else
- out[k]=v
+ out[k] = v
end
end
- @cds.add_case(out,false)
+
+ @cds.add_row(Daru::Vector.new(out))
end
- @cds.update_valid_data
+ @cds.update
end
@cds
end
+
def report_building(builder)
builder.section(:name=>@name) do |s|
- sa=scale_analysis
+ sa = scale_analysis
s.parse_element(sa)
if summary_show_problematic_items
s.section(:name=>_("Problematic Items")) do |spi|
@@ -91,17 +97,16 @@ def report_building(builder)
spi.table(:name=>"Proportions",:header=>[_("Value"), _("%")]) do |table|
props.each do |k1,v|
- table.row [ @ds[k].labeling(k1), "%0.3f" % v]
+ table.row [ @ds[k].index_of(k1), "%0.3f" % v]
end
end
-
end
end
end
+
spi.text _("No problematic items") if count==0
end
end
-
end
end
end
diff --git a/lib/statsample/resample.rb b/lib/statsample/resample.rb
index 8a1795d..d6ca1b7 100644
--- a/lib/statsample/resample.rb
+++ b/lib/statsample/resample.rb
@@ -7,7 +7,7 @@ def repeat_and_save(times,&action)
def generate (size,low,upper)
range=upper-low+1
- Vector.new((0...size).collect {|x| rand(range)+low },:scale)
+ Daru::Vector.new((0...size).collect {|x| rand(range)+low })
end
end
diff --git a/lib/statsample/rserve_extension.rb b/lib/statsample/rserve_extension.rb
deleted file mode 100644
index d439c91..0000000
--- a/lib/statsample/rserve_extension.rb
+++ /dev/null
@@ -1,20 +0,0 @@
-# Several additions to Statsample objects, to support
-# rserve-client
-
-module Statsample
- class Vector
- def to_REXP
- Rserve::REXP::Wrapper.wrap(data_with_nils)
- end
- end
- class Dataset
- def to_REXP
- names=@fields
- data=@fields.map {|f|
- Rserve::REXP::Wrapper.wrap(@vectors[f].data_with_nils)
- }
- l=Rserve::Rlist.new(data,names)
- Rserve::REXP.create_data_frame(l)
- end
- end
-end
\ No newline at end of file
diff --git a/lib/statsample/shorthand.rb b/lib/statsample/shorthand.rb
index d4956f3..6f2c5c4 100644
--- a/lib/statsample/shorthand.rb
+++ b/lib/statsample/shorthand.rb
@@ -11,30 +11,20 @@ module Shorthand
###
# :section: R like methods
###
- def read_with_cache(klass, filename,opts=Hash.new, cache=true)
- file_ds=filename+".ds"
- if cache and (File.exists? file_ds and File.mtime(file_ds)>File.mtime(filename))
- ds=Statsample.load(file_ds)
- else
- ds=klass.read(filename)
- ds.save(file_ds) if cache
- end
- ds
- end
- # Import an Excel file. Cache result by default
- def read_excel(filename, opts=Hash.new, cache=true)
- read_with_cache(Statsample::Excel, filename, opts, cache)
+ # Import an Excel file. Cache result by default
+ def read_excel(filename, opts=Hash.new)
+ Daru::DataFrame.from_excel filename, opts
end
- # Import an CSV file. Cache result by default
- def read_csv
- read_with_cache(Statsample::CSV, filename, opts, cache)
+ # Import an CSV file. Cache result by default
+ def read_csv(filename, opts=Hash.new)
+ Daru::DataFrame.from_csv filename, opts
end
# Retrieve names (fields) from dataset
def names(ds)
- ds.fields
+ ds.vectors.to_a
end
# Create a correlation matrix from a dataset
def cor(ds)
@@ -44,21 +34,25 @@ def cor(ds)
def cov(ds)
Statsample::Bivariate.covariate_matrix(ds)
end
- # Create a Statsample::Vector
+ # Create a Daru::Vector
# Analog to R's c
def vector(*args)
- Statsample::Vector[*args]
+ Daru::Vector[*args]
end
# Random generation for the normal distribution
def rnorm(n,mean=0,sd=1)
rng=Distribution::Normal.rng(mean,sd)
- Statsample::Vector.new_scale(n) { rng.call}
+ Daru::Vector.new_with_size(n) { rng.call}
end
- # Creates a new Statsample::Dataset
- # Each key is transformed into string
+ # Creates a new Daru::DataFrame
+ # Each key is transformed into a Symbol wherever possible.
def dataset(vectors=Hash.new)
- vectors=vectors.inject({}) {|ac,v| ac[v[0].to_s]=v[1];ac}
- Statsample::Dataset.new(vectors)
+ vectors = vectors.inject({}) do |ac,v|
+ n = v[0].respond_to?(:to_sym) ? v[0].to_sym : v[0]
+ ac[n] = v[1]
+ ac
+ end
+ Daru::DataFrame.new(vectors)
end
alias :data_frame :dataset
# Returns a Statsample::Graph::Boxplot
@@ -78,13 +72,15 @@ def scatterplot(*args)
def levene(*args)
Statsample::Test::Levene.new(*args)
end
+
def principal_axis(*args)
Statsample::Factor::PrincipalAxis.new(*args)
-
end
+
def polychoric(*args)
Statsample::Bivariate::Polychoric.new(*args)
end
+
def tetrachoric(*args)
Statsample::Bivariate::Tetrachoric.new(*args)
end
@@ -95,27 +91,35 @@ def tetrachoric(*args)
def lr(*args)
Statsample::Regression.multiple(*args)
end
+
def pca(ds,opts=Hash.new)
Statsample::Factor::PCA.new(ds,opts)
end
+
def dominance_analysis(*args)
Statsample::DominanceAnalysis.new(*args)
end
+
def dominance_analysis_bootstrap(*args)
Statsample::DominanceAnalysis::Bootstrap.new(*args)
end
+
def scale_analysis(*args)
Statsample::Reliability::ScaleAnalysis.new(*args)
end
+
def skill_scale_analysis(*args)
Statsample::Reliability::SkillScaleAnalysis.new(*args)
end
+
def multiscale_analysis(*args,&block)
Statsample::Reliability::MultiScaleAnalysis.new(*args,&block)
end
+
def test_u(*args)
Statsample::Test::UMannWhitney.new(*args)
end
+
module_function :test_u, :rnorm
end
end
diff --git a/lib/statsample/test/bartlettsphericity.rb b/lib/statsample/test/bartlettsphericity.rb
index 98b6676..b05ed02 100644
--- a/lib/statsample/test/bartlettsphericity.rb
+++ b/lib/statsample/test/bartlettsphericity.rb
@@ -31,7 +31,7 @@ def initialize(matrix,ncases)
#
def compute
@value=-((@ncases-1)-(2*@nvars+5).quo(6))*Math::log(@matrix.determinant)
- @df=(@nvars*(@nvars-1)).quo(2)
+ @df=(@nvars*(@nvars-1)) / 2
end
def probability
1-Distribution::ChiSquare.cdf(@value,@df)
diff --git a/lib/statsample/test/chisquare.rb b/lib/statsample/test/chisquare.rb
index 2180ea0..28acb04 100644
--- a/lib/statsample/test/chisquare.rb
+++ b/lib/statsample/test/chisquare.rb
@@ -1,9 +1,26 @@
module Statsample
module Test
module ChiSquare
- class WithMatrix
+ module Shared
attr_reader :df
attr_reader :value
+
+ def to_f
+ @value
+ end
+
+ def chi_square
+ @value
+ end
+
+ def probability
+ 1-Distribution::ChiSquare.cdf(@value.to_f,@df)
+ end
+ end
+
+ class WithMatrix
+ include Statsample::Test::ChiSquare::Shared
+
def initialize(observed, expected=nil)
@observed=observed
@expected=expected or calculate_expected
@@ -11,33 +28,46 @@ def initialize(observed, expected=nil)
@df=(@observed.row_size-1)*(@observed.column_size-1)
@value=compute_chi
end
+
def calculate_expected
sum=@observed.total_sum
@expected=Matrix.rows( @observed.row_size.times.map {|i|
@observed.column_size.times.map {|j|
(@observed.row_sum[i].quo(sum) * @observed.column_sum[j].quo(sum))*sum
}
- })
- end
- def to_f
- @value
- end
- def chi_square
- @value
- end
- def probability
- 1-Distribution::ChiSquare.cdf(@value.to_f,@df)
+ })
end
+
def compute_chi
- sum=0
- (0...@observed.row_size).each {|i|
- (0...@observed.column_size).each {|j|
+ sum=0
+ (0...@observed.row_size).each {|i|
+ (0...@observed.column_size).each {|j|
sum+=((@observed[i, j] - @expected[i,j])**2).quo(@expected[i,j])
- }
}
- sum
+ }
+ sum
+ end
+ end
+
+ class WithVector
+ include Statsample::Test::ChiSquare::Shared
+
+ def initialize(observed, expected)
+ @observed = observed
+ @expected = expected
+ raise "Observed size!=expected size" if @observed.size!=@expected.size
+ @df = @observed.size - 1
+ @value = compute_chi
+ end
+
+ def compute_chi
+ sum=0
+ (0...@observed.size).each {|i|
+ sum+=((@observed[i] - @expected[i])**2).quo(@expected[i])
+ }
+ sum
end
end
end
end
-end
\ No newline at end of file
+end
diff --git a/lib/statsample/test/kolmogorovsmirnov.rb b/lib/statsample/test/kolmogorovsmirnov.rb
index 31c60f9..f6e7436 100644
--- a/lib/statsample/test/kolmogorovsmirnov.rb
+++ b/lib/statsample/test/kolmogorovsmirnov.rb
@@ -22,6 +22,7 @@ def initialize(d1,d2)
end
calculate
end
+
def calculate
d=0
@d1.each {|x|
@@ -31,12 +32,13 @@ def calculate
}
@d=d
end
+
# Make a wrapper EmpiricDistribution to any method which implements
- # each
- # On Statsample::Vector, only uses #valid_data
+ # each on Statsample::Vector, only uses non-missing data.
def make_cdf(v)
- v.is_a?(Statsample::Vector) ? EmpiricDistribution.new(v.valid_data) : EmpiricDistribution.new(v)
+ v.is_a?(Daru::Vector) ? EmpiricDistribution.new(v.only_valid.to_a) : EmpiricDistribution.new(v)
end
+
class EmpiricDistribution
def initialize(data)
@min=data.min
diff --git a/lib/statsample/test/levene.rb b/lib/statsample/test/levene.rb
index 4727ceb..4293bdc 100644
--- a/lib/statsample/test/levene.rb
+++ b/lib/statsample/test/levene.rb
@@ -5,8 +5,8 @@ module Test
# Levene's test ( Levene, 1960) is used to test if k samples have equal variances. Equal variances across samples is called homogeneity of variance. Some statistical tests, for example the analysis of variance, assume that variances are equal across groups or samples. The Levene test can be used to verify that assumption.
# Use:
# require 'statsample'
- # a=[1,2,3,4,5,6,7,8,100,10].to_scale
- # b=[30,40,50,60,70,80,90,100,110,120].to_scale
+ # a = Daru::Vector.new([1,2,3,4,5,6,7,8,100,10])
+ # b = Daru::Vector.new([30,40,50,60,70,80,90,100,110,120])
#
# levene=Statsample::Test::Levene.new([a,b])
# puts levene.summary
@@ -29,10 +29,10 @@ class Levene
attr_accessor :name
# Input could be an array of vectors or a dataset
def initialize(input, opts=Hash.new())
- if input.is_a? Statsample::Dataset
- @vectors=input.vectors.values
+ if input.is_a? Daru::DataFrame
+ @vectors = input.to_h.values
else
- @vectors=input
+ @vectors = input
end
@name=_("Levene Test")
opts.each{|k,v|
@@ -48,32 +48,34 @@ def report_building(builder) # :nodoc:
builder.text "%s : F(%d, %d) = %0.4f , p = %0.4f" % [@name, @d1, @d2, f, probability]
end
def compute
- n=@vectors.inject(0) {|ac,v| ac+v.n_valid}
+ n=@vectors.inject(0) { |ac,v| ac + v.reject_values(*Daru::MISSING_VALUES).size }
- zi=@vectors.collect {|vector|
+ zi=@vectors.collect do |vector|
mean=vector.mean
- vector.collect {|v| (v-mean).abs }.to_scale
- }
+ Daru::Vector.new(vector.collect { |v| (v - mean).abs })
+ end
- total_mean=zi.inject([]) {|ac,vector|
- ac+vector.valid_data
- }.to_scale.mean
+ total_mean = Daru::Vector.new(
+ zi.inject([]) do |ac,vector|
+ ac + vector.reject_values(*Daru::MISSING_VALUES).to_a
+ end
+ ).mean
- k=@vectors.size
-
- sum_num=zi.inject(0) {|ac,vector|
- ac+(vector.size*(vector.mean-total_mean)**2)
- }
+ k = @vectors.size
+ sum_num = zi.inject(0) do |ac,vector|
+ ac + (vector.size * (vector.mean - total_mean)**2)
+ end
- sum_den=zi.inject(0) {|ac,vector|
- z_mean=vector.mean
- ac+vector.valid_data.inject(0) {|acp,zij|
- acp+(zij-z_mean)**2
- }
- }
- @w=((n-k)*sum_num).quo((k-1)*sum_den)
- @d1=k-1
- @d2=n-k
+ sum_den = zi.inject(0) do |ac,vector|
+ z_mean = vector.mean
+ ac + vector.reject_values(*Daru::MISSING_VALUES).to_a.inject(0) do |acp,zij|
+ acp + (zij - z_mean)**2
+ end
+ end
+
+ @w = ((n - k) * sum_num).quo((k - 1) * sum_den)
+ @d1 = k - 1
+ @d2 = n - k
end
private :compute
# Probability.
@@ -81,7 +83,6 @@ def compute
def probability
p_using_cdf(Distribution::F.cdf(f, @d1, @d2), :right)
end
-
end
end
end
diff --git a/lib/statsample/test/t.rb b/lib/statsample/test/t.rb
index d0306a9..b132be8 100644
--- a/lib/statsample/test/t.rb
+++ b/lib/statsample/test/t.rb
@@ -1,10 +1,8 @@
module Statsample
module Test
-
-
-
-
- # A t-test is any statistical hypothesis test in which the test statistic follows a Student's t distribution, if the null hypothesis is supported
+ # A t-test is any statistical hypothesis test in which the test
+ # statistic follows a Student's t distribution, if the null
+ # hypothesis is supported
class T
class << self
@@ -125,7 +123,7 @@ def report_building_t(s)
# One Sample t-test
# == Usage
- # a=1000.times.map {rand(100)}.to_scale
+ # a = Daru::Vector.new(1000.times.map {rand(100)})
# t_1=Statsample::Test::T::OneSample.new(a, {:u=>50})
# t_1.summary
#
@@ -165,11 +163,11 @@ def initialize(vector, opts=Hash.new)
@u=@opts[:u]
@tails=@opts[:tails]
@confidence_level=@opts[:confidence_level] || 0.95
- @df= @vector.n_valid-1
+ @df= @vector.reject_values(*Daru::MISSING_VALUES).size-1
@t=nil
end
def t_object
- T.new(@vector.mean-u, @vector.se, @vector.n_valid-1, opts)
+ T.new(@vector.mean-u, @vector.se, @vector.reject_values(*Daru::MISSING_VALUES).size-1, opts)
end
def t
t_object.t
@@ -196,8 +194,8 @@ def report_building(b) # :nodoc:
# Two Sample t-test.
#
# == Usage
- # a=1000.times.map {rand(100)}.to_scale
- # b=1000.times.map {rand(100)}.to_scale
+ # a = Daru::Vector.new(1000.times.map {rand(100)})
+ # b = Daru::Vector.new(1000.times.map {rand(100)})
# t_2=Statsample::Test::T::TwoSamplesIndependent.new(a,b)
# t_2.summary
# === Output
@@ -266,12 +264,12 @@ def initialize(v1, v2, opts=Hash.new)
# Set t and probability for given u
def compute
- @t_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.n_valid, @v2.n_valid,true)
+ @t_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.reject_values(*Daru::MISSING_VALUES).size, @v2.reject_values(*Daru::MISSING_VALUES).size,true)
- @t_not_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.n_valid, @v2.n_valid, false)
+ @t_not_equal_variance= T.two_sample_independent(@v1.mean, @v2.mean, @v1.sd, @v2.sd, @v1.reject_values(*Daru::MISSING_VALUES).size, @v2.reject_values(*Daru::MISSING_VALUES).size, false)
- @df_equal_variance=T.df_equal_variance(@v1.n_valid, @v2.n_valid)
- @df_not_equal_variance=T.df_not_equal_variance(@v1.sd, @v2.sd, @v1.n_valid, @v2.n_valid)
+ @df_equal_variance=T.df_equal_variance(@v1.reject_values(*Daru::MISSING_VALUES).size, @v2.reject_values(*Daru::MISSING_VALUES).size)
+ @df_not_equal_variance=T.df_not_equal_variance(@v1.sd, @v2.sd, @v1.reject_values(*Daru::MISSING_VALUES).size, @v2.reject_values(*Daru::MISSING_VALUES).size)
@probability_equal_variance = p_using_cdf(Distribution::T.cdf(@t_equal_variance, @df_equal_variance), tails)
@@ -280,8 +278,8 @@ def compute
end
# Cohen's d is a measure of effect size. Its defined as the difference between two means divided by a standard deviation for the data
def d
- n1=@v1.n_valid
- n2=@v2.n_valid
+ n1=@v1.reject_values(*Daru::MISSING_VALUES).size
+ n2=@v2.reject_values(*Daru::MISSING_VALUES).size
num=@v1.mean-@v2.mean
den=Math::sqrt( ((n1-1)*@v1.sd+(n2-1)*@v2.sd).quo(n1+n2))
num.quo(den)
@@ -290,8 +288,8 @@ def d
def report_building(b) # :nodoc:
b.section(:name=>@name) {|g|
g.table(:name=>_("Mean and standard deviation"), :header=>[_("Variable"), _("mean"), _("sd"),_("n")]) {|t|
- t.row([@v1.name,"%0.4f" % @v1.mean,"%0.4f" % @v1.sd,@v1.n_valid])
- t.row([@v2.name,"%0.4f" % @v2.mean,"%0.4f" % @v2.sd, @v2.n_valid])
+ t.row([@v1.name,"%0.4f" % @v1.mean,"%0.4f" % @v1.sd, @v1.reject_values(*Daru::MISSING_VALUES).size])
+ t.row([@v2.name,"%0.4f" % @v2.mean,"%0.4f" % @v2.sd, @v2.reject_values(*Daru::MISSING_VALUES).size])
}
g.parse_element(Statsample::Test.levene([@v1,@v2],:name=>_("Levene test for equality of variances")))
diff --git a/lib/statsample/test/umannwhitney.rb b/lib/statsample/test/umannwhitney.rb
index e41d93d..43195cf 100644
--- a/lib/statsample/test/umannwhitney.rb
+++ b/lib/statsample/test/umannwhitney.rb
@@ -113,36 +113,36 @@ def self.distribution_permutations(n1,n2)
include Summarizable
#
# Create a new U Mann-Whitney test
- # Params: Two Statsample::Vectors
+ # Params: Two Daru::Vectors
#
def initialize(v1,v2, opts=Hash.new)
- @v1=v1
- @v2=v2
- @n1=v1.valid_data.size
- @n2=v2.valid_data.size
- data=(v1.valid_data+v2.valid_data).to_scale
- groups=(([0]*@n1)+([1]*@n2)).to_vector
- ds={'g'=>groups, 'data'=>data}.to_dataset
- @t=nil
- @ties=data.data.size!=data.data.uniq.size
- if(@ties)
- adjust_for_ties(ds['data'])
+ @v1 = v1
+ @v2 = v2
+ v1_valid = v1.reject_values(*Daru::MISSING_VALUES).reset_index!
+ v2_valid = v2.reject_values(*Daru::MISSING_VALUES).reset_index!
+ @n1 = v1_valid.size
+ @n2 = v2_valid.size
+ data = Daru::Vector.new(v1_valid.to_a + v2_valid.to_a)
+ groups = Daru::Vector.new(([0] * @n1) + ([1] * @n2))
+ ds = Daru::DataFrame.new({:g => groups, :data => data})
+ @t = nil
+ @ties = data.to_a.size != data.to_a.uniq.size
+ if @ties
+ adjust_for_ties(ds[:data])
end
- ds['ranked']=ds['data'].ranked(:scale)
-
- @n=ds.cases
+ ds[:ranked] = ds[:data].ranked
+ @n = ds.nrows
- @r1=ds.filter{|r| r['g']==0}['ranked'].sum
- @r2=((ds.cases*(ds.cases+1)).quo(2))-r1
- @u1=r1-((@n1*(@n1+1)).quo(2))
- @u2=r2-((@n2*(@n2+1)).quo(2))
- @u=(u1_("Mann-Whitney's U")}
- @opts=opts_default.merge(opts)
+ @r1 = ds.filter_rows { |r| r[:g] == 0}[:ranked].sum
+ @r2 = ((ds.nrows * (ds.nrows + 1)).quo(2)) - r1
+ @u1 = r1 - ((@n1 * (@n1 + 1)).quo(2))
+ @u2 = r2 - ((@n2 * (@n2 + 1)).quo(2))
+ @u = (u1 < u2) ? u1 : u2
+ opts_default = { :name=>_("Mann-Whitney's U") }
+ @opts = opts_default.merge(opts)
opts_default.keys.each {|k|
send("#{k}=", @opts[k])
- }
-
+ }
end
def report_building(generator) # :nodoc:
generator.section(:name=>@name) do |s|
@@ -160,8 +160,8 @@ def report_building(generator) # :nodoc:
# Exact probability of finding values of U lower or equal to sample on U distribution. Use with caution with m*n>100000.
# Uses u_sampling_distribution_as62
def probability_exact
- dist=UMannWhitney.u_sampling_distribution_as62(@n1,@n2)
- sum=0
+ dist = UMannWhitney.u_sampling_distribution_as62(@n1,@n2)
+ sum = 0
(0..@u.to_i).each {|i|
sum+=dist[i]
}
@@ -172,8 +172,8 @@ def probability_exact
# == Reference:
# * http://europe.isixsigma.com/library/content/c080806a.asp
def adjust_for_ties(data)
- @t=data.frequencies.find_all{|k,v| v>1}.inject(0) {|a,v|
- a+(v[1]**3-v[1]).quo(12)
+ @t = data.frequencies.to_h.find_all { |k,v| v > 1 }.inject(0) { |a,v|
+ a + (v[1]**3 - v[1]).quo(12)
}
end
diff --git a/lib/statsample/test/wilcoxonsignedrank.rb b/lib/statsample/test/wilcoxonsignedrank.rb
index be8b223..5661904 100644
--- a/lib/statsample/test/wilcoxonsignedrank.rb
+++ b/lib/statsample/test/wilcoxonsignedrank.rb
@@ -8,13 +8,13 @@ class WilcoxonSignedRank
# Name of F analysis
attr_accessor :name
- attr_reader :w
- attr_reader :nr
- attr_writer :tails
+ attr_reader :w
+ attr_reader :nr
+ attr_writer :tails
# Parameters:
def initialize(v1,v2, opts=Hash.new)
- @v1=v1
- @v2=v2
+ @v1 = v1
+ @v2 = v2
opts_default={:name=>_("Wilcoxon Signed Rank Test"),:tails=>:both}
@opts=opts_default.merge(opts)
opts_default.keys.each {|k|
@@ -22,66 +22,68 @@ def initialize(v1,v2, opts=Hash.new)
}
calculate
end
+
def calculate
- df=Statsample::Dataset.new({'v1'=>@v1,'v2'=>@v2})
- df["abs"]=df.collect {|row|
- r=(row["v2"]-row["v1"]).abs
- }
- df["sgn"]=df.collect {|row|
- r=row["v2"]-row["v1"]
- r==0 ? 0 : r/r.abs
- }
- df=df.filter {|row| row["sgn"]!=0}
- df["rank"]=df["abs"].ranked
- @nr=df.cases
- @w=df.collect {|row|
- row["sgn"]*row["rank"]
- #p row["sgn"]*row["rank"]
- }.sum
+ df = Daru::DataFrame.new({:v1 => @v1,:v2 => @v2})
+ # df[:abs]=df.collect(:row) { |row| (row[:v2] - row[:v1]).abs }
+ df[:abs] = (df[:v2] - df[:v1]).abs
+ df[:sgn] = df.collect(:row) { |row|
+ r = row[:v2] - row[:v1]
+ r == 0 ? 0 : r/r.abs
+ }
+ df = df.filter_rows { |row| row[:sgn] != 0}
+ df[:rank] = df[:abs].ranked
+ @nr = df.nrows
+
+ @w = df.collect(:row) { |row|
+ row[:sgn] * row[:rank]
+ }.sum
end
+
def report_building(generator) # :nodoc:
generator.section(:name=>@name) do |s|
s.table(:name=>_("%s results") % @name) do |t|
t.row([_("W Value"), "%0.3f" % @w])
t.row([_("Z"), "%0.3f (p: %0.3f)" % [z, probability_z]])
if(nr<=10)
- t.row([_("Exact probability"), "p-exact: %0.3f" % [probability_exact]])
+ t.row([_("Exact probability"), "p-exact: %0.3f" % [probability_exact]])
end
end
end
end
def z
- sigma=Math.sqrt((nr*(nr+1)*(2*nr+1))/6)
- (w-0.5)/sigma
+ sigma=Math.sqrt((nr*(nr+1)*(2*nr+1))/6)
+ (w-0.5)/sigma
end
# Assuming normal distribution of W, this calculate
# the probability of samples with Z equal or higher than
# obtained on sample
def probability_z
- (1-Distribution::Normal.cdf(z))*(@tails==:both ? 2:1)
+ (1-Distribution::Normal.cdf(z))*(@tails==:both ? 2:1)
end
# Calculate exact probability.
# Don't calculate for large Nr, please!
def probability_exact
- str_format="%0#{nr}b"
- combinations=2**nr
- #p str_format
- total_w=combinations.times.map {|i|
- comb=sprintf(str_format,i)
- w_local=comb.length.times.inject(0) {|ac,j|
- sgn=comb[j]=="0" ? -1 : 1
- ac+(j+1)*sgn
- }
- }.sort
- total_w.find_all {|v|
- if @tails==:both
- v<=-w.abs or v>=w.abs
- elsif @tails==:left
- v<=w
- elsif @tails==:right
- v>=w
- end
- }.count/(combinations.to_f)
+ str_format="%0#{nr}b"
+ combinations=2**nr
+ #p str_format
+ total_w=combinations.times.map do |i|
+ comb=sprintf(str_format,i)
+ w_local=comb.length.times.inject(0) do |ac,j|
+ sgn=comb[j]=="0" ? -1 : 1
+ ac+(j+1)*sgn
+ end
+ end.sort
+
+ total_w.find_all do |v|
+ if @tails==:both
+ v<=-w.abs or v>=w.abs
+ elsif @tails==:left
+ v<=w
+ elsif @tails==:right
+ v>=w
+ end
+ end.count/(combinations.to_f)
end
end
end
diff --git a/lib/statsample/vector.rb b/lib/statsample/vector.rb
index 64f5111..caf7ac2 100644
--- a/lib/statsample/vector.rb
+++ b/lib/statsample/vector.rb
@@ -1,15 +1,8 @@
-require 'date'
-require 'statsample/vector/gsl'
-
module Statsample::VectorShorthands
# Creates a new Statsample::Vector object
# Argument should be equal to Vector.new
def to_vector(*args)
- Statsample::Vector.new(self,*args)
- end
- # Creates a new Statsample::Vector object of type :scale
- def to_scale(*args)
- Statsample::Vector.new(self, :scale, *args)
+ Daru::Vector.new(self)
end
end
@@ -24,1047 +17,3 @@ class Vector
end
end
end
-module Statsample
-
-
- # Collection of values on one dimension. Works as a column on a Spreadsheet.
- #
- # == Usage
- # The fast way to create a vector uses Array.to_vector or Array.to_scale.
- #
- # v=[1,2,3,4].to_vector(:scale)
- # v=[1,2,3,4].to_scale
- #
- class Vector
- include Enumerable
- include Writable
- include Summarizable
- include Statsample::VectorShorthands
-
- # Level of measurement. Could be :nominal, :ordinal or :scale
- attr_reader :type
- # Original data.
- attr_reader :data
- # Valid data. Equal to data, minus values assigned as missing values
- attr_reader :valid_data
- # Array of values considered as missing. Nil is a missing value, by default
- attr_reader :missing_values
- # Array of values considered as "Today", with date type. "NOW", "TODAY", :NOW and :TODAY are 'today' values, by default
- attr_reader :today_values
- # Missing values array
- attr_reader :missing_data
- # Original data, with all missing values replaced by nils
- attr_reader :data_with_nils
- # Date date, with all missing values replaced by nils
- attr_reader :date_data_with_nils
- # Change label for specific values
- attr_accessor :labels
- # Name of vector. Should be used for output by many classes
- attr_accessor :name
-
- # Creates a new Vector object.
- # * data Any data which can be converted on Array
- # * type Level of meausurement. See Vector#type
- # * opts Hash of options
- # * :missing_values Array of missing values. See Vector#missing_values
- # * :today_values Array of 'today' values. See Vector#today_values
- # * :labels Labels for data values
- # * :name Name of vector
- def initialize(data=[], type=:nominal, opts=Hash.new)
- @data=data.is_a?(Array) ? data : data.to_a
- @type=type
- opts_default={
- :missing_values=>[],
- :today_values=>['NOW','TODAY', :NOW, :TODAY],
- :labels=>{},
- :name=>nil
- }
- @opts=opts_default.merge(opts)
- if @opts[:name].nil?
- @@n_table||=0
- @@n_table+=1
- @opts[:name]="Vector #{@@n_table}"
- end
- @missing_values=@opts[:missing_values]
- @labels=@opts[:labels]
- @today_values=@opts[:today_values]
- @name=@opts[:name]
- @valid_data=[]
- @data_with_nils=[]
- @date_data_with_nils=[]
- @missing_data=[]
- @has_missing_data=nil
- @scale_data=nil
- set_valid_data
- self.type=type
- end
- # Create a vector using (almost) any object
- # * Array: flattened
- # * Range: transformed using to_a
- # * Statsample::Vector
- # * Numeric and string values
- def self.[](*args)
- values=[]
- args.each do |a|
- case a
- when Array
- values.concat a.flatten
- when Statsample::Vector
- values.concat a.to_a
- when Range
- values.concat a.to_a
- else
- values << a
- end
- end
- vector=new(values)
- vector.type=:scale if vector.can_be_scale?
- vector
- end
- # Create a new scale type vector
- # Parameters
- # [n] Size
- # [val] Value of each value
- # [&block] If block provided, is used to set the values of vector
- def self.new_scale(n,val=nil, &block)
- if block
- vector=n.times.map {|i| block.call(i)}.to_scale
- else
- vector=n.times.map { val}.to_scale
- end
- vector.type=:scale
- vector
- end
- # Creates a duplicate of the Vector.
- # Note: data, missing_values and labels are duplicated, so
- # changes on original vector doesn't propages to copies.
- def dup
- Vector.new(@data.dup,@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=>@name)
- end
- # Returns an empty duplicate of the vector. Maintains the type,
- # missing values and labels.
- def dup_empty
- Vector.new([],@type, :missing_values => @missing_values.dup, :labels => @labels.dup, :name=> @name)
- end
-
- if Statsample::STATSAMPLE__.respond_to?(:check_type)
- # Raises an exception if type of vector is inferior to t type
- def check_type(t)
- Statsample::STATSAMPLE__.check_type(self,t)
- end
- else
- def check_type(t) #:nodoc:
- _check_type(t)
- end
- end
-
-
- def _check_type(t) #:nodoc:
- raise NoMethodError if (t==:scale and @type!=:scale) or (t==:ordinal and @type==:nominal) or (t==:date) or (:date==@type)
- end
-
- def vector_standarized_compute(m,sd) # :nodoc:
- @data_with_nils.collect{|x| x.nil? ? nil : (x.to_f - m).quo(sd) }.to_vector(:scale)
- end
- # Return a vector usign the standarized values for data
- # with sd with denominator n-1. With variance=0 or mean nil,
- # returns a vector of equal size full of nils
- #
- def vector_standarized(use_population=false)
- check_type :scale
- m=mean
- sd=use_population ? sdp : sds
- return ([nil]*size).to_scale if mean.nil? or sd==0.0
- vector=vector_standarized_compute(m,sd)
- vector.name=_("%s(standarized)") % @name
- vector
- end
- def vector_centered_compute(m) #:nodoc:
- @data_with_nils.collect {|x| x.nil? ? nil : x.to_f-m }.to_scale
- end
- # Return a centered vector
- def vector_centered
- check_type :scale
- m=mean
- return ([nil]*size).to_scale if mean.nil?
- vector=vector_centered_compute(m)
- vector.name=_("%s(centered)") % @name
- vector
- end
-
- alias_method :standarized, :vector_standarized
- alias_method :centered, :vector_centered
- # Return a vector with values replaced with the percentiles
- # of each values
- def vector_percentil
- check_type :ordinal
- c=@valid_data.size
- vector=ranked.map {|i| i.nil? ? nil : (i.quo(c)*100).to_f }.to_vector(@type)
- vector.name=_("%s(percentil)") % @name
- vector
- end
- def box_cox_transformation(lambda) # :nodoc:
- raise "Should be a scale" unless @type==:scale
- @data_with_nils.collect{|x|
- if !x.nil?
- if(lambda==0)
- Math.log(x)
- else
- (x**lambda-1).quo(lambda)
- end
- else
- nil
- end
- }.to_vector(:scale)
- end
-
- # Vector equality.
- # Two vector will be the same if their data, missing values, type, labels are equals
- def ==(v2)
- return false unless v2.instance_of? Statsample::Vector
- @data==v2.data and @missing_values==v2.missing_values and @type==v2.type and @labels==v2.labels
- end
-
- def _dump(i) # :nodoc:
- Marshal.dump({'data'=>@data,'missing_values'=>@missing_values, 'labels'=>@labels, 'type'=>@type,'name'=>@name})
- end
-
- def self._load(data) # :nodoc:
- h=Marshal.load(data)
- Vector.new(h['data'], h['type'], :missing_values=> h['missing_values'], :labels=>h['labels'], :name=>h['name'])
- end
- # Returns a new vector, with data modified by block.
- # Equivalent to create a Vector after #collect on data
- def recode(type=nil)
- type||=@type
- @data.collect{|x|
- yield x
- }.to_vector(type)
- end
- # Modifies current vector, with data modified by block.
- # Equivalent to #collect! on @data
- def recode!
- @data.collect!{|x|
- yield x
- }
- set_valid_data
- end
- def push(v)
- @data.push(v)
- set_valid_data
- end
- # Dicotomize the vector with 0 and 1, based on lowest value
- # If parameter if defined, this value and lower
- # will be 0 and higher, 1
- def dichotomize(low=nil)
- fs=factors
- low||=factors.min
- @data_with_nils.collect{|x|
- if x.nil?
- nil
- elsif x>low
- 1
- else
- 0
- end
- }.to_scale
- end
- # Iterate on each item.
- # Equivalent to
- # @data.each{|x| yield x}
- def each
- @data.each{|x| yield(x) }
- end
-
- # Iterate on each item, retrieving index
- def each_index
- (0...@data.size).each {|i|
- yield(i)
- }
- end
- # Add a value at the end of the vector.
- # If second argument set to false, you should update the Vector usign
- # Vector.set_valid_data at the end of your insertion cycle
- #
- def add(v,update_valid=true)
- @data.push(v)
- set_valid_data if update_valid
- end
- # Update valid_data, missing_data, data_with_nils and gsl
- # at the end of an insertion.
- #
- # Use after Vector.add(v,false)
- # Usage:
- # v=Statsample::Vector.new
- # v.add(2,false)
- # v.add(4,false)
- # v.data
- # => [2,3]
- # v.valid_data
- # => []
- # v.set_valid_data
- # v.valid_data
- # => [2,3]
- def set_valid_data
- @valid_data.clear
- @missing_data.clear
- @data_with_nils.clear
- @date_data_with_nils.clear
- set_valid_data_intern
- set_scale_data if(@type==:scale)
- set_date_data if(@type==:date)
- end
- if Statsample::STATSAMPLE__.respond_to?(:set_valid_data_intern)
- def set_valid_data_intern #:nodoc:
- Statsample::STATSAMPLE__.set_valid_data_intern(self)
- end
- else
- def set_valid_data_intern #:nodoc:
- _set_valid_data_intern
- end
- end
- def _set_valid_data_intern #:nodoc:
- @data.each do |n|
- if is_valid? n
- @valid_data.push(n)
- @data_with_nils.push(n)
- else
- @data_with_nils.push(nil)
- @missing_data.push(n)
- end
- end
- @has_missing_data=@missing_data.size>0
- end
-
- # Retrieves true if data has one o more missing values
- def has_missing_data?
- @has_missing_data
- end
- alias :flawed? :has_missing_data?
-
- # Retrieves label for value x. Retrieves x if
- # no label defined.
- def labeling(x)
- @labels.has_key?(x) ? @labels[x].to_s : x.to_s
- end
- alias :label :labeling
- # Returns a Vector with data with labels replaced by the label.
- def vector_labeled
- d=@data.collect{|x|
- if @labels.has_key? x
- @labels[x]
- else
- x
- end
- }
- Vector.new(d,@type)
- end
- # Size of total data
- def size
- @data.size
- end
- alias_method :n, :size
-
- # Retrieves i element of data
- def [](i)
- @data[i]
- end
- # Set i element of data.
- # Note: Use set_valid_data if you include missing values
- def []=(i,v)
- @data[i]=v
- end
- # Return true if a value is valid (not nil and not included on missing values)
- def is_valid?(x)
- !(x.nil? or @missing_values.include? x)
- end
- # Set missing_values.
- # set_valid_data is called after changes
- def missing_values=(vals)
- @missing_values = vals
- set_valid_data
- end
- # Set data considered as "today" on data vectors
- def today_values=(vals)
- @today_values = vals
- set_valid_data
- end
- # Set level of measurement.
- def type=(t)
- @type=t
- set_scale_data if(t==:scale)
- set_date_data if (t==:date)
- end
- def to_a
- if @data.is_a? Array
- @data.dup
- else
- @data.to_a
- end
- end
- alias_method :to_ary, :to_a
-
- # Vector sum.
- # - If v is a scalar, add this value to all elements
- # - If v is a Array or a Vector, should be of the same size of this vector
- # every item of this vector will be added to the value of the
- # item at the same position on the other vector
- def +(v)
- _vector_ari("+",v)
- end
- # Vector rest.
- # - If v is a scalar, rest this value to all elements
- # - If v is a Array or a Vector, should be of the same
- # size of this vector
- # every item of this vector will be rested to the value of the
- # item at the same position on the other vector
-
- def -(v)
- _vector_ari("-",v)
- end
-
- def *(v)
- _vector_ari("*",v)
- end
- # Reports all values that doesn't comply with a condition.
- # Returns a hash with the index of data and the invalid data.
- def verify
- h={}
- (0...@data.size).to_a.each{|i|
- if !(yield @data[i])
- h[i]=@data[i]
- end
- }
- h
- end
- def _vector_ari(method,v) # :nodoc:
- if(v.is_a? Vector or v.is_a? Array)
- raise ArgumentError, "The array/vector parameter (#{v.size}) should be of the same size of the original vector (#{@data.size})" unless v.size==@data.size
- sum=[]
- v.size.times {|i|
- if((v.is_a? Vector and v.is_valid?(v[i]) and is_valid?(@data[i])) or (v.is_a? Array and !v[i].nil? and !data[i].nil?))
- sum.push(@data[i].send(method,v[i]))
- else
- sum.push(nil)
- end
- }
- Statsample::Vector.new(sum, :scale)
- elsif(v.respond_to? method )
- Statsample::Vector.new(
- @data.collect {|x|
- if(!x.nil?)
- x.send(method,v)
- else
- nil
- end
- } , :scale)
- else
- raise TypeError,"You should pass a scalar or a array/vector"
- end
-
- end
- # Return an array with the data splitted by a separator.
- # a=Vector.new(["a,b","c,d","a,b","d"])
- # a.splitted
- # =>
- # [["a","b"],["c","d"],["a","b"],["d"]]
- def splitted(sep=Statsample::SPLIT_TOKEN)
- @data.collect{|x|
- if x.nil?
- nil
- elsif (x.respond_to? :split)
- x.split(sep)
- else
- [x]
- end
- }
- end
- # Returns a hash of Vectors, defined by the different values
- # defined on the fields
- # Example:
- #
- # a=Vector.new(["a,b","c,d","a,b"])
- # a.split_by_separator
- # => {"a"=>#,
- # "b"=>#,
- # "c"=>#}
- #
- def split_by_separator(sep=Statsample::SPLIT_TOKEN)
- split_data=splitted(sep)
- factors=split_data.flatten.uniq.compact
- out=factors.inject({}) {|a,x|
- a[x]=[]
- a
- }
- split_data.each do |r|
- if r.nil?
- factors.each do |f|
- out[f].push(nil)
- end
- else
- factors.each do |f|
- out[f].push(r.include?(f) ? 1:0)
- end
- end
- end
- out.inject({}){|s,v|
- s[v[0]]=Vector.new(v[1],:nominal)
- s
- }
- end
- def split_by_separator_freq(sep=Statsample::SPLIT_TOKEN)
- split_by_separator(sep).inject({}) {|a,v|
- a[v[0]]=v[1].inject {|s,x| s+x.to_i}
- a
- }
- end
-
- # == Bootstrap
- # Generate +nr+ resamples (with replacement) of size +s+
- # from vector, computing each estimate from +estimators+
- # over each resample.
- # +estimators+ could be
- # a) Hash with variable names as keys and lambdas as values
- # a.bootstrap(:log_s2=>lambda {|v| Math.log(v.variance)},1000)
- # b) Array with names of method to bootstrap
- # a.bootstrap([:mean, :sd],1000)
- # c) A single method to bootstrap
- # a.jacknife(:mean, 1000)
- # If s is nil, is set to vector size by default.
- #
- # Returns a dataset where each vector is an vector
- # of length +nr+ containing the computed resample estimates.
- def bootstrap(estimators, nr, s=nil)
- s||=n
-
- h_est, es, bss= prepare_bootstrap(estimators)
-
-
- nr.times do |i|
- bs=sample_with_replacement(s)
- es.each do |estimator|
- # Add bootstrap
- bss[estimator].push(h_est[estimator].call(bs))
- end
- end
-
- es.each do |est|
- bss[est]=bss[est].to_scale
- bss[est].type=:scale
- end
- bss.to_dataset
-
- end
-
- # == Jacknife
- # Returns a dataset with jacknife delete-+k+ +estimators+
- # +estimators+ could be:
- # a) Hash with variable names as keys and lambdas as values
- # a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance)})
- # b) Array with method names to jacknife
- # a.jacknife([:mean, :sd])
- # c) A single method to jacknife
- # a.jacknife(:mean)
- # +k+ represent the block size for block jacknife. By default
- # is set to 1, for classic delete-one jacknife.
- #
- # Returns a dataset where each vector is an vector
- # of length +cases+/+k+ containing the computed jacknife estimates.
- #
- # == Reference:
- # * Sawyer, S. (2005). Resampling Data: Using a Statistical Jacknife.
- def jacknife(estimators, k=1)
- raise "n should be divisible by k:#{k}" unless n%k==0
-
- nb=(n / k).to_i
-
-
- h_est, es, ps= prepare_bootstrap(estimators)
-
- est_n=es.inject({}) {|h,v|
- h[v]=h_est[v].call(self)
- h
- }
-
-
- nb.times do |i|
- other=@data_with_nils.dup
- other.slice!(i*k,k)
- other=other.to_scale
- es.each do |estimator|
- # Add pseudovalue
- ps[estimator].push( nb * est_n[estimator] - (nb-1) * h_est[estimator].call(other))
- end
- end
-
-
- es.each do |est|
- ps[est]=ps[est].to_scale
- ps[est].type=:scale
- end
- ps.to_dataset
- end
-
-
- # For an array or hash of estimators methods, returns
- # an array with three elements
- # 1.- A hash with estimators names as keys and lambdas as values
- # 2.- An array with estimators names
- # 3.- A Hash with estimators names as keys and empty arrays as values
- def prepare_bootstrap(estimators)
- h_est=estimators
-
- h_est=[h_est] unless h_est.is_a? Array or h_est.is_a? Hash
-
- if h_est.is_a? Array
- h_est=h_est.inject({}) {|h,est|
- h[est]=lambda {|v| v.send(est)}
- h
- }
- end
-
- bss=h_est.keys.inject({}) {|h,v| h[v]=[];h}
-
- [h_est,h_est.keys, bss]
-
- end
- private :prepare_bootstrap
-
- # Returns an random sample of size n, with replacement,
- # only with valid data.
- #
- # In all the trails, every item have the same probability
- # of been selected.
- def sample_with_replacement(sample=1)
- vds=@valid_data.size
- (0...sample).collect{ @valid_data[rand(vds)] }
- end
- # Returns an random sample of size n, without replacement,
- # only with valid data.
- #
- # Every element could only be selected once.
- #
- # A sample of the same size of the vector is the vector itself.
-
- def sample_without_replacement(sample=1)
- raise ArgumentError, "Sample size couldn't be greater than n" if sample>@valid_data.size
- out=[]
- size=@valid_data.size
- while out.sizedir could be :horizontal or :vertical
- def to_matrix(dir=:horizontal)
- case dir
- when :horizontal
- Matrix[@data]
- when :vertical
- Matrix.columns([@data])
- end
- end
- def inspect
- self.to_s
- end
- # Retrieves uniques values for data.
- def factors
- if @type==:scale
- @scale_data.uniq.sort
- elsif @type==:date
- @date_data_with_nils.uniq.sort
- else
- @valid_data.uniq.sort
- end
- end
- if Statsample::STATSAMPLE__.respond_to?(:frequencies)
- # Returns a hash with the distribution of frecuencies for
- # the sample
- def frequencies
- Statsample::STATSAMPLE__.frequencies(@valid_data)
- end
- else
- def frequencies #:nodoc:
- _frequencies
- end
- end
-
-
- def _frequencies #:nodoc:
- @valid_data.inject(Hash.new) {|a,x|
- a[x]||=0
- a[x]=a[x]+1
- a
- }
- end
-
- # Returns the most frequent item.
- def mode
- frequencies.max{|a,b| a[1]<=>b[1]}.first
- end
- # The numbers of item with valid data.
- def n_valid
- @valid_data.size
- end
- # Returns a hash with the distribution of proportions of
- # the sample.
- def proportions
- frequencies.inject({}){|a,v|
- a[v[0]] = v[1].quo(n_valid)
- a
- }
- end
- # Proportion of a given value.
- def proportion(v=1)
- frequencies[v].quo(@valid_data.size)
- end
- def report_building(b)
- b.section(:name=>name) do |s|
- s.text _("n :%d") % n
- s.text _("n valid:%d") % n_valid
- if @type==:nominal
- s.text _("factors:%s") % factors.join(",")
- s.text _("mode: %s") % mode
-
- s.table(:name=>_("Distribution")) do |t|
- frequencies.sort.each do |k,v|
- key=labels.has_key?(k) ? labels[k]:k
- t.row [key, v , ("%0.2f%%" % (v.quo(n_valid)*100))]
- end
- end
- end
-
- s.text _("median: %s") % median.to_s if(@type==:ordinal or @type==:scale)
- if(@type==:scale)
- s.text _("mean: %0.4f") % mean
- if sd
- s.text _("std.dev.: %0.4f") % sd
- s.text _("std.err.: %0.4f") % se
- s.text _("skew: %0.4f") % skew
- s.text _("kurtosis: %0.4f") % kurtosis
- end
- end
- end
- end
-
- # Variance of p, according to poblation size
- def variance_proportion(n_poblation, v=1)
- Statsample::proportion_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
- end
- # Variance of p, according to poblation size
- def variance_total(n_poblation, v=1)
- Statsample::total_variance_sample(self.proportion(v), @valid_data.size, n_poblation)
- end
- def proportion_confidence_interval_t(n_poblation,margin=0.95,v=1)
- Statsample::proportion_confidence_interval_t(proportion(v), @valid_data.size, n_poblation, margin)
- end
- def proportion_confidence_interval_z(n_poblation,margin=0.95,v=1)
- Statsample::proportion_confidence_interval_z(proportion(v), @valid_data.size, n_poblation, margin)
- end
-
- self.instance_methods.find_all{|met| met=~/_slow$/}.each do |met|
- met_or=met.gsub("_slow","")
- if !self.method_defined?(met_or)
- alias_method met_or, met
- end
- end
-
- ######
- ### Ordinal Methods
- ######
-
- # == Percentil
- # Returns the value of the percentile q
- #
- # Accepts an optional second argument specifying the strategy to interpolate
- # when the requested percentile lies between two data points a and b
- # Valid strategies are:
- # * :midpoint (Default): (a + b) / 2
- # * :linear : a + (b - a) * d where d is the decimal part of the index between a and b.
- # This is the NIST recommended method (http://en.wikipedia.org/wiki/Percentile#NIST_method)
- #
- def percentil(q, strategy = :midpoint)
- check_type :ordinal
- sorted=@valid_data.sort
-
- case strategy
- when :midpoint
- v = (n_valid * q).quo(100)
- if(v.to_i!=v)
- sorted[v.to_i]
- else
- (sorted[(v-0.5).to_i].to_f + sorted[(v+0.5).to_i]).quo(2)
- end
- when :linear
- index = (q / 100.0) * (n_valid + 1)
-
- k = index.truncate
- d = index % 1
-
- if k == 0
- sorted[0]
- elsif k >= sorted.size
- sorted[-1]
- else
- sorted[k - 1] + d * (sorted[k] - sorted[k - 1])
- end
- else
- raise NotImplementedError.new "Unknown strategy #{strategy.to_s}"
- end
- end
-
- # Returns a ranked vector.
- def ranked(type=:ordinal)
- check_type :ordinal
- i=0
- r=frequencies.sort.inject({}){|a,v|
- a[v[0]]=(i+1 + i+v[1]).quo(2)
- i+=v[1]
- a
- }
- @data.collect {|c| r[c] }.to_vector(type)
- end
- # Return the median (percentil 50)
- def median
- check_type :ordinal
- percentil(50)
- end
- # Minimun value
- def min
- check_type :ordinal
- @valid_data.min
- end
- # Maximum value
- def max
- check_type :ordinal
- @valid_data.max
- end
-
- def set_date_data
- @date_data_with_nils=@data.collect do|x|
- if x.is_a? Date
- x
- elsif x.is_a? Time
- Date.new(x.year, x.month, x.day)
- elsif x.is_a? String and x=~/(\d{4,4})[-\/](\d{1,2})[-\/](\d{1,2})/
- Date.new($1.to_i,$2.to_i,$3.to_i)
- elsif @today_values.include? x
- Date.today()
- elsif @missing_values.include? x or x.nil?
- nil
- end
- end
- end
-
- def set_scale_data
- @scale_data=@valid_data.collect do|x|
- if x.is_a? Numeric
- x
- elsif x.is_a? String and x.to_i==x.to_f
- x.to_i
- else
- x.to_f
- end
- end
- end
-
- private :set_date_data, :set_scale_data
-
- # The range of the data (max - min)
- def range;
- check_type :scale
- @scale_data.max - @scale_data.min
- end
- # The sum of values for the data
- def sum
- check_type :scale
- @scale_data.inject(0){|a,x|x+a} ;
- end
- # The arithmetical mean of data
- def mean
- check_type :scale
- sum.to_f.quo(n_valid)
- end
- # Sum of squares for the data around a value.
- # By default, this value is the mean
- # ss= sum{(xi-m)^2}
- #
- def sum_of_squares(m=nil)
- check_type :scale
- m||=mean
- @scale_data.inject(0){|a,x| a+(x-m).square}
- end
- # Sum of squared deviation
- def sum_of_squared_deviation
- check_type :scale
- @scale_data.inject(0) {|a,x| x.square+a} - (sum.square.quo(n_valid))
- end
-
- # Population variance (denominator N)
- def variance_population(m=nil)
- check_type :scale
- m||=mean
- squares=@scale_data.inject(0){|a,x| x.square+a}
- squares.quo(n_valid) - m.square
- end
-
-
- # Population Standard deviation (denominator N)
- def standard_deviation_population(m=nil)
- check_type :scale
- Math::sqrt( variance_population(m) )
- end
-
- # Population average deviation (denominator N)
- # author: Al Chou
-
- def average_deviation_population( m = nil )
- check_type :scale
- m ||= mean
- ( @scale_data.inject( 0 ) { |a, x| ( x - m ).abs + a } ).quo( n_valid )
- end
- def median_absolute_deviation
- med=median
- recode {|x| (x-med).abs}.median
- end
- alias :mad :median_absolute_deviation
- # Sample Variance (denominator n-1)
- def variance_sample(m=nil)
- check_type :scale
- m||=mean
- sum_of_squares(m).quo(n_valid - 1)
- end
-
- # Sample Standard deviation (denominator n-1)
- def standard_deviation_sample(m=nil)
- check_type :scale
- m||=mean
- Math::sqrt(variance_sample(m))
- end
- # Skewness of the sample
- def skew(m=nil)
- check_type :scale
- m||=mean
- th=@scale_data.inject(0){|a,x| a+((x-m)**3)}
- th.quo((@scale_data.size)*sd(m)**3)
- end
- # Kurtosis of the sample
- def kurtosis(m=nil)
- check_type :scale
- m||=mean
- fo=@scale_data.inject(0){|a,x| a+((x-m)**4)}
- fo.quo((@scale_data.size)*sd(m)**4)-3
-
- end
- # Product of all values on the sample
- #
- def product
- check_type :scale
- @scale_data.inject(1){|a,x| a*x }
- end
-
- # With a fixnum, creates X bins within the range of data
- # With an Array, each value will be a cut point
- def histogram(bins=10)
- check_type :scale
-
- if bins.is_a? Array
- #h=Statsample::Histogram.new(self, bins)
- h=Statsample::Histogram.alloc(bins)
- else
- # ugly patch. The upper limit for a bin has the form
- # x < range
- #h=Statsample::Histogram.new(self, bins)
- min,max=Statsample::Util.nice(@valid_data.min,@valid_data.max)
- # fix last data
- if max==@valid_data.max
- max+=1e-10
- end
- h=Statsample::Histogram.alloc(bins,[min,max])
- # Fix last bin
-
- end
- h.increment(@valid_data)
- h
- end
-
- # Coefficient of variation
- # Calculed with the sample standard deviation
- def coefficient_of_variation
- check_type :scale
- standard_deviation_sample.quo(mean)
- end
- # Standard error of the distribution mean
- # Calculated using sd/sqrt(n)
- def standard_error
- standard_deviation_sample.quo(Math.sqrt(valid_data.size))
- end
- alias :se :standard_error
-
- alias_method :sdp, :standard_deviation_population
- alias_method :sds, :standard_deviation_sample
- alias_method :adp, :average_deviation_population
- alias_method :cov, :coefficient_of_variation
- alias_method :variance, :variance_sample
- alias_method :sd, :standard_deviation_sample
- alias_method :ss, :sum_of_squares
- include_aliasing Statsample::Vector::GSL_ if Statsample.has_gsl?
- end
-end
diff --git a/lib/statsample/vector/gsl.rb b/lib/statsample/vector/gsl.rb
deleted file mode 100644
index 9b12418..0000000
--- a/lib/statsample/vector/gsl.rb
+++ /dev/null
@@ -1,106 +0,0 @@
-module Statsample
- class Vector
- module GSL_
- def clear_gsl
- @gsl=nil
- end
-
- def set_valid_data
- clear_gsl
- set_valid_data_ruby
- end
- def push(v)
- # If data is GSL::Vector, should be converted first to an Array
- if @data.is_a? GSL::Vector
- @data=@data.to_a
- end
- push_ruby(v)
- end
-
- def gsl
- @gsl||=GSL::Vector.alloc(@scale_data) if @scale_data.size>0
- end
-
- alias :to_gsl :gsl
- def vector_standarized_compute(m,sd)
- if flawed?
- vector_standarized_compute_ruby(m,sd)
- else
- gsl.collect {|x| (x.to_f - m).quo(sd)}.to_scale
- end
- end
-
- def vector_centered_compute(m)
- if flawed?
- vector_centered_compute_ruby(m)
- else
- gsl.collect {|x| (x.to_f - m)}.to_scale
- end
- end
- def sample_with_replacement(sample=1)
- if(@type!=:scale)
- sample_with_replacement_ruby(sample)
- else
- r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
- Statsample::Vector.new(r.sample(gsl, sample).to_a,:scale)
- end
- end
-
- def sample_without_replacement(sample=1)
- if(@type!=:scale)
- sample_without_replacement_ruby(sample)
- else
- r = GSL::Rng.alloc(GSL::Rng::MT19937,rand(10000))
- r.choose(gsl, sample).to_a
- end
- end
- def median
- if @type!=:scale
- median_ruby
- else
- sorted=GSL::Vector.alloc(@scale_data.sort)
- GSL::Stats::median_from_sorted_data(sorted)
- end
- end
-
- def sum
- check_type :scale
- gsl.nil? ? nil : gsl.sum
- end
- def mean
- check_type :scale
- gsl.nil? ? nil : gsl.mean
- end
- def variance_sample(m=nil)
- check_type :scale
- m||=mean
- gsl.nil? ? nil : gsl.variance_m
- end
-
- def standard_deviation_sample(m=nil)
- check_type :scale
- m||=mean
- gsl.nil? ? nil : gsl.sd(m)
- end
-
- def variance_population(m=nil) # :nodoc:
- check_type :scale
- m||=mean
- gsl.nil? ? nil : gsl.variance_with_fixed_mean(m)
- end
- def standard_deviation_population(m=nil) # :nodoc:
- check_type :scale
- m||=mean
- gsl.nil? ? nil : gsl.sd_with_fixed_mean(m)
- end
- def skew # :nodoc:
- check_type :scale
- gsl.nil? ? nil : gsl.skew
- end
- def kurtosis # :nodoc:
- check_type :scale
- gsl.nil? ? nil : gsl.kurtosis
- end
- end
- end
-end
diff --git a/lib/statsample/version.rb b/lib/statsample/version.rb
index 4da66f2..b14c020 100644
--- a/lib/statsample/version.rb
+++ b/lib/statsample/version.rb
@@ -1,3 +1,3 @@
module Statsample
- VERSION = '1.4.0'
+ VERSION = '2.1.0'
end
diff --git a/setup.rb b/setup.rb
deleted file mode 100644
index 424a5f3..0000000
--- a/setup.rb
+++ /dev/null
@@ -1,1585 +0,0 @@
-#
-# setup.rb
-#
-# Copyright (c) 2000-2005 Minero Aoki
-#
-# This program is free software.
-# You can distribute/modify this program under the terms of
-# the GNU LGPL, Lesser General Public License version 2.1.
-#
-
-unless Enumerable.method_defined?(:map) # Ruby 1.4.6
- module Enumerable
- alias map collect
- end
-end
-
-unless File.respond_to?(:read) # Ruby 1.6
- def File.read(fname)
- open(fname) {|f|
- return f.read
- }
- end
-end
-
-unless Errno.const_defined?(:ENOTEMPTY) # Windows?
- module Errno
- class ENOTEMPTY
- # We do not raise this exception, implementation is not needed.
- end
- end
-end
-
-def File.binread(fname)
- open(fname, 'rb') {|f|
- return f.read
- }
-end
-
-# for corrupted Windows' stat(2)
-def File.dir?(path)
- File.directory?((path[-1,1] == '/') ? path : path + '/')
-end
-
-
-class ConfigTable
-
- include Enumerable
-
- def initialize(rbconfig)
- @rbconfig = rbconfig
- @items = []
- @table = {}
- # options
- @install_prefix = nil
- @config_opt = nil
- @verbose = true
- @no_harm = false
- end
-
- attr_accessor :install_prefix
- attr_accessor :config_opt
-
- attr_writer :verbose
-
- def verbose?
- @verbose
- end
-
- attr_writer :no_harm
-
- def no_harm?
- @no_harm
- end
-
- def [](key)
- lookup(key).resolve(self)
- end
-
- def []=(key, val)
- lookup(key).set val
- end
-
- def names
- @items.map {|i| i.name }
- end
-
- def each(&block)
- @items.each(&block)
- end
-
- def key?(name)
- @table.key?(name)
- end
-
- def lookup(name)
- @table[name] or setup_rb_error "no such config item: #{name}"
- end
-
- def add(item)
- @items.push item
- @table[item.name] = item
- end
-
- def remove(name)
- item = lookup(name)
- @items.delete_if {|i| i.name == name }
- @table.delete_if {|name, i| i.name == name }
- item
- end
-
- def load_script(path, inst = nil)
- if File.file?(path)
- MetaConfigEnvironment.new(self, inst).instance_eval File.read(path), path
- end
- end
-
- def savefile
- '.config'
- end
-
- def load_savefile
- begin
- File.foreach(savefile()) do |line|
- k, v = *line.split(/=/, 2)
- self[k] = v.strip
- end
- rescue Errno::ENOENT
- setup_rb_error $!.message + "\n#{File.basename($0)} config first"
- end
- end
-
- def save
- @items.each {|i| i.value }
- File.open(savefile(), 'w') {|f|
- @items.each do |i|
- f.printf "%s=%s\n", i.name, i.value if i.value? and i.value
- end
- }
- end
-
- def load_standard_entries
- standard_entries(@rbconfig).each do |ent|
- add ent
- end
- end
-
- def standard_entries(rbconfig)
- c = rbconfig
-
- rubypath = File.join(c['bindir'], c['ruby_install_name'] + c['EXEEXT'])
-
- major = c['MAJOR'].to_i
- minor = c['MINOR'].to_i
- teeny = c['TEENY'].to_i
- version = "#{major}.#{minor}"
-
- # ruby ver. >= 1.4.4?
- newpath_p = ((major >= 2) or
- ((major == 1) and
- ((minor >= 5) or
- ((minor == 4) and (teeny >= 4)))))
-
- if c['rubylibdir']
- # V > 1.6.3
- libruby = "#{c['prefix']}/lib/ruby"
- librubyver = c['rubylibdir']
- librubyverarch = c['archdir']
- siteruby = c['sitedir']
- siterubyver = c['sitelibdir']
- siterubyverarch = c['sitearchdir']
- elsif newpath_p
- # 1.4.4 <= V <= 1.6.3
- libruby = "#{c['prefix']}/lib/ruby"
- librubyver = "#{c['prefix']}/lib/ruby/#{version}"
- librubyverarch = "#{c['prefix']}/lib/ruby/#{version}/#{c['arch']}"
- siteruby = c['sitedir']
- siterubyver = "$siteruby/#{version}"
- siterubyverarch = "$siterubyver/#{c['arch']}"
- else
- # V < 1.4.4
- libruby = "#{c['prefix']}/lib/ruby"
- librubyver = "#{c['prefix']}/lib/ruby/#{version}"
- librubyverarch = "#{c['prefix']}/lib/ruby/#{version}/#{c['arch']}"
- siteruby = "#{c['prefix']}/lib/ruby/#{version}/site_ruby"
- siterubyver = siteruby
- siterubyverarch = "$siterubyver/#{c['arch']}"
- end
- parameterize = lambda {|path|
- path.sub(/\A#{Regexp.quote(c['prefix'])}/, '$prefix')
- }
-
- if arg = c['configure_args'].split.detect {|arg| /--with-make-prog=/ =~ arg }
- makeprog = arg.sub(/'/, '').split(/=/, 2)[1]
- else
- makeprog = 'make'
- end
-
- [
- ExecItem.new('installdirs', 'std/site/home',
- 'std: install under libruby; site: install under site_ruby; home: install under $HOME')\
- {|val, table|
- case val
- when 'std'
- table['rbdir'] = '$librubyver'
- table['sodir'] = '$librubyverarch'
- when 'site'
- table['rbdir'] = '$siterubyver'
- table['sodir'] = '$siterubyverarch'
- when 'home'
- setup_rb_error '$HOME was not set' unless ENV['HOME']
- table['prefix'] = ENV['HOME']
- table['rbdir'] = '$libdir/ruby'
- table['sodir'] = '$libdir/ruby'
- end
- },
- PathItem.new('prefix', 'path', c['prefix'],
- 'path prefix of target environment'),
- PathItem.new('bindir', 'path', parameterize.call(c['bindir']),
- 'the directory for commands'),
- PathItem.new('libdir', 'path', parameterize.call(c['libdir']),
- 'the directory for libraries'),
- PathItem.new('datadir', 'path', parameterize.call(c['datadir']),
- 'the directory for shared data'),
- PathItem.new('mandir', 'path', parameterize.call(c['mandir']),
- 'the directory for man pages'),
- PathItem.new('sysconfdir', 'path', parameterize.call(c['sysconfdir']),
- 'the directory for system configuration files'),
- PathItem.new('localstatedir', 'path', parameterize.call(c['localstatedir']),
- 'the directory for local state data'),
- PathItem.new('libruby', 'path', libruby,
- 'the directory for ruby libraries'),
- PathItem.new('librubyver', 'path', librubyver,
- 'the directory for standard ruby libraries'),
- PathItem.new('librubyverarch', 'path', librubyverarch,
- 'the directory for standard ruby extensions'),
- PathItem.new('siteruby', 'path', siteruby,
- 'the directory for version-independent aux ruby libraries'),
- PathItem.new('siterubyver', 'path', siterubyver,
- 'the directory for aux ruby libraries'),
- PathItem.new('siterubyverarch', 'path', siterubyverarch,
- 'the directory for aux ruby binaries'),
- PathItem.new('rbdir', 'path', '$siterubyver',
- 'the directory for ruby scripts'),
- PathItem.new('sodir', 'path', '$siterubyverarch',
- 'the directory for ruby extentions'),
- PathItem.new('rubypath', 'path', rubypath,
- 'the path to set to #! line'),
- ProgramItem.new('rubyprog', 'name', rubypath,
- 'the ruby program using for installation'),
- ProgramItem.new('makeprog', 'name', makeprog,
- 'the make program to compile ruby extentions'),
- SelectItem.new('shebang', 'all/ruby/never', 'ruby',
- 'shebang line (#!) editing mode'),
- BoolItem.new('without-ext', 'yes/no', 'no',
- 'does not compile/install ruby extentions')
- ]
- end
- private :standard_entries
-
- def load_multipackage_entries
- multipackage_entries().each do |ent|
- add ent
- end
- end
-
- def multipackage_entries
- [
- PackageSelectionItem.new('with', 'name,name...', '', 'ALL',
- 'package names that you want to install'),
- PackageSelectionItem.new('without', 'name,name...', '', 'NONE',
- 'package names that you do not want to install')
- ]
- end
- private :multipackage_entries
-
- ALIASES = {
- 'std-ruby' => 'librubyver',
- 'stdruby' => 'librubyver',
- 'rubylibdir' => 'librubyver',
- 'archdir' => 'librubyverarch',
- 'site-ruby-common' => 'siteruby', # For backward compatibility
- 'site-ruby' => 'siterubyver', # For backward compatibility
- 'bin-dir' => 'bindir',
- 'bin-dir' => 'bindir',
- 'rb-dir' => 'rbdir',
- 'so-dir' => 'sodir',
- 'data-dir' => 'datadir',
- 'ruby-path' => 'rubypath',
- 'ruby-prog' => 'rubyprog',
- 'ruby' => 'rubyprog',
- 'make-prog' => 'makeprog',
- 'make' => 'makeprog'
- }
-
- def fixup
- ALIASES.each do |ali, name|
- @table[ali] = @table[name]
- end
- @items.freeze
- @table.freeze
- @options_re = /\A--(#{@table.keys.join('|')})(?:=(.*))?\z/
- end
-
- def parse_opt(opt)
- m = @options_re.match(opt) or setup_rb_error "config: unknown option #{opt}"
- m.to_a[1,2]
- end
-
- def dllext
- @rbconfig['DLEXT']
- end
-
- def value_config?(name)
- lookup(name).value?
- end
-
- class Item
- def initialize(name, template, default, desc)
- @name = name.freeze
- @template = template
- @value = default
- @default = default
- @description = desc
- end
-
- attr_reader :name
- attr_reader :description
-
- attr_accessor :default
- alias help_default default
-
- def help_opt
- "--#{@name}=#{@template}"
- end
-
- def value?
- true
- end
-
- def value
- @value
- end
-
- def resolve(table)
- @value.gsub(%r<\$([^/]+)>) { table[$1] }
- end
-
- def set(val)
- @value = check(val)
- end
-
- private
-
- def check(val)
- setup_rb_error "config: --#{name} requires argument" unless val
- val
- end
- end
-
- class BoolItem < Item
- def config_type
- 'bool'
- end
-
- def help_opt
- "--#{@name}"
- end
-
- private
-
- def check(val)
- return 'yes' unless val
- case val
- when /\Ay(es)?\z/i, /\At(rue)?\z/i then 'yes'
- when /\An(o)?\z/i, /\Af(alse)\z/i then 'no'
- else
- setup_rb_error "config: --#{@name} accepts only yes/no for argument"
- end
- end
- end
-
- class PathItem < Item
- def config_type
- 'path'
- end
-
- private
-
- def check(path)
- setup_rb_error "config: --#{@name} requires argument" unless path
- path[0,1] == '$' ? path : File.expand_path(path)
- end
- end
-
- class ProgramItem < Item
- def config_type
- 'program'
- end
- end
-
- class SelectItem < Item
- def initialize(name, selection, default, desc)
- super
- @ok = selection.split('/')
- end
-
- def config_type
- 'select'
- end
-
- private
-
- def check(val)
- unless @ok.include?(val.strip)
- setup_rb_error "config: use --#{@name}=#{@template} (#{val})"
- end
- val.strip
- end
- end
-
- class ExecItem < Item
- def initialize(name, selection, desc, &block)
- super name, selection, nil, desc
- @ok = selection.split('/')
- @action = block
- end
-
- def config_type
- 'exec'
- end
-
- def value?
- false
- end
-
- def resolve(table)
- setup_rb_error "$#{name()} wrongly used as option value"
- end
-
- undef set
-
- def evaluate(val, table)
- v = val.strip.downcase
- unless @ok.include?(v)
- setup_rb_error "invalid option --#{@name}=#{val} (use #{@template})"
- end
- @action.call v, table
- end
- end
-
- class PackageSelectionItem < Item
- def initialize(name, template, default, help_default, desc)
- super name, template, default, desc
- @help_default = help_default
- end
-
- attr_reader :help_default
-
- def config_type
- 'package'
- end
-
- private
-
- def check(val)
- unless File.dir?("packages/#{val}")
- setup_rb_error "config: no such package: #{val}"
- end
- val
- end
- end
-
- class MetaConfigEnvironment
- def initialize(config, installer)
- @config = config
- @installer = installer
- end
-
- def config_names
- @config.names
- end
-
- def config?(name)
- @config.key?(name)
- end
-
- def bool_config?(name)
- @config.lookup(name).config_type == 'bool'
- end
-
- def path_config?(name)
- @config.lookup(name).config_type == 'path'
- end
-
- def value_config?(name)
- @config.lookup(name).config_type != 'exec'
- end
-
- def add_config(item)
- @config.add item
- end
-
- def add_bool_config(name, default, desc)
- @config.add BoolItem.new(name, 'yes/no', default ? 'yes' : 'no', desc)
- end
-
- def add_path_config(name, default, desc)
- @config.add PathItem.new(name, 'path', default, desc)
- end
-
- def set_config_default(name, default)
- @config.lookup(name).default = default
- end
-
- def remove_config(name)
- @config.remove(name)
- end
-
- # For only multipackage
- def packages
- raise '[setup.rb fatal] multi-package metaconfig API packages() called for single-package; contact application package vendor' unless @installer
- @installer.packages
- end
-
- # For only multipackage
- def declare_packages(list)
- raise '[setup.rb fatal] multi-package metaconfig API declare_packages() called for single-package; contact application package vendor' unless @installer
- @installer.packages = list
- end
- end
-
-end # class ConfigTable
-
-
-# This module requires: #verbose?, #no_harm?
-module FileOperations
-
- def mkdir_p(dirname, prefix = nil)
- dirname = prefix + File.expand_path(dirname) if prefix
- $stderr.puts "mkdir -p #{dirname}" if verbose?
- return if no_harm?
-
- # Does not check '/', it's too abnormal.
- dirs = File.expand_path(dirname).split(%r<(?=/)>)
- if /\A[a-z]:\z/i =~ dirs[0]
- disk = dirs.shift
- dirs[0] = disk + dirs[0]
- end
- dirs.each_index do |idx|
- path = dirs[0..idx].join('')
- Dir.mkdir path unless File.dir?(path)
- end
- end
-
- def rm_f(path)
- $stderr.puts "rm -f #{path}" if verbose?
- return if no_harm?
- force_remove_file path
- end
-
- def rm_rf(path)
- $stderr.puts "rm -rf #{path}" if verbose?
- return if no_harm?
- remove_tree path
- end
-
- def remove_tree(path)
- if File.symlink?(path)
- remove_file path
- elsif File.dir?(path)
- remove_tree0 path
- else
- force_remove_file path
- end
- end
-
- def remove_tree0(path)
- Dir.foreach(path) do |ent|
- next if ent == '.'
- next if ent == '..'
- entpath = "#{path}/#{ent}"
- if File.symlink?(entpath)
- remove_file entpath
- elsif File.dir?(entpath)
- remove_tree0 entpath
- else
- force_remove_file entpath
- end
- end
- begin
- Dir.rmdir path
- rescue Errno::ENOTEMPTY
- # directory may not be empty
- end
- end
-
- def move_file(src, dest)
- force_remove_file dest
- begin
- File.rename src, dest
- rescue
- File.open(dest, 'wb') {|f|
- f.write File.binread(src)
- }
- File.chmod File.stat(src).mode, dest
- File.unlink src
- end
- end
-
- def force_remove_file(path)
- begin
- remove_file path
- rescue
- end
- end
-
- def remove_file(path)
- File.chmod 0777, path
- File.unlink path
- end
-
- def install(from, dest, mode, prefix = nil)
- $stderr.puts "install #{from} #{dest}" if verbose?
- return if no_harm?
-
- realdest = prefix ? prefix + File.expand_path(dest) : dest
- realdest = File.join(realdest, File.basename(from)) if File.dir?(realdest)
- str = File.binread(from)
- if diff?(str, realdest)
- verbose_off {
- rm_f realdest if File.exist?(realdest)
- }
- File.open(realdest, 'wb') {|f|
- f.write str
- }
- File.chmod mode, realdest
-
- File.open("#{objdir_root()}/InstalledFiles", 'a') {|f|
- if prefix
- f.puts realdest.sub(prefix, '')
- else
- f.puts realdest
- end
- }
- end
- end
-
- def diff?(new_content, path)
- return true unless File.exist?(path)
- new_content != File.binread(path)
- end
-
- def command(*args)
- $stderr.puts args.join(' ') if verbose?
- system(*args) or raise RuntimeError,
- "system(#{args.map{|a| a.inspect }.join(' ')}) failed"
- end
-
- def ruby(*args)
- command config('rubyprog'), *args
- end
-
- def make(task = nil)
- command(*[config('makeprog'), task].compact)
- end
-
- def extdir?(dir)
- File.exist?("#{dir}/MANIFEST") or File.exist?("#{dir}/extconf.rb")
- end
-
- def files_of(dir)
- Dir.open(dir) {|d|
- return d.select {|ent| File.file?("#{dir}/#{ent}") }
- }
- end
-
- DIR_REJECT = %w( . .. CVS SCCS RCS CVS.adm .svn )
-
- def directories_of(dir)
- Dir.open(dir) {|d|
- return d.select {|ent| File.dir?("#{dir}/#{ent}") } - DIR_REJECT
- }
- end
-
-end
-
-
-# This module requires: #srcdir_root, #objdir_root, #relpath
-module HookScriptAPI
-
- def get_config(key)
- @config[key]
- end
-
- alias config get_config
-
- # obsolete: use metaconfig to change configuration
- def set_config(key, val)
- @config[key] = val
- end
-
- #
- # srcdir/objdir (works only in the package directory)
- #
-
- def curr_srcdir
- "#{srcdir_root()}/#{relpath()}"
- end
-
- def curr_objdir
- "#{objdir_root()}/#{relpath()}"
- end
-
- def srcfile(path)
- "#{curr_srcdir()}/#{path}"
- end
-
- def srcexist?(path)
- File.exist?(srcfile(path))
- end
-
- def srcdirectory?(path)
- File.dir?(srcfile(path))
- end
-
- def srcfile?(path)
- File.file?(srcfile(path))
- end
-
- def srcentries(path = '.')
- Dir.open("#{curr_srcdir()}/#{path}") {|d|
- return d.to_a - %w(. ..)
- }
- end
-
- def srcfiles(path = '.')
- srcentries(path).select {|fname|
- File.file?(File.join(curr_srcdir(), path, fname))
- }
- end
-
- def srcdirectories(path = '.')
- srcentries(path).select {|fname|
- File.dir?(File.join(curr_srcdir(), path, fname))
- }
- end
-
-end
-
-
-class ToplevelInstaller
-
- Version = '3.4.1'
- Copyright = 'Copyright (c) 2000-2005 Minero Aoki'
-
- TASKS = [
- [ 'all', 'do config, setup, then install' ],
- [ 'config', 'saves your configurations' ],
- [ 'show', 'shows current configuration' ],
- [ 'setup', 'compiles ruby extentions and others' ],
- [ 'install', 'installs files' ],
- [ 'test', 'run all tests in test/' ],
- [ 'clean', "does `make clean' for each extention" ],
- [ 'distclean',"does `make distclean' for each extention" ]
- ]
-
- def ToplevelInstaller.invoke
- config = ConfigTable.new(load_rbconfig())
- config.load_standard_entries
- config.load_multipackage_entries if multipackage?
- config.fixup
- klass = (multipackage?() ? ToplevelInstallerMulti : ToplevelInstaller)
- klass.new(File.dirname($0), config).invoke
- end
-
- def ToplevelInstaller.multipackage?
- File.dir?(File.dirname($0) + '/packages')
- end
-
- def ToplevelInstaller.load_rbconfig
- if arg = ARGV.detect {|arg| /\A--rbconfig=/ =~ arg }
- ARGV.delete(arg)
- load File.expand_path(arg.split(/=/, 2)[1])
- $".push 'rbconfig.rb'
- else
- require 'rbconfig'
- end
- ::Config::CONFIG
- end
-
- def initialize(ardir_root, config)
- @ardir = File.expand_path(ardir_root)
- @config = config
- # cache
- @valid_task_re = nil
- end
-
- def config(key)
- @config[key]
- end
-
- def inspect
- "#<#{self.class} #{__id__()}>"
- end
-
- def invoke
- run_metaconfigs
- case task = parsearg_global()
- when nil, 'all'
- parsearg_config
- init_installers
- exec_config
- exec_setup
- exec_install
- else
- case task
- when 'config', 'test'
- ;
- when 'clean', 'distclean'
- @config.load_savefile if File.exist?(@config.savefile)
- else
- @config.load_savefile
- end
- __send__ "parsearg_#{task}"
- init_installers
- __send__ "exec_#{task}"
- end
- end
-
- def run_metaconfigs
- @config.load_script "#{@ardir}/metaconfig"
- end
-
- def init_installers
- @installer = Installer.new(@config, @ardir, File.expand_path('.'))
- end
-
- #
- # Hook Script API bases
- #
-
- def srcdir_root
- @ardir
- end
-
- def objdir_root
- '.'
- end
-
- def relpath
- '.'
- end
-
- #
- # Option Parsing
- #
-
- def parsearg_global
- while arg = ARGV.shift
- case arg
- when /\A\w+\z/
- setup_rb_error "invalid task: #{arg}" unless valid_task?(arg)
- return arg
- when '-q', '--quiet'
- @config.verbose = false
- when '--verbose'
- @config.verbose = true
- when '--help'
- print_usage $stdout
- exit 0
- when '--version'
- puts "#{File.basename($0)} version #{Version}"
- exit 0
- when '--copyright'
- puts Copyright
- exit 0
- else
- setup_rb_error "unknown global option '#{arg}'"
- end
- end
- nil
- end
-
- def valid_task?(t)
- valid_task_re() =~ t
- end
-
- def valid_task_re
- @valid_task_re ||= /\A(?:#{TASKS.map {|task,desc| task }.join('|')})\z/
- end
-
- def parsearg_no_options
- unless ARGV.empty?
- task = caller(0).first.slice(%r<`parsearg_(\w+)'>, 1)
- setup_rb_error "#{task}: unknown options: #{ARGV.join(' ')}"
- end
- end
-
- alias parsearg_show parsearg_no_options
- alias parsearg_setup parsearg_no_options
- alias parsearg_test parsearg_no_options
- alias parsearg_clean parsearg_no_options
- alias parsearg_distclean parsearg_no_options
-
- def parsearg_config
- evalopt = []
- set = []
- @config.config_opt = []
- while i = ARGV.shift
- if /\A--?\z/ =~ i
- @config.config_opt = ARGV.dup
- break
- end
- name, value = *@config.parse_opt(i)
- if @config.value_config?(name)
- @config[name] = value
- else
- evalopt.push [name, value]
- end
- set.push name
- end
- evalopt.each do |name, value|
- @config.lookup(name).evaluate value, @config
- end
- # Check if configuration is valid
- set.each do |n|
- @config[n] if @config.value_config?(n)
- end
- end
-
- def parsearg_install
- @config.no_harm = false
- @config.install_prefix = ''
- while a = ARGV.shift
- case a
- when '--no-harm'
- @config.no_harm = true
- when /\A--prefix=/
- path = a.split(/=/, 2)[1]
- path = File.expand_path(path) unless path[0,1] == '/'
- @config.install_prefix = path
- else
- setup_rb_error "install: unknown option #{a}"
- end
- end
- end
-
- def print_usage(out)
- out.puts 'Typical Installation Procedure:'
- out.puts " $ ruby #{File.basename $0} config"
- out.puts " $ ruby #{File.basename $0} setup"
- out.puts " # ruby #{File.basename $0} install (may require root privilege)"
- out.puts
- out.puts 'Detailed Usage:'
- out.puts " ruby #{File.basename $0} "
- out.puts " ruby #{File.basename $0} [] []"
-
- fmt = " %-24s %s\n"
- out.puts
- out.puts 'Global options:'
- out.printf fmt, '-q,--quiet', 'suppress message outputs'
- out.printf fmt, ' --verbose', 'output messages verbosely'
- out.printf fmt, ' --help', 'print this message'
- out.printf fmt, ' --version', 'print version and quit'
- out.printf fmt, ' --copyright', 'print copyright and quit'
- out.puts
- out.puts 'Tasks:'
- TASKS.each do |name, desc|
- out.printf fmt, name, desc
- end
-
- fmt = " %-24s %s [%s]\n"
- out.puts
- out.puts 'Options for CONFIG or ALL:'
- @config.each do |item|
- out.printf fmt, item.help_opt, item.description, item.help_default
- end
- out.printf fmt, '--rbconfig=path', 'rbconfig.rb to load',"running ruby's"
- out.puts
- out.puts 'Options for INSTALL:'
- out.printf fmt, '--no-harm', 'only display what to do if given', 'off'
- out.printf fmt, '--prefix=path', 'install path prefix', ''
- out.puts
- end
-
- #
- # Task Handlers
- #
-
- def exec_config
- @installer.exec_config
- @config.save # must be final
- end
-
- def exec_setup
- @installer.exec_setup
- end
-
- def exec_install
- @installer.exec_install
- end
-
- def exec_test
- @installer.exec_test
- end
-
- def exec_show
- @config.each do |i|
- printf "%-20s %s\n", i.name, i.value if i.value?
- end
- end
-
- def exec_clean
- @installer.exec_clean
- end
-
- def exec_distclean
- @installer.exec_distclean
- end
-
-end # class ToplevelInstaller
-
-
-class ToplevelInstallerMulti < ToplevelInstaller
-
- include FileOperations
-
- def initialize(ardir_root, config)
- super
- @packages = directories_of("#{@ardir}/packages")
- raise 'no package exists' if @packages.empty?
- @root_installer = Installer.new(@config, @ardir, File.expand_path('.'))
- end
-
- def run_metaconfigs
- @config.load_script "#{@ardir}/metaconfig", self
- @packages.each do |name|
- @config.load_script "#{@ardir}/packages/#{name}/metaconfig"
- end
- end
-
- attr_reader :packages
-
- def packages=(list)
- raise 'package list is empty' if list.empty?
- list.each do |name|
- raise "directory packages/#{name} does not exist"\
- unless File.dir?("#{@ardir}/packages/#{name}")
- end
- @packages = list
- end
-
- def init_installers
- @installers = {}
- @packages.each do |pack|
- @installers[pack] = Installer.new(@config,
- "#{@ardir}/packages/#{pack}",
- "packages/#{pack}")
- end
- with = extract_selection(config('with'))
- without = extract_selection(config('without'))
- @selected = @installers.keys.select {|name|
- (with.empty? or with.include?(name)) \
- and not without.include?(name)
- }
- end
-
- def extract_selection(list)
- a = list.split(/,/)
- a.each do |name|
- setup_rb_error "no such package: #{name}" unless @installers.key?(name)
- end
- a
- end
-
- def print_usage(f)
- super
- f.puts 'Inluded packages:'
- f.puts ' ' + @packages.sort.join(' ')
- f.puts
- end
-
- #
- # Task Handlers
- #
-
- def exec_config
- run_hook 'pre-config'
- each_selected_installers {|inst| inst.exec_config }
- run_hook 'post-config'
- @config.save # must be final
- end
-
- def exec_setup
- run_hook 'pre-setup'
- each_selected_installers {|inst| inst.exec_setup }
- run_hook 'post-setup'
- end
-
- def exec_install
- run_hook 'pre-install'
- each_selected_installers {|inst| inst.exec_install }
- run_hook 'post-install'
- end
-
- def exec_test
- run_hook 'pre-test'
- each_selected_installers {|inst| inst.exec_test }
- run_hook 'post-test'
- end
-
- def exec_clean
- rm_f @config.savefile
- run_hook 'pre-clean'
- each_selected_installers {|inst| inst.exec_clean }
- run_hook 'post-clean'
- end
-
- def exec_distclean
- rm_f @config.savefile
- run_hook 'pre-distclean'
- each_selected_installers {|inst| inst.exec_distclean }
- run_hook 'post-distclean'
- end
-
- #
- # lib
- #
-
- def each_selected_installers
- Dir.mkdir 'packages' unless File.dir?('packages')
- @selected.each do |pack|
- $stderr.puts "Processing the package `#{pack}' ..." if verbose?
- Dir.mkdir "packages/#{pack}" unless File.dir?("packages/#{pack}")
- Dir.chdir "packages/#{pack}"
- yield @installers[pack]
- Dir.chdir '../..'
- end
- end
-
- def run_hook(id)
- @root_installer.run_hook id
- end
-
- # module FileOperations requires this
- def verbose?
- @config.verbose?
- end
-
- # module FileOperations requires this
- def no_harm?
- @config.no_harm?
- end
-
-end # class ToplevelInstallerMulti
-
-
-class Installer
-
- FILETYPES = %w( bin lib ext data conf man )
-
- include FileOperations
- include HookScriptAPI
-
- def initialize(config, srcroot, objroot)
- @config = config
- @srcdir = File.expand_path(srcroot)
- @objdir = File.expand_path(objroot)
- @currdir = '.'
- end
-
- def inspect
- "#<#{self.class} #{File.basename(@srcdir)}>"
- end
-
- def noop(rel)
- end
-
- #
- # Hook Script API base methods
- #
-
- def srcdir_root
- @srcdir
- end
-
- def objdir_root
- @objdir
- end
-
- def relpath
- @currdir
- end
-
- #
- # Config Access
- #
-
- # module FileOperations requires this
- def verbose?
- @config.verbose?
- end
-
- # module FileOperations requires this
- def no_harm?
- @config.no_harm?
- end
-
- def verbose_off
- begin
- save, @config.verbose = @config.verbose?, false
- yield
- ensure
- @config.verbose = save
- end
- end
-
- #
- # TASK config
- #
-
- def exec_config
- exec_task_traverse 'config'
- end
-
- alias config_dir_bin noop
- alias config_dir_lib noop
-
- def config_dir_ext(rel)
- extconf if extdir?(curr_srcdir())
- end
-
- alias config_dir_data noop
- alias config_dir_conf noop
- alias config_dir_man noop
-
- def extconf
- ruby "#{curr_srcdir()}/extconf.rb", *@config.config_opt
- end
-
- #
- # TASK setup
- #
-
- def exec_setup
- exec_task_traverse 'setup'
- end
-
- def setup_dir_bin(rel)
- files_of(curr_srcdir()).each do |fname|
- update_shebang_line "#{curr_srcdir()}/#{fname}"
- end
- end
-
- alias setup_dir_lib noop
-
- def setup_dir_ext(rel)
- make if extdir?(curr_srcdir())
- end
-
- alias setup_dir_data noop
- alias setup_dir_conf noop
- alias setup_dir_man noop
-
- def update_shebang_line(path)
- return if no_harm?
- return if config('shebang') == 'never'
- old = Shebang.load(path)
- if old
- $stderr.puts "warning: #{path}: Shebang line includes too many args. It is not portable and your program may not work." if old.args.size > 1
- new = new_shebang(old)
- return if new.to_s == old.to_s
- else
- return unless config('shebang') == 'all'
- new = Shebang.new(config('rubypath'))
- end
- $stderr.puts "updating shebang: #{File.basename(path)}" if verbose?
- open_atomic_writer(path) {|output|
- File.open(path, 'rb') {|f|
- f.gets if old # discard
- output.puts new.to_s
- output.print f.read
- }
- }
- end
-
- def new_shebang(old)
- if /\Aruby/ =~ File.basename(old.cmd)
- Shebang.new(config('rubypath'), old.args)
- elsif File.basename(old.cmd) == 'env' and old.args.first == 'ruby'
- Shebang.new(config('rubypath'), old.args[1..-1])
- else
- return old unless config('shebang') == 'all'
- Shebang.new(config('rubypath'))
- end
- end
-
- def open_atomic_writer(path, &block)
- tmpfile = File.basename(path) + '.tmp'
- begin
- File.open(tmpfile, 'wb', &block)
- File.rename tmpfile, File.basename(path)
- ensure
- File.unlink tmpfile if File.exist?(tmpfile)
- end
- end
-
- class Shebang
- def Shebang.load(path)
- line = nil
- File.open(path) {|f|
- line = f.gets
- }
- return nil unless /\A#!/ =~ line
- parse(line)
- end
-
- def Shebang.parse(line)
- cmd, *args = *line.strip.sub(/\A\#!/, '').split(' ')
- new(cmd, args)
- end
-
- def initialize(cmd, args = [])
- @cmd = cmd
- @args = args
- end
-
- attr_reader :cmd
- attr_reader :args
-
- def to_s
- "#! #{@cmd}" + (@args.empty? ? '' : " #{@args.join(' ')}")
- end
- end
-
- #
- # TASK install
- #
-
- def exec_install
- rm_f 'InstalledFiles'
- exec_task_traverse 'install'
- end
-
- def install_dir_bin(rel)
- install_files targetfiles(), "#{config('bindir')}/#{rel}", 0755
- end
-
- def install_dir_lib(rel)
- install_files libfiles(), "#{config('rbdir')}/#{rel}", 0644
- end
-
- def install_dir_ext(rel)
- return unless extdir?(curr_srcdir())
- install_files rubyextentions('.'),
- "#{config('sodir')}/#{File.dirname(rel)}",
- 0555
- end
-
- def install_dir_data(rel)
- install_files targetfiles(), "#{config('datadir')}/#{rel}", 0644
- end
-
- def install_dir_conf(rel)
- # FIXME: should not remove current config files
- # (rename previous file to .old/.org)
- install_files targetfiles(), "#{config('sysconfdir')}/#{rel}", 0644
- end
-
- def install_dir_man(rel)
- install_files targetfiles(), "#{config('mandir')}/#{rel}", 0644
- end
-
- def install_files(list, dest, mode)
- mkdir_p dest, @config.install_prefix
- list.each do |fname|
- install fname, dest, mode, @config.install_prefix
- end
- end
-
- def libfiles
- glob_reject(%w(*.y *.output), targetfiles())
- end
-
- def rubyextentions(dir)
- ents = glob_select("*.#{@config.dllext}", targetfiles())
- if ents.empty?
- setup_rb_error "no ruby extention exists: 'ruby #{$0} setup' first"
- end
- ents
- end
-
- def targetfiles
- mapdir(existfiles() - hookfiles())
- end
-
- def mapdir(ents)
- ents.map {|ent|
- if File.exist?(ent)
- then ent # objdir
- else "#{curr_srcdir()}/#{ent}" # srcdir
- end
- }
- end
-
- # picked up many entries from cvs-1.11.1/src/ignore.c
- JUNK_FILES = %w(
- core RCSLOG tags TAGS .make.state
- .nse_depinfo #* .#* cvslog.* ,* .del-* *.olb
- *~ *.old *.bak *.BAK *.orig *.rej _$* *$
-
- *.org *.in .*
- )
-
- def existfiles
- glob_reject(JUNK_FILES, (files_of(curr_srcdir()) | files_of('.')))
- end
-
- def hookfiles
- %w( pre-%s post-%s pre-%s.rb post-%s.rb ).map {|fmt|
- %w( config setup install clean ).map {|t| sprintf(fmt, t) }
- }.flatten
- end
-
- def glob_select(pat, ents)
- re = globs2re([pat])
- ents.select {|ent| re =~ ent }
- end
-
- def glob_reject(pats, ents)
- re = globs2re(pats)
- ents.reject {|ent| re =~ ent }
- end
-
- GLOB2REGEX = {
- '.' => '\.',
- '$' => '\$',
- '#' => '\#',
- '*' => '.*'
- }
-
- def globs2re(pats)
- /\A(?:#{
- pats.map {|pat| pat.gsub(/[\.\$\#\*]/) {|ch| GLOB2REGEX[ch] } }.join('|')
- })\z/
- end
-
- #
- # TASK test
- #
-
- TESTDIR = 'test'
-
- def exec_test
- unless File.directory?('test')
- $stderr.puts 'no test in this package' if verbose?
- return
- end
- $stderr.puts 'Running tests...' if verbose?
- begin
- require 'test/unit'
- rescue LoadError
- setup_rb_error 'test/unit cannot loaded. You need Ruby 1.8 or later to invoke this task.'
- end
- runner = Test::Unit::AutoRunner.new(true)
- runner.to_run << TESTDIR
- runner.run
- end
-
- #
- # TASK clean
- #
-
- def exec_clean
- exec_task_traverse 'clean'
- rm_f @config.savefile
- rm_f 'InstalledFiles'
- end
-
- alias clean_dir_bin noop
- alias clean_dir_lib noop
- alias clean_dir_data noop
- alias clean_dir_conf noop
- alias clean_dir_man noop
-
- def clean_dir_ext(rel)
- return unless extdir?(curr_srcdir())
- make 'clean' if File.file?('Makefile')
- end
-
- #
- # TASK distclean
- #
-
- def exec_distclean
- exec_task_traverse 'distclean'
- rm_f @config.savefile
- rm_f 'InstalledFiles'
- end
-
- alias distclean_dir_bin noop
- alias distclean_dir_lib noop
-
- def distclean_dir_ext(rel)
- return unless extdir?(curr_srcdir())
- make 'distclean' if File.file?('Makefile')
- end
-
- alias distclean_dir_data noop
- alias distclean_dir_conf noop
- alias distclean_dir_man noop
-
- #
- # Traversing
- #
-
- def exec_task_traverse(task)
- run_hook "pre-#{task}"
- FILETYPES.each do |type|
- if type == 'ext' and config('without-ext') == 'yes'
- $stderr.puts 'skipping ext/* by user option' if verbose?
- next
- end
- traverse task, type, "#{task}_dir_#{type}"
- end
- run_hook "post-#{task}"
- end
-
- def traverse(task, rel, mid)
- dive_into(rel) {
- run_hook "pre-#{task}"
- __send__ mid, rel.sub(%r[\A.*?(?:/|\z)], '')
- directories_of(curr_srcdir()).each do |d|
- traverse task, "#{rel}/#{d}", mid
- end
- run_hook "post-#{task}"
- }
- end
-
- def dive_into(rel)
- return unless File.dir?("#{@srcdir}/#{rel}")
-
- dir = File.basename(rel)
- Dir.mkdir dir unless File.dir?(dir)
- prevdir = Dir.pwd
- Dir.chdir dir
- $stderr.puts '---> ' + rel if verbose?
- @currdir = rel
- yield
- Dir.chdir prevdir
- $stderr.puts '<--- ' + rel if verbose?
- @currdir = File.dirname(rel)
- end
-
- def run_hook(id)
- path = [ "#{curr_srcdir()}/#{id}",
- "#{curr_srcdir()}/#{id}.rb" ].detect {|cand| File.file?(cand) }
- return unless path
- begin
- instance_eval File.read(path), path, 1
- rescue
- raise if $DEBUG
- setup_rb_error "hook #{path} failed:\n" + $!.message
- end
- end
-
-end # class Installer
-
-
-class SetupError < StandardError; end
-
-def setup_rb_error(msg)
- raise SetupError, msg
-end
-
-if $0 == __FILE__
- begin
- ToplevelInstaller.invoke
- rescue SetupError
- raise if $DEBUG
- $stderr.puts $!.message
- $stderr.puts "Try 'ruby #{$0} --help' for detailed usage."
- exit 1
- end
-end
diff --git a/statsample.gemspec b/statsample.gemspec
new file mode 100644
index 0000000..4ff94bf
--- /dev/null
+++ b/statsample.gemspec
@@ -0,0 +1,87 @@
+$:.unshift File.expand_path("../lib/", __FILE__)
+
+require 'statsample/version'
+require 'date'
+
+Statsample::DESCRIPTION = < 0.1.6'
+ s.add_runtime_dependency 'spreadsheet', '~> 1.1'
+ s.add_runtime_dependency 'reportbuilder', '~> 1.4'
+ s.add_runtime_dependency 'minimization', '~> 0.2'
+ s.add_runtime_dependency 'dirty-memoize', '~> 0.0.4'
+ s.add_runtime_dependency 'extendmatrix', '~> 0.4'
+ s.add_runtime_dependency 'rserve-client', '~> 0.3'
+ s.add_runtime_dependency 'rubyvis', '~> 0.6.1'
+ s.add_runtime_dependency 'distribution', '~> 0.7'
+ s.add_runtime_dependency 'awesome_print', '~> 1.6'
+
+ s.add_development_dependency 'bundler', '~> 1.10'
+ s.add_development_dependency 'rake', '~> 10.4'
+ s.add_development_dependency 'rdoc', '~> 4.2'
+ s.add_development_dependency 'shoulda', '~> 3.5'
+ s.add_development_dependency 'shoulda-matchers', '~> 2.2'
+ s.add_development_dependency 'minitest', '~> 5.7'
+ s.add_development_dependency 'gettext', '~> 3.1'
+ s.add_development_dependency 'mocha', '~> 1.1'
+ s.add_development_dependency 'nmatrix', '~> 0.2.1'
+ s.add_development_dependency 'gsl', '~> 2.1'
+ s.add_development_dependency 'pry'
+ s.add_development_dependency 'rubocop'
+ s.add_development_dependency 'activesupport', '~> 4.2'
+end
diff --git a/test/fixtures/df.csv b/test/fixtures/df.csv
new file mode 100644
index 0000000..4398132
--- /dev/null
+++ b/test/fixtures/df.csv
@@ -0,0 +1,15 @@
+y,a,b,c,d,e
+0,6,62.1,no,female,A
+1,18,34.7,yes,male,B
+1,6,29.7,no,female,C
+0,4,71,no,male,C
+1,5,36.9,yes,male,B
+0,11,58.7,no,female,B
+0,8,63.3,no,male,B
+1,21,20.4,yes,male,A
+1,2,20.5,yes,male,C
+0,11,59.2,no,male,B
+0,1,76.4,yes,female,A
+0,8,71.7,no,female,B
+1,2,77.5,no,male,C
+1,3,31.1,no,male,B
\ No newline at end of file
diff --git a/test/fixtures/repeated_fields.csv b/test/fixtures/repeated_fields.csv
deleted file mode 100644
index 90010dd..0000000
--- a/test/fixtures/repeated_fields.csv
+++ /dev/null
@@ -1,7 +0,0 @@
-"id","name","age","city","a1","name","age"
-1,"Alex",20,"New York","a,b","a",3
-2,"Claude",23,"London","b,c","b",4
-3,"Peter",25,"London","a","c",5
-4,"Franz",27,"Paris",,"d",6
-5,"George","5,5","Tome","a,b,c","f",
-6,"Fernand",20,"London","c,b","f",8
diff --git a/test/fixtures/test_csv.csv b/test/fixtures/test_csv.csv
deleted file mode 100644
index 667beaf..0000000
--- a/test/fixtures/test_csv.csv
+++ /dev/null
@@ -1,7 +0,0 @@
-"id","name","age","city","a1"
-1,"Alex",20,"New York","a,b"
-2,"Claude",23,"London","b,c"
-3,"Peter",25,"London","a"
-4,"Franz",27,"Paris",
-5,"George","5,5","Tome","a,b,c"
-6,"Fernand",,,
diff --git a/test/fixtures/test_xls.xls b/test/fixtures/test_xls.xls
deleted file mode 100644
index 043890d..0000000
Binary files a/test/fixtures/test_xls.xls and /dev/null differ
diff --git a/test/helpers_tests.rb b/test/helpers_tests.rb
index 47495e7..99d2d28 100644
--- a/test/helpers_tests.rb
+++ b/test/helpers_tests.rb
@@ -1,5 +1,6 @@
-$:.unshift(File.expand_path(File.dirname(__FILE__)+'/../lib/'))
-$:.unshift(File.expand_path(File.dirname(__FILE__)+'/'))
+$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__) + '/../lib/'))
+$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__) + '/'))
+
require 'minitest'
require 'minitest/unit'
require 'mocha/setup'
@@ -11,67 +12,72 @@
require 'statsample'
-
-module MiniTest
+module Minitest
class Test
include Shoulda::Context::Assertions
include Shoulda::Context::InstanceMethods
extend Shoulda::Context::ClassMethods
- def self.should_with_gsl(name,&block)
- should(name) do
- if Statsample.has_gsl?
- instance_eval(&block)
- else
- skip("Requires GSL")
- end
-
+
+ def self.should_with_gsl(name, &block)
+ should(name) do
+ if Statsample.has_gsl?
+ instance_eval(&block)
+ else
+ skip('Requires GSL')
end
-
-
+ end
end
end
module Assertions
- def assert_similar_vector(exp, obs, delta=1e-10,msg=nil)
- msg||="Different vectors #{exp} - #{obs}"
+ def assert_vectors_from_formula(formula, names)
+ model = Statsample::FitModel.new formula, @df
+
+ model.df_for_regression.vectors.to_a.sort
+ .must_equal names.sort
+ end
+
+ def assert_similar_vector(exp, obs, delta = 1e-10, msg = nil)
+ msg ||= "Different vectors #{exp} - #{obs}"
assert_equal(exp.size, obs.size)
- exp.data_with_nils.each_with_index {|v,i|
- assert_in_delta(v,obs[i],delta)
+ exp.to_a.each_with_index {|v, i|
+ assert_in_delta(v, obs[i], delta)
}
end
- def assert_equal_vector(exp,obs,delta=1e-10,msg=nil)
+
+ def assert_equal_vector(exp, obs, delta = 1e-10, msg = nil)
assert_equal(exp.size, obs.size, "Different size.#{msg}")
exp.size.times {|i|
- assert_in_delta(exp[i],obs[i],delta, "Different element #{i}. \nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}")
+ assert_in_delta(exp[i], obs[i], delta, "Different element #{i}. \nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}")
}
end
- def assert_equal_matrix(exp,obs,delta=1e-10,msg=nil)
- assert_equal(exp.row_size, obs.row_size, "Different row size.#{msg}")
- assert_equal(exp.column_size, obs.column_size, "Different column size.#{msg}")
- exp.row_size.times {|i|
- exp.column_size.times {|j|
- assert_in_delta(exp[i,j],obs[i,j], delta, "Different element #{i},#{j}\nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}")
- }
- }
+
+ def assert_equal_matrix(exp, obs, delta = 1e-10, msg = nil)
+ assert_equal(exp.row_size, obs.row_size, "Different row size.#{msg}")
+ assert_equal(exp.column_size, obs.column_size, "Different column size.#{msg}")
+ exp.row_size.times {|i|
+ exp.column_size.times {|j|
+ assert_in_delta(exp[i, j], obs[i, j], delta, "Different element #{i},#{j}\nExpected:\n#{exp}\nObserved:\n#{obs}.#{msg}")
+ }
+ }
end
- alias :assert_raise :assert_raises unless method_defined? :assert_raise
- alias :assert_not_equal :refute_equal unless method_defined? :assert_not_equal
- alias :assert_not_same :refute_same unless method_defined? :assert_not_same
+ alias_method :assert_raise, :assert_raises unless method_defined? :assert_raise
+ alias_method :assert_not_equal, :refute_equal unless method_defined? :assert_not_equal
+ alias_method :assert_not_same, :refute_same unless method_defined? :assert_not_same
unless method_defined? :assert_nothing_raised
- def assert_nothing_raised(msg=nil)
- msg||="Nothing should be raised, but raised %s"
+ def assert_nothing_raised(msg = nil)
+ msg ||= 'Nothing should be raised, but raised %s'
begin
yield
- not_raised=true
+ not_raised = true
rescue Exception => e
- not_raised=false
- msg=sprintf(msg,e)
+ not_raised = false
+ msg = sprintf(msg, e)
end
- assert(not_raised,msg)
+ assert(not_raised, msg)
end
end
end
end
MiniTest.autorun
-
diff --git a/test/test_analysis.rb b/test/test_analysis.rb
index 9799f57..20d8985 100644
--- a/test/test_analysis.rb
+++ b/test/test_analysis.rb
@@ -1,77 +1,77 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
-class StatsampleAnalysisTestCase < MiniTest::Unit::TestCase
+class StatsampleAnalysisTestCase < Minitest::Test
context(Statsample::Analysis) do
setup do
Statsample::Analysis.clear_analysis
end
- should "store() should create and store Statsample::Analysis::Suite" do
+ should 'store() should create and store Statsample::Analysis::Suite' do
Statsample::Analysis.store(:first) do
- a=1
+ a = 1
end
assert(Statsample::Analysis.stored_analysis[:first])
assert(Statsample::Analysis.stored_analysis[:first].is_a? Statsample::Analysis::Suite)
end
-
- should "ss_analysis should create an Statsample::Analysis" do
- ss_analysis(:first) {a=1}
+
+ should 'ss_analysis should create an Statsample::Analysis' do
+ ss_analysis(:first) { a = 1 }
end
- should "store last created analysis" do
- an=Statsample::Analysis.store(:first) do
- a=1
+ should 'store last created analysis' do
+ an = Statsample::Analysis.store(:first) do
+ a = 1
end
- assert_equal(an,Statsample::Analysis.last)
+ assert_equal(an, Statsample::Analysis.last)
end
-
- should "add_to_reportbuilder() add sections to reportbuilder object" do
- rb=mock()
- rb.expects(:add).with {|value| value.is_a? ReportBuilder::Section and value.name==:first}
- rb.expects(:add).with {|value| value.is_a? ReportBuilder::Section and value.name==:second}
-
+
+ should 'add_to_reportbuilder() add sections to reportbuilder object' do
+ rb = mock
+ rb.expects(:add).with { |value| value.is_a? ReportBuilder::Section and value.name == :first }
+ rb.expects(:add).with { |value| value.is_a? ReportBuilder::Section and value.name == :second }
+
Statsample::Analysis.store(:first) do
- echo "first","second"
+ echo 'first', 'second'
end
Statsample::Analysis.store(:second) do
- echo "third"
+ echo 'third'
end
- Statsample::Analysis.add_to_reportbuilder(rb,:first,:second)
+ Statsample::Analysis.add_to_reportbuilder(rb, :first, :second)
end
- should "to_text returns the same as a normal ReportBuilder object" do
- rb=ReportBuilder.new(:name=>:test)
- section=ReportBuilder::Section.new(:name=>"first")
- a=[1,2,3].to_scale
- section.add("first")
+ should 'to_text returns the same as a normal ReportBuilder object' do
+ rb = ReportBuilder.new(name: :test)
+ section = ReportBuilder::Section.new(name: 'first')
+ a = Daru::Vector.new([1, 2, 3])
+ section.add('first')
section.add(a)
rb.add(section)
- exp=rb.to_text
- an=ss_analysis(:first) {
+ exp = rb.to_text
+ an = ss_analysis(:first) {
echo 'first'
summary(a)
}
- obs=Statsample::Analysis.to_text(:first)
-
- assert_equal(exp.split("\n")[1,exp.size], obs.split("\n")[1,obs.size])
+ obs = Statsample::Analysis.to_text(:first)
+
+ assert_equal(exp.split("\n")[1, exp.size], obs.split("\n")[1, obs.size])
end
-
- should "run() execute all analysis by default" do
- m1=mock()
+
+ should 'run() execute all analysis by default' do
+ m1 = mock
m1.expects(:run).once
m1.expects(:hide).once
-
+
Statsample::Analysis.store(:first) do
m1.run
end
Statsample::Analysis.store(:second) do
m1.hide
end
-
+
# Should run all test
Statsample::Analysis.run
end
-
- should "run() execute blocks specificed on parameters" do
- m1=mock()
- m1.expects(:run).once
+
+ should 'run() execute blocks specificed on parameters' do
+ m1 = mock
+ m1.expects(:run).once
m1.expects(:hide).never
Statsample::Analysis.store(:first) do
m1.run
@@ -82,78 +82,78 @@ class StatsampleAnalysisTestCase < MiniTest::Unit::TestCase
# Should run all test
Statsample::Analysis.run(:first)
end
-
+
context(Statsample::Analysis::Suite) do
- should "echo() uses output#puts with same arguments" do
- an=Statsample::Analysis::Suite.new(:output)
- obj=mock()
- obj.expects(:puts).with(:first,:second).once
- an.output=obj
- an.echo(:first,:second)
- end
- should "summary() should call object.summary" do
- an=Statsample::Analysis::Suite.new(:summary)
- obj=stub('summarizable',:summary=>'summary')
- assert_equal(obj.summary,an.summary(obj))
- end
- should "attach() allows to call objects on objects which respond to fields" do
- an=Statsample::Analysis::Suite.new(:summary)
- ds={'x'=>stub(:mean=>10),'y'=>stub(:mean=>12)}
- ds.expects(:fields).returns(%w{x y}).at_least_once
+ should 'echo() uses output#puts with same arguments' do
+ an = Statsample::Analysis::Suite.new(:output)
+ obj = mock
+ obj.expects(:puts).with(:first, :second).once
+ an.output = obj
+ an.echo(:first, :second)
+ end
+ should 'summary() should call object.summary' do
+ an = Statsample::Analysis::Suite.new(:summary)
+ obj = stub('summarizable', summary: 'summary')
+ assert_equal(obj.summary, an.summary(obj))
+ end
+ should 'attach() allows to call objects on objects which respond to fields' do
+ an = Statsample::Analysis::Suite.new(:summary)
+ ds = { :x => stub(mean: 10), :y => stub(mean: 12) }
+ ds.expects(:vectors).returns([:x, :y]).at_least_once
an.attach(ds)
- assert_equal(10,an.x.mean)
- assert_equal(12,an.y.mean)
+ assert_equal(10, an.x.mean)
+ assert_equal(12, an.y.mean)
assert_raise(RuntimeError) {
an.z
}
end
- should "attached objects should be called LIFO" do
- an=Statsample::Analysis::Suite.new(:summary)
- ds1={'x'=>stub(:mean=>100),'y'=>stub(:mean=>120),'z'=>stub(:mean=>13)}
- ds1.expects(:fields).returns(%w{x y z}).at_least_once
- ds2={'x'=>stub(:mean=>10),'y'=>stub(:mean=>12)}
- ds2.expects(:fields).returns(%w{x y}).at_least_once
+ should 'attached objects should be called LIFO' do
+ an = Statsample::Analysis::Suite.new(:summary)
+ ds1 = { :x => stub(mean: 100), :y => stub(mean: 120), :z => stub(mean: 13) }
+ ds1.expects(:vectors).returns([:x, :y, :z]).at_least_once
+ ds2 = { :x => stub(mean: 10), :y => stub(mean: 12) }
+ ds2.expects(:vectors).returns([:x, :y]).at_least_once
an.attach(ds1)
an.attach(ds2)
- assert_equal(10,an.x.mean)
- assert_equal(12,an.y.mean)
- assert_equal(13,an.z.mean)
- end
-
- should "detach() without arguments drop latest object" do
- an=Statsample::Analysis::Suite.new(:summary)
- ds1={'x'=>stub(:mean=>100),'y'=>stub(:mean=>120),'z'=>stub(:mean=>13)}
- ds1.expects(:fields).returns(%w{x y z}).at_least_once
- ds2={'x'=>stub(:mean=>10),'y'=>stub(:mean=>12)}
- ds2.expects(:fields).returns(%w{x y}).at_least_once
+ assert_equal(10, an.x.mean)
+ assert_equal(12, an.y.mean)
+ assert_equal(13, an.z.mean)
+ end
+
+ should 'detach() without arguments drop latest object' do
+ an = Statsample::Analysis::Suite.new(:summary)
+ ds1 = { :x => stub(mean: 100), :y => stub(mean: 120), :z => stub(mean: 13) }
+ ds1.expects(:vectors).returns([:x, :y, :z]).at_least_once
+ ds2 = { :x => stub(mean: 10), :y => stub(mean: 12) }
+ ds2.expects(:vectors).returns([:x, :y]).at_least_once
an.attach(ds1)
an.attach(ds2)
- assert_equal(10,an.x.mean)
+ assert_equal(10, an.x.mean)
an.detach
assert_equal(100, an.x.mean)
end
- should "detach() with argument drop select object" do
- an=Statsample::Analysis::Suite.new(:summary)
- ds1={'x'=>1}
- ds1.expects(:fields).returns(%w{x}).at_least_once
- ds2={'x'=>2,'y'=>3}
- ds2.expects(:fields).returns(%w{x y}).at_least_once
- ds3={'y'=>4}
- ds3.expects(:fields).returns(%w{y}).at_least_once
-
+ should 'detach() with argument drop select object' do
+ an = Statsample::Analysis::Suite.new(:summary)
+ ds1 = { :x => 1 }
+ ds1.expects(:vectors).returns([:x]).at_least_once
+ ds2 = { :x => 2, :y => 3 }
+ ds2.expects(:vectors).returns([:x, :y]).at_least_once
+ ds3 = { :y => 4 }
+ ds3.expects(:vectors).returns([:y]).at_least_once
+
an.attach(ds3)
an.attach(ds2)
an.attach(ds1)
- assert_equal(1,an.x)
- assert_equal(3,an.y)
+ assert_equal(1, an.x)
+ assert_equal(3, an.y)
an.detach(ds2)
- assert_equal(4,an.y)
+ assert_equal(4, an.y)
end
- should "perform a simple analysis" do
- output=mock()
+ should 'perform a simple analysis' do
+ output = mock
output.expects(:puts).with(5.5)
- an=Statsample::Analysis.store(:simple, :output=>output) do
- ds=data_frame(:x=>vector(1..10),:y=>vector(1..10))
+ an = Statsample::Analysis.store(:simple, output: output) do
+ ds = data_frame(x: vector(1..10), y: vector(1..10))
attach(ds)
echo x.mean
end
@@ -161,17 +161,16 @@ class StatsampleAnalysisTestCase < MiniTest::Unit::TestCase
end
end
context(Statsample::Analysis::SuiteReportBuilder) do
- should "echo() use add on rb object" do
- an=Statsample::Analysis::SuiteReportBuilder.new(:puts_to_add)
+ should 'echo() use add on rb object' do
+ an = Statsample::Analysis::SuiteReportBuilder.new(:puts_to_add)
an.rb.expects(:add).with(:first).twice
an.echo(:first, :first)
end
- should "summary() uses add on rb object" do
- an=Statsample::Analysis::SuiteReportBuilder.new(:summary_to_add)
+ should 'summary() uses add on rb object' do
+ an = Statsample::Analysis::SuiteReportBuilder.new(:summary_to_add)
an.rb.expects(:add).with(:first).once
an.summary(:first)
end
end
-
end
end
diff --git a/test/test_anova_contrast.rb b/test/test_anova_contrast.rb
index a335149..36ccc60 100644
--- a/test/test_anova_contrast.rb
+++ b/test/test_anova_contrast.rb
@@ -1,36 +1,36 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleAnovaContrastTestCase < MiniTest::Unit::TestCase
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleAnovaContrastTestCase < Minitest::Test
context(Statsample::Anova::Contrast) do
setup do
- constant=[12,13,11,12,12].to_scale
- frequent=[9,10,9,13,14].to_scale
- infrequent=[15,16,17,16,16].to_scale
- never=[17,18,12,18,20].to_scale
- @vectors=[constant, frequent, infrequent, never]
- @c=Statsample::Anova::Contrast.new(:vectors=>@vectors)
+ constant = Daru::Vector.new([12, 13, 11, 12, 12])
+ frequent = Daru::Vector.new([9, 10, 9, 13, 14])
+ infrequent = Daru::Vector.new([15, 16, 17, 16, 16])
+ never = Daru::Vector.new([17, 18, 12, 18, 20])
+ @vectors = [constant, frequent, infrequent, never]
+ @c = Statsample::Anova::Contrast.new(vectors: @vectors)
end
- should "return correct value using c" do
- @c.c([1,-1.quo(3),-1.quo(3),-1.quo(3)])
- #@c.c([1,-0.333,-0.333,-0.333])
+ should 'return correct value using c' do
+ @c.c([1, -1.quo(3), -1.quo(3), -1.quo(3)])
+ # @c.c([1,-0.333,-0.333,-0.333])
assert_in_delta(-2.6667, @c.psi, 0.0001)
assert_in_delta(1.0165, @c.se, 0.0001)
assert_in_delta(-2.623, @c.t, 0.001)
- assert_in_delta(-4.82, @c.confidence_interval[0],0.01)
- assert_in_delta(-0.51, @c.confidence_interval[1],0.01)
- assert(@c.summary.size>0)
+ assert_in_delta(-4.82, @c.confidence_interval[0], 0.01)
+ assert_in_delta(-0.51, @c.confidence_interval[1], 0.01)
+ assert(@c.summary.size > 0)
end
- should "return correct values using c_by_index" do
- @c.c_by_index([0],[1,2,3])
+ should 'return correct values using c_by_index' do
+ @c.c_by_index([0], [1, 2, 3])
assert_in_delta(-2.6667, @c.psi, 0.0001)
assert_in_delta(1.0165, @c.se, 0.0001)
assert_in_delta(-2.623, @c.t, 0.001)
end
- should "return correct values using incomplete c_by_index" do
- c1=Statsample::Anova::Contrast.new(:vectors=>@vectors, :c=>[0.5,0.5,-1,0])
- c2=Statsample::Anova::Contrast.new(:vectors=>@vectors, :c1=>[0,1],:c2=>[2])
- assert_equal(c1.psi,c2.psi)
- assert_equal(c1.se,c2.se)
- assert_equal(c1.t,c2.t)
+ should 'return correct values using incomplete c_by_index' do
+ c1 = Statsample::Anova::Contrast.new(vectors: @vectors, c: [0.5, 0.5, -1, 0])
+ c2 = Statsample::Anova::Contrast.new(vectors: @vectors, c1: [0, 1], c2: [2])
+ assert_equal(c1.psi, c2.psi)
+ assert_equal(c1.se, c2.se)
+ assert_equal(c1.t, c2.t)
end
end
end
diff --git a/test/test_anovaoneway.rb b/test/test_anovaoneway.rb
index 2f0e1e5..17c86cb 100644
--- a/test/test_anovaoneway.rb
+++ b/test/test_anovaoneway.rb
@@ -1,26 +1,26 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleAnovaOneWayTestCase < MiniTest::Unit::TestCase
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleAnovaOneWayTestCase < Minitest::Test
context(Statsample::Anova::OneWay) do
setup do
- @ss_num=30.08
- @ss_den=87.88
- @df_num=2
- @df_den=21
- @anova=Statsample::Anova::OneWay.new(:ss_num=>@ss_num, :ss_den=>@ss_den, :df_num=>@df_num, :df_den=>@df_den)
+ @ss_num = 30.08
+ @ss_den = 87.88
+ @df_num = 2
+ @df_den = 21
+ @anova = Statsample::Anova::OneWay.new(ss_num: @ss_num, ss_den: @ss_den, df_num: @df_num, df_den: @df_den)
end
- should "Statsample::Anova.oneway respond to #oneway" do
+ should 'Statsample::Anova.oneway respond to #oneway' do
assert(Statsample::Anova.respond_to? :oneway)
end
- should "return correct value for ms_num and ms_den" do
+ should 'return correct value for ms_num and ms_den' do
assert_in_delta(15.04, @anova.ms_num, 0.01)
assert_in_delta(4.18, @anova.ms_den, 0.01)
end
- should "return correct value for f" do
+ should 'return correct value for f' do
assert_in_delta(3.59, @anova.f, 0.01)
end
- should "respond to summary" do
+ should 'respond to summary' do
assert(@anova.respond_to? :summary)
- assert(@anova.summary.size>0)
+ assert(@anova.summary.size > 0)
end
end
end
diff --git a/test/test_anovatwoway.rb b/test/test_anovatwoway.rb
index aa88194..db110c4 100644
--- a/test/test_anovatwoway.rb
+++ b/test/test_anovatwoway.rb
@@ -1,38 +1,37 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleAnovaTwoWayTestCase < MiniTest::Unit::TestCase
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleAnovaTwoWayTestCase < Minitest::Test
context(Statsample::Anova::TwoWay) do
setup do
- @ss_a=192.2
- @ss_b=57.8
- @ss_axb=168.2
- @ss_within=75.6
- @df_a=@df_b=1
- @df_within=16
- @anova=Statsample::Anova::TwoWay.new(:ss_a=>@ss_a, :ss_b=>@ss_b, :ss_axb=>@ss_axb, :ss_within=>@ss_within , :df_a=>@df_a, :df_b=>@df_b, :df_within=>@df_within)
+ @ss_a = 192.2
+ @ss_b = 57.8
+ @ss_axb = 168.2
+ @ss_within = 75.6
+ @df_a = @df_b = 1
+ @df_within = 16
+ @anova = Statsample::Anova::TwoWay.new(ss_a: @ss_a, ss_b: @ss_b, ss_axb: @ss_axb, ss_within: @ss_within, df_a: @df_a, df_b: @df_b, df_within: @df_within)
end
- should "Statsample::Anova.twoway respond to #twoway" do
- assert(Statsample::Anova.respond_to? :twoway)
+ should 'Statsample::Anova.twoway respond to #twoway' do
+ assert(Statsample::Anova.respond_to? :twoway)
end
- should "return correct value for ms_a, ms_b and ms_axb" do
+ should 'return correct value for ms_a, ms_b and ms_axb' do
assert_in_delta(192.2, @anova.ms_a, 0.01)
assert_in_delta(57.8, @anova.ms_b, 0.01)
assert_in_delta(168.2, @anova.ms_axb, 0.01)
-
end
- should "return correct value for f " do
+ should 'return correct value for f ' do
assert_in_delta(40.68, @anova.f_a, 0.01)
assert_in_delta(12.23, @anova.f_b, 0.01)
assert_in_delta(35.60, @anova.f_axb, 0.01)
end
- should "return correct value for probability for f " do
+ should 'return correct value for probability for f ' do
assert(@anova.f_a_probability < 0.05)
assert(@anova.f_b_probability < 0.05)
assert(@anova.f_axb_probability < 0.05)
end
- should "respond to summary" do
+ should 'respond to summary' do
assert(@anova.respond_to? :summary)
- assert(@anova.summary.size>0)
+ assert(@anova.summary.size > 0)
end
end
end
diff --git a/test/test_anovatwowaywithdataset.rb b/test/test_anovatwowaywithdataset.rb
index a08eb7d..ee69a3e 100644
--- a/test/test_anovatwowaywithdataset.rb
+++ b/test/test_anovatwowaywithdataset.rb
@@ -1,49 +1,47 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
# Reference:
# * http://www.uwsp.edu/psych/Stat/13/anova-2w.htm#III
-class StatsampleAnovaTwoWayWithVectorsTestCase < MiniTest::Unit::TestCase
+class StatsampleAnovaTwoWayWithVectorsTestCase < Minitest::Test
context(Statsample::Anova::TwoWayWithVectors) do
setup do
- @pa=[5,4,3,4,2,18,19,14,12,15,6,7,5,8,4,6,9,5,9,3].to_scale
- @pa.name="Passive Avoidance"
- @a=[0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,1,1,1,1,1].to_vector
- @a.labels={0=>'0%',1=>'35%'}
- @a.name='Diet'
- @b=[0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1].to_vector
- @b.labels={0=>'Young',1=>'Older'}
- @b.name="Age"
- @anova=Statsample::Anova::TwoWayWithVectors.new(:a=>@a,:b=>@b, :dependent=>@pa)
+ @pa = Daru::Vector.new [5, 4, 3, 4, 2, 18, 19, 14, 12, 15, 6, 7, 5, 8, 4, 6, 9, 5, 9, 3]
+ @pa.rename 'Passive Avoidance'
+ @a = Daru::Vector.new [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+ # @a.labels = { 0 => '0%', 1 => '35%' }
+ @a.rename 'Diet'
+ @b = Daru::Vector.new [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+ # @b.labels = { 0 => 'Young', 1 => 'Older' }
+ @b.rename 'Age'
+ @anova = Statsample::Anova::TwoWayWithVectors.new(a: @a, b: @b, dependent: @pa)
end
- should "Statsample::Anova respond to #twoway_with_vectors" do
- assert(Statsample::Anova.respond_to? :twoway_with_vectors)
+ should 'Statsample::Anova respond to #twoway_with_vectors' do
+ assert(Statsample::Anova.respond_to? :twoway_with_vectors)
end
- should "#new returns the same as Statsample::Anova.twoway_with_vectors" do
- @anova2=Statsample::Anova.twoway_with_vectors(:a=>@a,:b=>@b, :dependent=>@pa)
+ should '#new returns the same as Statsample::Anova.twoway_with_vectors' do
+ @anova2 = Statsample::Anova.twoway_with_vectors(a: @a, b: @b, dependent: @pa)
assert_equal(@anova.summary, @anova2.summary)
end
- should "return correct value for ms_a, ms_b and ms_axb" do
+ should 'return correct value for ms_a, ms_b and ms_axb' do
assert_in_delta(192.2, @anova.ms_a, 0.01)
assert_in_delta(57.8, @anova.ms_b, 0.01)
assert_in_delta(168.2, @anova.ms_axb, 0.01)
-
end
- should "return correct value for f " do
+ should 'return correct value for f ' do
assert_in_delta(40.68, @anova.f_a, 0.01)
assert_in_delta(12.23, @anova.f_b, 0.01)
assert_in_delta(35.60, @anova.f_axb, 0.01)
end
- should "return correct value for probability for f " do
+ should 'return correct value for probability for f ' do
assert(@anova.f_a_probability < 0.05)
assert(@anova.f_b_probability < 0.05)
assert(@anova.f_axb_probability < 0.05)
end
- should "respond to summary" do
-
- @anova.summary_descriptives=true
- @anova.summary_levene=true
+ should 'respond to summary' do
+ @anova.summary_descriptives = true
+ @anova.summary_levene = true
assert(@anova.respond_to? :summary)
- assert(@anova.summary.size>0)
+ assert(@anova.summary.size > 0)
end
end
end
diff --git a/test/test_anovawithvectors.rb b/test/test_anovawithvectors.rb
index b85c074..9da0380 100644
--- a/test/test_anovawithvectors.rb
+++ b/test/test_anovawithvectors.rb
@@ -1,102 +1,100 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleAnovaOneWayWithVectorsTestCase < MiniTest::Unit::TestCase
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleAnovaOneWayWithVectorsTestCase < Minitest::Test
context(Statsample::Anova::OneWayWithVectors) do
-
- context("when initializing") do
+ context('when initializing') do
setup do
- @v1=10.times.map {rand(100)}.to_scale
- @v2=10.times.map {rand(100)}.to_scale
- @v3=10.times.map {rand(100)}.to_scale
+ @v1 = Daru::Vector.new(10.times.map { rand(100) })
+ @v2 = Daru::Vector.new(10.times.map { rand(100) })
+ @v3 = Daru::Vector.new(10.times.map { rand(100) })
end
- should "be the same using [] or args*" do
- a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3)
- a2=Statsample::Anova::OneWayWithVectors.new([@v1,@v2,@v3])
- assert_equal(a1.f,a2.f)
+ should 'be the same using [] or args*' do
+ a1 = Statsample::Anova::OneWayWithVectors.new(@v1, @v2, @v3)
+ a2 = Statsample::Anova::OneWayWithVectors.new([@v1, @v2, @v3])
+ assert_equal(a1.f, a2.f)
end
- should "be the same using module method or object instantiation" do
- a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3)
- a2=Statsample::Anova.oneway_with_vectors(@v1,@v2,@v3)
- assert_equal(a1.f,a2.f)
+ should 'be the same using module method or object instantiation' do
+ a1 = Statsample::Anova::OneWayWithVectors.new(@v1, @v2, @v3)
+ a2 = Statsample::Anova.oneway_with_vectors(@v1, @v2, @v3)
+ assert_equal(a1.f, a2.f)
end
- should "detect optional hash" do
- a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3, {:name=>'aaa'})
+ should 'detect optional hash' do
+ a1 = Statsample::Anova::OneWayWithVectors.new(@v1, @v2, @v3, name: 'aaa')
assert_equal('aaa', a1.name)
end
- should "omit incorrect arguments" do
- a1=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3, {:name=>'aaa'})
- a2=Statsample::Anova::OneWayWithVectors.new(@v1,nil,nil,@v2,@v3, {:name=>'aaa'})
- assert_equal(a1.f,a2.f)
+ should 'omit incorrect arguments' do
+ a1 = Statsample::Anova::OneWayWithVectors.new(@v1, @v2, @v3, name: 'aaa')
+ a2 = Statsample::Anova::OneWayWithVectors.new(@v1, nil, nil, @v2, @v3, name: 'aaa')
+ assert_equal(a1.f, a2.f)
end
end
setup do
- @v1=[3,3,2,3,6].to_vector(:scale)
- @v2=[7,6,5,6,7].to_vector(:scale)
- @v3=[9,8,9,7,8].to_vector(:scale)
- @name="Anova testing"
- @anova=Statsample::Anova::OneWayWithVectors.new(@v1,@v2,@v3, :name=>@name)
- end
- should "store correctly contrasts" do
- c1=Statsample::Anova::Contrast.new(:vectors=>[@v1,@v2,@v3], :c=>[1,-0.5, -0.5])
-
- c2=@anova.contrast(:c=>[1,-0.5,-0.5])
- assert_equal(c1.t,c2.t)
-
- end
- should "respond to #summary" do
+ @v1 = Daru::Vector.new([3, 3, 2, 3, 6])
+ @v2 = Daru::Vector.new([7, 6, 5, 6, 7])
+ @v3 = Daru::Vector.new([9, 8, 9, 7, 8])
+ @name = 'Anova testing'
+ @anova = Statsample::Anova::OneWayWithVectors.new(@v1, @v2, @v3, name: @name)
+ end
+ should 'store correctly contrasts' do
+ c1 = Statsample::Anova::Contrast.new(vectors: [@v1, @v2, @v3], c: [1, -0.5, -0.5])
+
+ c2 = @anova.contrast(c: [1, -0.5, -0.5])
+ assert_equal(c1.t, c2.t)
+ end
+ should 'respond to #summary' do
assert(@anova.respond_to? :summary)
end
- should "have correct name of analysis on #summary" do
+ should 'have correct name of analysis on #summary' do
assert_match(/#{@name}/, @anova.summary)
end
- should "returns same levene values as direct Levene creation" do
- assert_equal(@anova.levene.f, Statsample::Test.levene([@v1,@v2,@v3]).f)
+ should 'returns same levene values as direct Levene creation' do
+ assert_equal(@anova.levene.f, Statsample::Test.levene([@v1, @v2, @v3]).f)
end
- should "have correct value for levene" do
- assert_in_delta(0.604,@anova.levene.f, 0.001)
- assert_in_delta(0.562,@anova.levene.probability, 0.001)
+ should 'have correct value for levene' do
+ assert_in_delta(0.604, @anova.levene.f, 0.001)
+ assert_in_delta(0.562, @anova.levene.probability, 0.001)
end
- should "have correct value for sst" do
- assert_in_delta(72.933, @anova.sst,0.001)
+ should 'have correct value for sst' do
+ assert_in_delta(72.933, @anova.sst, 0.001)
end
- should "have correct value for sswg" do
- assert_in_delta(14.8,@anova.sswg,0.001)
+ should 'have correct value for sswg' do
+ assert_in_delta(14.8, @anova.sswg, 0.001)
end
- should "have correct value for ssb" do
- assert_in_delta(58.133,@anova.ssbg,0.001)
+ should 'have correct value for ssb' do
+ assert_in_delta(58.133, @anova.ssbg, 0.001)
end
- should "sst=sswg+ssbg" do
- assert_in_delta(@anova.sst,@anova.sswg+@anova.ssbg,0.00001)
+ should 'sst=sswg+ssbg' do
+ assert_in_delta(@anova.sst, @anova.sswg + @anova.ssbg, 0.00001)
end
- should "df total equal to number of n-1" do
- assert_equal(@v1.n+@v2.n+@v3.n-1,@anova.df_total)
+ should 'df total equal to number of n-1' do
+ assert_equal(@v1.size + @v2.size + @v3.size - 1, @anova.df_total)
end
- should "df wg equal to number of n-k" do
- assert_equal(@v1.n+@v2.n+@v3.n-3,@anova.df_wg)
+ should 'df wg equal to number of n-k' do
+ assert_equal(@v1.size + @v2.size + @v3.size - 3, @anova.df_wg)
end
- should "df bg equal to number of k-1" do
- assert_equal(2,@anova.df_bg)
+ should 'df bg equal to number of k-1' do
+ assert_equal(2, @anova.df_bg)
end
- should "f=(ssbg/df_bg)/(sswt/df_wt)" do
- assert_in_delta((@anova.ssbg.quo(@anova.df_bg)).quo( @anova.sswg.quo(@anova.df_wg)), @anova.f, 0.001)
+ should 'f=(ssbg/df_bg)/(sswt/df_wt)' do
+ assert_in_delta((@anova.ssbg.quo(@anova.df_bg)).quo(@anova.sswg.quo(@anova.df_wg)), @anova.f, 0.001)
end
- should "p be correct" do
- assert(@anova.probability<0.01)
+ should 'p be correct' do
+ assert(@anova.probability < 0.01)
end
- should "be correct using different test values" do
- anova2=Statsample::Anova::OneWayWithVectors.new([@v1,@v1,@v1,@v1,@v2])
- assert_in_delta(3.960, anova2.f,0.001)
- assert_in_delta(0.016, anova2.probability,0.001)
+ should 'be correct using different test values' do
+ anova2 = Statsample::Anova::OneWayWithVectors.new([@v1, @v1, @v1, @v1, @v2])
+ assert_in_delta(3.960, anova2.f, 0.001)
+ assert_in_delta(0.016, anova2.probability, 0.001)
end
- context "with extra information on summary" do
+ context 'with extra information on summary' do
setup do
- @anova.summary_descriptives=true
- @anova.summary_levene=true
- @summary=@anova.summary
+ @anova.summary_descriptives = true
+ @anova.summary_levene = true
+ @summary = @anova.summary
end
- should "have section with levene statistics" do
+ should 'have section with levene statistics' do
assert_match(/Levene/, @summary)
end
- should "have section with descriptives" do
+ should 'have section with descriptives' do
assert_match(/Min/, @summary)
end
end
diff --git a/test/test_awesome_print_bug.rb b/test/test_awesome_print_bug.rb
index 065d3e7..ceccd72 100644
--- a/test/test_awesome_print_bug.rb
+++ b/test/test_awesome_print_bug.rb
@@ -1,14 +1,14 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleAwesomePrintBug < MiniTest::Test
- context("Awesome Print integration") do
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleAwesomePrintBug < Minitest::Test
+ context('Awesome Print integration') do
setup do
- require "awesome_print"
+ require 'awesome_print'
end
- should "should be flawless" do
- a=[1,2,3].to_scale
-
- assert(a!=[1,2,3])
- assert_nothing_raised do
+ should 'should be flawless' do
+ a = Daru::Vector.new([1, 2, 3])
+
+ assert(a != [1, 2, 3])
+ assert_nothing_raised do
ap a
end
end
diff --git a/test/test_bartlettsphericity.rb b/test/test_bartlettsphericity.rb
index 02f43ce..3865259 100644
--- a/test/test_bartlettsphericity.rb
+++ b/test/test_bartlettsphericity.rb
@@ -1,25 +1,25 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
-class StatsampleBartlettSphericityTestCase < MiniTest::Test
+class StatsampleBartlettSphericityTestCase < Minitest::Test
include Statsample::Test
context Statsample::Test::BartlettSphericity do
setup do
- @v1=[1 ,2 ,3 ,4 ,7 ,8 ,9 ,10,14,15,20,50,60,70].to_scale
- @v2=[5 ,6 ,11,12,13,16,17,18,19,20,30,0,0,0].to_scale
- @v3=[10,3 ,20,30,40,50,80,10,20,30,40,2,3,4].to_scale
+ @v1 = Daru::Vector.new([1, 2, 3, 4, 7, 8, 9, 10, 14, 15, 20, 50, 60, 70])
+ @v2 = Daru::Vector.new([5, 6, 11, 12, 13, 16, 17, 18, 19, 20, 30, 0, 0, 0])
+ @v3 = Daru::Vector.new([10, 3, 20, 30, 40, 50, 80, 10, 20, 30, 40, 2, 3, 4])
# KMO: 0.490
- ds={'v1'=>@v1,'v2'=>@v2,'v3'=>@v3}.to_dataset
- cor=Statsample::Bivariate.correlation_matrix(ds)
- @bs=Statsample::Test::BartlettSphericity.new(cor, 14)
+ ds = Daru::DataFrame.new({ :v1 => @v1, :v2 => @v2, :v3 => @v3 })
+ cor = Statsample::Bivariate.correlation_matrix(ds)
+ @bs = Statsample::Test::BartlettSphericity.new(cor, 14)
end
- should "have correct value for chi" do
- assert_in_delta(9.477, @bs.value,0.001)
+ should 'have correct value for chi' do
+ assert_in_delta(9.477, @bs.value, 0.001)
end
- should "have correct value for df" do
+ should 'have correct value for df' do
assert_equal(3, @bs.df)
end
- should "have correct value for probability" do
- assert_in_delta(0.024,@bs.probability,0.001)
+ should 'have correct value for probability' do
+ assert_in_delta(0.024, @bs.probability, 0.001)
end
end
end
diff --git a/test/test_bivariate.rb b/test/test_bivariate.rb
index 2b745cd..8d20917 100644
--- a/test/test_bivariate.rb
+++ b/test/test_bivariate.rb
@@ -1,163 +1,164 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleBivariateTestCase < MiniTest::Test
- should "method sum of squares should be correct" do
- v1=[1,2,3,4,5,6].to_vector(:scale)
- v2=[6,2,4,10,12,8].to_vector(:scale)
- assert_equal(23.0, Statsample::Bivariate.sum_of_squares(v1,v2))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleBivariateTestCase < Minitest::Test
+ should 'method sum of squares should be correct' do
+ v1 = Daru::Vector.new([1, 2, 3, 4, 5, 6])
+ v2 = Daru::Vector.new([6, 2, 4, 10, 12, 8])
+ assert_equal(23.0, Statsample::Bivariate.sum_of_squares(v1, v2))
end
- should_with_gsl "return same covariance with ruby and gls implementation" do
- v1=20.times.collect {|a| rand()}.to_scale
- v2=20.times.collect {|a| rand()}.to_scale
- assert_in_delta(Statsample::Bivariate.covariance(v1,v2), Statsample::Bivariate.covariance_slow(v1,v2), 0.001)
+ should_with_gsl 'return same covariance with ruby and gls implementation' do
+ v1 = Daru::Vector.new(20.times.collect { |_a| rand })
+ v2 = Daru::Vector.new(20.times.collect { |_a| rand })
+ assert_in_delta(Statsample::Bivariate.covariance(v1, v2), Statsample::Bivariate.covariance_slow(v1, v2), 0.001)
end
- should_with_gsl "return same correlation with ruby and gls implementation" do
- v1=20.times.collect {|a| rand()}.to_scale
- v2=20.times.collect {|a| rand()}.to_scale
+ should_with_gsl 'return same correlation with ruby and gls implementation' do
+ v1 = Daru::Vector.new(20.times.collect { |_a| rand })
+ v2 = Daru::Vector.new(20.times.collect { |_a| rand })
- assert_in_delta(GSL::Stats::correlation(v1.gsl, v2.gsl), Statsample::Bivariate.pearson_slow(v1,v2), 1e-10)
+ assert_in_delta(GSL::Stats.correlation(v1.to_gsl, v2.to_gsl), Statsample::Bivariate.pearson_slow(v1, v2), 1e-10)
end
- should "return correct pearson correlation" do
- v1=[6,5,4,7,8,4,3,2].to_vector(:scale)
- v2=[2,3,7,8,6,4,3,2].to_vector(:scale)
- assert_in_delta(0.525,Statsample::Bivariate.pearson(v1,v2), 0.001)
- assert_in_delta(0.525,Statsample::Bivariate.pearson_slow(v1,v2), 0.001)
+ should 'return correct pearson correlation' do
+ v1 = Daru::Vector.new([6, 5, 4, 7, 8, 4, 3, 2])
+ v2 = Daru::Vector.new([2, 3, 7, 8, 6, 4, 3, 2])
+ assert_in_delta(0.525, Statsample::Bivariate.pearson(v1, v2), 0.001)
+ assert_in_delta(0.525, Statsample::Bivariate.pearson_slow(v1, v2), 0.001)
- v3=[6,2, 1000,1000,5,4,7,8,4,3,2,nil].to_vector(:scale)
- v4=[2,nil,nil,nil, 3,7,8,6,4,3,2,500].to_vector(:scale)
- assert_in_delta(0.525,Statsample::Bivariate.pearson(v3,v4),0.001)
+ v3 = Daru::Vector.new([6, 2, 1000, 1000, 5, 4, 7, 8, 4, 3, 2, nil])
+ v4 = Daru::Vector.new([2, nil, nil, nil, 3, 7, 8, 6, 4, 3, 2, 500])
+ assert_in_delta(0.525, Statsample::Bivariate.pearson(v3, v4), 0.001)
# Test ruby method
- v3a,v4a=Statsample.only_valid v3, v4
- assert_in_delta(0.525, Statsample::Bivariate.pearson_slow(v3a,v4a),0.001)
+ v3a, v4a = Statsample.only_valid v3, v4
+ assert_in_delta(0.525, Statsample::Bivariate.pearson_slow(v3a, v4a), 0.001)
end
- should "return correct values for t_pearson and prop_pearson" do
- v1=[6,5,4,7,8,4,3,2].to_vector(:scale)
- v2=[2,3,7,8,6,4,3,2].to_vector(:scale)
- r=Statsample::Bivariate::Pearson.new(v1,v2)
- assert_in_delta(0.525,r.r, 0.001)
- assert_in_delta(Statsample::Bivariate.t_pearson(v1,v2), r.t, 0.001)
- assert_in_delta(Statsample::Bivariate.prop_pearson(r.t,8,:both), r.probability, 0.001)
- assert(r.summary.size>0)
+ should 'return correct values for t_pearson and prop_pearson' do
+ v1 = Daru::Vector.new([6, 5, 4, 7, 8, 4, 3, 2])
+ v2 = Daru::Vector.new([2, 3, 7, 8, 6, 4, 3, 2])
+ r = Statsample::Bivariate::Pearson.new(v1, v2)
+ assert_in_delta(0.525, r.r, 0.001)
+ assert_in_delta(Statsample::Bivariate.t_pearson(v1, v2), r.t, 0.001)
+ assert_in_delta(Statsample::Bivariate.prop_pearson(r.t, 8, :both), r.probability, 0.001)
+ assert(r.summary.size > 0)
end
- should "return correct correlation_matrix with nils values" do
- v1=[6,5,4,7,8,4,3,2].to_vector(:scale)
- v2=[2,3,7,8,6,4,3,2].to_vector(:scale)
- v3=[6,2, 1000,1000,5,4,7,8].to_vector(:scale)
- v4=[2,nil,nil,nil, 3,7,8,6].to_vector(:scale)
- ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4}.to_dataset
- c=Proc.new {|n1,n2|Statsample::Bivariate.pearson(n1,n2)}
- expected=Matrix[ [c.call(v1,v1),c.call(v1,v2),c.call(v1,v3),c.call(v1,v4)], [c.call(v2,v1),c.call(v2,v2),c.call(v2,v3),c.call(v2,v4)], [c.call(v3,v1),c.call(v3,v2),c.call(v3,v3),c.call(v3,v4)],
- [c.call(v4,v1),c.call(v4,v2),c.call(v4,v3),c.call(v4,v4)]
+ should 'return correct correlation_matrix with nils values' do
+ v1 = Daru::Vector.new([6, 5, 4, 7, 8, 4, 3, 2])
+ v2 = Daru::Vector.new([2, 3, 7, 8, 6, 4, 3, 2])
+ v3 = Daru::Vector.new([6, 2, 1000, 1000, 5, 4, 7, 8])
+ v4 = Daru::Vector.new([2, nil, nil, nil, 3, 7, 8, 6])
+ ds = Daru::DataFrame.new({ :v1 => v1, :v2 => v2, :v3 => v3, :v4 => v4 })
+ c = proc { |n1, n2| Statsample::Bivariate.pearson(n1, n2) }
+ expected = Matrix[[c.call(v1, v1), c.call(v1, v2), c.call(v1, v3), c.call(v1, v4)], [c.call(v2, v1), c.call(v2, v2), c.call(v2, v3), c.call(v2, v4)], [c.call(v3, v1), c.call(v3, v2), c.call(v3, v3), c.call(v3, v4)],
+ [c.call(v4, v1), c.call(v4, v2), c.call(v4, v3), c.call(v4, v4)]
]
- obt=Statsample::Bivariate.correlation_matrix(ds)
+ obt = Statsample::Bivariate.correlation_matrix(ds)
for i in 0...expected.row_size
for j in 0...expected.column_size
- #puts expected[i,j].inspect
- #puts obt[i,j].inspect
- assert_in_delta(expected[i,j], obt[i,j],0.0001, "#{expected[i,j].class}!=#{obt[i,j].class} ")
+ # puts expected[i,j].inspect
+ # puts obt[i,j].inspect
+ assert_in_delta(expected[i, j], obt[i, j], 0.0001, "#{expected[i, j].class}!=#{obt[i, j].class} ")
end
end
- #assert_equal(expected,obt)
+ # assert_equal(expected,obt)
end
- should_with_gsl "return same values for optimized and pairwise covariance matrix" do
- cases=100
- v1=Statsample::Vector.new_scale(cases) {rand()}
- v2=Statsample::Vector.new_scale(cases) {rand()}
- v3=Statsample::Vector.new_scale(cases) {rand()}
- v4=Statsample::Vector.new_scale(cases) {rand()}
- v5=Statsample::Vector.new_scale(cases) {rand()}
+ should_with_gsl 'return same values for optimized and pairwise covariance matrix' do
+ cases = 100
+ v1 = Daru::Vector.new_with_size(cases) { rand }
+ v2 = Daru::Vector.new_with_size(cases) { rand }
+ v3 = Daru::Vector.new_with_size(cases) { rand }
+ v4 = Daru::Vector.new_with_size(cases) { rand }
+ v5 = Daru::Vector.new_with_size(cases) { rand }
- ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'v5'=>v5}.to_dataset
-
- cor_opt=Statsample::Bivariate.covariance_matrix_optimized(ds)
-
- cor_pw =Statsample::Bivariate.covariance_matrix_pairwise(ds)
- assert_equal_matrix(cor_opt,cor_pw,1e-15)
+ ds = Daru::DataFrame.new({ :v1 => v1, :v2 => v2, :v3 => v3, :v4 => v4, :v5 => v5 })
+
+ cor_opt = Statsample::Bivariate.covariance_matrix_optimized(ds)
+
+ cor_pw = Statsample::Bivariate.covariance_matrix_pairwise(ds)
+ assert_equal_matrix(cor_opt, cor_pw, 1e-15)
end
- should_with_gsl "return same values for optimized and pairwise correlation matrix" do
-
- cases=100
- v1=Statsample::Vector.new_scale(cases) {rand()}
- v2=Statsample::Vector.new_scale(cases) {rand()}
- v3=Statsample::Vector.new_scale(cases) {rand()}
- v4=Statsample::Vector.new_scale(cases) {rand()}
- v5=Statsample::Vector.new_scale(cases) {rand()}
+ should_with_gsl 'return same values for optimized and pairwise correlation matrix' do
+ cases = 100
+ v1 = Daru::Vector.new_with_size(cases) { rand }
+ v2 = Daru::Vector.new_with_size(cases) { rand }
+ v3 = Daru::Vector.new_with_size(cases) { rand }
+ v4 = Daru::Vector.new_with_size(cases) { rand }
+ v5 = Daru::Vector.new_with_size(cases) { rand }
+
+ ds = Daru::DataFrame.new({
+ :v1 => v1, :v2 => v2, :v3 => v3, :v4 => v4, :v5 => v5 })
+
+ cor_opt = Statsample::Bivariate.correlation_matrix_optimized(ds)
- ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'v5'=>v5}.to_dataset
-
- cor_opt=Statsample::Bivariate.correlation_matrix_optimized(ds)
-
- cor_pw =Statsample::Bivariate.correlation_matrix_pairwise(ds)
- assert_equal_matrix(cor_opt,cor_pw,1e-15)
-
+ cor_pw = Statsample::Bivariate.correlation_matrix_pairwise(ds)
+ assert_equal_matrix(cor_opt, cor_pw, 1e-15)
end
- should "return correct correlation_matrix without nils values" do
- v1=[6,5,4,7,8,4,3,2].to_vector(:scale)
- v2=[2,3,7,8,6,4,3,2].to_vector(:scale)
- v3=[6,2, 1000,1000,5,4,7,8].to_vector(:scale)
- v4=[2,4,6,7, 3,7,8,6].to_vector(:scale)
- ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4}.to_dataset
- c=Proc.new {|n1,n2|Statsample::Bivariate.pearson(n1,n2)}
- expected=Matrix[ [c.call(v1,v1),c.call(v1,v2),c.call(v1,v3),c.call(v1,v4)], [c.call(v2,v1),c.call(v2,v2),c.call(v2,v3),c.call(v2,v4)], [c.call(v3,v1),c.call(v3,v2),c.call(v3,v3),c.call(v3,v4)],
- [c.call(v4,v1),c.call(v4,v2),c.call(v4,v3),c.call(v4,v4)]
+ should 'return correct correlation_matrix without nils values' do
+ v1 = Daru::Vector.new([6, 5, 4, 7, 8, 4, 3, 2])
+ v2 = Daru::Vector.new([2, 3, 7, 8, 6, 4, 3, 2])
+ v3 = Daru::Vector.new([6, 2, 1000, 1000, 5, 4, 7, 8])
+ v4 = Daru::Vector.new([2, 4, 6, 7, 3, 7, 8, 6])
+ ds = Daru::DataFrame.new({ :v1 => v1, :v2 => v2, :v3 => v3, :v4 => v4 })
+ c = proc { |n1, n2| Statsample::Bivariate.pearson(n1, n2) }
+ expected = Matrix[[c.call(v1, v1), c.call(v1, v2), c.call(v1, v3), c.call(v1, v4)], [c.call(v2, v1), c.call(v2, v2), c.call(v2, v3), c.call(v2, v4)], [c.call(v3, v1), c.call(v3, v2), c.call(v3, v3), c.call(v3, v4)],
+ [c.call(v4, v1), c.call(v4, v2), c.call(v4, v3), c.call(v4, v4)]
]
- obt=Statsample::Bivariate.correlation_matrix(ds)
+ obt = Statsample::Bivariate.correlation_matrix(ds)
for i in 0...expected.row_size
for j in 0...expected.column_size
- #puts expected[i,j].inspect
- #puts obt[i,j].inspect
- assert_in_delta(expected[i,j], obt[i,j],0.0001, "#{expected[i,j].class}!=#{obt[i,j].class} ")
+ # puts expected[i,j].inspect
+ # puts obt[i,j].inspect
+ assert_in_delta(expected[i, j], obt[i, j], 0.0001, "#{expected[i, j].class}!=#{obt[i, j].class} ")
end
end
- #assert_equal(expected,obt)
+ # assert_equal(expected,obt)
end
-
- should "return correct value for prop pearson" do
- assert_in_delta(0.42, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.084,94), 94),0.01)
- assert_in_delta(0.65, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.046,95), 95),0.01)
- r=0.9
- n=100
- t=Statsample::Bivariate.t_r(r,n)
- assert(Statsample::Bivariate.prop_pearson(t,n,:both)<0.05)
- assert(Statsample::Bivariate.prop_pearson(t,n,:right)<0.05)
- assert(Statsample::Bivariate.prop_pearson(t,n,:left)>0.05)
+ should 'return correct value for prop pearson' do
+ assert_in_delta(0.42, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.084, 94), 94), 0.01)
+ assert_in_delta(0.65, Statsample::Bivariate.prop_pearson(Statsample::Bivariate.t_r(0.046, 95), 95), 0.01)
+ r = 0.9
+ n = 100
+ t = Statsample::Bivariate.t_r(r, n)
+ assert(Statsample::Bivariate.prop_pearson(t, n, :both) < 0.05)
+ assert(Statsample::Bivariate.prop_pearson(t, n, :right) < 0.05)
+ assert(Statsample::Bivariate.prop_pearson(t, n, :left) > 0.05)
- r=-0.9
- n=100
- t=Statsample::Bivariate.t_r(r,n)
- assert(Statsample::Bivariate.prop_pearson(t,n,:both)<0.05)
- assert(Statsample::Bivariate.prop_pearson(t,n,:right)>0.05)
- assert(Statsample::Bivariate.prop_pearson(t,n,:left)<0.05)
+ r = -0.9
+ n = 100
+ t = Statsample::Bivariate.t_r(r, n)
+ assert(Statsample::Bivariate.prop_pearson(t, n, :both) < 0.05)
+ assert(Statsample::Bivariate.prop_pearson(t, n, :right) > 0.05)
+ assert(Statsample::Bivariate.prop_pearson(t, n, :left) < 0.05)
end
should "return correct value for Spearman's rho" do
- v1=[86,97,99,100,101,103,106,110,112,113].to_vector(:scale)
- v2=[0,20,28,27,50,29,7,17,6,12].to_vector(:scale)
- assert_in_delta(-0.175758,Statsample::Bivariate.spearman(v1,v2),0.0001)
-
+ v1 =Daru::Vector.new( [86, 97, 99, 100, 101, 103, 106, 110, 112, 113])
+ v2 =Daru::Vector.new( [0, 20, 28, 27, 50, 29, 7, 17, 6, 12])
+ assert_in_delta(-0.175758, Statsample::Bivariate.spearman(v1, v2), 0.0001)
end
- should "return correct value for point_biserial correlation" do
- c=[1,3,5,6,7,100,200,300,400,300].to_vector(:scale)
- d=[1,1,1,1,1,0,0,0,0,0].to_vector(:scale)
+ should 'return correct value for point_biserial correlation' do
+ c = Daru::Vector.new([1, 3, 5, 6, 7, 100, 200, 300, 400, 300])
+ d = Daru::Vector.new([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
assert_raises TypeError do
- Statsample::Bivariate.point_biserial(c,d)
+ Statsample::Bivariate.point_biserial(c, d)
end
- assert_in_delta(Statsample::Bivariate.point_biserial(d,c), Statsample::Bivariate.pearson(d,c), 0.0001)
+ assert_in_delta(Statsample::Bivariate.point_biserial(d, c), Statsample::Bivariate.pearson(d, c), 0.0001)
+ end
+ should 'return correct value for tau_a and tau_b' do
+ v1 = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+ v2 = Daru::Vector.new([1, 3, 4, 5, 7, 8, 2, 9, 10, 6, 11])
+ assert_in_delta(0.6727, Statsample::Bivariate.tau_a(v1, v2), 0.001)
+ assert_in_delta(0.6727, Statsample::Bivariate.tau_b((Statsample::Crosstab.new(v1, v2).to_matrix)), 0.001)
+ v1 = Daru::Vector.new([12, 14, 14, 17, 19, 19, 19, 19, 19, 20, 21, 21, 21, 21, 21, 22, 23, 24, 24, 24, 26, 26, 27])
+ v2 = Daru::Vector.new([11, 4, 4, 2, 0, 0, 0, 0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0])
+ assert_in_delta(-0.376201540231705, Statsample::Bivariate.tau_b(Statsample::Crosstab.new(v1, v2).to_matrix), 0.001)
end
- should "return correct value for tau_a and tau_b" do
- v1=[1,2,3,4,5,6,7,8,9,10,11].to_vector(:ordinal)
- v2=[1,3,4,5,7,8,2,9,10,6,11].to_vector(:ordinal)
- assert_in_delta(0.6727,Statsample::Bivariate.tau_a(v1,v2),0.001)
- assert_in_delta(0.6727,Statsample::Bivariate.tau_b((Statsample::Crosstab.new(v1,v2).to_matrix)),0.001)
- v1=[12,14,14,17,19,19,19,19,19,20,21,21,21,21,21,22,23,24,24,24,26,26,27].to_vector(:ordinal)
- v2=[11,4,4,2,0,0,0,0,0,0,4,0,4,0,0,0,0,4,0,0,0,0,0].to_vector(:ordinal)
- assert_in_delta(-0.376201540231705, Statsample::Bivariate.tau_b(Statsample::Crosstab.new(v1,v2).to_matrix),0.001)
+ should 'return correct value for gamma correlation' do
+ m = Matrix[[10, 5, 2], [10, 15, 20]]
+ assert_in_delta(0.636, Statsample::Bivariate.gamma(m), 0.001)
+ m2 = Matrix[[15, 12, 6, 5], [12, 8, 10, 8], [4, 6, 9, 10]]
+ assert_in_delta(0.349, Statsample::Bivariate.gamma(m2), 0.001)
end
- should "return correct value for gamma correlation" do
- m=Matrix[[10,5,2],[10,15,20]]
- assert_in_delta(0.636,Statsample::Bivariate.gamma(m),0.001)
- m2=Matrix[[15,12,6,5],[12,8,10,8],[4,6,9,10]]
- assert_in_delta(0.349,Statsample::Bivariate.gamma(m2),0.001)
+
+ should 'return correct residuals' do
+ # TODO: test Statsample::Bivariate.residuals
end
end
diff --git a/test/test_codification.rb b/test/test_codification.rb
index 2049d06..21121bc 100644
--- a/test/test_codification.rb
+++ b/test/test_codification.rb
@@ -1,76 +1,78 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleCodificationTestCase < MiniTest::Unit::TestCase
-
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleCodificationTestCase < Minitest::Test
def initialize(*args)
- v1=%w{run walk,run walking running sleep sleeping,dreaming sleep,dream}.to_vector
- @dict={'run'=>'r','walk'=>'w','walking'=>'w','running'=>'r','sleep'=>'s', 'sleeping'=>'s', 'dream'=>'d', 'dreaming'=>'d'}
- @ds={"v1"=>v1}.to_dataset
+ v1 = Daru::Vector.new(%w(run walk,run walking running sleep sleeping,dreaming sleep,dream))
+ @dict = { 'run' => 'r', 'walk' => 'w', 'walking' => 'w', 'running' => 'r', 'sleep' => 's', 'sleeping' => 's', 'dream' => 'd', 'dreaming' => 'd' }
+ @ds = Daru::DataFrame.new({ :v1 => v1 })
super
end
+
def test_create_hash
- expected_keys_v1=%w{run walk walking running sleep sleeping dream dreaming}.sort
- hash=Statsample::Codification.create_hash(@ds,['v1'])
- assert_equal(['v1'],hash.keys)
- assert_equal(expected_keys_v1,hash['v1'].keys.sort)
- assert_equal(expected_keys_v1,hash['v1'].values.sort)
+ expected_keys_v1 = %w(run walk walking running sleep sleeping dream dreaming).sort
+ hash = Statsample::Codification.create_hash(@ds, [:v1])
+ assert_equal([:v1], hash.keys)
+ assert_equal(expected_keys_v1, hash[:v1].keys.sort)
+ assert_equal(expected_keys_v1, hash[:v1].values.sort)
end
+
def test_create_excel
- filename=Dir::tmpdir+"/test_excel"+Time.now().to_s+".xls"
- #filename = Tempfile.new("test_codification_"+Time.now().to_s)
+ filename = Dir.tmpdir + '/test_excel' + Time.now.to_s + '.xls'
+ # filename = Tempfile.new("test_codification_"+Time.now().to_s)
Statsample::Codification.create_excel(@ds, ['v1'], filename)
- field=(["v1"]*8).to_vector
- keys=%w{dream dreaming run running sleep sleeping walk walking}.to_vector
- ds=Statsample::Excel.read(filename)
- assert_equal(field, ds['field'])
- assert_equal(keys, ds['original'])
- assert_equal(keys, ds['recoded'])
- hash=Statsample::Codification.excel_to_recoded_hash(filename)
- assert_equal(keys.data, hash['v1'].keys.sort)
- assert_equal(keys.data, hash['v1'].values.sort)
-
+ field = Daru::Vector.new(['v1'] * 8, name: :field)
+ keys = Daru::Vector.new(%w(dream dreaming run running sleep sleeping walk walking))
+ ds = Daru::DataFrame.from_excel(filename)
+ assert_equal(field, ds[:field])
+ assert_equal(keys, ds[:original])
+ assert_equal(keys, ds[:recoded])
+ hash = Statsample::Codification.excel_to_recoded_hash(filename)
+ assert_equal(keys.to_a, hash[:v1].keys.sort)
+ assert_equal(keys.to_a, hash[:v1].values.sort)
end
+
def test_create_yaml
- assert_raise ArgumentError do
- Statsample::Codification.create_yaml(@ds,[])
+ assert_raise ArgumentError do
+ Statsample::Codification.create_yaml(@ds, [])
end
- expected_keys_v1=%w{run walk walking running sleep sleeping dream dreaming}.sort
- yaml_hash=Statsample::Codification.create_yaml(@ds,['v1'])
- h=YAML::load(yaml_hash)
- assert_equal(['v1'],h.keys)
- assert_equal(expected_keys_v1,h['v1'].keys.sort)
- tf = Tempfile.new("test_codification")
- yaml_hash=Statsample::Codification.create_yaml(@ds,['v1'],tf, Statsample::SPLIT_TOKEN)
+ expected_keys_v1 = %w(run walk walking running sleep sleeping dream dreaming).sort
+ yaml_hash = Statsample::Codification.create_yaml(@ds, [:v1])
+ h = YAML.load(yaml_hash)
+ assert_equal([:v1], h.keys)
+ assert_equal(expected_keys_v1, h[:v1].keys.sort)
+ tf = Tempfile.new('test_codification')
+ yaml_hash = Statsample::Codification.create_yaml(@ds, [:v1], tf, Statsample::SPLIT_TOKEN)
tf.close
tf.open
- h=YAML::load(tf)
- assert_equal(['v1'],h.keys)
- assert_equal(expected_keys_v1,h['v1'].keys.sort)
+ h = YAML.load(tf)
+ assert_equal([:v1], h.keys)
+ assert_equal(expected_keys_v1, h[:v1].keys.sort)
tf.close(true)
end
+
def test_recodification
- expected=[['r'],['w','r'],['w'],['r'],['s'],['s','d'], ['s','d']]
- assert_equal(expected,Statsample::Codification.recode_vector(@ds['v1'],@dict))
- v2=['run','walk,dreaming',nil,'walk,dream,dreaming,walking'].to_vector
- expected=[['r'],['w','d'],nil,['w','d']]
- assert_equal(expected,Statsample::Codification.recode_vector(v2,@dict))
+ expected = [['r'], %w(w r), ['w'], ['r'], ['s'], %w(s d), %w(s d)]
+ assert_equal(expected, Statsample::Codification.recode_vector(@ds[:v1], @dict))
+ v2 = Daru::Vector.new(['run', 'walk,dreaming', nil, 'walk,dream,dreaming,walking'])
+ expected = [['r'], %w(w d), nil, %w(w d)]
+ assert_equal(expected, Statsample::Codification.recode_vector(v2, @dict))
end
+
def test_recode_dataset_simple
- Statsample::Codification.recode_dataset_simple!(@ds,{'v1'=>@dict})
- expected_vector=['r','w,r','w','r','s','s,d', 's,d'].to_vector
- assert_not_equal(expected_vector,@ds['v1'])
- assert_equal(expected_vector,@ds['v1_recoded'])
+ Statsample::Codification.recode_dataset_simple!(@ds, :v1 => @dict)
+ expected_vector = Daru::Vector.new(['r', 'w,r', 'w', 'r', 's', 's,d', 's,d'])
+ assert_not_equal(expected_vector, @ds[:v1])
+ assert_equal(expected_vector, @ds[:v1_recoded])
end
- def test_recode_dataset_split
- Statsample::Codification.recode_dataset_split!(@ds,{'v1'=>@dict})
- e={}
- e['r']=[1,1,0,1,0,0,0].to_vector
- e['w']=[0,1,1,0,0,0,0].to_vector
- e['s']=[0,0,0,0,1,1,1].to_vector
- e['d']=[0,0,0,0,0,1,1].to_vector
- e.each{|k,expected|
- assert_equal(expected,@ds['v1_'+k],"Error on key #{k}")
+ def test_recode_dataset_split
+ Statsample::Codification.recode_dataset_split!(@ds, :v1 => @dict)
+ e = {}
+ e['r'] = Daru::Vector.new([1, 1, 0, 1, 0, 0, 0])
+ e['w'] = Daru::Vector.new([0, 1, 1, 0, 0, 0, 0])
+ e['s'] = Daru::Vector.new([0, 0, 0, 0, 1, 1, 1])
+ e['d'] = Daru::Vector.new([0, 0, 0, 0, 0, 1, 1])
+ e.each { |k, expected|
+ assert_equal(expected, @ds[('v1_' + k).to_sym], "Error on key #{k}")
}
end
-
end
diff --git a/test/test_crosstab.rb b/test/test_crosstab.rb
index 2eef2b1..8f39460 100644
--- a/test/test_crosstab.rb
+++ b/test/test_crosstab.rb
@@ -1,63 +1,67 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleCrosstabTestCase < MiniTest::Unit::TestCase
-
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleCrosstabTestCase < Minitest::Test
def initialize(*args)
- @v1=%w{black blonde black black red black brown black blonde black red black blonde}.to_vector
- @v2=%w{woman man man woman man man man woman man woman woman man man}.to_vector
- @ct=Statsample::Crosstab.new(@v1,@v2)
+ @v1 =Daru::Vector.new( %w(black blonde black black red black brown black blonde black red black blonde))
+ @v2 =Daru::Vector.new( %w(woman man man woman man man man woman man woman woman man man))
+ @ct = Statsample::Crosstab.new(@v1, @v2)
super
end
+
def test_crosstab_errors
- e1=%w{black blonde black black red black brown black blonde black}
+ e1 = %w(black blonde black black red black brown black blonde black)
assert_raise ArgumentError do
- Statsample::Crosstab.new(e1,@v2)
+ Statsample::Crosstab.new(e1, @v2)
end
- e2=%w{black blonde black black red black brown black blonde black black}.to_vector
+ e2 = Daru::Vector.new(%w(black blonde black black red black brown black blonde black black))
assert_raise ArgumentError do
- Statsample::Crosstab.new(e2,@v2)
+ Statsample::Crosstab.new(e2, @v2)
end
assert_nothing_raised do
- Statsample::Crosstab.new(@v1,@v2)
+ Statsample::Crosstab.new(@v1, @v2)
end
end
+
def test_crosstab_basic
- assert_equal(%w{black blonde brown red}, @ct.rows_names)
- assert_equal(%w{man woman}, @ct.cols_names)
- assert_equal({'black'=>7,'blonde'=>3,'red'=>2,'brown'=>1}, @ct.rows_total)
- assert_equal({'man'=>8,'woman'=>5}, @ct.cols_total)
+ assert_equal(Daru::Vector.new(%w(black blonde brown red)), @ct.rows_names)
+ assert_equal(Daru::Vector.new(%w(man woman)), @ct.cols_names)
+ assert_equal({ 'black' => 7, 'blonde' => 3, 'red' => 2, 'brown' => 1 }, @ct.rows_total)
+ assert_equal({ 'man' => 8, 'woman' => 5 }, @ct.cols_total)
end
+
def test_crosstab_frequencies
- fq=@ct.frequencies
- assert_equal(8,fq.size)
- sum=fq.inject(0) {|s,x| s+x[1]}
- assert_equal(13,sum)
- fr=@ct.frequencies_by_row
- assert_equal(4,fr.size)
- assert_equal(%w{black blonde brown red},fr.keys.sort)
- fc=@ct.frequencies_by_col
- assert_equal(2,fc.size)
- assert_equal(%w{man woman},fc.keys.sort)
- assert_equal(Matrix.rows([[3,4],[3,0],[1,0],[1,1]]),@ct.to_matrix)
+ fq = @ct.frequencies
+ assert_equal(8, fq.size)
+ sum = fq.inject(0) { |s, x| s + x[1] }
+ assert_equal(13, sum)
+ fr = @ct.frequencies_by_row
+ assert_equal(4, fr.size)
+ assert_equal(%w(black blonde brown red), fr.keys.sort)
+ fc = @ct.frequencies_by_col
+ assert_equal(2, fc.size)
+ assert_equal(%w(man woman), fc.keys.sort)
+ assert_equal(Matrix.rows([[3, 4], [3, 0], [1, 0], [1, 1]]), @ct.to_matrix)
end
+
def test_summary
- @ct.percentage_row=true
- @ct.percentage_column=true
- @ct.percentage_total=true
- assert(@ct.summary.size>0)
+ @ct.percentage_row = true
+ @ct.percentage_column = true
+ @ct.percentage_total = true
+ assert(@ct.summary.size > 0)
end
+
def test_expected
- v1=%w{1 1 1 1 1 0 0 0 0 0}.to_vector
- v2=%w{0 0 0 0 0 1 1 1 1 1}.to_vector
- ct=Statsample::Crosstab.new(v1,v2)
- assert_equal(Matrix[[2.5,2.5],[2.5,2.5]],ct.matrix_expected)
+ v1 = Daru::Vector.new(%w(1 1 1 1 1 0 0 0 0 0))
+ v2 = Daru::Vector.new(%w(0 0 0 0 0 1 1 1 1 1))
+ ct = Statsample::Crosstab.new(v1, v2)
+ assert_equal(Matrix[[2.5, 2.5], [2.5, 2.5]], ct.matrix_expected)
end
+
def test_crosstab_with_scale
- v1=%w{1 1 1 1 1 0 0 0 0 0}.to_scale
- v2=%w{0 0 0 0 0 1 1 1 1 1}.to_scale
- ct=Statsample::Crosstab.new(v1,v2)
- assert_equal(Matrix[[0,5],[5,0]],ct.to_matrix)
- assert_nothing_raised { ct.summary }
+ v1 = Daru::Vector.new(%w(1 1 1 1 1 0 0 0 0 0))
+ v2 = Daru::Vector.new(%w(0 0 0 0 0 1 1 1 1 1))
+ ct = Statsample::Crosstab.new(v1, v2)
+ assert_equal(Matrix[[0, 5], [5, 0]], ct.to_matrix)
+ assert_nothing_raised { ct.summary }
end
-
end
diff --git a/test/test_csv.rb b/test/test_csv.rb
deleted file mode 100644
index 283dadd..0000000
--- a/test/test_csv.rb
+++ /dev/null
@@ -1,81 +0,0 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleCSVTestCase < MiniTest::Unit::TestCase
- def setup
- @ds=Statsample::CSV.read(File.dirname(__FILE__)+"/fixtures/test_csv.csv")
- end
- def test_read
- assert_equal(6,@ds.cases)
- assert_equal(%w{id name age city a1}, @ds.fields)
- id=[1,2,3,4,5,6].to_vector(:scale)
- name=["Alex","Claude","Peter","Franz","George","Fernand"].to_vector(:nominal)
- age=[20,23,25,27,5.5,nil].to_vector(:scale)
- city=["New York","London","London","Paris","Tome",nil].to_vector(:nominal)
- a1=["a,b","b,c","a",nil,"a,b,c",nil].to_vector(:nominal)
- ds_exp=Statsample::Dataset.new({'id'=>id,'name'=>name,'age'=>age,'city'=>city,'a1'=>a1}, %w{id name age city a1})
- ds_exp.fields.each{|f|
- assert_equal(ds_exp[f],@ds[f])
- }
- assert_equal(ds_exp,@ds)
- end
- def test_nil
- assert_equal(nil,@ds['age'][5])
- end
- def test_repeated
- ds=Statsample::CSV.read(File.dirname(__FILE__)+"/fixtures/repeated_fields.csv")
- assert_equal(%w{id name_1 age_1 city a1 name_2 age_2},ds.fields)
- age=[3,4,5,6,nil,8].to_vector(:scale)
- assert_equal(age,ds['age_2'])
- end
- def test_write
- filename=Tempfile.new("afile")
- # filename=Dir::tmpdir+"/test_write.csv"
- Statsample::CSV.write(@ds, filename.path)
- ds2=Statsample::CSV.read(filename.path)
- i=0
- ds2.each_array{|row|
- assert_equal(@ds.case_as_array(i),row)
- i+=1
- }
- end
-end
-=begin
-class StatsampleCSVTestCase2 < MiniTest::Unit::TestCase
- def setup
- @ds=Statsample::CSV.read19(File.dirname(__FILE__)+"/fixtures/test_csv.csv")
- end
- def test_read
- assert_equal(6,@ds.cases)
- assert_equal(%w{id name age city a1}, @ds.fields)
- id=[1,2,3,4,5,6].to_vector(:scale)
- name=["Alex","Claude","Peter","Franz","George","Fernand"].to_vector(:nominal)
- age=[20,23,25,27,5.5,nil].to_vector(:scale)
- city=["New York","London","London","Paris","Tome",nil].to_vector(:nominal)
- a1=["a,b","b,c","a",nil,"a,b,c",nil].to_vector(:nominal)
- ds_exp=Statsample::Dataset.new({'id'=>id,'name'=>name,'age'=>age,'city'=>city,'a1'=>a1}, %w{id name age city a1})
- ds_exp.fields.each{|f|
- assert_equal(ds_exp[f],@ds[f])
- }
- assert_equal(ds_exp,@ds)
- end
- def test_nil
- assert_equal(nil,@ds['age'][5])
- end
- def test_repeated
- ds=Statsample::CSV.read19(File.dirname(__FILE__)+"/fixtures/repeated_fields.csv")
- assert_equal(%w{id name_1 age_1 city a1 name_2 age_2},ds.fields)
- age=[3,4,5,6,nil,8].to_vector(:scale)
- assert_equal(age,ds['age_2'])
- end
- def test_write
- filename=Tempfile.new("afile")
- # filename=Dir::tmpdir+"/test_write.csv"
- Statsample::CSV.write(@ds, filename.path)
- ds2=Statsample::CSV.read19(filename.path)
- i=0
- ds2.each_array{|row|
- assert_equal(@ds.case_as_array(i),row)
- i+=1
- }
- end
-end
-=end
diff --git a/test/test_dataset.rb b/test/test_dataset.rb
deleted file mode 100644
index c6fb979..0000000
--- a/test/test_dataset.rb
+++ /dev/null
@@ -1,462 +0,0 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleDatasetTestCase < MiniTest::Unit::TestCase
- def setup
- @ds=Statsample::Dataset.new({'id' => Statsample::Vector.new([1,2,3,4,5]), 'name'=>Statsample::Vector.new(%w{Alex Claude Peter Franz George}), 'age'=>Statsample::Vector.new([20,23,25,27,5]),
- 'city'=>Statsample::Vector.new(['New York','London','London','Paris','Tome']),
- 'a1'=>Statsample::Vector.new(['a,b','b,c','a',nil,'a,b,c'])}, ['id','name','age','city','a1'])
- end
- def test_nest
- ds={
- 'a'=>%w{a a a b b b}.to_vector,
- 'b'=>%w{c c d d e e}.to_vector,
- 'c'=>%w{f g h i j k}.to_vector
- }.to_dataset
- nest=ds.nest('a','b')
- assert_equal([{'c'=>'f'},{'c'=>'g'}], nest['a']['c'])
- assert_equal([{'c'=>'h'}], nest['a']['d'])
- assert_equal([{'c'=>'j'},{'c'=>'k'}], nest['b']['e'])
-
- end
- def test_should_have_summary
- assert(@ds.summary.size>0)
- end
- def test_basic
- assert_equal(5,@ds.cases)
- assert_equal(%w{id name age city a1}, @ds.fields)
- end
- def test_saveload
- outfile=Tempfile.new("dataset.ds")
- @ds.save(outfile.path)
- a=Statsample.load(outfile.path)
- assert_equal(@ds,a)
- end
- def test_gsl
- if Statsample.has_gsl?
- matrix=GSL::Matrix[[1,2],[3,4],[5,6]]
- ds=Statsample::Dataset.new('v1'=>[1,3,5].to_vector,'v2'=>[2,4,6].to_vector)
- assert_equal(matrix,ds.to_gsl)
- else
- skip("Gsl needed")
- end
- end
- def test_matrix
- matrix=Matrix[[1,2],[3,4],[5,6]]
- ds=Statsample::Dataset.new('v1'=>[1,3,5].to_vector,'v2'=>[2,4,6].to_vector)
- assert_equal(matrix,ds.to_matrix)
- end
-
- def test_fields
- @ds.fields=%w{name a1 id age city}
- assert_equal(%w{name a1 id age city}, @ds.fields)
- @ds.fields=%w{id name age}
- assert_equal(%w{id name age a1 city}, @ds.fields)
- end
- def test_merge
- a=[1,2,3].to_scale
- b=[3,4,5].to_vector
- c=[4,5,6].to_scale
- d=[7,8,9].to_vector
- e=[10,20,30].to_vector
- ds1={'a'=>a,'b'=>b}.to_dataset
- ds2={'c'=>c,'d'=>d}.to_dataset
- exp={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
-
- assert_equal(exp,ds1.merge(ds2))
- exp.fields=%w{c d a b}
- assert_equal(exp,ds2.merge(ds1))
- ds3={'a'=>e}.to_dataset
- exp={'a_1'=>a,'b'=>b,'a_2'=>e}.to_dataset
- exp.fields=%w{a_1 b a_2}
- assert_equal(exp,ds1.merge(ds3))
- end
- def test_each_vector
- a=[1,2,3].to_vector
- b=[3,4,5].to_vector
- fields=["a","b"]
- ds=Statsample::Dataset.new({'a'=>a,'b'=>b},fields)
- res=[]
- ds.each_vector{|k,v|
- res.push([k,v])
- }
- assert_equal([["a",a],["b",b]],res)
- ds.fields=["b","a"]
- res=[]
- ds.each_vector{|k,v|
- res.push([k,v])
- }
- assert_equal([["b",b],["a",a]],res)
- end
- def test_equality
- v1=[1,2,3,4].to_vector
- v2=[5,6,7,8].to_vector
- ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2}, %w{v2 v1})
- v3=[1,2,3,4].to_vector
- v4=[5,6,7,8].to_vector
- ds2=Statsample::Dataset.new({'v1'=>v3,'v2'=>v4}, %w{v2 v1})
- assert_equal(ds1,ds2)
- ds2.fields=%w{v1 v2}
- assert_not_equal(ds1,ds2)
- end
- def test_add_vector
- v=Statsample::Vector.new(%w{a b c d e})
- @ds.add_vector('new',v)
- assert_equal(%w{id name age city a1 new},@ds.fields)
- x=Statsample::Vector.new(%w{a b c d e f g})
- assert_raise ArgumentError do
- @ds.add_vector('new2',x)
- end
- end
- def test_vector_by_calculation
- a1=[1,2,3,4,5,6,7].to_vector(:scale)
- a2=[10,20,30,40,50,60,70].to_vector(:scale)
- a3=[100,200,300,400,500,600,700].to_vector(:scale)
- ds={'a1'=>a1,'a2'=>a2,'a3'=>a3}.to_dataset
- total=ds.vector_by_calculation() {|row|
- row['a1']+row['a2']+row['a3']
- }
- expected=[111,222,333,444,555,666,777].to_vector(:scale)
- assert_equal(expected,total)
- end
- def test_vector_sum
- a1=[1 ,2 ,3 ,4 , 5,nil].to_vector(:scale)
- a2=[10 ,10,20,20 ,20,30].to_vector(:scale)
- b1=[nil,1 ,1 ,1 ,1 ,2].to_vector(:scale)
- b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale)
- ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2}.to_dataset
- total=ds.vector_sum
- a=ds.vector_sum(['a1','a2'])
- b=ds.vector_sum(['b1','b2'])
- expected_a=[11,12,23,24,25,nil].to_vector(:scale)
- expected_b=[nil,3,3,nil,3,5].to_vector(:scale)
- expected_total=[nil,15,26,nil,28,nil].to_vector(:scale)
- assert_equal(expected_a, a)
- assert_equal(expected_b, b)
- assert_equal(expected_total, total)
- end
- def test_vector_missing_values
- a1=[1 ,nil ,3 ,4 , 5,nil].to_vector(:scale)
- a2=[10 ,nil ,20,20 ,20,30].to_vector(:scale)
- b1=[nil,nil ,1 ,1 ,1 ,2].to_vector(:scale)
- b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale)
- c= [nil,2 , 4,2 ,2 ,2].to_vector(:scale)
- ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset
- mva=[2,3,0,1,0,1].to_vector(:scale)
- assert_equal(mva,ds.vector_missing_values)
- end
-
- def test_has_missing_values
- a1=[1 ,nil ,3 ,4 , 5,nil].to_vector(:scale)
- a2=[10 ,nil ,20,20 ,20,30].to_vector(:scale)
- b1=[nil,nil ,1 ,1 ,1 ,2].to_vector(:scale)
- b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale)
- c= [nil,2 , 4,2 ,2 ,2].to_vector(:scale)
- ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset
- assert(ds.has_missing_data?)
- clean=ds.dup_only_valid
- assert(!clean.has_missing_data?)
- end
-
-
- def test_vector_count_characters
- a1=[1 ,"abcde" ,3 ,4 , 5,nil].to_vector(:scale)
- a2=[10 ,20.3 ,20 ,20 ,20,30].to_vector(:scale)
- b1=[nil,"343434" ,1 ,1 ,1 ,2].to_vector(:scale)
- b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale)
- c= [nil,2 ,"This is a nice example",2 ,2 ,2].to_vector(:scale)
- ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset
- exp=[4,17,27,5,6,5].to_vector(:scale)
- assert_equal(exp,ds.vector_count_characters)
-
- end
- def test_vector_mean
- a1=[1 ,2 ,3 ,4 , 5,nil].to_vector(:scale)
- a2=[10 ,10,20,20 ,20,30].to_vector(:scale)
- b1=[nil,1 ,1 ,1 ,1 ,2].to_vector(:scale)
- b2=[2 ,2 ,2 ,nil,2 ,3].to_vector(:scale)
- c= [nil,2, 4,2 ,2 ,2].to_vector(:scale)
- ds={'a1'=>a1,'a2'=>a2,'b1'=>b1,'b2'=>b2,'c'=>c}.to_dataset
- total=ds.vector_mean
- a=ds.vector_mean(['a1','a2'],1)
- b=ds.vector_mean(['b1','b2'],1)
- c=ds.vector_mean(['b1','b2','c'],1)
- expected_a=[5.5,6,11.5,12,12.5,30].to_vector(:scale)
- expected_b=[2,1.5,1.5,1,1.5,2.5].to_vector(:scale)
- expected_c=[nil, 5.0/3,7.0/3,1.5,5.0/3,7.0/3].to_vector(:scale)
- expected_total=[nil,3.4,6,nil,6.0,nil].to_vector(:scale)
- assert_equal(expected_a, a)
- assert_equal(expected_b, b)
- assert_equal(expected_c, c)
- assert_equal(expected_total, total)
- end
-
- def test_each_array
- expected=[[1,'Alex',20,'New York','a,b'], [2,'Claude',23,'London','b,c'], [3,'Peter',25,'London','a'],[4,'Franz', 27,'Paris',nil],[5,'George',5,'Tome','a,b,c']]
- out=[]
- @ds.each_array{ |a|
- out.push(a)
- }
- assert_equal(expected,out)
- end
- def test_recode
- @ds['age'].type=:scale
- @ds.recode!("age") {|c| c['id']*2}
- expected=[2,4,6,8,10].to_vector(:scale)
- assert_equal(expected,@ds['age'])
- end
- def test_case_as
- assert_equal({'id'=>1,'name'=>'Alex','city'=>'New York','age'=>20,'a1'=>'a,b'},@ds.case_as_hash(0))
- assert_equal([5,'George',5,'Tome','a,b,c'],@ds.case_as_array(4))
- # Native methods
- assert_equal({'id'=>1,'name'=>'Alex','city'=>'New York','age'=>20,'a1'=>'a,b'},@ds._case_as_hash(0))
- assert_equal([5,'George',5,'Tome','a,b,c'],@ds._case_as_array(4))
-
-
-
- end
- def test_delete_vector
- @ds.delete_vector('name')
- assert_equal(%w{id age city a1},@ds.fields)
- assert_equal(%w{a1 age city id},@ds.vectors.keys.sort)
- end
- def test_change_type
- @ds.col('age').type=:scale
- assert_equal(:scale,@ds.col('age').type)
- end
- def test_split_by_separator_recode
- @ds.add_vectors_by_split_recode("a1","_")
- assert_equal(%w{id name age city a1 a1_1 a1_2 a1_3},@ds.fields)
- assert_equal([1,0,1,nil,1],@ds.col('a1_1').to_a)
- assert_equal([1,1,0,nil,1],@ds.col('a1_2').to_a)
- assert_equal([0,1,0,nil,1],@ds.col('a1_3').to_a)
- {'a1_1'=>'a1:a', 'a1_2'=>'a1:b', 'a1_3'=>'a1:c'}.each do |k,v|
- assert_equal(v, @ds[k].name)
- end
- end
- def test_split_by_separator
- @ds.add_vectors_by_split("a1","_")
- assert_equal(%w{id name age city a1 a1_a a1_b a1_c},@ds.fields)
- assert_equal([1,0,1,nil,1],@ds.col('a1_a').to_a)
- assert_equal([1,1,0,nil,1],@ds.col('a1_b').to_a)
- assert_equal([0,1,0,nil,1],@ds.col('a1_c').to_a)
- end
- def test_percentiles
- v1=(1..100).to_a.to_scale
- assert_equal(50.5,v1.median)
- assert_equal(25.5, v1.percentil(25))
- v2=(1..99).to_a.to_scale
- assert_equal(50,v2.median)
- assert_equal(25,v2.percentil(25))
- v3=(1..50).to_a.to_scale
- assert_equal(25.5, v3.median)
- assert_equal(13, v3.percentil(25))
-
- end
- def test_add_case
- ds=Statsample::Dataset.new({'a'=>[].to_vector, 'b'=>[].to_vector, 'c'=>[].to_vector})
- ds.add_case([1,2,3])
- ds.add_case({'a'=>4,'b'=>5,'c'=>6})
- ds.add_case([[7,8,9],%w{a b c}])
- assert_equal({'a'=>1,'b'=>2,'c'=>3},ds.case_as_hash(0))
- assert_equal([4,5,6],ds.case_as_array(1))
- assert_equal([7,8,9],ds.case_as_array(2))
- assert_equal(['a','b','c'],ds.case_as_array(3))
- ds.add_case_array([6,7,1])
- ds.update_valid_data
- assert_equal([6,7,1],ds.case_as_array(4))
-
- end
- def test_marshaling
- ds_marshal=Marshal.load(Marshal.dump(@ds))
- assert_equal(ds_marshal,@ds)
- end
- def test_range
- v1=[1,2,3,4].to_vector
- v2=[5,6,7,8].to_vector
- v3=[9,10,11,12].to_vector
- ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2,'v3'=>v3}, %w{v3 v2 v1})
- assert_same(v1,ds1['v1'])
- ds2=ds1["v2".."v1"]
- assert_equal(%w{v2 v1},ds2.fields)
- assert_same(ds1['v1'],ds2['v1'])
- assert_same(ds1['v2'],ds2['v2'])
-
-
- end
- def test_clone
- v1=[1,2,3,4].to_vector
- v2=[5,6,7,8].to_vector
- ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2}, %w{v2 v1})
- ds2=ds1.clone
- assert_equal(ds1,ds2)
- assert_not_same(ds1,ds2)
- assert_equal(ds1['v1'],ds2['v1'])
- assert_same(ds1['v1'], ds2['v1'])
- assert_equal(ds1.fields,ds2.fields)
- assert_not_same(ds1.fields,ds2.fields)
- assert_equal(ds1.cases,ds2.cases)
-
- # partial clone
- ds3=ds1.clone('v1')
- ds_exp=Statsample::Dataset.new({'v1'=>v1},%w{v1})
- assert_equal(ds_exp,ds3)
- assert_not_same(ds_exp,ds3)
- assert_equal(ds3['v1'],ds_exp['v1'])
- assert_same(ds3['v1'],ds_exp['v1'])
- assert_equal(ds3.fields,ds_exp.fields)
- assert_equal(ds3.cases,ds_exp.cases)
-
- assert_not_same(ds3.fields,ds_exp.fields)
-
- end
- def test_dup
- v1=[1,2,3,4].to_vector
- v2=[5,6,7,8].to_vector
- ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2}, %w{v2 v1})
- ds2=ds1.dup
- assert_equal(ds1,ds2)
- assert_not_same(ds1,ds2)
- assert_equal(ds1['v1'],ds2['v1'])
- assert_not_same(ds1['v1'],ds2['v1'])
- assert_equal(ds1.cases,ds2.cases)
-
- assert_equal(ds1.fields,ds2.fields)
- assert_not_same(ds1.fields,ds2.fields)
- ds1['v1'].type=:scale
- # dup partial
- ds3=ds1.dup('v1')
- ds_exp=Statsample::Dataset.new({'v1'=>v1},%w{v1})
- assert_equal(ds_exp,ds3)
- assert_not_same(ds_exp,ds3)
- assert_equal(ds3['v1'],ds_exp['v1'])
- assert_not_same(ds3['v1'],ds_exp['v1'])
- assert_equal(ds3.fields,ds_exp.fields)
- assert_equal(ds3.cases,ds_exp.cases)
-
- assert_not_same(ds3.fields,ds_exp.fields)
-
-
- # empty
- ds3=ds1.dup_empty
- assert_not_equal(ds1,ds3)
- assert_not_equal(ds1['v1'],ds3['v1'])
- assert_equal([],ds3['v1'].data)
- assert_equal([],ds3['v2'].data)
- assert_equal(:scale,ds3['v1'].type)
- assert_equal(ds1.fields,ds2.fields)
- assert_not_same(ds1.fields,ds2.fields)
- end
- def test_from_to
- assert_equal(%w{name age city}, @ds.from_to("name","city"))
- assert_raise ArgumentError do
- @ds.from_to("name","a2")
- end
- end
- def test_each_array_with_nils
- v1=[1,-99,3,4,"na"].to_vector(:scale,:missing_values=>[-99,"na"])
- v2=[5,6,-99,8,20].to_vector(:scale,:missing_values=>[-99])
- v3=[9,10,11,12,20].to_vector(:scale,:missing_values=>[-99])
- ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2,'v3'=>v3})
- ds2=ds1.dup_empty
- ds1.each_array_with_nils {|row|
- ds2.add_case_array(row)
- }
- ds2.update_valid_data
- assert_equal([1,nil,3,4,nil],ds2['v1'].data)
- assert_equal([5,6,nil,8,20],ds2['v2'].data)
- end
- def test_dup_only_valid
- v1=[1,nil,3,4].to_vector(:scale)
- v2=[5,6,nil,8].to_vector(:scale)
- v3=[9,10,11,12].to_vector(:scale)
- ds1=Statsample::Dataset.new({'v1'=>v1,'v2'=>v2,'v3'=>v3})
- ds2=ds1.dup_only_valid
- expected=Statsample::Dataset.new({'v1'=>[1,4].to_vector(:scale), 'v2'=> [5,8].to_vector(:scale), 'v3'=>[9, 12].to_vector(:scale)})
- assert_equal(expected,ds2)
- assert_equal(expected.vectors.values,Statsample::only_valid(v1,v2,v3))
- expected_partial=Statsample::Dataset.new({'v1'=>[1,3,4].to_vector(:scale), 'v3'=>[9, 11,12].to_vector(:scale)})
- assert_equal(expected_partial, ds1.dup_only_valid(%w{v1 v3}))
-
-
- end
- def test_filter
- @ds['age'].type=:scale
- filtered=@ds.filter{|c| c['id']==2 or c['id']==4}
- expected=Statsample::Dataset.new({'id' => Statsample::Vector.new([2,4]), 'name'=>Statsample::Vector.new(%w{Claude Franz}), 'age'=>Statsample::Vector.new([23,27],:scale),
- 'city'=>Statsample::Vector.new(['London','Paris']),
- 'a1'=>Statsample::Vector.new(['b,c',nil,])}, ['id','name','age','city','a1'])
- assert_equal(expected,filtered)
- end
- def test_filter_field
- @ds['age'].type=:scale
- filtered=@ds.filter_field('id') {|c| c['id']==2 or c['id']==4}
- expected=[2,4].to_vector
- assert_equal(expected,filtered)
-
- end
- def test_verify
- name=%w{r1 r2 r3 r4}.to_vector(:nominal)
- v1=[1,2,3,4].to_vector(:scale)
- v2=[4,3,2,1].to_vector(:scale)
- v3=[10,20,30,40].to_vector(:scale)
- v4=%w{a b a b}.to_vector(:nominal)
- ds={'v1'=>v1,'v2'=>v2,'v3'=>v3,'v4'=>v4,'id'=>name}.to_dataset
- ds.fields=%w{v1 v2 v3 v4 id}
- #Correct
- t1=create_test("If v4=a, v1 odd") {|r| r['v4']=='b' or (r['v4']=='a' and r['v1']%2==1)}
- t2=create_test("v3=v1*10") {|r| r['v3']==r['v1']*10}
- # Fail!
- t3=create_test("v4='b'") {|r| r['v4']=='b'}
- exp1=["1 [1]: v4='b'", "3 [3]: v4='b'"]
- exp2=["1 [r1]: v4='b'", "3 [r3]: v4='b'"]
- res=ds.verify(t3,t1,t2)
- assert_equal(exp1,res)
- res=ds.verify('id',t1,t2,t3)
- assert_equal(exp2,res)
- end
- def test_compute_operation
- v1=[1,2,3,4].to_vector(:scale)
- v2=[4,3,2,1].to_vector(:scale)
- v3=[10,20,30,40].to_vector(:scale)
- vscale=[1.quo(2),1,3.quo(2),2].to_vector(:scale)
- vsum=[1+4+10.0,2+3+20.0,3+2+30.0,4+1+40.0].to_vector(:scale)
- vmult=[1*4,2*3,3*2,4*1].to_vector(:scale)
- ds={'v1'=>v1,'v2'=>v2,'v3'=>v3}.to_dataset
- assert_equal(vscale,ds.compute("v1/2"))
- assert_equal(vsum,ds.compute("v1+v2+v3"))
- assert_equal(vmult,ds.compute("v1*v2"))
-
- end
- def test_crosstab_with_asignation
- v1=%w{a a a b b b c c c}.to_vector
- v2=%w{a b c a b c a b c}.to_vector
- v3=%w{0 1 0 0 1 1 0 0 1}.to_scale
- ds=Statsample::Dataset.crosstab_by_asignation(v1,v2,v3)
- assert_equal(:nominal, ds['_id'].type)
- assert_equal(:scale, ds['a'].type)
- assert_equal(:scale, ds['b'].type)
- ev_id=%w{a b c}.to_vector
- ev_a =%w{0 0 0}.to_scale
- ev_b =%w{1 1 0}.to_scale
- ev_c =%w{0 1 1}.to_scale
- ds2={'_id'=>ev_id, 'a'=>ev_a, 'b'=>ev_b, 'c'=>ev_c}.to_dataset
- assert_equal(ds, ds2)
- end
- def test_one_to_many
- cases=[
- ['1','george','red',10,'blue',20,nil,nil],
- ['2','fred','green',15,'orange',30,'white',20],
- ['3','alfred',nil,nil,nil,nil,nil,nil]
- ]
- ds=Statsample::Dataset.new(%w{id name car_color1 car_value1 car_color2 car_value2 car_color3 car_value3})
- cases.each {|c| ds.add_case_array c }
- ds.update_valid_data
- ids=%w{1 1 2 2 2}.to_vector
- colors=%w{red blue green orange white}.to_vector
- values=[10,20,15,30,20].to_vector
- col_ids=[1,2,1,2,3].to_scale
- ds_expected={'id'=>ids, '_col_id'=>col_ids, 'color'=>colors, 'value'=>values}.to_dataset(['id','_col_id', 'color','value'])
- assert_equal(ds_expected, ds.one_to_many(%w{id}, "car_%v%n"))
-
- end
-
-end
diff --git a/test/test_dominance_analysis.rb b/test/test_dominance_analysis.rb
index 803262a..012d1a6 100644
--- a/test/test_dominance_analysis.rb
+++ b/test/test_dominance_analysis.rb
@@ -1,41 +1,39 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleDominanceAnalysisTestCase < MiniTest::Unit::TestCase
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleDominanceAnalysisTestCase < Minitest::Test
def test_dominance_univariate
# Example from Budescu (1993)
- m=Matrix[[1, 0.683, 0.154, 0.460, 0.618],[0.683, 1, -0.050, 0.297, 0.461], [0.154, -0.050, 1, 0.006, 0.262],[0.460, 0.297, 0.006, 1, 0.507],[0.618, 0.461, 0.262, 0.507, 1]]
+ m = Matrix[[1, 0.683, 0.154, 0.460, 0.618], [0.683, 1, -0.050, 0.297, 0.461], [0.154, -0.050, 1, 0.006, 0.262], [0.460, 0.297, 0.006, 1, 0.507], [0.618, 0.461, 0.262, 0.507, 1]]
m.extend Statsample::CovariateMatrix
- m.fields=%w{x1 x2 x3 x4 y}
- da=Statsample::DominanceAnalysis.new(m,'y')
+ m.fields = %w(x1 x2 x3 x4 y)
+ da = Statsample::DominanceAnalysis.new(m, 'y')
- contr_x1={'x2'=>0.003, 'x3'=>0.028, 'x4'=>0.063}
- contr_x1.each do |k,v|
+ contr_x1 = { 'x2' => 0.003, 'x3' => 0.028, 'x4' => 0.063 }
+ contr_x1.each do |k, v|
assert_in_delta(v, da.models_data[['x1']].contributions[k], 0.001)
end
- assert_in_delta(0.052, da.models_data[['x2','x3','x4']].contributions['x1'], 0.001)
- expected_dominances=[1, 1, 0.5, 0.5, 0,0]
- expected_g_dominances=[1, 1, 1, 1, 0,0]
+ assert_in_delta(0.052, da.models_data[%w(x2 x3 x4)].contributions['x1'], 0.001)
+ expected_dominances = [1, 1, 0.5, 0.5, 0, 0]
+ expected_g_dominances = [1, 1, 1, 1, 0, 0]
- da.pairs.each_with_index do |a,i|
- assert_equal(expected_dominances[i], da.total_dominance_pairwise(a[0],a[1]))
- assert_equal(expected_dominances[i], da.conditional_dominance_pairwise(a[0],a[1]))
- assert_equal(expected_g_dominances[i], da.general_dominance_pairwise(a[0],a[1]))
+ da.pairs.each_with_index do |a, i|
+ assert_equal(expected_dominances[i], da.total_dominance_pairwise(a[0], a[1]))
+ assert_equal(expected_dominances[i], da.conditional_dominance_pairwise(a[0], a[1]))
+ assert_equal(expected_g_dominances[i], da.general_dominance_pairwise(a[0], a[1]))
end
- assert(da.summary.size>0)
+ assert(da.summary.size > 0)
end
+
def test_dominance_multivariate
- m=Matrix[[1.0, -0.19, -0.358, -0.343, 0.359, 0.257], [-0.19, 1.0, 0.26, 0.29, -0.11, -0.11], [-0.358, 0.26, 1.0, 0.54, -0.49, -0.23], [-0.343, 0.29, 0.54, 1.0, -0.22, -0.41], [0.359, -0.11, -0.49, -0.22, 1.0, 0.62], [0.257, -0.11, -0.23, -0.41, 0.62, 1]]
+ m = Matrix[[1.0, -0.19, -0.358, -0.343, 0.359, 0.257], [-0.19, 1.0, 0.26, 0.29, -0.11, -0.11], [-0.358, 0.26, 1.0, 0.54, -0.49, -0.23], [-0.343, 0.29, 0.54, 1.0, -0.22, -0.41], [0.359, -0.11, -0.49, -0.22, 1.0, 0.62], [0.257, -0.11, -0.23, -0.41, 0.62, 1]]
m.extend Statsample::CovariateMatrix
- m.fields=%w{y1 y2 x1 x2 x3 x4}
- m2=m.submatrix(%w{y1 x1 x2 x3 x4})
-
+ m.fields = %w(y1 y2 x1 x2 x3 x4)
+ m2 = m.submatrix(%w(y1 x1 x2 x3 x4))
- da=Statsample::DominanceAnalysis.new(m, ['y1','y2'], :cases=>683, :method_association=>:p2yx)
+ da = Statsample::DominanceAnalysis.new(m, %w(y1 y2), cases: 683, method_association: :p2yx)
- contr_x1={'x2'=>0.027, 'x3'=>0.024, 'x4'=>0.017}
- contr_x1.each do |k,v|
+ contr_x1 = { 'x2' => 0.027, 'x3' => 0.024, 'x4' => 0.017 }
+ contr_x1.each do |k, v|
assert_in_delta(v, da.models_data[['x1']].contributions[k], 0.003)
end
-
-
end
end
diff --git a/test/test_factor.rb b/test/test_factor.rb
index 1884f4e..b724091 100644
--- a/test/test_factor.rb
+++ b/test/test_factor.rb
@@ -1,222 +1,228 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-#require 'rserve'
-#require 'statsample/rserve_extension'
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+# require 'rserve'
+# require 'statsample/rserve_extension'
-class StatsampleFactorTestCase < MiniTest::Unit::TestCase
+class StatsampleFactorTestCase < Minitest::Test
include Statsample::Fixtures
# Based on Hardle and Simar
def setup
- @fixtures_dir=File.expand_path(File.dirname(__FILE__)+"/fixtures")
+ @fixtures_dir = File.expand_path(File.dirname(__FILE__) + '/fixtures')
end
+
# Based on Hurdle example
def test_covariance_matrix
- ds=Statsample::PlainText.read(@fixtures_dir+"/bank2.dat", %w{v1 v2 v3 v4 v5 v6})
- ds.fields.each {|f|
- ds[f]=ds[f].centered
+ ds = Daru::DataFrame.from_plaintext(@fixtures_dir + '/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6])
+ ds.vectors.each {|f|
+ ds[f] = ds[f].center
}
- cm=ds.covariance_matrix
- pca =Statsample::Factor::PCA.new( cm, :m=>6)
- #puts pca.summary
- #puts pca.feature_matrix
- exp_eig=[2.985, 0.931,0.242, 0.194, 0.085, 0.035].to_scale
- assert_similar_vector(exp_eig, pca.eigenvalues.to_scale, 0.1)
- pcs=pca.principal_components(ds)
- k=6
- comp_matrix=pca.component_matrix()
+ cm = Statsample::Bivariate.covariance_matrix ds
+ pca = Statsample::Factor::PCA.new(cm, m: 6)
+ # puts pca.summary
+ # puts pca.feature_matrix
+ exp_eig = Daru::Vector.new([2.985, 0.931, 0.242, 0.194, 0.085, 0.035])
+ assert_similar_vector(exp_eig, Daru::Vector.new(pca.eigenvalues), 0.1)
+ pcs = pca.principal_components(ds)
+ k = 6
+ comp_matrix = pca.component_matrix
k.times {|i|
- pc_id="PC_#{i+1}"
+ pc_id = "PC_#{i + 1}".to_sym
k.times {|j| # variable
- ds_id="v#{j+1}"
- r= Statsample::Bivariate.correlation(ds[ds_id], pcs[pc_id])
- assert_in_delta( r, comp_matrix[j,i])
- }
+ ds_id = "v#{j + 1}".to_sym
+ r = Statsample::Bivariate.correlation(ds[ds_id], pcs[pc_id])
+ assert_in_delta(r, comp_matrix[j, i])
+ }
}
-
end
+
def test_principalcomponents_ruby_gsl
-
- ran=Distribution::Normal.rng
-
-# @r=::Rserve::Connection.new
-
- samples=20
- [3,5,7].each {|k|
- v={}
- v["x0"]=samples.times.map { ran.call()}.to_scale.centered
- (1...k).each {|i|
- v["x#{i}"]=samples.times.map {|ii| ran.call()*0.5+v["x#{i-1}"][ii]*0.5}.to_scale.centered
- }
-
- ds=v.to_dataset
- cm=ds.covariance_matrix
-# @r.assign('ds',ds)
-# @r.eval('cm<-cor(ds);sm<-eigen(cm, sym=TRUE);v<-sm$vectors')
-# puts "eigenvalues"
-# puts @r.eval('v').to_ruby.to_s
- pca_ruby=Statsample::Factor::PCA.new( cm, :m=>k, :use_gsl=>false )
- pca_gsl =Statsample::Factor::PCA.new( cm, :m=>k, :use_gsl=>true )
- pc_ruby = pca_ruby.principal_components(ds)
- pc_gsl = pca_gsl.principal_components(ds)
- # Test component matrix correlation!
- cm_ruby=pca_ruby.component_matrix
- #puts cm_ruby.summary
- k.times {|i|
- pc_id="PC_#{i+1}"
- assert_in_delta(pca_ruby.eigenvalues[i], pca_gsl.eigenvalues[i],1e-10)
- # Revert gsl component values
- pc_gsl_data= (pc_gsl[pc_id][0]-pc_ruby[pc_id][0]).abs>1e-6 ? pc_gsl[pc_id].recode {|v| -v} : pc_gsl[pc_id]
- assert_similar_vector(pc_gsl_data, pc_ruby[pc_id], 1e-6,"PC for #{k} variables")
- if false
- k.times {|j| # variable
- ds_id="x#{j}"
- r= Statsample::Bivariate.correlation(ds[ds_id],pc_ruby[pc_id])
- puts "#{pc_id}-#{ds_id}:#{r}"
+ if Statsample.has_gsl?
+ ran = Distribution::Normal.rng
+
+ # @r=::Rserve::Connection.new
+
+ samples = 20
+ [3, 5, 7].each {|k|
+ v = {}
+ v[:x0] = Daru::Vector.new(samples.times.map { ran.call }).center
+ (1...k).each { |i|
+ v["x#{i}".to_sym] = Daru::Vector.new(samples.times.map { |ii| ran.call * 0.5 + v["x#{i - 1}".to_sym][ii] * 0.5 }).center
+ }
+
+ ds = Daru::DataFrame.new(v)
+ cm = Statsample::Bivariate.covariance_matrix ds
+ # @r.assign('ds',ds)
+ # @r.eval('cm<-cor(ds);sm<-eigen(cm, sym=TRUE);v<-sm$vectors')
+ # puts "eigenvalues"
+ # puts @r.eval('v').to_ruby.to_s
+ pca_ruby = Statsample::Factor::PCA.new(cm, m: k, use_gsl: false)
+ pca_gsl = Statsample::Factor::PCA.new(cm, m: k, use_gsl: true)
+ pc_ruby = pca_ruby.principal_components(ds)
+ pc_gsl = pca_gsl.principal_components(ds)
+ # Test component matrix correlation!
+ cm_ruby = pca_ruby.component_matrix
+ # puts cm_ruby.summary
+ k.times {|i|
+ pc_id = "PC_#{i + 1}".to_sym
+ assert_in_delta(pca_ruby.eigenvalues[i], pca_gsl.eigenvalues[i], 1e-10)
+ # Revert gsl component values
+ pc_gsl_data = (pc_gsl[pc_id][0] - pc_ruby[pc_id][0]).abs > 1e-6 ? pc_gsl[pc_id].recode(&:-@) : pc_gsl[pc_id]
+ assert_similar_vector(pc_gsl_data, pc_ruby[pc_id], 1e-6, "PC for #{k} variables")
+ if false
+ k.times {|j| # variable
+ ds_id = "x#{j}".to_sym
+ r = Statsample::Bivariate.correlation(ds[ds_id], pc_ruby[pc_id])
+ puts "#{pc_id}-#{ds_id}:#{r}"
+ }
+ end
}
- end
}
- }
- #@r.close
+ end
+ # @r.close
end
- def test_principalcomponents()
- principalcomponents(true)
- principalcomponents(false)
-
- end
+
+ def test_principalcomponents
+ if Statsample.has_gsl?
+ principalcomponents(true)
+ else
+ skip "Require GSL"
+ end
+ principalcomponents(false)
+ end
+
def principalcomponents(gsl)
- ran=Distribution::Normal.rng
- samples=50
- x1=samples.times.map { ran.call()}.to_scale
- x2=samples.times.map {|i| ran.call()*0.5+x1[i]*0.5}.to_scale
- ds={'x1'=>x1,'x2'=>x2}.to_dataset
-
- cm=ds.correlation_matrix
- r=cm[0,1]
- pca=Statsample::Factor::PCA.new(cm,:m=>2,:use_gsl=>gsl)
- assert_in_delta(1+r,pca.eigenvalues[0],1e-10)
- assert_in_delta(1-r,pca.eigenvalues[1],1e-10)
- hs=1.0 / Math.sqrt(2)
- assert_equal_vector(Vector[1, 1]*hs, pca.eigenvectors[0])
- m_1=gsl ? Vector[-1,1] : Vector[1,-1]
-
- assert_equal_vector(hs*m_1, pca.eigenvectors[1])
-
- pcs=pca.principal_components(ds)
- exp_pc_1=ds.collect_with_index {|row,i|
- hs*(row['x1']+row['x2'])
- }
- exp_pc_2=ds.collect_with_index {|row,i|
- gsl ? hs*(row['x2']-row['x1']) : hs*(row['x1']-row['x2'])
+ ran = Distribution::Normal.rng
+ samples = 50
+ x1 = Daru::Vector.new(samples.times.map { ran.call })
+ x2 = Daru::Vector.new(samples.times.map { |i| ran.call * 0.5 + x1[i] * 0.5 })
+ ds = Daru::DataFrame.new({ :x1 => x1, :x2 => x2 })
+
+ cm = Statsample::Bivariate.correlation_matrix ds
+ r = cm[0, 1]
+ pca = Statsample::Factor::PCA.new(cm, m: 2, use_gsl: gsl)
+ assert_in_delta(1 + r, pca.eigenvalues[0], 1e-10)
+ assert_in_delta(1 - r, pca.eigenvalues[1], 1e-10)
+ hs = 1.0 / Math.sqrt(2)
+ assert_equal_vector(Vector[1, 1] * hs, pca.eigenvectors[0])
+ m_1 = gsl ? Vector[-1, 1] : Vector[1, -1]
+ assert_equal_vector(hs * m_1, pca.eigenvectors[1])
+
+ pcs = pca.principal_components(ds)
+ exp_pc_1 = ds.collect_row_with_index {|row, _i|
+ hs * (row[:x1] + row[:x2])
+ }
+ exp_pc_2 = ds.collect_row_with_index {|row, _i|
+ gsl ? hs * (row[:x2] - row[:x1]) : hs * (row[:x1] - row[:x2])
}
- assert_similar_vector(exp_pc_1, pcs["PC_1"])
- assert_similar_vector(exp_pc_2, pcs["PC_2"])
+ assert_similar_vector(exp_pc_1, pcs[:PC_1])
+ assert_similar_vector(exp_pc_2, pcs[:PC_2])
end
+
def test_antiimage
- cor=Matrix[[1,0.964, 0.312],[0.964,1,0.411],[0.312,0.411,1]]
- expected=Matrix[[0.062,-0.057, 0.074],[-0.057, 0.057, -0.089], [0.074, -0.089, 0.729]]
- ai=Statsample::Factor.anti_image_covariance_matrix(cor)
- assert(Matrix.equal_in_delta?(expected, ai, 0.01), "#{expected.to_s} not equal to #{ai.to_s}")
+ cor = Matrix[[1, 0.964, 0.312], [0.964, 1, 0.411], [0.312, 0.411, 1]]
+ expected = Matrix[[0.062, -0.057, 0.074], [-0.057, 0.057, -0.089], [0.074, -0.089, 0.729]]
+ ai = Statsample::Factor.anti_image_covariance_matrix(cor)
+ assert(Matrix.equal_in_delta?(expected, ai, 0.01), "#{expected} not equal to #{ai}")
end
+
def test_kmo
- @v1=[1 ,2 ,3 ,4 ,7 ,8 ,9 ,10,14,15,20,50,60,70].to_scale
- @v2=[5 ,6 ,11,12,13,16,17,18,19,20,30,0,0,0].to_scale
- @v3=[10,3 ,20,30,40,50,80,10,20,30,40,2,3,4].to_scale
- # KMO: 0.490
- ds={'v1'=>@v1,'v2'=>@v2,'v3'=>@v3}.to_dataset
- cor=Statsample::Bivariate.correlation_matrix(ds)
- kmo=Statsample::Factor.kmo(cor)
- assert_in_delta(0.667, kmo,0.001)
- assert_in_delta(0.81, Statsample::Factor.kmo(harman_817),0.01)
-
+ @v1 = Daru::Vector.new([1, 2, 3, 4, 7, 8, 9, 10, 14, 15, 20, 50, 60, 70])
+ @v2 = Daru::Vector.new([5, 6, 11, 12, 13, 16, 17, 18, 19, 20, 30, 0, 0, 0])
+ @v3 = Daru::Vector.new([10, 3, 20, 30, 40, 50, 80, 10, 20, 30, 40, 2, 3, 4])
+ # KMO: 0.490
+ ds = Daru::DataFrame.new({ :v1 => @v1, :v2 => @v2, :v3 => @v3 })
+ cor = Statsample::Bivariate.correlation_matrix(ds)
+ kmo = Statsample::Factor.kmo(cor)
+ assert_in_delta(0.667, kmo, 0.001)
+ assert_in_delta(0.81, Statsample::Factor.kmo(harman_817), 0.01)
end
+
def test_kmo_univariate
- m=harman_817
- expected=[0.73,0.76,0.84,0.87,0.53,0.93,0.78,0.86]
+ m = harman_817
+ expected = [0.73, 0.76, 0.84, 0.87, 0.53, 0.93, 0.78, 0.86]
m.row_size.times.map {|i|
- assert_in_delta(expected[i], Statsample::Factor.kmo_univariate(m,i),0.01)
+ assert_in_delta(expected[i], Statsample::Factor.kmo_univariate(m, i), 0.01)
}
end
# Tested with SPSS and R
def test_pca
- a=[2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1].to_scale
- b=[2.4, 0.7, 2.9, 2.2, 3.0, 2.7, 1.6, 1.1, 1.6, 0.9].to_scale
- a.recode! {|c| c-a.mean}
- b.recode! {|c| c-b.mean}
- ds={'a'=>a,'b'=>b}.to_dataset
- cov_matrix=Statsample::Bivariate.covariance_matrix(ds)
- if Statsample.has_gsl?
- pca=Statsample::Factor::PCA.new(cov_matrix,:use_gsl=>true)
- pca_set(pca,"gsl")
- else
- skip("Eigenvalues could be calculated with GSL (requires gsl)")
- end
- pca=Statsample::Factor::PCA.new(cov_matrix,:use_gsl=>false)
- pca_set(pca,"ruby")
+ dtype = Statsample.has_gsl? ? :gsl : :array
+ a = Daru::Vector.new([2.5, 0.5, 2.2, 1.9, 3.1, 2.3, 2.0, 1.0, 1.5, 1.1], dtype: dtype)
+ b = Daru::Vector.new([2.4, 0.7, 2.9, 2.2, 3.0, 2.7, 1.6, 1.1, 1.6, 0.9], dtype: dtype)
+ a = a - a.mean
+ b = b - b.mean
+ ds = Daru::DataFrame.new({ :a => a, :b => b })
+
+ cov_matrix = Statsample::Bivariate.covariance_matrix(ds)
+ if Statsample.has_gsl?
+ pca = Statsample::Factor::PCA.new(cov_matrix, use_gsl: true)
+ pca_set(pca, 'gsl')
+ else
+ skip('Eigenvalues could be calculated with GSL (requires gsl)')
+ end
+ pca = Statsample::Factor::PCA.new(cov_matrix, use_gsl: false)
+ pca_set(pca, 'ruby')
end
- def pca_set(pca,type)
- expected_eigenvalues=[1.284, 0.0490]
- expected_eigenvalues.each_with_index{|ev,i|
- assert_in_delta(ev,pca.eigenvalues[i],0.001)
- }
- expected_communality=[0.590, 0.694]
- expected_communality.each_with_index{|ev,i|
- assert_in_delta(ev,pca.communalities[i],0.001)
- }
- expected_cm=[0.768, 0.833]
- obs=pca.component_matrix_correlation(1).column(0).to_a
- expected_cm.each_with_index{|ev,i|
- assert_in_delta(ev,obs[i],0.001)
- }
- assert(pca.summary)
+ def pca_set(pca, _type)
+ expected_eigenvalues = [1.284, 0.0490]
+ expected_eigenvalues.each_with_index{|ev, i|
+ assert_in_delta(ev, pca.eigenvalues[i], 0.001)
+ }
+ expected_communality = [0.590, 0.694]
+ expected_communality.each_with_index{|ev, i|
+ assert_in_delta(ev, pca.communalities[i], 0.001)
+ }
+ expected_cm = [0.768, 0.833]
+ obs = pca.component_matrix_correlation(1).column(0).to_a
+ expected_cm.each_with_index{|ev, i|
+ assert_in_delta(ev, obs[i], 0.001)
+ }
+
+ assert(pca.summary)
end
# Tested with R
def test_principalaxis
- matrix=::Matrix[
- [1.0, 0.709501601093587, 0.877596585880047, 0.272219316266807], [0.709501601093587, 1.0, 0.291633797330304, 0.871141831433844], [0.877596585880047, 0.291633797330304, 1.0, -0.213373722977167], [0.272219316266807, 0.871141831433844, -0.213373722977167, 1.0]]
-
-
- fa=Statsample::Factor::PrincipalAxis.new(matrix,:m=>1, :max_iterations=>50)
-
- cm=::Matrix[[0.923],[0.912],[0.507],[0.483]]
-
- assert_equal_matrix(cm,fa.component_matrix,0.001)
-
- h2=[0.852,0.832,0.257,0.233]
- h2.each_with_index{|ev,i|
- assert_in_delta(ev,fa.communalities[i],0.001)
- }
- eigen1=2.175
- assert_in_delta(eigen1, fa.eigenvalues[0],0.001)
- assert(fa.summary.size>0)
- fa=Statsample::Factor::PrincipalAxis.new(matrix,:smc=>false)
-
- assert_raise RuntimeError do
- fa.iterate
- end
+ matrix = ::Matrix[
+ [1.0, 0.709501601093587, 0.877596585880047, 0.272219316266807], [0.709501601093587, 1.0, 0.291633797330304, 0.871141831433844], [0.877596585880047, 0.291633797330304, 1.0, -0.213373722977167], [0.272219316266807, 0.871141831433844, -0.213373722977167, 1.0]]
- end
+ fa = Statsample::Factor::PrincipalAxis.new(matrix, m: 1, max_iterations: 50)
+
+ cm = ::Matrix[[0.923], [0.912], [0.507], [0.483]]
+ assert_equal_matrix(cm, fa.component_matrix, 0.001)
+
+ h2 = [0.852, 0.832, 0.257, 0.233]
+ h2.each_with_index{|ev, i|
+ assert_in_delta(ev, fa.communalities[i], 0.001)
+ }
+ eigen1 = 2.175
+ assert_in_delta(eigen1, fa.eigenvalues[0], 0.001)
+ assert(fa.summary.size > 0)
+ fa = Statsample::Factor::PrincipalAxis.new(matrix, smc: false)
+
+ assert_raise RuntimeError do
+ fa.iterate
+ end
+ end
def test_rotation_varimax
- a = Matrix[ [ 0.4320, 0.8129, 0.3872] ,
- [0.7950, -0.5416, 0.2565] ,
- [0.5944, 0.7234, -0.3441],
- [0.8945, -0.3921, -0.1863] ]
-
- expected= Matrix[[-0.0204423, 0.938674, -0.340334],
- [0.983662, 0.0730206, 0.134997],
- [0.0826106, 0.435975, -0.893379],
- [0.939901, -0.0965213, -0.309596]]
- varimax=Statsample::Factor::Varimax.new(a)
+ a = Matrix[[0.4320, 0.8129, 0.3872],
+ [0.7950, -0.5416, 0.2565],
+ [0.5944, 0.7234, -0.3441],
+ [0.8945, -0.3921, -0.1863]]
+
+ expected = Matrix[[-0.0204423, 0.938674, -0.340334],
+ [0.983662, 0.0730206, 0.134997],
+ [0.0826106, 0.435975, -0.893379],
+ [0.939901, -0.0965213, -0.309596]]
+ varimax = Statsample::Factor::Varimax.new(a)
assert(!varimax.rotated.nil?, "Rotated shouldn't be empty")
assert(!varimax.component_transformation_matrix.nil?, "Component matrix shouldn't be empty")
assert(!varimax.h2.nil?, "H2 shouldn't be empty")
-
- assert_equal_matrix(expected,varimax.rotated,1e-6)
- assert(varimax.summary.size>0)
- end
-
+ assert_equal_matrix(expected, varimax.rotated, 1e-6)
+ assert(varimax.summary.size > 0)
+ end
end
diff --git a/test/test_factor_map.rb b/test/test_factor_map.rb
index 05c94d5..69610bc 100644
--- a/test/test_factor_map.rb
+++ b/test/test_factor_map.rb
@@ -1,43 +1,38 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-#require 'rserve'
-#require 'statsample/rserve_extension'
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+# require 'rserve'
+# require 'statsample/rserve_extension'
-class StatsampleFactorMpaTestCase < MiniTest::Unit::TestCase
+class StatsampleFactorMpaTestCase < Minitest::Test
context Statsample::Factor::MAP do
setup do
- m=Matrix[
- [ 1, 0.846, 0.805, 0.859, 0.473, 0.398, 0.301, 0.382],
- [ 0.846, 1, 0.881, 0.826, 0.376, 0.326, 0.277, 0.415],
- [ 0.805, 0.881, 1, 0.801, 0.38, 0.319, 0.237, 0.345],
- [ 0.859, 0.826, 0.801, 1, 0.436, 0.329, 0.327, 0.365],
- [ 0.473, 0.376, 0.38, 0.436, 1, 0.762, 0.73, 0.629],
- [ 0.398, 0.326, 0.319, 0.329, 0.762, 1, 0.583, 0.577],
- [ 0.301, 0.277, 0.237, 0.327, 0.73, 0.583, 1, 0.539],
- [ 0.382, 0.415, 0.345, 0.365, 0.629, 0.577, 0.539, 1]
+ m = Matrix[
+ [1, 0.846, 0.805, 0.859, 0.473, 0.398, 0.301, 0.382],
+ [0.846, 1, 0.881, 0.826, 0.376, 0.326, 0.277, 0.415],
+ [0.805, 0.881, 1, 0.801, 0.38, 0.319, 0.237, 0.345],
+ [0.859, 0.826, 0.801, 1, 0.436, 0.329, 0.327, 0.365],
+ [0.473, 0.376, 0.38, 0.436, 1, 0.762, 0.73, 0.629],
+ [0.398, 0.326, 0.319, 0.329, 0.762, 1, 0.583, 0.577],
+ [0.301, 0.277, 0.237, 0.327, 0.73, 0.583, 1, 0.539],
+ [0.382, 0.415, 0.345, 0.365, 0.629, 0.577, 0.539, 1]
]
- @map=Statsample::Factor::MAP.new(m)
+ @map = Statsample::Factor::MAP.new(m)
end
- should "return correct values with pure ruby" do
- @map.use_gsl=false
+ should 'return correct values with pure ruby' do
+ @map.use_gsl = false
map_assertions(@map)
end
- should_with_gsl "return correct values with gsl" do
- #require 'ruby-prof'
+ should_with_gsl 'return correct values with gsl' do
+ # require 'ruby-prof'
- @map.use_gsl=true
- map_assertions(@map)
+ @map.use_gsl = true
+ map_assertions(@map)
end
-
-
end
-
+
def map_assertions(map)
- assert_in_delta(map.minfm, 0.066445,0.00001)
- assert_equal(map.number_of_factors, 2)
- assert_in_delta(map.fm[0], 0.312475,0.00001)
- assert_in_delta(map.fm[1], 0.245121,0.00001)
+ assert_in_delta(map.minfm, 0.066445, 0.00001)
+ assert_equal(map.number_of_factors, 2)
+ assert_in_delta(map.fm[0], 0.312475, 0.00001)
+ assert_in_delta(map.fm[1], 0.245121, 0.00001)
end
-
-
end
-
diff --git a/test/test_factor_pa.rb b/test/test_factor_pa.rb
index b1332ba..e2df935 100644
--- a/test/test_factor_pa.rb
+++ b/test/test_factor_pa.rb
@@ -1,52 +1,56 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-#require 'rserve'
-#require 'statsample/rserve_extension'
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+# require 'rserve'
+# require 'statsample/rserve_extension'
-class StatsampleFactorTestCase < MiniTest::Unit::TestCase
+class StatsampleFactorTestCase < Minitest::Test
include Statsample::Fixtures
# Based on Hardle and Simar
def setup
- @fixtures_dir=File.expand_path(File.dirname(__FILE__)+"/fixtures")
+ @fixtures_dir = File.expand_path(File.dirname(__FILE__) + '/fixtures')
end
+
def test_parallelanalysis_with_data
if Statsample.has_gsl?
- samples=100
- variables=10
- iterations=50
+ samples = 100
+ variables = 10
+ iterations = 50
rng = Distribution::Normal.rng
- f1=samples.times.collect {rng.call}.to_scale
- f2=samples.times.collect {rng.call}.to_scale
- vectors={}
+ f1 = Daru::Vector.new(samples.times.collect { rng.call })
+ f2 = Daru::Vector.new(samples.times.collect { rng.call })
+ vectors = {}
variables.times do |i|
- if i<5
- vectors["v#{i}"]=samples.times.collect {|nv|
- f1[nv]*5+f2[nv]*2+rng.call
- }.to_scale
+ if i < 5
+ vectors["v#{i}".to_sym] = Daru::Vector.new(
+ samples.times.collect { |nv|
+ f1[nv] * 5 + f2[nv] * 2 + rng.call
+ }
+ )
else
- vectors["v#{i}"]=samples.times.collect {|nv|
- f2[nv]*5+f1[nv]*2+rng.call
- }.to_scale
+ vectors["v#{i}".to_sym] = Daru::Vector.new(
+ samples.times.collect { |nv|
+ f2[nv] * 5 + f1[nv] * 2 + rng.call
+ }
+ )
end
-
end
- ds=vectors.to_dataset
-
- pa1=Statsample::Factor::ParallelAnalysis.new(ds, :bootstrap_method=>:data, :iterations=>iterations)
- pa2=Statsample::Factor::ParallelAnalysis.with_random_data(samples,variables,:iterations=>iterations,:percentil=>95)
+ ds = Daru::DataFrame.new(vectors)
+
+ pa1 = Statsample::Factor::ParallelAnalysis.new(ds, bootstrap_method: :data, iterations: iterations)
+ pa2 = Statsample::Factor::ParallelAnalysis.with_random_data(samples, variables, iterations: iterations, percentil: 95)
3.times do |n|
- var="ev_0000#{n+1}"
- assert_in_delta(pa1.ds_eigenvalues[var].mean, pa2.ds_eigenvalues[var].mean,0.05)
+ var = "ev_0000#{n + 1}".to_sym
+ assert_in_delta(pa1.ds_eigenvalues[var].mean, pa2.ds_eigenvalues[var].mean, 0.07)
end
else
- skip("Too slow without GSL")
+ skip('Too slow without GSL')
end
-
end
+
def test_parallelanalysis
- pa=Statsample::Factor::ParallelAnalysis.with_random_data(305,8,:iterations=>100,:percentil=>95)
- assert_in_delta(1.2454, pa.ds_eigenvalues['ev_00001'].mean, 0.01)
- assert_in_delta(1.1542, pa.ds_eigenvalues['ev_00002'].mean, 0.01)
- assert_in_delta(1.0836, pa.ds_eigenvalues['ev_00003'].mean, 0.01)
- assert(pa.summary.size>0)
- end
+ pa = Statsample::Factor::ParallelAnalysis.with_random_data(305, 8, iterations: 100, percentil: 95)
+ assert_in_delta(1.2454, pa.ds_eigenvalues[:ev_00001].mean, 0.05)
+ assert_in_delta(1.1542, pa.ds_eigenvalues[:ev_00002].mean, 0.01)
+ assert_in_delta(1.0836, pa.ds_eigenvalues[:ev_00003].mean, 0.01)
+ assert(pa.summary.size > 0)
+ end
end
diff --git a/test/test_fit_model.rb b/test/test_fit_model.rb
new file mode 100644
index 0000000..e7be554
--- /dev/null
+++ b/test/test_fit_model.rb
@@ -0,0 +1,88 @@
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+require 'minitest/autorun'
+
+describe Statsample::FitModel do
+ before do
+ @df = Daru::DataFrame.from_csv 'test/fixtures/df.csv'
+ @df.to_category 'c', 'd', 'e'
+ end
+ context '#df_for_regression' do
+ context 'no interaction' do
+ it { assert_vectors_from_formula 'y~a+e', %w[a e_B e_C y] }
+ end
+
+ context '2-way interaction' do
+ context 'interaction of numerical with numerical' do
+ context 'none reoccur' do
+ it { assert_vectors_from_formula 'y~a:b', %w[a:b y] }
+ end
+
+ context 'one reoccur' do
+ it { assert_vectors_from_formula 'y~a+a:b', %w[a a:b y] }
+ end
+
+ context 'both reoccur' do
+ it { assert_vectors_from_formula 'y~a+b+a:b', %w[a a:b b y] }
+ end
+ end
+
+ context 'interaction of category with numerical' do
+ context 'none reoccur' do
+ it { assert_vectors_from_formula 'y~a:e', %w[e_A:a e_B:a e_C:a y] }
+ end
+
+ context 'one reoccur' do
+ context 'numeric occur' do
+ it { assert_vectors_from_formula 'y~a+a:e', %w[a e_B:a e_C:a y] }
+ end
+
+ context 'category occur' do
+ it { assert_vectors_from_formula 'y~e+a:e',
+ %w[e_B e_C e_A:a e_B:a e_C:a y] }
+ end
+ end
+
+ context 'both reoccur' do
+ it { assert_vectors_from_formula 'y~a+e+a:e',
+ %w[a e_B e_C e_B:a e_C:a y] }
+ end
+ end
+
+ context 'interaction of category with category' do
+ context 'none reoccur' do
+ it { assert_vectors_from_formula 'y~c:e',
+ %w[e_B e_C c_yes:e_A c_yes:e_B c_yes:e_C y] }
+ end
+
+ context 'one reoccur' do
+ it { assert_vectors_from_formula 'y~e+c:e',
+ %w[e_B e_C c_yes:e_A c_yes:e_B c_yes:e_C y] }
+ end
+
+ context 'both reoccur' do
+ it { assert_vectors_from_formula 'y~c+e+c:e',
+ %w[c_yes e_B e_C c_yes:e_B c_yes:e_C y] }
+ end
+ end
+ end
+
+ context 'corner case' do
+ context 'example 1' do
+ it { assert_vectors_from_formula 'y~d:a+d:e',
+ %w[e_B e_C d_male:e_A d_male:e_B d_male:e_C d_female:a d_male:a y] }
+ end
+ end
+
+ context 'complex examples' do
+ context 'random example 1' do
+ it { assert_vectors_from_formula 'y~a+e+c:d+e:d',
+ %w[e_B e_C d_male c_yes:d_female c_yes:d_male e_B:d_male e_C:d_male a y] }
+ end
+
+ context 'random example 2' do
+ it { assert_vectors_from_formula 'y~e+b+c+d:e+b:e+a:e+0',
+ %w[e_A e_B e_C c_yes d_male:e_A d_male:e_B d_male:e_C b e_B:b e_C:b e_A:a e_B:a e_C:a y] }
+ end
+ end
+ end
+end
diff --git a/test/test_ggobi.rb b/test/test_ggobi.rb
index ecef32c..6f1724a 100644
--- a/test/test_ggobi.rb
+++ b/test/test_ggobi.rb
@@ -1,24 +1,25 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
require 'ostruct'
-class StatsampleGGobiTestCase < MiniTest::Unit::TestCase
-
+class StatsampleGGobiTestCase < Minitest::Test
def setup
- v1=([10.2,20.3,10,20,30,40,30,20,30,40]*10).to_vector(:scale)
- @v2=(%w{a b c a a a b b c d}*10).to_vector(:nominal)
- @v2.labels={"a"=>"letter a","d"=>"letter d"}
- v3=([1,2,3,4,5,4,3,2,1,2]*10).to_vector(:ordinal)
- @ds={'v1'=>v1,'v2'=>@v2,'v3'=>v3}.to_dataset
+ v1 = Daru::Vector.new([10.2, 20.3, 10, 20, 30, 40, 30, 20, 30, 40] * 10)
+ @v2 = Daru::Vector.new(%w(a b c a a a b b c d) * 10)
+ @v2.labels = { 'a' => 'letter a', 'd' => 'letter d' }
+ v3 = Daru::Vector.new([1, 2, 3, 4, 5, 4, 3, 2, 1, 2] * 10)
+ @ds = Daru::DataFrame.new({ :v1 => v1, :v2 => @v2, :v3 => v3 })
end
+
def test_values_definition
- a=[1.0,2,"a",nil]
- assert_equal("1.0 2 a NA", Statsample::GGobi.values_definition(a,"NA"))
+ a = [1.0, 2, 'a', nil]
+ assert_equal('1.0 2 a NA', Statsample::GGobi.values_definition(a, 'NA'))
end
+
def test_variable_definition
- carrier=OpenStruct.new
- carrier.categorials=[]
- carrier.conversions={}
- real_var_definition=Statsample::GGobi.variable_definition(carrier,@v2,'variable 2',"v2")
- expected=<<-EOS
+ carrier = OpenStruct.new
+ carrier.categorials = []
+ carrier.conversions = {}
+ real_var_definition = Statsample::GGobi.variable_definition(carrier, @v2, 'variable 2', 'v2')
+ expected = <<-EOS
letter a
@@ -27,8 +28,8 @@ def test_variable_definition
letter d
EOS
- assert_equal(expected.gsub(/\s/," "),real_var_definition.gsub(/\s/," "))
- assert_equal({'variable 2'=>{'a'=>1,'b'=>2,'c'=>3,'d'=>4}},carrier.conversions)
- assert_equal(['variable 2'],carrier.categorials)
+ assert_equal(expected.gsub(/\s/, ' '), real_var_definition.gsub(/\s/, ' '))
+ assert_equal({ 'variable 2' => { 'a' => 1, 'b' => 2, 'c' => 3, 'd' => 4 } }, carrier.conversions)
+ assert_equal(['variable 2'], carrier.categorials)
end
end
diff --git a/test/test_gsl.rb b/test/test_gsl.rb
index 2d841aa..261b9cf 100644
--- a/test/test_gsl.rb
+++ b/test/test_gsl.rb
@@ -1,17 +1,15 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleGSLTestCase < MiniTest::Unit::TestCase
- should_with_gsl "matrix with gsl" do
- a=[1,2,3,4,20].to_vector(:scale)
- b=[3,2,3,4,50].to_vector(:scale)
- c=[6,2,3,4,3].to_vector(:scale)
- ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset
- gsl=ds.to_matrix.to_gsl
- assert_equal(5,gsl.size1)
- assert_equal(3,gsl.size2)
- matrix=gsl.to_matrix
- assert_equal(5,matrix.row_size)
- assert_equal(3,matrix.column_size)
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleGSLTestCase < Minitest::Test
+ should_with_gsl 'matrix with gsl' do
+ a = Daru::Vector.new([1, 2, 3, 4, 20])
+ b = Daru::Vector.new([3, 2, 3, 4, 50])
+ c = Daru::Vector.new([6, 2, 3, 4, 3])
+ ds = Daru::DataFrame.new({ :a => a, :b => b, :c => c })
+ gsl = ds.to_matrix.to_gsl
+ assert_equal(5, gsl.size1)
+ assert_equal(3, gsl.size2)
+ matrix = gsl.to_matrix
+ assert_equal(5, matrix.row_size)
+ assert_equal(3, matrix.column_size)
end
end
-
-
diff --git a/test/test_histogram.rb b/test/test_histogram.rb
index 1a086e0..5db9101 100644
--- a/test/test_histogram.rb
+++ b/test/test_histogram.rb
@@ -1,112 +1,109 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
-
-class StatsampleHistogramTestCase < MiniTest::Unit::TestCase
+class StatsampleHistogramTestCase < Minitest::Test
context Statsample::Histogram do
- should "alloc correctly with integer" do
+ should 'alloc correctly with integer' do
h = Statsample::Histogram.alloc(4)
- assert_equal([0.0]*4, h.bin)
- assert_equal([0.0]*5, h.range)
+ assert_equal([0.0] * 4, h.bin)
+ assert_equal([0.0] * 5, h.range)
end
- should "alloc correctly with array" do
+ should 'alloc correctly with array' do
h = Statsample::Histogram.alloc([1, 3, 7, 9, 20])
- assert_equal([0.0]*4, h.bin)
- assert_equal([1,3,7,9,20], h.range)
+ assert_equal([0.0] * 4, h.bin)
+ assert_equal([1, 3, 7, 9, 20], h.range)
end
- should "alloc correctly with integer and min, max array" do
+ should 'alloc correctly with integer and min, max array' do
h = Statsample::Histogram.alloc(5, [0, 5])
- assert_equal([0.0,1.0,2.0,3.0,4.0,5.0], h.range)
- assert_equal([0.0]*5,h.bin)
+ assert_equal([0.0, 1.0, 2.0, 3.0, 4.0, 5.0], h.range)
+ assert_equal([0.0] * 5, h.bin)
end
- should "bin() method return correct number of bins" do
+ should 'bin() method return correct number of bins' do
h = Statsample::Histogram.alloc(4)
- assert_equal(4,h.bins)
+ assert_equal(4, h.bins)
end
- should "increment correctly" do
- h = Statsample::Histogram.alloc(5, [0, 5])
+ should 'increment correctly' do
+ h = Statsample::Histogram.alloc(5, [0, 5])
h.increment 2.5
- assert_equal([0.0,0.0,1.0,0.0,0.0], h.bin)
- h.increment [0.5,0.5,3.5,3.5]
- assert_equal([2.0,0.0,1.0,2.0,0.0], h.bin)
+ assert_equal([0.0, 0.0, 1.0, 0.0, 0.0], h.bin)
+ h.increment [0.5, 0.5, 3.5, 3.5]
+ assert_equal([2.0, 0.0, 1.0, 2.0, 0.0], h.bin)
h.increment 0
- assert_equal([3.0,0.0,1.0,2.0,0.0], h.bin)
+ assert_equal([3.0, 0.0, 1.0, 2.0, 0.0], h.bin)
h.increment 5
- assert_equal([3.0,0.0,1.0,2.0,0.0], h.bin)
+ assert_equal([3.0, 0.0, 1.0, 2.0, 0.0], h.bin)
end
-
- should "alloc_uniform correctly with n, min,max" do
- h = Statsample::Histogram.alloc_uniform(5,0,10)
- assert_equal(5,h.bins)
- assert_equal([0.0]*5,h.bin)
- assert_equal([0.0,2.0,4.0,6.0,8.0,10.0], h.range)
+
+ should 'alloc_uniform correctly with n, min,max' do
+ h = Statsample::Histogram.alloc_uniform(5, 0, 10)
+ assert_equal(5, h.bins)
+ assert_equal([0.0] * 5, h.bin)
+ assert_equal([0.0, 2.0, 4.0, 6.0, 8.0, 10.0], h.range)
end
- should "alloc_uniform correctly with n, [min,max]" do
+ should 'alloc_uniform correctly with n, [min,max]' do
h = Statsample::Histogram.alloc_uniform(5, [0, 10])
- assert_equal(5,h.bins)
- assert_equal([0.0]*5,h.bin)
- assert_equal([0.0,2.0,4.0,6.0,8.0,10.0], h.range)
+ assert_equal(5, h.bins)
+ assert_equal([0.0] * 5, h.bin)
+ assert_equal([0.0, 2.0, 4.0, 6.0, 8.0, 10.0], h.range)
end
- should "get_range()" do
- h = Statsample::Histogram.alloc_uniform(5,2,12)
+ should 'get_range()' do
+ h = Statsample::Histogram.alloc_uniform(5, 2, 12)
5.times {|i|
- assert_equal([2+i*2, 4+i*2], h.get_range(i))
+ assert_equal([2 + i * 2, 4 + i * 2], h.get_range(i))
+ }
+ end
+ should 'min() and max()' do
+ h = Statsample::Histogram.alloc_uniform(5, 2, 12)
+ assert_equal(2, h.min)
+ assert_equal(12, h.max)
+ end
+ should 'max_val()' do
+ h = Statsample::Histogram.alloc(5, [0, 5])
+ 100.times { h.increment(rand * 5) }
+ max = h.bin[0]
+ (1..4).each {|i|
+ max = h.bin[i] if h.bin[i] > max
}
+ assert_equal(max, h.max_val)
end
- should "min() and max()" do
- h=Statsample::Histogram.alloc_uniform(5,2,12)
- assert_equal(2,h.min)
- assert_equal(12,h.max)
- end
- should "max_val()" do
- h = Statsample::Histogram.alloc(5, [0, 5])
- 100.times {h.increment(rand*5)}
- max=h.bin[0]
- (1..4).each {|i|
- max = h.bin[i] if h.bin[i] > max
- }
- assert_equal(max,h.max_val)
- end
- should "min_val()" do
- h = Statsample::Histogram.alloc(5, [0, 5])
- 100.times {h.increment(rand*5)}
- min=h.bin[0]
- (1..4).each {|i|
- min = h.bin[i] if h.bin[i]x1,'x2'=>x2}.to_dataset
- ds.name="test"
- obs=m.to_dataset
- assert_equal(ds['x1'],obs['x1'])
- assert_equal(ds['x2'],obs['x2'])
- assert_equal(ds['x1'].mean,obs['x1'].mean)
-
-
+ m.fields_y = [:x1, :x2]
+ m.name = 'test'
+ samples = 100
+ x1 =Daru::Vector.new([1, 2, 3])
+ x2 =Daru::Vector.new([4, 5, 6])
+ ds = Daru::DataFrame.new({ :x1 => x1, :x2 => x2 })
+ ds.rename 'test'
+ obs = m.to_dataframe
+ assert_equal(ds[:x1], obs[:x1])
+ assert_equal(ds[:x2], obs[:x2])
+ assert_equal(ds[:x1].mean, obs[:x1].mean)
end
+
def test_covariate
- a=Matrix[[1.0, 0.3, 0.2], [0.3, 1.0, 0.5], [0.2, 0.5, 1.0]]
+ a = Matrix[[1.0, 0.3, 0.2], [0.3, 1.0, 0.5], [0.2, 0.5, 1.0]]
a.extend Statsample::CovariateMatrix
- a.fields=%w{a b c}
+ a.fields = %w(a b c)
assert_equal(:correlation, a._type)
- assert_equal(Matrix[[0.5],[0.3]], a.submatrix(%w{c a}, %w{b}))
- assert_equal(Matrix[[1.0, 0.2] , [0.2, 1.0]], a.submatrix(%w{c a}))
- assert_equal(:correlation, a.submatrix(%w{c a})._type)
+ assert_equal(Matrix[[0.5], [0.3]], a.submatrix(%w(c a), %w(b)))
+ assert_equal(Matrix[[1.0, 0.2], [0.2, 1.0]], a.submatrix(%w(c a)))
+ assert_equal(:correlation, a.submatrix(%w(c a))._type)
- a=Matrix[[20,30,10], [30,60,50], [10,50,50]]
+ a = Matrix[[20, 30, 10], [30, 60, 50], [10, 50, 50]]
a.extend Statsample::CovariateMatrix
assert_equal(:covariance, a._type)
- a=50.times.collect {rand()}.to_scale
- b=50.times.collect {rand()}.to_scale
- c=50.times.collect {rand()}.to_scale
- ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset
- corr=Statsample::Bivariate.correlation_matrix(ds)
- real=Statsample::Bivariate.covariance_matrix(ds).correlation
+ a = Daru::Vector.new(50.times.collect { rand })
+ b = Daru::Vector.new(50.times.collect { rand })
+ c = Daru::Vector.new(50.times.collect { rand })
+ ds = Daru::DataFrame.new({ :a => a, :b => b, :c => c })
+ corr = Statsample::Bivariate.correlation_matrix(ds)
+ real = Statsample::Bivariate.covariance_matrix(ds).correlation
corr.row_size.times do |i|
corr.column_size.times do |j|
- assert_in_delta(corr[i,j], real[i,j],1e-15)
+ assert_in_delta(corr[i, j], real[i, j], 1e-15)
end
end
- end
+ end
end
diff --git a/test/test_multiset.rb b/test/test_multiset.rb
index 2c5487c..0e47477 100644
--- a/test/test_multiset.rb
+++ b/test/test_multiset.rb
@@ -1,158 +1,176 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
-
-class StatsampleMultisetTestCase < MiniTest::Unit::TestCase
+class StatsampleMultisetTestCase < Minitest::Test
def setup
- @x=%w{a a a a b b b b}.to_vector
- @y=[1,2,3,4,5,6,7,8].to_scale
- @z=[10,11,12,13,14,15,16,17].to_scale
- @ds={'x'=>@x,'y'=>@y,'z'=>@z}.to_dataset
- @ms=@ds.to_multiset_by_split('x')
+ @x = Daru::Vector.new(%w(a a a a b b b b))
+ @y = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7, 8])
+ @z = Daru::Vector.new([10, 11, 12, 13, 14, 15, 16, 17])
+ @ds = Daru::DataFrame.new({ :x => @x, :y => @y, :z => @z })
+ @ms = @ds.to_multiset_by_split(:x)
end
+
def test_creation
- v1a=[1,2,3,4,5].to_vector
- v2b=[11,21,31,41,51].to_vector
- v3c=[21,23,34,45,56].to_vector
- ds1={'v1'=>v1a,'v2'=>v2b,'v3'=>v3c}.to_dataset
- v1b=[15,25,35,45,55].to_vector
- v2b=[11,21,31,41,51].to_vector
- v3b=[21,23,34,45,56].to_vector
- ds2={'v1'=>v1b,'v2'=>v2b,'v3'=>v3b}.to_dataset
- ms=Statsample::Multiset.new(['v1','v2','v3'])
- ms.add_dataset('ds1',ds1)
- ms.add_dataset('ds2',ds2)
- assert_equal(ds1,ms['ds1'])
- assert_equal(ds2,ms['ds2'])
- assert_equal(v1a,ms['ds1']['v1'])
- assert_not_equal(v1b,ms['ds1']['v1'])
- ds3={'v1'=>v1b,'v2'=>v2b}.to_dataset
+ v1a = Daru::Vector.new([1, 2, 3, 4, 5])
+ v2b = Daru::Vector.new([11, 21, 31, 41, 51])
+ v3c = Daru::Vector.new([21, 23, 34, 45, 56])
+ ds1 = Daru::DataFrame.new({ :v1 => v1a, :v2 => v2b, :v3 => v3c })
+ v1b = Daru::Vector.new([15, 25, 35, 45, 55])
+ v2b = Daru::Vector.new([11, 21, 31, 41, 51])
+ v3b = Daru::Vector.new([21, 23, 34, 45, 56])
+ ds2 = Daru::DataFrame.new({ :v1 => v1b, :v2 => v2b, :v3 => v3b })
+ ms = Statsample::Multiset.new([:v1, :v2, :v3])
+ ms.add_dataset(:ds1, ds1)
+ ms.add_dataset(:ds2, ds2)
+ assert_equal(ds1, ms[:ds1])
+ assert_equal(ds2, ms[:ds2])
+ assert_equal(v1a, ms[:ds1][:v1])
+ assert_not_equal(v1b, ms[:ds1][:v1])
+ ds3 = Daru::DataFrame.new({ :v1 => v1b, :v2 => v2b })
assert_raise ArgumentError do
ms.add_dataset(ds3)
end
end
+
def test_creation_empty
- ms=Statsample::Multiset.new_empty_vectors(%w{id age name},%w{male female})
- ds_male={'id'=>[].to_vector,'age'=>[].to_vector, 'name'=>[].to_vector}.to_dataset(%w{id age name})
- ds_female={'id'=>[].to_vector,'age'=>[].to_vector, 'name'=>[].to_vector}.to_dataset(%w{id age name})
- ms2=Statsample::Multiset.new(%w{id age name})
- ms2.add_dataset('male',ds_male)
- ms2.add_dataset('female',ds_female)
- assert_equal(ms2.fields,ms.fields)
- assert_equal(ms2['male'],ms['male'])
- assert_equal(ms2['female'],ms['female'])
+ ms = Statsample::Multiset.new_empty_vectors([:id, :age, :name], [:male, :female])
+ ds_male = Daru::DataFrame.new({
+ :id => Daru::Vector.new([]),
+ :age => Daru::Vector.new([]),
+ :name => Daru::Vector.new([])
+ }, order: [:id, :age, :name])
+
+ ds_female = Daru::DataFrame.new({
+ :id => Daru::Vector.new([]),
+ :age => Daru::Vector.new([]),
+ :name => Daru::Vector.new([])
+ }, order: [:id, :age, :name])
+
+ ms2 = Statsample::Multiset.new([:id, :age, :name])
+ ms2.add_dataset(:male, ds_male)
+ ms2.add_dataset(:female, ds_female)
+ assert_equal(ms2.fields, ms.fields)
+ assert_equal(ms2[:male], ms[:male])
+ assert_equal(ms2[:female], ms[:female])
end
+
def test_to_multiset_by_split_one
- sex=%w{m m m m m f f f f m}.to_vector(:nominal)
- city=%w{London Paris NY London Paris NY London Paris NY Tome}.to_vector(:nominal)
- age=[10,10,20,30,34,34,33,35,36,40].to_vector(:scale)
- ds={'sex'=>sex,'city'=>city,'age'=>age}.to_dataset
- ms=ds.to_multiset_by_split('sex')
- assert_equal(2,ms.n_datasets)
- assert_equal(%w{f m},ms.datasets.keys.sort)
- assert_equal(6,ms['m'].cases)
- assert_equal(4,ms['f'].cases)
- assert_equal(%w{London Paris NY London Paris Tome},ms['m']['city'].to_a)
- assert_equal([34,33,35,36],ms['f']['age'].to_a)
+ sex = Daru::Vector.new(%w(m m m m m f f f f m))
+ city = Daru::Vector.new(%w(London Paris NY London Paris NY London Paris NY Tome))
+ age = Daru::Vector.new([10, 10, 20, 30, 34, 34, 33, 35, 36, 40])
+ ds = Daru::DataFrame.new({ :sex => sex, :city => city, :age => age })
+ ms = ds.to_multiset_by_split(:sex)
+ assert_equal(2, ms.n_datasets)
+ assert_equal(%w(f m), ms.datasets.keys.sort)
+ assert_equal(6, ms['m'].nrows)
+ assert_equal(4, ms['f'].nrows)
+ assert_equal(%w(London Paris NY London Paris Tome), ms['m'][:city].to_a)
+ assert_equal([34, 33, 35, 36], ms['f'][:age].to_a)
end
+
def test_to_multiset_by_split_multiple
- sex=%w{m m m m m m m m m m f f f f f f f f f f}.to_vector(:nominal)
- city=%w{London London London Paris Paris London London London Paris Paris London London London Paris Paris London London London Paris Paris}.to_vector(:nominal)
- hair=%w{blonde blonde black black blonde blonde black black blonde blonde black black blonde blonde black black blonde blonde black black}.to_vector(:nominal)
- age=[10,10,20,30,34,34,33,35,36,40, 10,10,20,30,34,34,33,35,36,40].to_vector(:scale)
- ds={'sex'=>sex,'city'=>city,'hair'=>hair,'age'=>age}.to_dataset(%w{sex city hair age})
- ms=ds.to_multiset_by_split('sex','city','hair')
- assert_equal(8,ms.n_datasets)
- assert_equal(3,ms[%w{m London blonde}].cases)
- assert_equal(3,ms[%w{m London blonde}].cases)
- assert_equal(1,ms[%w{m Paris black}].cases)
+ sex = Daru::Vector.new(%w(m m m m m m m m m m f f f f f f f f f f))
+ city = Daru::Vector.new(%w(London London London Paris Paris London London London Paris Paris London London London Paris Paris London London London Paris Paris))
+ hair = Daru::Vector.new(%w(blonde blonde black black blonde blonde black black blonde blonde black black blonde blonde black black blonde blonde black black))
+ age = Daru::Vector.new([10, 10, 20, 30, 34, 34, 33, 35, 36, 40, 10, 10, 20, 30, 34, 34, 33, 35, 36, 40])
+ ds = Daru::DataFrame.new({
+ :sex => sex, :city => city, :hair => hair, :age => age
+ }, order: [:sex, :city, :hair, :age])
+ ms = ds.to_multiset_by_split(:sex, :city, :hair)
+ assert_equal(8, ms.n_datasets)
+ assert_equal(3, ms[%w(m London blonde)].nrows)
+ assert_equal(3, ms[%w(m London blonde)].nrows)
+ assert_equal(1, ms[%w(m Paris black)].nrows)
end
def test_stratum_proportion
- ds1={'q1'=>[1,1,1,1,1,0,0,0,0,0,0,0].to_vector}.to_dataset
- ds2={'q1'=>[1,1,1,1,1,1,1,0,0].to_vector}.to_dataset
- assert_equal(5.0/12, ds1['q1'].proportion )
- assert_equal(7.0/9, ds2['q1'].proportion )
- ms=Statsample::Multiset.new(['q1'])
- ms.add_dataset('d1',ds1)
- ms.add_dataset('d2',ds2)
- ss=Statsample::StratifiedSample.new(ms,{'d1'=>50,'d2'=>100})
- assert_in_delta(0.655, ss.proportion('q1'),0.01)
- assert_in_delta(0.345, ss.proportion('q1',0),0.01)
-
+ ds1 = Daru::DataFrame.new({ :q1 => Daru::Vector.new([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]) })
+ ds2 = Daru::DataFrame.new({ :q1 => Daru::Vector.new([1, 1, 1, 1, 1, 1, 1, 0, 0]) })
+ assert_equal(5.0 / 12, ds1[:q1].proportion)
+ assert_equal(7.0 / 9, ds2[:q1].proportion)
+ ms = Statsample::Multiset.new([:q1])
+ ms.add_dataset(:d1, ds1)
+ ms.add_dataset(:d2, ds2)
+ ss = Statsample::StratifiedSample.new(ms, :d1 => 50, :d2 => 100)
+ assert_in_delta(0.655, ss.proportion(:q1), 0.01)
+ assert_in_delta(0.345, ss.proportion(:q1, 0), 0.01)
end
+
def test_stratum_scale
- boys={'test'=>[50, 55, 60, 62, 62, 65, 67, 67, 70, 70, 73, 73, 75, 78, 78, 80, 85, 90].to_vector(:scale)}.to_dataset
- girls={'test'=>[70, 70, 72, 72, 75, 75, 78, 78, 80, 80, 82, 82, 85, 85, 88, 88, 90, 90].to_vector(:scale)}.to_dataset
- ms=Statsample::Multiset.new(['test'])
- ms.add_dataset('boys',boys)
- ms.add_dataset('girls',girls)
- ss=Statsample::StratifiedSample.new(ms,{'boys'=>10000,'girls'=>10000})
- assert_equal(2,ss.strata_number)
- assert_equal(20000,ss.population_size)
- assert_equal(10000,ss.stratum_size('boys'))
- assert_equal(10000,ss.stratum_size('girls'))
- assert_equal(36,ss.sample_size)
- assert_equal(75,ss.mean('test'))
- assert_in_delta(1.45,ss.standard_error_wor('test'),0.01)
- assert_in_delta(ss.standard_error_wor('test'), ss.standard_error_wor_2('test'),0.00001)
+ boys = Daru::DataFrame.new({ :test => Daru::Vector.new([50, 55, 60, 62, 62, 65, 67, 67, 70, 70, 73, 73, 75, 78, 78, 80, 85, 90]) })
+ girls =Daru::DataFrame.new({ :test => Daru::Vector.new( [70, 70, 72, 72, 75, 75, 78, 78, 80, 80, 82, 82, 85, 85, 88, 88, 90, 90]) })
+ ms = Statsample::Multiset.new([:test])
+ ms.add_dataset(:boys, boys)
+ ms.add_dataset(:girls, girls)
+ ss = Statsample::StratifiedSample.new(ms, :boys => 10_000, :girls => 10_000)
+ assert_equal(2, ss.strata_number)
+ assert_equal(20_000, ss.population_size)
+ assert_equal(10_000, ss.stratum_size(:boys))
+ assert_equal(10_000, ss.stratum_size(:girls))
+ assert_equal(36, ss.sample_size)
+ assert_equal(75, ss.mean(:test))
+ assert_in_delta(1.45, ss.standard_error_wor(:test), 0.01)
+ assert_in_delta(ss.standard_error_wor(:test), ss.standard_error_wor_2(:test), 0.00001)
end
+
def test_each
- xpe={
- 'a'=>%w{a a a a}.to_vector,
- 'b'=>%w{b b b b}.to_vector
+ xpe = {
+ 'a' => Daru::Vector.new(%w(a a a a)),
+ 'b' => Daru::Vector.new(%w(b b b b))
}
- ype={
- 'a'=>[1,2,3,4].to_scale,
- 'b'=>[5,6,7,8].to_scale,
+ ype = {
+ 'a' => Daru::Vector.new([1, 2, 3, 4]),
+ 'b' => Daru::Vector.new([5, 6, 7, 8])
}
- zpe={
- 'a'=>[10,11,12,13].to_scale,
- 'b'=>[14,15,16,17].to_scale,
+ zpe = {
+ 'a' => Daru::Vector.new([10, 11, 12, 13]),
+ 'b' => Daru::Vector.new([14, 15, 16, 17])
}
- xp,yp,zp=Hash.new(),Hash.new(),Hash.new()
- @ms.each {|k,ds|
- xp[k]=ds['x']
- yp[k]=ds['y']
- zp[k]=ds['z']
+ xp, yp, zp = {}, {}, {}
+ @ms.each {|k, ds|
+ xp[k] = ds[:x]
+ yp[k] = ds[:y]
+ zp[k] = ds[:z]
}
- assert_equal(xpe,xp)
- assert_equal(ype,yp)
- assert_equal(zpe,zp)
-
+ assert_equal(xpe, xp)
+ assert_equal(ype, yp)
+ assert_equal(zpe, zp)
end
+
def test_multiset_union_with_block
-
- r1=rand()
- r2=rand()
- ye=[1*r1,2*r1,3*r1,4*r1,5*r2,6*r2,7*r2,8*r2].to_scale
-
- ze=[10*r1,11*r1,12*r1,13*r1, 14*r2,15*r2,16*r2,17*r2].to_scale
-
- ds2=@ms.union {|k,ds|
- ds['y'].recode!{|v|
- k=='a' ? v*r1 : v*r2}
- ds['z'].recode!{|v|
- k=='a' ? v*r1 : v*r2}
+ r1 = rand
+ r2 = rand
+ ye = Daru::Vector.new([1 * r1, 2 * r1, 3 * r1, 4 * r1, 5 * r2, 6 * r2, 7 * r2, 8 * r2])
+
+ ze = Daru::Vector.new([10 * r1, 11 * r1, 12 * r1, 13 * r1, 14 * r2, 15 * r2, 16 * r2, 17 * r2])
+
+ ds2 = @ms.union {|k, ds|
+ ds[:y].recode!{|v|
+ k == 'a' ? v * r1 : v * r2
+ }
+ ds[:z].recode!{|v|
+ k == 'a' ? v * r1 : v * r2
+ }
}
- assert_equal(ye,ds2['y'])
- assert_equal(ze,ds2['z'])
+ assert_equal(ye, ds2[:y])
+ assert_equal(ze, ds2[:z])
end
+
def test_multiset_union
- r1=rand()
- r2=rand()
- ye=[1*r1,2*r1,3*r1,4*r1,5*r2,6*r2,7*r2,8*r2].to_scale
-
- ze=[10*r1,11*r1,12*r1,13*r1, 14*r2,15*r2,16*r2,17*r2].to_scale
- @ms.each {|k,ds|
- ds['y'].recode!{|v|
- k=='a' ? v*r1 : v*r2}
- ds['z'].recode!{|v|
- k=='a' ? v*r1 : v*r2}
-
- }
- ds2=@ms.union
- assert_equal(ye,ds2['y'])
- assert_equal(ze,ds2['z'])
-
+ r1 = rand
+ r2 = rand
+ ye = Daru::Vector.new([1 * r1, 2 * r1, 3 * r1, 4 * r1, 5 * r2, 6 * r2, 7 * r2, 8 * r2])
+ ze = Daru::Vector.new([10 * r1, 11 * r1, 12 * r1, 13 * r1, 14 * r2, 15 * r2, 16 * r2, 17 * r2])
+
+ @ms.each do |k, ds|
+ ds[:y].recode! { |v|
+ k == 'a' ? v * r1 : v * r2
+ }
+ ds[:z].recode! {|v|
+ k == 'a' ? v * r1 : v * r2
+ }
+ end
+ ds2 = @ms.union
+ assert_equal(ye, ds2[:y])
+ assert_equal(ze, ds2[:z])
end
end
diff --git a/test/test_regression.rb b/test/test_regression.rb
index 8405703..8c23bc0 100644
--- a/test/test_regression.rb
+++ b/test/test_regression.rb
@@ -1,215 +1,215 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
-class StatsampleRegressionTestCase < MiniTest::Unit::TestCase
- context "Example with missing data" do
- setup do
- @x=[0.285714285714286, 0.114285714285714, 0.314285714285714, 0.2, 0.2, 0.228571428571429, 0.2, 0.4, 0.714285714285714, 0.285714285714286, 0.285714285714286, 0.228571428571429, 0.485714285714286, 0.457142857142857, 0.257142857142857, 0.228571428571429, 0.285714285714286, 0.285714285714286, 0.285714285714286, 0.142857142857143, 0.285714285714286, 0.514285714285714, 0.485714285714286, 0.228571428571429, 0.285714285714286, 0.342857142857143, 0.285714285714286, 0.0857142857142857].to_scale
+class StatsampleRegressionTestCase < Minitest::Test
+ context 'Example with missing data' do
+ setup do
+ @x = Daru::Vector.new([0.285714285714286, 0.114285714285714, 0.314285714285714, 0.2, 0.2, 0.228571428571429, 0.2, 0.4, 0.714285714285714, 0.285714285714286, 0.285714285714286, 0.228571428571429, 0.485714285714286, 0.457142857142857, 0.257142857142857, 0.228571428571429, 0.285714285714286, 0.285714285714286, 0.285714285714286, 0.142857142857143, 0.285714285714286, 0.514285714285714, 0.485714285714286, 0.228571428571429, 0.285714285714286, 0.342857142857143, 0.285714285714286, 0.0857142857142857])
- @y=[nil, 0.233333333333333, nil, 0.266666666666667, 0.366666666666667, nil, 0.333333333333333, 0.3, 0.666666666666667, 0.0333333333333333, 0.333333333333333, nil, nil, 0.533333333333333, 0.433333333333333, 0.4, 0.4, 0.5, 0.4, 0.266666666666667, 0.166666666666667, 0.666666666666667, 0.433333333333333, 0.166666666666667, nil, 0.4, 0.366666666666667, nil].to_scale
- @ds={'x'=>@x,'y'=>@y}.to_dataset
- @lr=Statsample::Regression::Multiple::RubyEngine.new(@ds,'y')
+ @y = Daru::Vector.new([nil, 0.233333333333333, nil, 0.266666666666667, 0.366666666666667, nil, 0.333333333333333, 0.3, 0.666666666666667, 0.0333333333333333, 0.333333333333333, nil, nil, 0.533333333333333, 0.433333333333333, 0.4, 0.4, 0.5, 0.4, 0.266666666666667, 0.166666666666667, 0.666666666666667, 0.433333333333333, 0.166666666666667, nil, 0.4, 0.366666666666667, nil])
+ @ds = Daru::DataFrame.new({ :x => @x, :y => @y })
+ @lr = Statsample::Regression::Multiple::RubyEngine.new(@ds, :y)
+ end
+ should 'have correct values' do
+ assert_in_delta(0.455, @lr.r2, 0.001)
+ assert_in_delta(0.427, @lr.r2_adjusted, 0.001)
+ assert_in_delta(0.1165, @lr.se_estimate, 0.001)
+ assert_in_delta(15.925, @lr.f, 0.0001)
+ assert_in_delta(0.675, @lr.standarized_coeffs[:x], 0.001)
+ assert_in_delta(0.778, @lr.coeffs[:x], 0.001, 'coeff x')
+ assert_in_delta(0.132, @lr.constant, 0.001, 'constant')
+ assert_in_delta(0.195, @lr.coeffs_se[:x], 0.001, 'coeff x se')
+ assert_in_delta(0.064, @lr.constant_se, 0.001, 'constant se')
end
- should "have correct values" do
- assert_in_delta(0.455,@lr.r2,0.001)
- assert_in_delta(0.427,@lr.r2_adjusted, 0.001)
- assert_in_delta(0.1165,@lr.se_estimate,0.001)
- assert_in_delta(15.925,@lr.f,0.0001)
- assert_in_delta(0.675, @lr.standarized_coeffs['x'],0.001)
- assert_in_delta(0.778, @lr.coeffs['x'],0.001, "coeff x")
- assert_in_delta(0.132, @lr.constant,0.001,"constant")
- assert_in_delta(0.195, @lr.coeffs_se['x'],0.001,"coeff x se")
- assert_in_delta(0.064, @lr.constant_se,0.001,"constant se")
- end
end
- should "return an error if data is linearly dependent" do
- samples=100
-
- a,b=rand,rand
-
- x1=samples.times.map { rand}.to_scale
- x2=samples.times.map {rand}.to_scale
- x3=samples.times.map {|i| x1[i]*(1+a)+x2[i]*(1+b)}.to_scale
- y=samples.times.map {|i| x1[i]+x2[i]+x3[i]+rand}.to_scale
-
- ds={'x1'=>x1,'x2'=>x2,'x3'=>x3,'y'=>y}.to_dataset
+ should 'return an error if data is linearly dependent' do
+ samples = 100
+
+ a, b = rand, rand
+
+ x1 = Daru::Vector.new(samples.times.map { rand })
+ x2 = Daru::Vector.new(samples.times.map { rand })
+ x3 = Daru::Vector.new(samples.times.map { |i| x1[i] * (1 + a) + x2[i] * (1 + b) })
+ y = Daru::Vector.new(samples.times.map { |i| x1[i] + x2[i] + x3[i] + rand })
+
+ ds = Daru::DataFrame.new({ :x1 => x1, :x2 => x2, :x3 => x3, :y => y })
assert_raise(Statsample::Regression::LinearDependency) {
- Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
+ Statsample::Regression::Multiple::RubyEngine.new(ds, :y)
}
end
def test_parameters
- @x=[13,20,10,33,15].to_vector(:scale)
- @y=[23,18,35,10,27 ].to_vector(:scale)
- reg=Statsample::Regression::Simple.new_from_vectors(@x,@y)
+ @x =Daru::Vector.new([13, 20, 10, 33, 15])
+ @y =Daru::Vector.new([23, 18, 35, 10, 27])
+ reg = Statsample::Regression::Simple.new_from_vectors(@x, @y)
_test_simple_regression(reg)
- ds={'x'=>@x,'y'=>@y}.to_dataset
- reg=Statsample::Regression::Simple.new_from_dataset(ds,'x','y')
+ ds = Daru::DataFrame.new({ :x => @x, :y => @y })
+ reg = Statsample::Regression::Simple.new_from_dataset(ds, :x, :y)
_test_simple_regression(reg)
- reg=Statsample::Regression.simple(@x,@y)
+ reg = Statsample::Regression.simple(@x, @y)
_test_simple_regression(reg)
-
end
+
def _test_simple_regression(reg)
-
- assert_in_delta(40.009, reg.a,0.001)
- assert_in_delta(-0.957, reg.b,0.001)
- assert_in_delta(4.248,reg.standard_error,0.002)
+ assert_in_delta(40.009, reg.a, 0.001)
+ assert_in_delta(-0.957, reg.b, 0.001)
+ assert_in_delta(4.248, reg.standard_error, 0.002)
assert(reg.summary)
end
-
+
def test_summaries
- a=10.times.map{rand(100)}.to_scale
- b=10.times.map{rand(100)}.to_scale
- y=10.times.map{rand(100)}.to_scale
- ds={'a'=>a,'b'=>b,'y'=>y}.to_dataset
- lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
- assert(lr.summary.size>0)
+ a = Daru::Vector.new(10.times.map { rand(100) })
+ b = Daru::Vector.new(10.times.map { rand(100) })
+ y = Daru::Vector.new(10.times.map { rand(100) })
+ ds = Daru::DataFrame.new({ :a => a, :b => b, :y => y })
+ lr = Statsample::Regression::Multiple::RubyEngine.new(ds, :y)
+ assert(lr.summary.size > 0)
end
+
def test_multiple_dependent
- complete=Matrix[
- [1,0.53,0.62,0.19,-0.09,0.08,0.02,-0.12,0.08],
- [0.53,1,0.61,0.23,0.1,0.18,0.02,-0.1,0.15],
- [0.62,0.61,1,0.03,0.1,0.12,0.03,-0.06,0.12],
- [0.19,0.23,0.03,1,-0.02,0.02,0,-0.02,-0.02],
- [-0.09,0.1,0.1,-0.02,1,0.05,0.06,0.18,0.02],
- [0.08,0.18,0.12,0.02,0.05,1,0.22,-0.07,0.36],
- [0.02,0.02,0.03,0,0.06,0.22,1,-0.01,-0.05],
- [-0.12,-0.1,-0.06,-0.02,0.18,-0.07,-0.01,1,-0.03],
- [0.08,0.15,0.12,-0.02,0.02,0.36,-0.05,-0.03,1]]
+ complete = Matrix[
+ [1, 0.53, 0.62, 0.19, -0.09, 0.08, 0.02, -0.12, 0.08],
+ [0.53, 1, 0.61, 0.23, 0.1, 0.18, 0.02, -0.1, 0.15],
+ [0.62, 0.61, 1, 0.03, 0.1, 0.12, 0.03, -0.06, 0.12],
+ [0.19, 0.23, 0.03, 1, -0.02, 0.02, 0, -0.02, -0.02],
+ [-0.09, 0.1, 0.1, -0.02, 1, 0.05, 0.06, 0.18, 0.02],
+ [0.08, 0.18, 0.12, 0.02, 0.05, 1, 0.22, -0.07, 0.36],
+ [0.02, 0.02, 0.03, 0, 0.06, 0.22, 1, -0.01, -0.05],
+ [-0.12, -0.1, -0.06, -0.02, 0.18, -0.07, -0.01, 1, -0.03],
+ [0.08, 0.15, 0.12, -0.02, 0.02, 0.36, -0.05, -0.03, 1]]
complete.extend Statsample::CovariateMatrix
- complete.fields=%w{adhd cd odd sex age monly mwork mage poverty}
+ complete.fields = %w(adhd cd odd sex age monly mwork mage poverty)
- lr=Statsample::Regression::Multiple::MultipleDependent.new(complete, %w{adhd cd odd})
-
-
- assert_in_delta(0.197, lr.r2yx,0.001)
- assert_in_delta(0.197, lr.r2yx_covariance,0.001)
- assert_in_delta(0.07, lr.p2yx,0.001)
+ lr = Statsample::Regression::Multiple::MultipleDependent.new(complete, %w(adhd cd odd))
+ assert_in_delta(0.197, lr.r2yx, 0.001)
+ assert_in_delta(0.197, lr.r2yx_covariance, 0.001)
+ assert_in_delta(0.07, lr.p2yx, 0.001)
end
-
+
def test_multiple_regression_pairwise_2
- @a=[1,3,2,4,3,5,4,6,5,7,3,nil,3,nil,3].to_vector(:scale)
- @b=[3,3,4,4,5,5,6,6,4,4,2,2,nil,6,2].to_vector(:scale)
- @c=[11,22,30,40,50,65,78,79,99,100,nil,3,7,nil,7].to_vector(:scale)
- @y=[3,4,5,6,7,8,9,10,20,30,30,40,nil,50,nil].to_vector(:scale)
- ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
- lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
- assert_in_delta(2407.436,lr.sst,0.001)
- assert_in_delta(0.752,lr.r,0.001, "pairwise r")
- assert_in_delta(0.565,lr.r2,0.001)
- assert_in_delta(1361.130,lr.ssr,0.001)
- assert_in_delta(1046.306,lr.sse,0.001)
- assert_in_delta(3.035,lr.f,0.001)
+ @a =Daru::Vector.new( [1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 3, nil, 3, nil, 3])
+ @b =Daru::Vector.new( [3, 3, 4, 4, 5, 5, 6, 6, 4, 4, 2, 2, nil, 6, 2])
+ @c =Daru::Vector.new( [11, 22, 30, 40, 50, 65, 78, 79, 99, 100, nil, 3, 7, nil, 7])
+ @y =Daru::Vector.new( [3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 30, 40, nil, 50, nil])
+ ds = Daru::DataFrame.new({ :a => @a, :b => @b, :c => @c, :y => @y })
+ lr = Statsample::Regression::Multiple::RubyEngine.new(ds, :y)
+ assert_in_delta(2407.436, lr.sst, 0.001)
+ assert_in_delta(0.752, lr.r, 0.001, 'pairwise r')
+ assert_in_delta(0.565, lr.r2, 0.001)
+ assert_in_delta(1361.130, lr.ssr, 0.001)
+ assert_in_delta(1046.306, lr.sse, 0.001)
+ assert_in_delta(3.035, lr.f, 0.001)
end
-
def test_multiple_regression_gsl
if Statsample.has_gsl?
- @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
- @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
- @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
- @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
- ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
- lr=Statsample::Regression::Multiple::GslEngine.new(ds,'y')
- assert(lr.summary.size>0)
- model_test(lr,'gsl')
- predicted=[1.7857, 6.0989, 3.2433, 7.2908, 4.9667, 10.3428, 8.8158, 10.4717, 23.6639, 25.3198]
- c_predicted=lr.predicted
+ @a =Daru::Vector.new( [1, 3, 2, 4, 3, 5, 4, 6, 5, 7])
+ @b =Daru::Vector.new( [3, 3, 4, 4, 5, 5, 6, 6, 4, 4])
+ @c =Daru::Vector.new( [11, 22, 30, 40, 50, 65, 78, 79, 99, 100])
+ @y =Daru::Vector.new( [3, 4, 5, 6, 7, 8, 9, 10, 20, 30])
+ ds = Daru::DataFrame.new({ :a => @a, :b => @b, :c => @c, :y => @y })
+ lr = Statsample::Regression::Multiple::GslEngine.new(ds, :y)
+ assert(lr.summary.size > 0)
+ model_test(lr, 'gsl')
+ predicted = [1.7857, 6.0989, 3.2433, 7.2908, 4.9667, 10.3428, 8.8158, 10.4717, 23.6639, 25.3198]
+ c_predicted = lr.predicted
predicted.each_index{|i|
- assert_in_delta(predicted[i],c_predicted[i],0.001)
+ assert_in_delta(predicted[i], c_predicted[i], 0.001)
}
- residuals=[1.2142, -2.0989, 1.7566, -1.29085, 2.033, -2.3428, 0.18414, -0.47177, -3.66395, 4.6801]
- c_residuals=lr.residuals
+ residuals = [1.2142, -2.0989, 1.7566, -1.29085, 2.033, -2.3428, 0.18414, -0.47177, -3.66395, 4.6801]
+ c_residuals = lr.residuals
residuals.each_index{|i|
- assert_in_delta(residuals[i],c_residuals[i],0.001)
+ assert_in_delta(residuals[i], c_residuals[i], 0.001)
}
else
- skip "Regression::Multiple::GslEngine not tested (no Gsl)"
+ skip 'Regression::Multiple::GslEngine not tested (no Gsl)'
end
end
-
-
- def model_test_matrix(lr,name='undefined')
-
- stan_coeffs={'a'=>0.151,'b'=>-0.547,'c'=>0.997}
- unstan_coeffs={'a'=>0.695, 'b'=>-4.286, 'c'=>0.266}
+ def model_test_matrix(lr, name = 'undefined')
+ stan_coeffs = { :a => 0.151, :b => -0.547, :c => 0.997 }
+ unstan_coeffs = { :a => 0.695, :b => -4.286, :c => 0.266 }
unstan_coeffs.each_key{|k|
- assert_in_delta(unstan_coeffs[k], lr.coeffs[k],0.001,"b coeffs - #{name}")
+ assert_in_delta(unstan_coeffs[k], lr.coeffs[k], 0.001, "b coeffs - #{name}")
}
stan_coeffs.each_key{|k|
- assert_in_delta(stan_coeffs[k], lr.standarized_coeffs[k],0.001, "beta coeffs - #{name}")
+ assert_in_delta(stan_coeffs[k], lr.standarized_coeffs[k], 0.001, "beta coeffs - #{name}")
}
- assert_in_delta(11.027,lr.constant,0.001)
+ assert_in_delta(11.027, lr.constant, 0.001)
- assert_in_delta(0.955,lr.r,0.001)
- assert_in_delta(0.913,lr.r2,0.001)
+ assert_in_delta(0.955, lr.r, 0.001)
+ assert_in_delta(0.913, lr.r2, 0.001)
- assert_in_delta(20.908, lr.f,0.001)
+ assert_in_delta(20.908, lr.f, 0.001)
assert_in_delta(0.001, lr.probability, 0.001)
- assert_in_delta(0.226,lr.tolerance("a"),0.001)
-
- coeffs_se={"a"=>1.171,"b"=>1.129,"c"=>0.072}
+ assert_in_delta(0.226, lr.tolerance(:a), 0.001)
+ coeffs_se = { :a => 1.171, :b => 1.129, :c => 0.072 }
-
- ccoeffs_se=lr.coeffs_se
+ ccoeffs_se = lr.coeffs_se
coeffs_se.each_key{|k|
- assert_in_delta(coeffs_se[k],ccoeffs_se[k],0.001)
+ assert_in_delta(coeffs_se[k], ccoeffs_se[k], 0.001)
}
- coeffs_t={"a"=>0.594,"b"=>-3.796,"c"=>3.703}
- ccoeffs_t=lr.coeffs_t
+ coeffs_t = { :a => 0.594, :b => -3.796, :c => 3.703 }
+ ccoeffs_t = lr.coeffs_t
coeffs_t.each_key{|k|
- assert_in_delta(coeffs_t[k], ccoeffs_t[k],0.001)
+ assert_in_delta(coeffs_t[k], ccoeffs_t[k], 0.001)
}
- assert_in_delta(639.6,lr.sst,0.001)
- assert_in_delta(583.76,lr.ssr,0.001)
- assert_in_delta(55.840,lr.sse,0.001)
- assert(lr.summary.size>0, "#{name} without summary")
+ assert_in_delta(639.6, lr.sst, 0.001)
+ assert_in_delta(583.76, lr.ssr, 0.001)
+ assert_in_delta(55.840, lr.sse, 0.001)
+ assert(lr.summary.size > 0, "#{name} without summary")
end
- def model_test(lr,name='undefined')
- model_test_matrix(lr,name)
- assert_in_delta(4.559, lr.constant_se,0.001)
- assert_in_delta(2.419, lr.constant_t,0.001)
- assert_in_delta(1.785,lr.process([1,3,11]),0.001)
+ def model_test(lr, name = 'undefined')
+ model_test_matrix(lr, name)
+ assert_in_delta(4.559, lr.constant_se, 0.001)
+ assert_in_delta(2.419, lr.constant_t, 0.001)
+
+ assert_in_delta(1.785, lr.process([1, 3, 11]), 0.001)
end
+
def test_regression_matrix
- @a=[1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
- @b=[3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
- @c=[11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
- @y=[3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
- ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
- cor=Statsample::Bivariate.correlation_matrix(ds)
-
- lr=Statsample::Regression::Multiple::MatrixEngine.new(cor,'y', :y_mean=>@y.mean, :x_mean=>{'a'=>ds['a'].mean, 'b'=>ds['b'].mean, 'c'=>ds['c'].mean}, :cases=>@a.size, :y_sd=>@y.sd , :x_sd=>{'a' => @a.sd, 'b' => @b.sd, 'c' => @c.sd})
+ @a = Daru::Vector.new([1, 3, 2, 4, 3, 5, 4, 6, 5, 7])
+ @b = Daru::Vector.new([3, 3, 4, 4, 5, 5, 6, 6, 4, 4])
+ @c = Daru::Vector.new([11, 22, 30, 40, 50, 65, 78, 79, 99, 100])
+ @y = Daru::Vector.new([3, 4, 5, 6, 7, 8, 9, 10, 20, 30])
+ ds = Daru::DataFrame.new({ :a => @a, :b => @b, :c => @c, :y => @y })
+ cor = Statsample::Bivariate.correlation_matrix(ds)
+
+ lr = Statsample::Regression::Multiple::MatrixEngine.new(
+ cor, :y, y_mean: @y.mean,
+ x_mean: { :a => ds[:a].mean, :b => ds[:b].mean, :c => ds[:c].mean },
+ cases: @a.size, y_sd: @y.sd, x_sd: { :a => @a.sd, :b => @b.sd, :c => @c.sd })
assert_nil(lr.constant_se)
assert_nil(lr.constant_t)
- model_test_matrix(lr, "correlation matrix")
+ model_test_matrix(lr, 'correlation matrix')
- covariance=Statsample::Bivariate.covariance_matrix(ds)
- lr=Statsample::Regression::Multiple::MatrixEngine.new(covariance,'y', :y_mean=>@y.mean, :x_mean=>{'a'=>ds['a'].mean, 'b'=>ds['b'].mean, 'c'=>ds['c'].mean}, :cases=>@a.size)
- assert(lr.summary.size>0)
+ covariance = Statsample::Bivariate.covariance_matrix(ds)
+ lr = Statsample::Regression::Multiple::MatrixEngine.new(
+ covariance, :y, y_mean: @y.mean,
+ x_mean: { :a => ds[:a].mean, :b => ds[:b].mean, :c => ds[:c].mean }, cases: @a.size)
+ assert(lr.summary.size > 0)
- model_test(lr , "covariance matrix")
+ model_test(lr, 'covariance matrix')
end
+
def test_regression_rubyengine
- @a=[nil,1,3,2,4,3,5,4,6,5,7].to_vector(:scale)
- @b=[nil,3,3,4,4,5,5,6,6,4,4].to_vector(:scale)
- @c=[nil,11,22,30,40,50,65,78,79,99,100].to_vector(:scale)
- @y=[nil,3,4,5,6,7,8,9,10,20,30].to_vector(:scale)
- ds={'a'=>@a,'b'=>@b,'c'=>@c,'y'=>@y}.to_dataset
- lr=Statsample::Regression::Multiple::RubyEngine.new(ds,'y')
+ @a = Daru::Vector.new([nil, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7])
+ @b = Daru::Vector.new([nil, 3, 3, 4, 4, 5, 5, 6, 6, 4, 4])
+ @c = Daru::Vector.new([nil, 11, 22, 30, 40, 50, 65, 78, 79, 99, 100])
+ @y = Daru::Vector.new([nil, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30])
+ ds = Daru::DataFrame.new({ :a => @a, :b => @b, :c => @c, :y => @y })
+ lr = Statsample::Regression::Multiple::RubyEngine.new(ds, :y)
assert_equal(11, lr.total_cases)
assert_equal(10, lr.valid_cases)
model_test(lr, 'rubyengine with missing data')
- predicted=[nil,1.7857, 6.0989, 3.2433, 7.2908, 4.9667, 10.3428, 8.8158, 10.4717, 23.6639, 25.3198]
+ predicted = [nil, 1.7857, 6.0989, 3.2433, 7.2908, 4.9667, 10.3428, 8.8158, 10.4717, 23.6639, 25.3198]
c_predicted = lr.predicted
predicted.each_index do |i|
if c_predicted[i].nil?
@@ -218,15 +218,14 @@ def test_regression_rubyengine
assert_in_delta(predicted[i], c_predicted[i], 0.001)
end
end
- residuals=[nil,1.2142, -2.0989, 1.7566, -1.29085, 2.033, -2.3428, 0.18414, -0.47177, -3.66395, 4.6801]
- c_residuals=lr.residuals
+ residuals = [nil, 1.2142, -2.0989, 1.7566, -1.29085, 2.033, -2.3428, 0.18414, -0.47177, -3.66395, 4.6801]
+ c_residuals = lr.residuals
residuals.each_index do |i|
if c_residuals[i].nil?
assert(residuals[i].nil?)
else
- assert_in_delta(residuals[i],c_residuals[i],0.001)
+ assert_in_delta(residuals[i], c_residuals[i], 0.001)
end
end
-
end
end
diff --git a/test/test_reliability.rb b/test/test_reliability.rb
index c7730e6..d0e284d 100644
--- a/test/test_reliability.rb
+++ b/test/test_reliability.rb
@@ -1,229 +1,223 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleReliabilityTestCase < MiniTest::Unit::TestCase
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleReliabilityTestCase < Minitest::Test
context Statsample::Reliability do
- should "return correct r according to Spearman-Brown prophecy" do
- r=0.6849
- n=62.quo(15)
- assert_in_delta(0.9, Statsample::Reliability.sbp(r,n), 0.001)
+ should 'return correct r according to Spearman-Brown prophecy' do
+ r = 0.6849
+ n = 62.quo(15)
+ assert_in_delta(0.9, Statsample::Reliability.sbp(r, n), 0.001)
end
- should "return correct n for desired realiability" do
- r=0.6849
- r_d=0.9
- assert_in_delta(62, Statsample::Reliability.n_for_desired_reliability(r, r_d, 15),0.5)
- end
- context "Cronbach's alpha" do
+ should 'return correct n for desired realiability' do
+ r = 0.6849
+ r_d = 0.9
+ assert_in_delta(62, Statsample::Reliability.n_for_desired_reliability(r, r_d, 15), 0.5)
+ end
+ context "Cronbach's alpha" do
setup do
- @samples=40
- @n_variables=rand(10)+2
- @ds=Statsample::Dataset.new()
- base=@samples.times.collect {|a| rand()}.to_scale
+ @samples = 40
+ @n_variables = rand(10) + 2
+ @ds = Daru::DataFrame.new({}, index: @samples)
+ base = Daru::Vector.new(@samples.times.collect { |_a| rand })
@n_variables.times do |i|
- @ds[i]=base.collect {|v| v+rand()}.to_scale
+ @ds[i] = Daru::Vector.new(base.collect { |v| v + rand })
end
-
- @ds.update_valid_data
- @k=@ds.fields.size
- @cm=Statsample::Bivariate.covariance_matrix(@ds)
- @dse=@ds.dup
- @dse.fields.each do |f|
- @dse[f]=@dse[f].standarized
+
+ @k = @ds.ncols
+ @cm = Statsample::Bivariate.covariance_matrix(@ds)
+ @dse = @ds.dup
+ @dse.vectors.each do |f|
+ @dse[f] = @dse[f].standardize
end
- @cme=Statsample::Bivariate.covariance_matrix(@dse)
- @a=Statsample::Reliability.cronbach_alpha(@ds)
- @as=Statsample::Reliability.cronbach_alpha_standarized(@ds)
- end
- should "alpha will be equal to sum of matrix covariance less the individual variances" do
- total_sum=@cm.total_sum
- ind_var=@ds.fields.inject(0) {|ac,v| ac+@ds[v].variance}
- expected = @k.quo(@k-1) * (1-(ind_var.quo(total_sum)))
- assert_in_delta(expected, @a,1e-10)
- end
- should "method cronbach_alpha_from_n_s2_cov return correct values" do
- sa=Statsample::Reliability::ScaleAnalysis.new(@ds)
+ @cme = Statsample::Bivariate.covariance_matrix(@dse)
+ @a = Statsample::Reliability.cronbach_alpha(@ds)
+ @as = Statsample::Reliability.cronbach_alpha_standarized(@ds)
+ end
+ should 'alpha will be equal to sum of matrix covariance less the individual variances' do
+ total_sum = @cm.total_sum
+ ind_var = @ds.vectors.to_a.inject(0) { |ac, v| ac + @ds[v].variance }
+ expected = @k.quo(@k - 1) * (1 - (ind_var.quo(total_sum)))
+ assert_in_delta(expected, @a, 1e-10)
+ end
+ should 'method cronbach_alpha_from_n_s2_cov return correct values' do
+ sa = Statsample::Reliability::ScaleAnalysis.new(@ds)
vm, cm = sa.variances_mean, sa.covariances_mean
- assert_in_delta(sa.alpha, Statsample::Reliability.cronbach_alpha_from_n_s2_cov(@n_variables, vm,cm), 1e-10)
+ assert_in_delta(sa.alpha, Statsample::Reliability.cronbach_alpha_from_n_s2_cov(@n_variables, vm, cm), 1e-10)
end
- should "method cronbach_alpha_from_covariance_matrix returns correct value" do
- cov=Statsample::Bivariate.covariance_matrix(@ds)
- assert_in_delta(@a, Statsample::Reliability.cronbach_alpha_from_covariance_matrix(cov),0.0000001)
+ should 'method cronbach_alpha_from_covariance_matrix returns correct value' do
+ cov = Statsample::Bivariate.covariance_matrix(@ds)
+ assert_in_delta(@a, Statsample::Reliability.cronbach_alpha_from_covariance_matrix(cov), 0.0000001)
end
- should "return correct n for desired alpha, covariance and variance" do
- sa=Statsample::Reliability::ScaleAnalysis.new(@ds)
+ should 'return correct n for desired alpha, covariance and variance' do
+ sa = Statsample::Reliability::ScaleAnalysis.new(@ds)
vm, cm = sa.variances_mean, sa.covariances_mean
- n_obtained=Statsample::Reliability.n_for_desired_alpha(@a, vm,cm)
- #p n_obtained
- assert_in_delta(Statsample::Reliability.cronbach_alpha_from_n_s2_cov(n_obtained, vm,cm) ,@a,0.001)
- end
-
- should "standarized alpha will be equal to sum of matrix covariance less the individual variances on standarized values" do
- total_sum=@cme.total_sum
- ind_var=@dse.fields.inject(0) {|ac,v| ac+@dse[v].variance}
- expected = @k.quo(@k-1) * (1-(ind_var.quo(total_sum)))
+ n_obtained = Statsample::Reliability.n_for_desired_alpha(@a, vm, cm)
+ # p n_obtained
+ assert_in_delta(Statsample::Reliability.cronbach_alpha_from_n_s2_cov(n_obtained, vm, cm), @a, 0.001)
+ end
+
+ should 'standarized alpha will be equal to sum of matrix covariance less the individual variances on standarized values' do
+ total_sum = @cme.total_sum
+ ind_var = @dse.vectors.to_a.inject(0) { |ac, v| ac + @dse[v].variance }
+ expected = @k.quo(@k - 1) * (1 - (ind_var.quo(total_sum)))
assert_in_delta(expected, @as, 1e-10)
end
end
context Statsample::Reliability::ItemCharacteristicCurve do
setup do
- @samples=100
- @points=rand(10)+3
- @max_point=(@points-1)*3
- @x1=@samples.times.map{rand(@points)}.to_scale
- @x2=@samples.times.map{rand(@points)}.to_scale
- @x3=@samples.times.map{rand(@points)}.to_scale
- @ds={'a'=>@x1,'b'=>@x2,'c'=>@x3}.to_dataset
- @icc=Statsample::Reliability::ItemCharacteristicCurve.new(@ds)
- end
- should "have a correct automatic vector_total" do
+ @samples = 100
+ @points = rand(10) + 3
+ @max_point = (@points - 1) * 3
+ @x1 = Daru::Vector.new(@samples.times.map { rand(@points) })
+ @x2 = Daru::Vector.new(@samples.times.map { rand(@points) })
+ @x3 = Daru::Vector.new(@samples.times.map { rand(@points) })
+ @ds = Daru::DataFrame.new({ :a => @x1, :b => @x2, :c => @x3 })
+ @icc = Statsample::Reliability::ItemCharacteristicCurve.new(@ds)
+ end
+ should 'have a correct automatic vector_total' do
assert_equal(@ds.vector_sum, @icc.vector_total)
end
- should "have a correct different vector_total" do
- x2=@samples.times.map{rand(10)}.to_scale
- @icc=Statsample::Reliability::ItemCharacteristicCurve.new(@ds,x2)
+ should 'have a correct different vector_total' do
+ x2 = Daru::Vector.new(@samples.times.map { rand(10) })
+ @icc = Statsample::Reliability::ItemCharacteristicCurve.new(@ds, x2)
assert_equal(x2, @icc.vector_total)
assert_raises(ArgumentError) do
- inc=(@samples+10).times.map{rand(10)}.to_scale
- @icc=Statsample::Reliability::ItemCharacteristicCurve.new(@ds,inc)
+ inc = Daru::Vector.new((@samples + 10).times.map { rand(10) })
+ @icc = Statsample::Reliability::ItemCharacteristicCurve.new(@ds, inc)
end
end
- should "have 0% for 0 points on maximum value values" do
- max=@icc.curve_field('a',0)[@max_point.to_f]
- max||=0
+ should 'have 0% for 0 points on maximum value values' do
+ max = @icc.curve_field(:a, 0)[@max_point.to_f]
+ max ||= 0
assert_in_delta(0, max)
end
- should "have 0 for max value on minimum value" do
- max=@icc.curve_field('a',@max_point)[0.0]
- max||=0
+ should 'have 0 for max value on minimum value' do
+ max = @icc.curve_field(:a, @max_point)[0.0]
+ max ||= 0
assert_in_delta(0, max)
end
- should "have correct values of % for any value" do
- sum=@icc.vector_total
- total={}
- total_g=sum.frequencies
- index=rand(@points)
- @x1.each_with_index do |v,i|
- total[sum[i]]||=0
- total[sum[i]]+=1 if v==index
+ should 'have correct values of % for any value' do
+ sum = @icc.vector_total
+ total = {}
+ total_g = sum.frequencies
+ index = rand(@points)
+ @x1.each_with_index do |v, i|
+ total[sum[i]] ||= 0
+ total[sum[i]] += 1 if v == index
end
- expected=total.each {|k,v|
- total[k]=v.quo(total_g[k])
+ expected = total.each {|k, v|
+ total[k] = v.quo(total_g[k])
}
- assert_equal(expected, @icc.curve_field('a',index))
-
+ assert_equal(expected, @icc.curve_field(:a, index))
end
-
end
-
+
context Statsample::Reliability::MultiScaleAnalysis do
-
setup do
- size=100
- @scales=3
- @items_per_scale=10
- h={}
+ size = 100
+ @scales = 3
+ @items_per_scale = 10
+ h = {}
@scales.times {|s|
@items_per_scale.times {|i|
- h["#{s}_#{i}"] = (size.times.map {(s*2)+rand}).to_scale
+ h["#{s}_#{i}".to_sym] = Daru::Vector.new((size.times.map { (s * 2) + rand }))
}
}
- @ds=h.to_dataset
- @msa=Statsample::Reliability::MultiScaleAnalysis.new(:name=>'Multiple Analysis') do |m|
- m.scale "complete", @ds
+ @ds = Daru::DataFrame.new(h)
+ @msa = Statsample::Reliability::MultiScaleAnalysis.new(name: 'Multiple Analysis') do |m|
+ m.scale 'complete', @ds
@scales.times {|s|
- m.scale "scale_#{s}", @ds.clone(@items_per_scale.times.map {|i| "#{s}_#{i}"}), {:name=>"Scale #{s}"}
+ m.scale "scale_#{s}", @ds.clone(*@items_per_scale.times.map { |i| "#{s}_#{i}".to_sym }), name: "Scale #{s}"
}
end
end
- should "Retrieve correct ScaleAnalysis for whole scale" do
- sa=Statsample::Reliability::ScaleAnalysis.new(@ds, :name=>"Scale complete")
- assert_equal(sa.variances_mean, @msa.scale("complete").variances_mean)
+
+ should 'Retrieve correct ScaleAnalysis for whole scale' do
+ sa = Statsample::Reliability::ScaleAnalysis.new(@ds, name: 'Scale complete')
+ assert_equal(sa.variances_mean, @msa.scale('complete').variances_mean)
end
- should "Retrieve correct ScaleAnalysis for each scale" do
+ should 'Retrieve correct ScaleAnalysis for each scale' do
@scales.times {|s|
- sa=Statsample::Reliability::ScaleAnalysis.new(@ds.dup(@items_per_scale.times.map {|i| "#{s}_#{i}"}), :name=>"Scale #{s}")
- assert_equal(sa.variances_mean,@msa.scale("scale_#{s}").variances_mean)
+ sa = Statsample::Reliability::ScaleAnalysis.new(@ds.dup(@items_per_scale.times.map { |i| "#{s}_#{i}".to_sym }), name: "Scale #{s}")
+ assert_equal(sa.variances_mean, @msa.scale("scale_#{s}").variances_mean)
}
end
- should "retrieve correct correlation matrix for each scale" do
- vectors={'complete' => @ds.vector_sum}
+ should 'retrieve correct correlation matrix for each scale' do
+ vectors = { :complete => @ds.vector_sum }
@scales.times {|s|
- vectors["scale_#{s}"]=@ds.dup(@items_per_scale.times.map {|i| "#{s}_#{i}"}).vector_sum
+ vectors["scale_#{s}".to_sym] = @ds.dup(@items_per_scale.times.map { |i| "#{s}_#{i}".to_sym }).vector_sum
}
- ds2=vectors.to_dataset
+ ds2 = Daru::DataFrame.new(vectors)
assert_equal(Statsample::Bivariate.correlation_matrix(ds2), @msa.correlation_matrix)
end
- should "delete scale using delete_scale" do
- @msa.delete_scale("complete")
- assert_equal(@msa.scales.keys.sort, @scales.times.map {|s| "scale_#{s}"})
+ should 'delete scale using delete_scale' do
+ @msa.delete_scale('complete')
+ assert_equal(@msa.scales.keys.sort, @scales.times.map { |s| "scale_#{s}" })
end
- should "retrieve pca for scales" do
- @msa.delete_scale("complete")
- vectors=Hash.new
+ should 'retrieve pca for scales' do
+ @msa.delete_scale('complete')
+ vectors = {}
@scales.times {|s|
- vectors["scale_#{s}"]=@ds.dup(@items_per_scale.times.map {|i| "#{s}_#{i}"}).vector_sum
+ vectors["scale_#{s}".to_sym] = @ds.dup(@items_per_scale.times.map { |i| "#{s}_#{i}".to_sym }).vector_sum
}
- ds2=vectors.to_dataset
- cor_matrix=Statsample::Bivariate.correlation_matrix(ds2)
- m=3
- pca=Statsample::Factor::PCA.new(cor_matrix, :m=>m)
- assert_equal(pca.component_matrix, @msa.pca(:m=>m).component_matrix)
- end
- should "retrieve acceptable summary" do
- @msa.delete_scale("scale_0")
- @msa.delete_scale("scale_1")
- @msa.delete_scale("scale_2")
-
-
- #@msa.summary_correlation_matrix=true
- #@msa.summary_pca=true
-
-
- assert(@msa.summary.size>0)
+ ds2 = Daru::DataFrame.new(vectors)
+ cor_matrix = Statsample::Bivariate.correlation_matrix(ds2)
+ m = 3
+ pca = Statsample::Factor::PCA.new(cor_matrix, m: m)
+ assert_equal(pca.component_matrix, @msa.pca(m: m).component_matrix)
+ end
+ should 'retrieve acceptable summary' do
+ @msa.delete_scale('scale_0')
+ @msa.delete_scale('scale_1')
+ @msa.delete_scale('scale_2')
+
+ # @msa.summary_correlation_matrix=true
+ # @msa.summary_pca=true
+
+ assert(@msa.summary.size > 0)
end
end
context Statsample::Reliability::ScaleAnalysis do
- setup do
- @x1=[1,1,1,1,2,2,2,2,3,3,3,30].to_scale
- @x2=[1,1,1,2,2,3,3,3,3,4,4,50].to_scale
- @x3=[2,2,1,1,1,2,2,2,3,4,5,40].to_scale
- @x4=[1,2,3,4,4,4,4,3,4,4,5,30].to_scale
- @ds={'x1'=>@x1,'x2'=>@x2,'x3'=>@x3,'x4'=>@x4}.to_dataset
- @ia=Statsample::Reliability::ScaleAnalysis.new(@ds)
- @cov_matrix=@ia.cov_m
- end
- should "return correct values for item analysis" do
- assert_in_delta(0.980,@ia.alpha,0.001)
- assert_in_delta(0.999,@ia.alpha_standarized,0.001)
- var_mean=4.times.map{|m| @cov_matrix[m,m]}.to_scale.mean
+ setup do
+ @x1 = Daru::Vector.new([1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 30])
+ @x2 = Daru::Vector.new([1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 50])
+ @x3 = Daru::Vector.new([2, 2, 1, 1, 1, 2, 2, 2, 3, 4, 5, 40])
+ @x4 = Daru::Vector.new([1, 2, 3, 4, 4, 4, 4, 3, 4, 4, 5, 30])
+ @ds = Daru::DataFrame.new({ :x1 => @x1, :x2 => @x2, :x3 => @x3, :x4 => @x4 })
+ @ia = Statsample::Reliability::ScaleAnalysis.new(@ds)
+ @cov_matrix = @ia.cov_m
+ end
+ should 'return correct values for item analysis' do
+ assert_in_delta(0.980, @ia.alpha, 0.001)
+ assert_in_delta(0.999, @ia.alpha_standarized, 0.001)
+ var_mean = Daru::Vector.new(4.times.map { |m| @cov_matrix[m, m] }).mean
assert_in_delta(var_mean, @ia.variances_mean)
- assert_equal(@x1.mean, @ia.item_statistics['x1'][:mean])
- assert_equal(@x4.mean, @ia.item_statistics['x4'][:mean])
- assert_in_delta(@x1.sds, @ia.item_statistics['x1'][:sds],1e-14)
- assert_in_delta(@x4.sds, @ia.item_statistics['x4'][:sds],1e-14)
- ds2=@ds.clone
- ds2.delete_vector('x1')
- vector_sum=ds2.vector_sum
- assert_equal(vector_sum.mean, @ia.stats_if_deleted['x1'][:mean])
- assert_equal(vector_sum.sds, @ia.stats_if_deleted['x1'][:sds])
- assert_in_delta(vector_sum.variance, @ia.stats_if_deleted['x1'][:variance_sample],1e-10)
+ assert_equal(@x1.mean, @ia.item_statistics[:x1][:mean])
+ assert_equal(@x4.mean, @ia.item_statistics[:x4][:mean])
+ assert_in_delta(@x1.sds, @ia.item_statistics[:x1][:sds], 1e-14)
+ assert_in_delta(@x4.sds, @ia.item_statistics[:x4][:sds], 1e-14)
+ ds2 = @ds.clone
+ ds2.delete_vector(:x1)
+ vector_sum = ds2.vector_sum
+ assert_equal(vector_sum.mean, @ia.stats_if_deleted[:x1][:mean])
+ assert_equal(vector_sum.sds, @ia.stats_if_deleted[:x1][:sds])
+ assert_in_delta(vector_sum.variance, @ia.stats_if_deleted[:x1][:variance_sample], 1e-10)
+
+ assert_equal(Statsample::Reliability.cronbach_alpha(ds2), @ia.stats_if_deleted[:x1][:alpha])
- assert_equal(Statsample::Reliability.cronbach_alpha(ds2), @ia.stats_if_deleted['x1'][:alpha])
-
- covariances=[]
+ covariances = []
4.times.each {|i|
4.times.each {|j|
- if i!=j
- covariances.push(@cov_matrix[i,j])
+ if i != j
+ covariances.push(@cov_matrix[i, j])
end
}
}
- assert_in_delta(covariances.to_scale.mean, @ia.covariances_mean)
- assert_in_delta(0.999,@ia.item_total_correlation()['x1'],0.001)
- assert_in_delta(1050.455,@ia.stats_if_deleted()['x1'][:variance_sample],0.001)
+ assert_in_delta(Daru::Vector.new(covariances).mean, @ia.covariances_mean)
+ assert_in_delta(0.999, @ia.item_total_correlation[:x1], 0.001)
+ assert_in_delta(1050.455, @ia.stats_if_deleted[:x1][:variance_sample], 0.001)
end
- should "return a summary" do
- assert(@ia.summary.size>0)
+ should 'return a summary' do
+ assert(@ia.summary.size > 0)
end
-
end
end
end
diff --git a/test/test_reliability_icc.rb b/test/test_reliability_icc.rb
index d413cc9..25f5e2a 100644
--- a/test/test_reliability_icc.rb
+++ b/test/test_reliability_icc.rb
@@ -1,140 +1,138 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
-$reliability_icc=nil
+$reliability_icc = nil
-class StatsampleReliabilityIccTestCase < MiniTest::Test
+class StatsampleReliabilityIccTestCase < Minitest::Test
context Statsample::Reliability::ICC do
setup do
- a=[9,6,8,7,10,6].to_scale
- b=[2,1,4,1,5,2].to_scale
- c=[5,3,6,2,6,4].to_scale
- d=[8,2,8,6,9,7].to_scale
- @ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
- @icc=Statsample::Reliability::ICC.new(@ds)
+ a = Daru::Vector.new([9, 6, 8, 7, 10, 6])
+ b = Daru::Vector.new([2, 1, 4, 1, 5, 2])
+ c = Daru::Vector.new([5, 3, 6, 2, 6, 4])
+ d = Daru::Vector.new([8, 2, 8, 6, 9, 7])
+ @ds = Daru::DataFrame.new({ :a => a, :b => b, :c => c, :d => d })
+ @icc = Statsample::Reliability::ICC.new(@ds)
end
- should "basic method be correct" do
- assert_equal(6,@icc.n)
- assert_equal(4,@icc.k)
+ should 'basic method be correct' do
+ assert_equal(6, @icc.n)
+ assert_equal(4, @icc.k)
end
- should "total mean be correct" do
+ should 'total mean be correct' do
assert_in_delta(5.291, @icc.total_mean, 0.001)
end
- should "df methods be correct" do
+ should 'df methods be correct' do
assert_equal(5, @icc.df_bt)
assert_equal(18, @icc.df_wt)
assert_equal(3, @icc.df_bj)
assert_equal(15, @icc.df_residual)
end
- should "ms between targets be correct" do
+ should 'ms between targets be correct' do
assert_in_delta(11.24, @icc.ms_bt, 0.01)
end
- should "ms within targets be correct" do
+ should 'ms within targets be correct' do
assert_in_delta(6.26, @icc.ms_wt, 0.01)
end
- should "ms between judges be correct" do
+ should 'ms between judges be correct' do
assert_in_delta(32.49, @icc.ms_bj, 0.01)
end
- should "ms residual be correct" do
+ should 'ms residual be correct' do
assert_in_delta(1.02, @icc.ms_residual, 0.01)
end
- context "with McGraw and Wong denominations," do
-
+ context 'with McGraw and Wong denominations,' do
end
- context "with Shrout & Fleiss denominations, " do
- should "icc(1,1) method be correct" do
+ context 'with Shrout & Fleiss denominations, ' do
+ should 'icc(1,1) method be correct' do
assert_in_delta(0.17, @icc.icc_1_1, 0.01)
end
# Verified on SPSS and R
- should "icc(2,1) method be correct" do
+ should 'icc(2,1) method be correct' do
assert_in_delta(0.29, @icc.icc_2_1, 0.01)
end
- should "icc(3,1) method be correct" do
+ should 'icc(3,1) method be correct' do
assert_in_delta(0.71, @icc.icc_3_1, 0.01)
end
- should "icc(1,k) method be correct" do
+ should 'icc(1,k) method be correct' do
assert_in_delta(0.44, @icc.icc_1_k, 0.01)
end
# Verified on SPSS and R
- should "icc(2,k) method be correct" do
+ should 'icc(2,k) method be correct' do
assert_in_delta(0.62, @icc.icc_2_k, 0.01)
- end
- should "icc(3,k) method be correct" do
+ end
+ should 'icc(3,k) method be correct' do
assert_in_delta(0.91, @icc.icc_3_k, 0.01)
end
-
- should "icc(1,1) F be correct" do
+
+ should 'icc(1,1) F be correct' do
assert_in_delta(1.795, @icc.icc_1_f.f)
end
- should "icc(1,1) confidence interval should be correct" do
+ should 'icc(1,1) confidence interval should be correct' do
assert_in_delta(-0.133, @icc.icc_1_1_ci[0], 0.001)
assert_in_delta(0.723, @icc.icc_1_1_ci[1], 0.001)
end
- should "icc(1,k) confidence interval should be correct" do
+ should 'icc(1,k) confidence interval should be correct' do
assert_in_delta(-0.884, @icc.icc_1_k_ci[0], 0.001)
assert_in_delta(0.912, @icc.icc_1_k_ci[1], 0.001)
end
-
- should "icc(2,1) F be correct" do
+
+ should 'icc(2,1) F be correct' do
assert_in_delta(11.027, @icc.icc_2_f.f)
end
- should "icc(2,1) confidence interval should be correct" do
- #skip("Not yet operational")
+ should 'icc(2,1) confidence interval should be correct' do
+ # skip("Not yet operational")
assert_in_delta(0.019, @icc.icc_2_1_ci[0], 0.001)
assert_in_delta(0.761, @icc.icc_2_1_ci[1], 0.001)
end
-
- # Verified on SPSS and R
- should "icc(2,k) confidence interval should be correct" do
- #skip("Not yet operational")
- #p @icc.icc_2_k_ci
+
+ # Verified on SPSS and R
+ should 'icc(2,k) confidence interval should be correct' do
+ # skip("Not yet operational")
+ # p @icc.icc_2_k_ci
assert_in_delta(0.039, @icc.icc_2_k_ci[0], 0.001)
assert_in_delta(0.929, @icc.icc_2_k_ci[1], 0.001)
-
end
- #should "Shrout icc(2,k) and McGraw icc(a,k) ci be equal" do
+ # should "Shrout icc(2,k) and McGraw icc(a,k) ci be equal" do
# assert_in_delta(@icc.icc_2_k_ci_shrout[0], @icc.icc_2_k_ci_mcgraw[0], 10e-5)
- #end
-
- should "icc(3,1) F be correct" do
+ # end
+
+ should 'icc(3,1) F be correct' do
assert_in_delta(11.027, @icc.icc_3_f.f)
end
-
- should "icc(3,1) confidence interval should be correct" do
+
+ should 'icc(3,1) confidence interval should be correct' do
assert_in_delta(0.342, @icc.icc_3_1_ci[0], 0.001)
assert_in_delta(0.946, @icc.icc_3_1_ci[1], 0.001)
end
- should "icc(3,k) confidence interval should be correct" do
+ should 'icc(3,k) confidence interval should be correct' do
assert_in_delta(0.676, @icc.icc_3_k_ci[0], 0.001)
assert_in_delta(0.986, @icc.icc_3_k_ci[1], 0.001)
end
- should "incorrect type raises an error" do
- assert_raise(::RuntimeError) do
- @icc.type=:nonexistant_type
+ should 'incorrect type raises an error' do
+ assert_raise(::RuntimeError) do
+ @icc.type = :nonexistant_type
end
end
end
-
+
begin
require 'rserve'
- require 'statsample/rserve_extension'
- context "McGraw and Wong" do
+ require 'daru/extensions/rserve'
+ context 'McGraw and Wong' do
teardown do
- @r=$reliability_icc[:r].close unless $reliability_icc[:r].nil?
+ @r = $reliability_icc[:r].close unless $reliability_icc[:r].nil?
end
setup do
- if($reliability_icc.nil?)
- size=100
- a=size.times.map {rand(10)}.to_scale
- b=a.recode{|i|i+rand(4)-2}
- c=a.recode{|i|i+rand(4)-2}
- d=a.recode{|i|i+rand(4)-2}
- @ds={'a'=>a,'b'=>b,'c'=>c,'d'=>d}.to_dataset
-
- @icc=Statsample::Reliability::ICC.new(@ds)
- @r=Rserve::Connection.new
-
- @r.assign('ds',@ds)
-
+ if $reliability_icc.nil?
+ size = 100
+ a = Daru::Vector.new(size.times.map { rand(10) })
+ b = a.recode { |i| i + rand(4) - 2 }
+ c = a.recode { |i| i + rand(4) - 2 }
+ d = a.recode { |i| i + rand(4) - 2 }
+ @ds = Daru::DataFrame.new({ :a => a, :b => b, :c => c, :d => d })
+
+ @icc = Statsample::Reliability::ICC.new(@ds)
+ @r = Rserve::Connection.new
+
+ @r.assign('ds', @ds)
+
@r.void_eval("library(irr);
iccs=list(
icc_1=icc(ds,'o','c','s'),
@@ -144,59 +142,57 @@ class StatsampleReliabilityIccTestCase < MiniTest::Test
icc_a_1=icc(ds,'t','a','s'),
icc_a_k=icc(ds,'t','a','a'))
")
- @iccs=@r.eval('iccs').to_ruby
- $reliability_icc={ :icc=>@icc, :iccs=>@iccs, :r=>@r
+ @iccs = @r.eval('iccs').to_ruby
+ $reliability_icc = { icc: @icc, iccs: @iccs, r: @r
}
-
- end
- @icc=$reliability_icc[:icc]
- @iccs=$reliability_icc[:iccs]
- @r=$reliability_icc[:r]
+ end
+ @icc = $reliability_icc[:icc]
+ @iccs = $reliability_icc[:iccs]
+ @r = $reliability_icc[:r]
end
[:icc_1, :icc_k, :icc_c_1, :icc_c_k, :icc_a_1, :icc_a_k].each do |t|
context "ICC Type #{t} " do
- should "value be correct" do
- @icc.type=t
- @r_icc=@iccs[t.to_s]
- assert_in_delta(@r_icc['value'],@icc.r)
+ should 'value be correct' do
+ @icc.type = t
+ @r_icc = @iccs[t.to_s]
+ assert_in_delta(@r_icc['value'], @icc.r)
end
- should "fvalue be correct" do
- @icc.type=t
- @r_icc=@iccs[t.to_s]
- assert_in_delta(@r_icc['Fvalue'],@icc.f.f)
+ should 'fvalue be correct' do
+ @icc.type = t
+ @r_icc = @iccs[t.to_s]
+ assert_in_delta(@r_icc['Fvalue'], @icc.f.f)
end
- should "num df be correct" do
- @icc.type=t
- @r_icc=@iccs[t.to_s]
- assert_in_delta(@r_icc['df1'],@icc.f.df_num)
+ should 'num df be correct' do
+ @icc.type = t
+ @r_icc = @iccs[t.to_s]
+ assert_in_delta(@r_icc['df1'], @icc.f.df_num)
end
- should "den df be correct" do
- @icc.type=t
- @r_icc=@iccs[t.to_s]
- assert_in_delta(@r_icc['df2'],@icc.f.df_den)
+ should 'den df be correct' do
+ @icc.type = t
+ @r_icc = @iccs[t.to_s]
+ assert_in_delta(@r_icc['df2'], @icc.f.df_den)
end
- should "f probability be correct" do
- @icc.type=t
- @r_icc=@iccs[t.to_s]
- assert_in_delta(@r_icc['p.value'],@icc.f.probability)
+ should 'f probability be correct' do
+ @icc.type = t
+ @r_icc = @iccs[t.to_s]
+ assert_in_delta(@r_icc['p.value'], @icc.f.probability)
end
- should "bounds be equal" do
- @icc.type=t
- @r_icc=@iccs[t.to_s]
- assert_in_delta(@r_icc['lbound'],@icc.lbound)
- assert_in_delta(@r_icc['ubound'],@icc.ubound)
+ should 'bounds be equal' do
+ @icc.type = t
+ @r_icc = @iccs[t.to_s]
+ assert_in_delta(@r_icc['lbound'], @icc.lbound, 0.1)
+ assert_in_delta(@r_icc['ubound'], @icc.ubound, 0.1)
end
- should "summary generated" do
- assert(@icc.summary.size>0)
+ should 'summary generated' do
+ assert(@icc.summary.size > 0)
end
end
end
end
rescue
- puts "requires rserve"
+ puts 'requires rserve'
end
-
end
end
diff --git a/test/test_reliability_skillscale.rb b/test/test_reliability_skillscale.rb
index 456c808..831740b 100644
--- a/test/test_reliability_skillscale.rb
+++ b/test/test_reliability_skillscale.rb
@@ -1,57 +1,57 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
-
-class StatsampleReliabilitySkillScaleTestCase < MiniTest::Unit::TestCase
+class StatsampleReliabilitySkillScaleTestCase < Minitest::Test
context Statsample::Reliability::SkillScaleAnalysis do
setup do
- options=%w{a b c d e}
- cases=20
- @id=cases.times.map {|v| v}.to_scale
- @a=cases.times.map {options[rand(5)]}.to_vector
- @b=cases.times.map {options[rand(5)]}.to_vector
- @c=cases.times.map {options[rand(5)]}.to_vector
- @d=cases.times.map {options[rand(5)]}.to_vector
- @e=cases.times.map {|i|
- i==0 ? options[rand(0)] :
- rand()>0.8 ? nil : options[rand(5)]
- }.to_vector
- @ds={'id'=>@id,'a'=>@a,'b'=>@b,'c'=>@c,'d'=>@d,'e'=>@e}.to_dataset
- @key={'a'=>"a", 'b'=>options[rand(5)], 'c'=>options[rand(5)], 'd'=>options[rand(5)],'e'=>options[rand(5)]}
- @ssa=Statsample::Reliability::SkillScaleAnalysis.new(@ds, @key)
- @ac=@a.map {|v| v==@key['a'] ? 1 : 0}.to_scale
- @bc=@b.map {|v| v==@key['b'] ? 1 : 0}.to_scale
- @cc=@c.map {|v| v==@key['c'] ? 1 : 0}.to_scale
- @dc=@d.map {|v| v==@key['d'] ? 1 : 0}.to_scale
- @ec=@e.map {|v| v.nil? ? nil : (v==@key['e'] ? 1 : 0)}.to_scale
-
+ options = %w(a b c d e)
+ cases = 20
+ @id = Daru::Vector.new(cases.times.map { |v| v })
+ @a = Daru::Vector.new(cases.times.map { options[rand(5)] })
+ @b = Daru::Vector.new(cases.times.map { options[rand(5)] })
+ @c = Daru::Vector.new(cases.times.map { options[rand(5)] })
+ @d = Daru::Vector.new(cases.times.map { options[rand(5)] })
+ @e = Daru::Vector.new(
+ cases.times.map do |i|
+ i == 0 ? options[rand(0)] :
+ rand > 0.8 ? nil : options[rand(5)]
+ end
+ )
+ @ds = Daru::DataFrame.new({ :id => @id, :a => @a, :b => @b, :c => @c, :d => @d, :e => @e })
+ @key = { :a => 'a', :b => options[rand(5)], :c => options[rand(5)], :d => options[rand(5)], :e => options[rand(5)] }
+ @ssa = Statsample::Reliability::SkillScaleAnalysis.new(@ds, @key)
+ @ac = Daru::Vector.new(@a.map { |v| v == @key[:a] ? 1 : 0 })
+ @bc = Daru::Vector.new(@b.map { |v| v == @key[:b] ? 1 : 0 })
+ @cc = Daru::Vector.new(@c.map { |v| v == @key[:c] ? 1 : 0 })
+ @dc = Daru::Vector.new(@d.map { |v| v == @key[:d] ? 1 : 0 })
+ @ec = Daru::Vector.new(@e.map { |v| v.nil? ? nil : (v == @key[:e] ? 1 : 0) })
end
- should "return proper corrected dataset" do
- cds={'id'=>@id, 'a'=>@ac,'b'=>@bc,'c'=>@cc,'d'=>@dc, 'e'=>@ec}.to_dataset
+ should 'return proper corrected dataset' do
+ cds = Daru::DataFrame.new({ :id => @id, :a => @ac, :b => @bc, :c => @cc, :d => @dc, :e => @ec })
assert_equal(cds, @ssa.corrected_dataset)
end
- should "return proper corrected minimal dataset" do
- cdsm={'a'=>@ac,'b'=>@bc,'c'=>@cc,'d'=>@dc, 'e'=>@ec}.to_dataset
+ should 'return proper corrected minimal dataset' do
+ cdsm = Daru::DataFrame.new({ :a => @ac, :b => @bc, :c => @cc, :d => @dc, :e => @ec })
assert_equal(cdsm, @ssa.corrected_dataset_minimal)
end
- should "return correct vector_sum and vector_sum" do
- cdsm=@ssa.corrected_dataset_minimal
+ should 'return correct vector_sum and vector_sum' do
+ cdsm = @ssa.corrected_dataset_minimal
assert_equal(cdsm.vector_sum, @ssa.vector_sum)
assert_equal(cdsm.vector_mean, @ssa.vector_mean)
end
- should "not crash on rare case" do
- a=Statsample::Vector["c","c","a","a","c","a","b","c","c","b","a","d","a","d","a","a","d","e","c","d"]
- b=Statsample::Vector["e","b","e","b","c","d","a","e","e","c","b","e","e","b","d","c","e","b","b","d"]
- c=Statsample::Vector["e","b","e","c","e","c","b","d","e","c","a","a","b","d","e","c","b","a","a","e"]
- d=Statsample::Vector["a","b","d","d","e","b","e","b","d","c","e","a","c","d","c","c","e","d","d","b"]
- e=Statsample::Vector["a","b",nil,"d","c","c","d",nil,"d","d","e","e",nil,nil,nil,"d","c",nil,"e","d"]
- key={"a"=>"a", "b"=>"e", "c"=>"d", "d"=>"c", "e"=>"d"}
- ds=Statsample::Dataset.new("a"=>a,"b"=>b,"c"=>c,"d"=>d,"e"=>e)
- ssa=Statsample::Reliability::SkillScaleAnalysis.new(ds, key)
+ should 'not crash on rare case' do
+ a = Daru::Vector.new(['c', 'c', 'a', 'a', 'c', 'a', 'b', 'c', 'c', 'b', 'a', 'd', 'a', 'd', 'a', 'a', 'd', 'e', 'c', 'd'])
+ b = Daru::Vector.new(['e', 'b', 'e', 'b', 'c', 'd', 'a', 'e', 'e', 'c', 'b', 'e', 'e', 'b', 'd', 'c', 'e', 'b', 'b', 'd'])
+ c = Daru::Vector.new(['e', 'b', 'e', 'c', 'e', 'c', 'b', 'd', 'e', 'c', 'a', 'a', 'b', 'd', 'e', 'c', 'b', 'a', 'a', 'e'])
+ d = Daru::Vector.new(['a', 'b', 'd', 'd', 'e', 'b', 'e', 'b', 'd', 'c', 'e', 'a', 'c', 'd', 'c', 'c', 'e', 'd', 'd', 'b'])
+ e = Daru::Vector.new(['a', 'b', nil, 'd', 'c', 'c', 'd', nil, 'd', 'd', 'e', 'e', nil, nil, nil, 'd', 'c', nil, 'e', 'd'])
+ key = { :a => 'a', :b => 'e', :c => 'd', :d => 'c', :e => 'd' }
+ ds = Daru::DataFrame.new({:a => a, :b => b, :c => c, :d => d, :e => e})
+ ssa = Statsample::Reliability::SkillScaleAnalysis.new(ds, key)
assert(ssa.summary)
end
-
- should "return valid summary" do
- assert(@ssa.summary.size>0)
+
+ should 'return valid summary' do
+ assert(@ssa.summary.size > 0)
end
end
end
diff --git a/test/test_resample.rb b/test/test_resample.rb
index c1821e1..ce8701b 100644
--- a/test/test_resample.rb
+++ b/test/test_resample.rb
@@ -1,22 +1,24 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
-class StatsampleResampleTestCase < MiniTest::Unit::TestCase
+class StatsampleResampleTestCase < Minitest::Test
def initialize(*args)
super
end
+
def test_basic
- r=Statsample::Resample.generate(20,1,10)
- assert_equal(20,r.size)
- assert(r.min>=1)
- assert(r.max<=10)
+ r = Statsample::Resample.generate(20, 1, 10)
+ assert_equal(20, r.size)
+ assert(r.min >= 1)
+ assert(r.max <= 10)
end
+
def test_repeat_and_save
- r=Statsample::Resample.repeat_and_save(400) {
- Statsample::Resample.generate(20,1,10).count(1)
+ r = Statsample::Resample.repeat_and_save(400) {
+ Statsample::Resample.generate(20, 1, 10).count(1)
}
- assert_equal(400,r.size)
- v=Statsample::Vector.new(r,:scale)
- a=v.count {|x| x > 3}
- assert(a>=30 && a<=70)
+ assert_equal(400, r.size)
+ v = Daru::Vector.new(r)
+ a = v.count { |x| x > 3 }
+ assert(a >= 30 && a <= 70)
end
end
diff --git a/test/test_rserve_extension.rb b/test/test_rserve_extension.rb
deleted file mode 100644
index e718978..0000000
--- a/test/test_rserve_extension.rb
+++ /dev/null
@@ -1,42 +0,0 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-begin
- require 'rserve'
- require 'statsample/rserve_extension'
-
-class StatsampleRserveExtensionTestCase < MiniTest::Unit::TestCase
- context "Statsample Rserve extensions" do
- setup do
- @r=Rserve::Connection.new
- end
- teardown do
- @r.close
- end
- should "return a valid rexp for numeric vector" do
- a=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale
- rexp=a.to_REXP
- assert(rexp.is_a? Rserve::REXP::Double)
- assert_equal(rexp.to_ruby,a.data_with_nils)
- @r.assign 'a',rexp
- assert_equal(a.data_with_nils, @r.eval('a').to_ruby)
- end
- should "return a valid rserve dataframe for statsample datasets" do
- a=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale
- b=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale
- c=100.times.map {|i| rand()>0.9 ? nil : i+rand() }.to_scale
- ds={'a'=>a,'b'=>b,'c'=>c}.to_dataset
- rexp=ds.to_REXP
- assert(rexp.is_a? Rserve::REXP::GenericVector)
- ret=rexp.to_ruby
- assert_equal(a.data_with_nils, ret['a'])
- @r.assign 'df', rexp
- out_df=@r.eval('df').to_ruby
- assert_equal('data.frame', out_df.attributes['class'])
- assert_equal(['a','b','c'], out_df.attributes['names'])
- assert_equal(a.data_with_nils, out_df['a'])
- end
- end
-end
-
-rescue LoadError
- puts "Require rserve extension"
-end
diff --git a/test/test_srs.rb b/test/test_srs.rb
index 1d18cf9..c9d5abd 100644
--- a/test/test_srs.rb
+++ b/test/test_srs.rb
@@ -1,9 +1,9 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
-class StatsampleSrsTestCase < MiniTest::Unit::TestCase
+class StatsampleSrsTestCase < Minitest::Test
def test_std_error
- assert_equal(384,Statsample::SRS.estimation_n0(0.05,0.5,0.95).to_i)
- assert_equal(108,Statsample::SRS.estimation_n(0.05,0.5,150,0.95).to_i)
- assert_in_delta(0.0289,Statsample::SRS.proportion_sd_kp_wor(0.5,100,150),0.001)
+ assert_equal(384, Statsample::SRS.estimation_n0(0.05, 0.5, 0.95).to_i)
+ assert_equal(108, Statsample::SRS.estimation_n(0.05, 0.5, 150, 0.95).to_i)
+ assert_in_delta(0.0289, Statsample::SRS.proportion_sd_kp_wor(0.5, 100, 150), 0.001)
end
end
diff --git a/test/test_statistics.rb b/test/test_statistics.rb
index 7fe47d3..f8b9372 100644
--- a/test/test_statistics.rb
+++ b/test/test_statistics.rb
@@ -1,77 +1,69 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleStatisicsTestCase < MiniTest::Unit::TestCase
-
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleStatisicsTestCase < Minitest::Test
def initialize(*args)
super
end
+
def test_p_using_cdf
- assert_equal(0.25, Statsample::Test.p_using_cdf(0.25, tails=:left))
- assert_equal(0.75, Statsample::Test.p_using_cdf(0.25, tails=:right))
- assert_equal(0.50, Statsample::Test.p_using_cdf(0.25, tails=:both))
- assert_equal(1, Statsample::Test.p_using_cdf(0.50, tails=:both))
- assert_equal(0.05, Statsample::Test.p_using_cdf(0.025, tails=:both))
- assert_in_delta(0.05, Statsample::Test.p_using_cdf(0.975, tails=:both),0.0001)
-
+ assert_equal(0.25, Statsample::Test.p_using_cdf(0.25, tails = :left))
+ assert_equal(0.75, Statsample::Test.p_using_cdf(0.25, tails = :right))
+ assert_equal(0.50, Statsample::Test.p_using_cdf(0.25, tails = :both))
+ assert_equal(1, Statsample::Test.p_using_cdf(0.50, tails = :both))
+ assert_equal(0.05, Statsample::Test.p_using_cdf(0.025, tails = :both))
+ assert_in_delta(0.05, Statsample::Test.p_using_cdf(0.975, tails = :both), 0.0001)
end
+
def test_recode_repeated
- a=%w{a b c c d d d e}
- exp=["a","b","c_1","c_2","d_1","d_2","d_3","e"]
- assert_equal(exp,a.recode_repeated)
+ a = %w(a b c c d d d e)
+ exp = %w(a b c_1 c_2 d_1 d_2 d_3 e)
+ assert_equal(exp, Daru::ArrayHelper.recode_repeated(a))
end
- def test_is_number
- assert("10".is_number?)
- assert("-10".is_number?)
- assert("0.1".is_number?)
- assert("-0.1".is_number?)
- assert("10e3".is_number?)
- assert("10e-3".is_number?)
- assert(!"1212-1212-1".is_number?)
- assert(!"a10".is_number?)
- assert(!"".is_number?)
+ def test_is_number
+ assert('10'.is_number?)
+ assert('-10'.is_number?)
+ assert('0.1'.is_number?)
+ assert('-0.1'.is_number?)
+ assert('10e3'.is_number?)
+ assert('10e-3'.is_number?)
+ assert(!'1212-1212-1'.is_number?)
+ assert(!'a10'.is_number?)
+ assert(!''.is_number?)
end
+
def test_estimation_mean
- v=([42]*23+[41]*4+[36]*1+[32]*1+[29]*1+[27]*2+[23]*1+[19]*1+[16]*2+[15]*2+[14,11,10,9,7]+ [6]*3+[5]*2+[4,3]).to_vector(:scale)
- assert_equal(50,v.size)
- assert_equal(1471,v.sum())
- #limits=Statsample::SRS.mean_confidence_interval_z(v.mean(), v.sds(), v.size,676,0.80)
+ v = Daru::Vector.new([42] * 23 + [41] * 4 + [36] * 1 + [32] * 1 + [29] * 1 + [27] * 2 + [23] * 1 + [19] * 1 + [16] * 2 + [15] * 2 + [14, 11, 10, 9, 7] + [6] * 3 + [5] * 2 + [4, 3])
+ assert_equal(50, v.size)
+ assert_equal(1471, v.sum)
+ # limits=Statsample::SRS.mean_confidence_interval_z(v.mean(), v.sds(), v.size,676,0.80)
end
+
def test_estimation_proportion
# total
- pop=3042
- sam=200
- prop=0.19
+ pop = 3042
+ sam = 200
+ prop = 0.19
assert_in_delta(81.8, Statsample::SRS.proportion_total_sd_ep_wor(prop, sam, pop), 0.1)
# confidence limits
- pop=500
- sam=100
- prop=0.37
- a=0.95
- l= Statsample::SRS.proportion_confidence_interval_z(prop, sam, pop, a)
- assert_in_delta(0.28,l[0],0.01)
- assert_in_delta(0.46,l[1],0.01)
+ pop = 500
+ sam = 100
+ prop = 0.37
+ a = 0.95
+ l = Statsample::SRS.proportion_confidence_interval_z(prop, sam, pop, a)
+ assert_in_delta(0.28, l[0], 0.01)
+ assert_in_delta(0.46, l[1], 0.01)
end
- def test_ml
- if(true)
- #real=[1,1,1,1].to_vector(:scale)
-
- #pred=[0.0001,0.0001,0.0001,0.0001].to_vector(:scale)
- # puts Statsample::Bivariate.maximum_likehood_dichotomic(pred,real)
-
- end
- end
-
def test_simple_linear_regression
- a=[1,2,3,4,5,6].to_vector(:scale)
- b=[6,2,4,10,12,8].to_vector(:scale)
- reg = Statsample::Regression::Simple.new_from_vectors(a,b)
- assert_in_delta((reg.ssr+reg.sse).to_f,reg.sst,0.001)
- assert_in_delta(Statsample::Bivariate.pearson(a,b),reg.r,0.001)
- assert_in_delta(2.4,reg.a,0.01)
- assert_in_delta(1.314,reg.b,0.001)
- assert_in_delta(0.657,reg.r,0.001)
- assert_in_delta(0.432,reg.r2,0.001)
+ a = Daru::Vector.new([1, 2, 3, 4, 5, 6])
+ b = Daru::Vector.new([6, 2, 4, 10, 12, 8])
+ reg = Statsample::Regression::Simple.new_from_vectors(a, b)
+ assert_in_delta((reg.ssr + reg.sse).to_f, reg.sst, 0.001)
+ assert_in_delta(Statsample::Bivariate.pearson(a, b), reg.r, 0.001)
+ assert_in_delta(2.4, reg.a, 0.01)
+ assert_in_delta(1.314, reg.b, 0.001)
+ assert_in_delta(0.657, reg.r, 0.001)
+ assert_in_delta(0.432, reg.r2, 0.001)
end
end
diff --git a/test/test_stest.rb b/test/test_stest.rb
index e13c580..aa375b4 100644
--- a/test/test_stest.rb
+++ b/test/test_stest.rb
@@ -1,56 +1,69 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
-class StatsampleTestTestCase < MiniTest::Unit::TestCase
+class StatsampleTestTestCase < Minitest::Test
def test_chi_square_matrix_with_expected
- real=Matrix[[95,95],[45,155]]
- expected=Matrix[[68,122],[72,128]]
+ real = Matrix[[95, 95], [45, 155]]
+ expected = Matrix[[68, 122], [72, 128]]
assert_nothing_raised do
- Statsample::Test.chi_square(real,expected)
+ Statsample::Test.chi_square(real, expected)
end
- chi=Statsample::Test.chi_square(real,expected).chi_square
- assert_in_delta(32.53,chi,0.1)
-
+ chi = Statsample::Test.chi_square(real, expected).chi_square
+ assert_in_delta(32.53, chi, 0.1)
end
+
def test_chi_square_matrix_only_observed
- observed=Matrix[[20,30,40],[30,40,50],[60,70,80],[10,20,40]]
+ observed = Matrix[[20, 30, 40], [30, 40, 50], [60, 70, 80], [10, 20, 40]]
assert_nothing_raised do
Statsample::Test.chi_square(observed)
end
- chi=Statsample::Test.chi_square(observed)
+ chi = Statsample::Test.chi_square(observed)
assert_in_delta(9.5602, chi.chi_square, 0.0001)
assert_in_delta(0.1444, chi.probability, 0.0001)
assert_equal(6, chi.df)
-
end
-
- def test_u_mannwhitney
- a=[1,2,3,4,5,6].to_scale
- b=[0,5,7,9,10,11].to_scale
- assert_equal(7.5, Statsample::Test.u_mannwhitney(a,b).u)
- assert_equal(7.5, Statsample::Test.u_mannwhitney(b,a).u)
- a=[1, 7,8,9,10,11].to_scale
- b=[2,3,4,5,6,12].to_scale
- assert_equal(11, Statsample::Test.u_mannwhitney(a,b).u)
+
+ def test_chi_square_vector
+ observed = Vector[20,30,15]
+ expected = Vector[20,20,20]
+ assert_nothing_raised do
+ Statsample::Test.chi_square(observed, expected)
+ end
+ chi = Statsample::Test.chi_square(observed, expected)
+
+ assert_in_delta(6.25, chi.chi_square, 0.0001)
+ assert_in_delta(0.04393, chi.probability, 0.00001)
+
+ assert_equal(2, chi.df)
end
+ def test_u_mannwhitney
+ a = Daru::Vector.new([1, 2, 3, 4, 5, 6])
+ b = Daru::Vector.new([0, 5, 7, 9, 10, 11])
+ assert_equal(7.5, Statsample::Test.u_mannwhitney(a, b).u)
+ assert_equal(7.5, Statsample::Test.u_mannwhitney(b, a).u)
+ a = Daru::Vector.new([1, 7, 8, 9, 10, 11])
+ b = Daru::Vector.new([2, 3, 4, 5, 6, 12])
+ assert_equal(11, Statsample::Test.u_mannwhitney(a, b).u)
+ end
def test_levene
- a=[1,2,3,4,5,6,7,8,100,10].to_scale
- b=[30,40,50,60,70,80,90,100,110,120].to_scale
- levene=Statsample::Test::Levene.new([a,b])
+ a = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7, 8, 100, 10])
+ b = Daru::Vector.new([30, 40, 50, 60, 70, 80, 90, 100, 110, 120])
+ levene = Statsample::Test::Levene.new([a, b])
assert_levene(levene)
end
+
def test_levene_dataset
- a=[1,2,3,4,5,6,7,8,100,10].to_scale
- b=[30,40,50,60,70,80,90,100,110,120].to_scale
- ds={'a'=>a,'b'=>b}.to_dataset
- levene=Statsample::Test::Levene.new(ds)
+ a = Daru::Vector.new([1, 2, 3, 4, 5, 6, 7, 8, 100, 10])
+ b = Daru::Vector.new([30, 40, 50, 60, 70, 80, 90, 100, 110, 120])
+ ds = Daru::DataFrame.new({ :a => a, :b => b })
+ levene = Statsample::Test::Levene.new(ds)
assert_levene(levene)
end
+
def assert_levene(levene)
assert_in_delta(0.778, levene.f, 0.001)
assert_in_delta(0.389, levene.probability, 0.001)
end
-
end
diff --git a/test/test_stratified.rb b/test/test_stratified.rb
index eb8ef45..3e619fe 100644
--- a/test/test_stratified.rb
+++ b/test/test_stratified.rb
@@ -1,17 +1,17 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-
-class StatsampleStratifiedTestCase < MiniTest::Unit::TestCase
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleStratifiedTestCase < Minitest::Test
def initialize(*args)
super
end
+
def test_mean
- a=[10,20,30,40,50]
- b=[110,120,130,140]
- pop=a+b
- av=a.to_vector(:scale)
- bv=b.to_vector(:scale)
- popv=pop.to_vector(:scale)
- assert_equal(popv.mean,Statsample::StratifiedSample.mean(av,bv))
+ a = [10, 20, 30, 40, 50]
+ b = [110, 120, 130, 140]
+ pop = a + b
+ av = Daru::Vector.new(a)
+ bv = Daru::Vector.new(b)
+ popv = Daru::Vector.new(pop)
+ assert_equal(popv.mean, Statsample::StratifiedSample.mean(av, bv))
end
end
diff --git a/test/test_test_f.rb b/test/test_test_f.rb
index b7cc4a8..0ef0650 100644
--- a/test/test_test_f.rb
+++ b/test/test_test_f.rb
@@ -1,32 +1,32 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleTestFTestCase < MiniTest::Unit::TestCase
- context(Statsample::Test::F) do
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleTestFTestCase < Minitest::Test
+ context(Statsample::Test::F) do
setup do
- @ssb=84
- @ssw=68
- @df_num=2
- @df_den=15
- @f=Statsample::Test::F.new(@ssb.quo(@df_num),@ssw.quo(@df_den), @df_num, @df_den)
+ @ssb = 84
+ @ssw = 68
+ @df_num = 2
+ @df_den = 15
+ @f = Statsample::Test::F.new(@ssb.quo(@df_num), @ssw.quo(@df_den), @df_num, @df_den)
end
- should "have #f equal to msb/msw" do
+ should 'have #f equal to msb/msw' do
assert_equal((@ssb.quo(@df_num)).quo(@ssw.quo(@df_den)), @f.f)
end
- should "have df total equal to df_num+df_den" do
+ should 'have df total equal to df_num+df_den' do
assert_equal(@df_num + @df_den, @f.df_total)
end
- should "have probability near 0.002" do
+ should 'have probability near 0.002' do
assert_in_delta(0.002, @f.probability, 0.0005)
end
- should "be coerced into float" do
+ should 'be coerced into float' do
assert_equal(@f.to_f, @f.f)
end
-
- context("method summary") do
+
+ context('method summary') do
setup do
- @summary=@f.summary
+ @summary = @f.summary
end
- should "have size > 0" do
- assert(@summary.size>0)
+ should 'have size > 0' do
+ assert(@summary.size > 0)
end
end
end
diff --git a/test/test_test_kolmogorovsmirnov.rb b/test/test_test_kolmogorovsmirnov.rb
index 409d25d..7b698a1 100644
--- a/test/test_test_kolmogorovsmirnov.rb
+++ b/test/test_test_kolmogorovsmirnov.rb
@@ -1,28 +1,28 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleTestKolmogorovSmirnovTestCase < MiniTest::Unit::TestCase
- context(Statsample::Test::KolmogorovSmirnov) do
- should "calculate correctly D for two given samples" do
- a=[1.1,2.5,5.6,9]
- b=[1,2.3,5.8,10]
- ks=Statsample::Test::KolmogorovSmirnov.new(a,b)
- assert_equal(0.25,ks.d)
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleTestKolmogorovSmirnovTestCase < Minitest::Test
+ context(Statsample::Test::KolmogorovSmirnov) do
+ should 'calculate correctly D for two given samples' do
+ a = [1.1, 2.5, 5.6, 9]
+ b = [1, 2.3, 5.8, 10]
+ ks = Statsample::Test::KolmogorovSmirnov.new(a, b)
+ assert_equal(0.25, ks.d)
end
- should "calculate correctly D for a normal sample and Normal Distribution" do
- a=[0.30022510,-0.36664035,0.08593404,1.29881130,-0.49878633,-0.63056010, 0.28397638, -0.04913700,0.03566644,-1.33414346]
- ks=Statsample::Test::KolmogorovSmirnov.new(a,Distribution::Normal)
- assert_in_delta(0.282, ks.d,0.001)
+ should 'calculate correctly D for a normal sample and Normal Distribution' do
+ a = [0.30022510, -0.36664035, 0.08593404, 1.29881130, -0.49878633, -0.63056010, 0.28397638, -0.04913700, 0.03566644, -1.33414346]
+ ks = Statsample::Test::KolmogorovSmirnov.new(a, Distribution::Normal)
+ assert_in_delta(0.282, ks.d, 0.001)
end
- should "calculate correctly D for a variable normal and Normal Distribution" do
- rng=Distribution::Normal.rng
- a=100.times.map {rng.call}
- ks=Statsample::Test::KolmogorovSmirnov.new(a,Distribution::Normal)
- assert(ks.d<0.15)
+ should 'calculate correctly D for a variable normal and Normal Distribution' do
+ rng = Distribution::Normal.rng
+ a = 100.times.map { rng.call }
+ ks = Statsample::Test::KolmogorovSmirnov.new(a, Distribution::Normal)
+ assert(ks.d < 0.15)
end
-
+
context(Statsample::Test::KolmogorovSmirnov::EmpiricDistribution) do
- should "Create a correct empirical distribution for an array" do
- a=[10,9,8,7,6,5,4,3,2,1]
- ed=Statsample::Test::KolmogorovSmirnov::EmpiricDistribution.new(a)
+ should 'Create a correct empirical distribution for an array' do
+ a = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]
+ ed = Statsample::Test::KolmogorovSmirnov::EmpiricDistribution.new(a)
assert_equal(0, ed.cdf(-2))
assert_equal(0.5, ed.cdf(5))
assert_equal(0.5, ed.cdf(5.5))
diff --git a/test/test_test_t.rb b/test/test_test_t.rb
index 1c39a6b..3b8cce6 100644
--- a/test/test_test_t.rb
+++ b/test/test_test_t.rb
@@ -1,62 +1,62 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleTestTTestCase < MiniTest::Unit::TestCase
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
+class StatsampleTestTTestCase < Minitest::Test
include Statsample::Test
include Math
- context T do
+ context T do
setup do
- @a=[30.02, 29.99, 30.11, 29.97, 30.01, 29.99].to_scale
- @b=[29.89, 29.93, 29.72, 29.98, 30.02, 29.98].to_scale
- @x1=@a.mean
- @x2=@b.mean
- @s1=@a.sd
- @s2=@b.sd
- @n1=@a.n
- @n2=@b.n
- end
- should "calculate correctly standard t" do
- t=Statsample::Test::T.new(@x1, @s1.quo(Math.sqrt(@a.n)), @a.n-1)
- assert_equal((@x1).quo(@s1.quo(Math.sqrt(@a.n))), t.t)
- assert_equal(@a.n-1, t.df)
- assert(t.summary.size>0)
- end
- should "calculate correctly t for one sample" do
- t1=[6, 4, 6, 7, 4,5,5,12,6,1].to_scale
- t2=[9, 6, 5,10,10,8,7,10,6,5].to_scale
- d=t1-t2
- t=Statsample::Test::T::OneSample.new(d)
+ @a = Daru::Vector.new([30.02, 29.99, 30.11, 29.97, 30.01, 29.99])
+ @b = Daru::Vector.new([29.89, 29.93, 29.72, 29.98, 30.02, 29.98])
+ @x1 = @a.mean
+ @x2 = @b.mean
+ @s1 = @a.sd
+ @s2 = @b.sd
+ @n1 = @a.size
+ @n2 = @b.size
+ end
+ should 'calculate correctly standard t' do
+ t = Statsample::Test::T.new(@x1, @s1.quo(Math.sqrt(@a.size)), @a.size - 1)
+ assert_equal((@x1).quo(@s1.quo(Math.sqrt(@a.size))), t.t)
+ assert_equal(@a.size - 1, t.df)
+ assert(t.summary.size > 0)
+ end
+ should 'calculate correctly t for one sample' do
+ t1 = Daru::Vector.new([6, 4, 6, 7, 4, 5, 5, 12, 6, 1])
+ t2 = Daru::Vector.new([9, 6, 5, 10, 10, 8, 7, 10, 6, 5])
+ d = t1 - t2
+ t = Statsample::Test::T::OneSample.new(d)
assert_in_delta(-2.631, t.t, 0.001)
- assert_in_delta( 0.027, t.probability, 0.001)
- assert_in_delta( 0.76012, t.se, 0.0001)
- assert(t.summary.size>0)
- end
- should "calculate correctly t for two samples" do
- assert_in_delta(1.959, T.two_sample_independent(@x1, @x2, @s1, @s2, @n1, @n2),0.001)
- assert_in_delta(1.959, T.two_sample_independent(@x1, @x2, @s1, @s2, @n1, @n2,true),0.001)
- end
- should "calculate correctly df for equal and unequal variance" do
- assert_equal(10, T.df_equal_variance(@n1,@n2))
- assert_in_delta(7.03, T.df_not_equal_variance(@s1,@s2,@n1,@n2),0.001)
- end
- should "calculate all values for T object" do
- t=Statsample::Test.t_two_samples_independent(@a,@b)
- assert(t.summary.size>0)
- assert_in_delta(1.959, t.t_equal_variance,0.001)
- assert_in_delta(1.959, t.t_not_equal_variance,0.001)
- assert_in_delta(10, t.df_equal_variance,0.001)
- assert_in_delta(7.03, t.df_not_equal_variance,0.001)
- assert_in_delta(0.07856, t.probability_equal_variance,0.001)
- assert_in_delta(0.09095, t.probability_not_equal_variance,0.001)
- end
- should "be the same using shorthand" do
- v=100.times.map {rand(100)}.to_scale
+ assert_in_delta(0.027, t.probability, 0.001)
+ assert_in_delta(0.76012, t.se, 0.0001)
+ assert(t.summary.size > 0)
+ end
+ should 'calculate correctly t for two samples' do
+ assert_in_delta(1.959, T.two_sample_independent(@x1, @x2, @s1, @s2, @n1, @n2), 0.001)
+ assert_in_delta(1.959, T.two_sample_independent(@x1, @x2, @s1, @s2, @n1, @n2, true), 0.001)
+ end
+ should 'calculate correctly df for equal and unequal variance' do
+ assert_equal(10, T.df_equal_variance(@n1, @n2))
+ assert_in_delta(7.03, T.df_not_equal_variance(@s1, @s2, @n1, @n2), 0.001)
+ end
+ should 'calculate all values for T object' do
+ t = Statsample::Test.t_two_samples_independent(@a, @b)
+ assert(t.summary.size > 0)
+ assert_in_delta(1.959, t.t_equal_variance, 0.001)
+ assert_in_delta(1.959, t.t_not_equal_variance, 0.001)
+ assert_in_delta(10, t.df_equal_variance, 0.001)
+ assert_in_delta(7.03, t.df_not_equal_variance, 0.001)
+ assert_in_delta(0.07856, t.probability_equal_variance, 0.001)
+ assert_in_delta(0.09095, t.probability_not_equal_variance, 0.001)
+ end
+ should 'be the same using shorthand' do
+ v = Daru::Vector.new(100.times.map { rand(100) })
assert_equal(Statsample::Test.t_one_sample(v).t, T::OneSample.new(v).t)
end
- should "calculate all values for one sample T test" do
- u=@a.mean+(1-rand*2)
- tos=T::OneSample.new(@a,{:u=>u})
- assert_equal((@a.mean-u).quo(@a.sd.quo(sqrt(@a.n))), tos.t)
- assert_equal(@a.n-1, tos.df)
- assert(tos.summary.size>0)
+ should 'calculate all values for one sample T test' do
+ u = @a.mean + (1 - rand * 2)
+ tos = T::OneSample.new(@a, u: u)
+ assert_equal((@a.mean - u).quo(@a.sd.quo(sqrt(@a.size))), tos.t)
+ assert_equal(@a.size - 1, tos.df)
+ assert(tos.summary.size > 0)
end
end
end
diff --git a/test/test_umannwhitney.rb b/test/test_umannwhitney.rb
index 82817af..69a34f9 100644
--- a/test/test_umannwhitney.rb
+++ b/test/test_umannwhitney.rb
@@ -1,27 +1,27 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
-class StatsampleUMannWhitneyTestCase < MiniTest::Unit::TestCase
+class StatsampleUMannWhitneyTestCase < Minitest::Test
include Statsample::Test
context Statsample::Test::UMannWhitney do
setup do
- @v1=[1,2,3,4,7,8,9,10,14,15].to_scale
- @v2=[5,6,11,12,13,16,17,18,19].to_scale
- @u=Statsample::Test::UMannWhitney.new(@v1,@v2)
+ @v1 = Daru::Vector.new([1, 2, 3, 4, 7, 8, 9, 10, 14, 15])
+ @v2 = Daru::Vector.new([5, 6, 11, 12, 13, 16, 17, 18, 19])
+ @u = Statsample::Test::UMannWhitney.new(@v1, @v2)
end
- should "have same result using class or Test#u_mannwhitney" do
- assert_equal(Statsample::Test.u_mannwhitney(@v1,@v2).u, @u.u)
+ should 'have same result using class or Test#u_mannwhitney' do
+ assert_equal(Statsample::Test.u_mannwhitney(@v1, @v2).u, @u.u)
end
- should "have correct U values" do
- assert_equal(73,@u.r1)
- assert_equal(117,@u.r2)
- assert_equal(18,@u.u)
+ should 'have correct U values' do
+ assert_equal(73, @u.r1)
+ assert_equal(117, @u.r2)
+ assert_equal(18, @u.u)
end
- should "have correct value for z" do
- assert_in_delta(-2.205,@u.z,0.001)
+ should 'have correct value for z' do
+ assert_in_delta(-2.205, @u.z, 0.001)
end
- should "have correct value for z and exact probability" do
- assert_in_delta(0.027,@u.probability_z,0.001)
- assert_in_delta(0.028,@u.probability_exact,0.001)
+ should 'have correct value for z and exact probability' do
+ assert_in_delta(0.027, @u.probability_z, 0.001)
+ assert_in_delta(0.028, @u.probability_exact, 0.001)
end
end
end
diff --git a/test/test_vector.rb b/test/test_vector.rb
index 2a00252..7685121 100644
--- a/test/test_vector.rb
+++ b/test/test_vector.rb
@@ -1,644 +1,12 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
-class StatsampleTestVector < MiniTest::Unit::TestCase
- include Statsample::Shorthand
-
- def setup
- @c = Statsample::Vector.new([5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99], :nominal)
- @c.name="Test Vector"
- @c.missing_values=[-99]
- end
- def assert_counting_tokens(b)
- assert_equal([1,1,0,1,0,nil],b['a'].to_a)
- assert_equal([0,1,0,0,0,nil],b['b'].to_a)
- assert_equal([0,0,1,0,0,nil],b['c'].to_a)
- assert_equal([0,0,1,1,0,nil],b['d'].to_a)
- assert_equal([0,0,0,0,1,nil],b[10].to_a)
- end
- context Statsample do
- setup do
- @sample=100
- @a=@sample.times.map{|i| (i+rand(10)) %10 ==0 ? nil : rand(100)}.to_scale
- @b=@sample.times.map{|i| (i+rand(10)) %10 ==0 ? nil : rand(100)}.to_scale
- @correct_a=Array.new
- @correct_b=Array.new
- @a.each_with_index do |v,i|
- if !@a[i].nil? and !@b[i].nil?
- @correct_a.push(@a[i])
- @correct_b.push(@b[i])
- end
- end
- @correct_a=@correct_a.to_scale
- @correct_b=@correct_b.to_scale
-
- @common=lambda do |av,bv|
- assert_equal(@correct_a, av, "A no es esperado")
- assert_equal(@correct_b, bv, "B no es esperado")
- assert(!av.has_missing_data?, "A tiene datos faltantes")
- assert(!bv.has_missing_data?, "b tiene datos faltantes")
- end
- end
- should "return correct only_valid" do
- av,bv=Statsample.only_valid @a,@b
- av2,bv2=Statsample.only_valid av,bv
- @common.call(av,bv)
- assert_equal(av,av2)
- assert_not_same(av,av2)
- assert_not_same(bv,bv2)
- end
- should "return correct only_valid_clone" do
- av,bv=Statsample.only_valid_clone @a,@b
- @common.call(av,bv)
- av2,bv2=Statsample.only_valid_clone av,bv
- assert_equal(av,av2)
- assert_same(av,av2)
- assert_same(bv,bv2)
- end
- end
- context Statsample::Vector do
- setup do
- @c = Statsample::Vector.new([5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99], :nominal)
- @c.name="Test Vector"
- @c.missing_values=[-99]
- end
- should_with_gsl "be created with GSL::Vector" do
- gsl=GSL::Vector[1,2,3,4,5]
- v=Statsample::Vector.new(gsl)
- assert_equal([1,2,3,4,5], v.to_a)
- refute(v.flawed?)
-
- end
-
- context "using matrix operations" do
- setup do
- @a=[1,2,3,4,5].to_scale
- end
- should "to_matrix returns a matrix with 1 row" do
- mh=Matrix[[1,2,3,4,5]]
- assert_equal(mh,@a.to_matrix)
- end
- should "to_matrix(:vertical) returns a matrix with 1 column" do
- mv=Matrix.columns([[1,2,3,4,5]])
- assert_equal(mv,@a.to_matrix(:vertical))
- end
- should "returns valid submatrixes" do
- # 3*4 + 2*5 = 22
- a=[3,2].to_vector(:scale)
- b=[4,5].to_vector(:scale)
- assert_equal(22,(a.to_matrix*b.to_matrix(:vertical))[0,0])
- end
- end
- context "when initializing" do
- setup do
- @data=(10.times.map{rand(100)})+[nil]
- @original=Statsample::Vector.new(@data, :scale)
- end
- should "be the sample using []" do
- second=Statsample::Vector[*@data]
- assert_equal(@original, second)
- end
- should "[] returns same results as R-c()" do
- reference=[0,4,5,6,10].to_scale
- assert_equal(reference, Statsample::Vector[0,4,5,6,10])
- assert_equal(reference, Statsample::Vector[0,4..6,10])
- assert_equal(reference, Statsample::Vector[[0],[4,5,6],[10]])
- assert_equal(reference, Statsample::Vector[[0],[4,[5,[6]]],[10]])
-
- assert_equal(reference, Statsample::Vector[[0],[4,5,6].to_vector,[10]])
-
- end
- should "be the same usign #to_vector" do
- lazy1=@data.to_vector(:scale)
- assert_equal(@original,lazy1)
- end
- should "be the same using #to_scale" do
- lazy2=@data.to_scale
- assert_equal(@original,lazy2)
- assert_equal(:scale,lazy2.type)
- assert_equal(@data.find_all{|v| !v.nil?},lazy2.valid_data)
- end
- should "could use new_scale with size only" do
- v1=10.times.map {nil}.to_scale
- v2=Statsample::Vector.new_scale(10)
- assert_equal(v1,v2)
-
- end
- should "could use new_scale with size and value" do
- a=rand
- v1=10.times.map {a}.to_scale
- v2=Statsample::Vector.new_scale(10,a)
- assert_equal(v1,v2)
- end
- should "could use new_scale with func" do
- v1=10.times.map {|i| i*2}.to_scale
- v2=Statsample::Vector.new_scale(10) {|i| i*2}
- assert_equal(v1,v2)
- end
-
- end
-
- context "#split_by_separator" do
-
- setup do
- @a = Statsample::Vector.new(["a","a,b","c,d","a,d",10,nil],:nominal)
- @b=@a.split_by_separator(",")
- end
- should "returns a Hash" do
- assert_kind_of(Hash, @b)
- end
- should "return a Hash with keys with different values of @a" do
- expected=['a','b','c','d',10]
- assert_equal(expected, @b.keys)
- end
-
- should "returns a Hash, which values are Statsample::Vector" do
- @b.each_key {|k| assert_instance_of(Statsample::Vector, @b[k])}
- end
- should "hash values are n times the tokens appears" do
- assert_counting_tokens(@b)
- end
- should "#split_by_separator_freq returns the number of ocurrences of tokens" do
- assert_equal({'a'=>3,'b'=>1,'c'=>1,'d'=>2,10=>1}, @a.split_by_separator_freq())
- end
- should "using a different separator give the same values" do
- a = Statsample::Vector.new(["a","a*b","c*d","a*d",10,nil],:nominal)
- b=a.split_by_separator("*")
- assert_counting_tokens(b)
- end
- end
- should "return correct median_absolute_deviation" do
- a=[1, 1, 2, 2, 4, 6, 9].to_scale
- assert_equal(1, a.median_absolute_deviation)
- end
- should "return correct histogram" do
- a=10.times.map {|v| v}.to_scale
- hist=a.histogram(2)
- assert_equal([5,5], hist.bin)
- 3.times do |i|
- assert_in_delta(i*4.5, hist.get_range(i)[0], 1e-9)
- end
-
- end
- should "have a name" do
- @c.name=="Test Vector"
- end
- should "without explicit name, returns vector with succesive numbers" do
- a=10.times.map{rand(100)}.to_scale
- b=10.times.map{rand(100)}.to_scale
- assert_match(/Vector \d+/, a.name)
- a.name=~/Vector (\d+)/
- next_number=$1.to_i+1
- assert_equal("Vector #{next_number}",b.name)
- end
- should "save to a file and load the same Vector" do
- outfile=Tempfile.new("vector.vec")
- @c.save(outfile.path)
- a=Statsample.load(outfile.path)
- assert_equal(@c,a)
- end
- should "#collect returns an array" do
- val=@c.collect {|v| v}
- assert_equal(val,[5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99])
- end
-
- should "#recode returns a recoded array" do
- a=@c.recode{|v| @c.is_valid?(v) ? 0 : 1 }
- exp=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1].to_vector
- assert_equal(exp,a)
- exp.recode!{|v| v==0 ? 1:0}
- exp2=(([1]*15)+([0]*3)).to_vector
- assert_equal(exp2,exp)
- end
- should "#product returns the * of all values" do
- a=[1,2,3,4,5].to_vector(:scale)
- assert_equal(120,a.product)
- end
-
- should "missing values" do
- @c.missing_values=[10]
- assert_equal([-99,-99,1,2,3,4,5,5,5,5,5,6,6,7,8,9], @c.valid_data.sort)
- assert_equal([5,5,5,5,5,6,6,7,8,9,nil,1,2,3,4,nil,-99,-99], @c.data_with_nils)
- @c.missing_values=[-99]
- assert_equal(@c.valid_data.sort,[1,2,3,4,5,5,5,5,5,6,6,7,8,9,10])
- assert_equal(@c.data_with_nils,[5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,nil,nil])
- @c.missing_values=[]
- assert_equal(@c.valid_data.sort,[-99,-99,1,2,3,4,5,5,5,5,5,6,6,7,8,9,10])
- assert_equal(@c.data_with_nils,[5,5,5,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99])
-
- end
- should "correct has_missing_data? with missing data" do
- a=[1,2,3,nil].to_vector
- assert(a.has_missing_data?)
- end
- should "correct has_missing_data? without missing data" do
- a=[1,2,3,4,10].to_vector
- assert(!a.has_missing_data?)
- end
- should "with explicit missing_values, should respond has_missing_data?" do
- a=[1,2,3,4,10].to_vector
- a.missing_values=[10]
- assert(a.has_missing_data?)
- end
- should "label correctly fields" do
- @c.labels={5=>'FIVE'}
- assert_equal(["FIVE","FIVE","FIVE","FIVE","FIVE",6,6,7,8,9,10,1,2,3,4,nil,-99, -99],@c.vector_labeled.to_a)
- end
- should "verify" do
- h=@c.verify{|d| !d.nil? and d>0}
- e={15=>nil,16=>-99,17=>-99}
- assert_equal(e,h)
- end
- should "have a summary with name on it" do
- assert_match(/#{@c.name}/, @c.summary)
- end
-
- should "GSL::Vector based should push correcty" do
- if Statsample.has_gsl?
- v=GSL::Vector[1,2,3,4,5].to_scale
- v.push(nil)
- assert_equal([1,2,3,4,5,nil], v.to_a)
- assert(v.flawed?)
- else
- skip("Requires GSL")
- end
- end
-
-
- should "split correctly" do
- a = Statsample::Vector.new(["a","a,b","c,d","a,d","d",10,nil],:nominal)
- assert_equal([%w{a},%w{a b},%w{c d},%w{a d},%w{d},[10],nil], a.splitted)
- end
- should "multiply correct for scalar" do
- a = [1,2,3].to_scale
- assert_equal([5,10,15].to_scale, a*5)
- end
- should "multiply correct with other vector" do
- a = [1,2,3].to_scale
- b = [2,4,6].to_scale
-
- assert_equal([2,8,18].to_scale, a*b)
- end
- should "sum correct for scalar" do
- a = [1,2,3].to_scale
- assert_equal([11,12,13].to_scale, a+10)
- end
-
- should "raise NoMethodError when method requires ordinal and vector is nominal" do
- @c.type=:nominal
- assert_raise(::NoMethodError) { @c.median }
- end
-
- should "raise NoMethodError when method requires scalar and vector is ordinal" do
- @c.type=:ordinal
- assert_raise(::NoMethodError) { @c.mean }
- end
- should "jacknife correctly with named method" do
- # First example
- a=[1,2,3,4].to_scale
- ds=a.jacknife(:mean)
- assert_equal(a.mean, ds[:mean].mean)
- ds=a.jacknife([:mean,:sd])
- assert_equal(a.mean, ds[:mean].mean)
- assert_equal(a.sd, ds[:mean].sd)
- end
- should "jacknife correctly with custom method" do
- # Second example
- a=[17.23, 18.71,13.93,18.81,15.78,11.29,14.91,13.39, 18.21, 11.57, 14.28, 10.94, 18.83, 15.52,13.45,15.25].to_scale
- ds=a.jacknife(:log_s2=>lambda {|v| Math.log(v.variance) })
- exp=[1.605, 2.972, 1.151, 3.097, 0.998, 3.308, 0.942, 1.393, 2.416, 2.951, 1.043, 3.806, 3.122, 0.958, 1.362, 0.937].to_scale
-
- assert_similar_vector(exp, ds[:log_s2], 0.001)
- assert_in_delta(2.00389, ds[:log_s2].mean, 0.00001)
- assert_in_delta(1.091, ds[:log_s2].variance, 0.001)
- end
- should "jacknife correctly with k>1" do
- a=rnorm(6)
- ds=a.jacknife(:mean,2)
- mean=a.mean
- exp=[3*mean-2*(a[2]+a[3]+a[4]+a[5]) / 4, 3*mean-2*(a[0]+a[1]+a[4]+a[5]) / 4, 3*mean-2*(a[0]+a[1]+a[2]+a[3]) / 4].to_scale
- assert_similar_vector(exp, ds[:mean], 1e-13)
+class StatsampleTestVector < Minitest::Test
+ should 'return correct histogram' do
+ a = Daru::Vector.new(10.times.map { |v| v })
+ hist = a.histogram(2)
+ assert_equal([5, 5], hist.bin)
+ 3.times do |i|
+ assert_in_delta(i * 4.5, hist.get_range(i)[0], 1e-9)
end
- should "bootstrap should return a vector with mean=mu and sd=se" do
- a=rnorm(100)
- ds=a.bootstrap([:mean,:sd],200)
- se=1/Math.sqrt(a.size)
- assert_in_delta(0, ds[:mean].mean, 0.3)
- assert_in_delta(se, ds[:mean].sd, 0.02)
- end
-
-
- end
-
-
-
- def test_nominal
- assert_equal(@c[1],5)
- assert_equal({ 1=>1,2=>1,3=>1,4=>1,5=>5,6=>2,7=>1,8=>1, 9=>1,10=>1},@c.frequencies)
- assert_equal({ 1=>1,2=>1,3=>1,4=>1,5=>5,6=>2,7=>1,8=>1, 9=>1,10=>1},@c._frequencies)
- assert_equal({ 1 => 1.quo(15) ,2=>1.quo(15), 3=>1.quo(15),4=>1.quo(15),5=>5.quo(15),6=>2.quo(15),7=>1.quo(15), 8=>1.quo(15), 9=>1.quo(15),10=>1.quo(15)}, @c.proportions)
- assert_equal(@c.proportion, 1.quo(15))
- assert_equal(@c.proportion(2), 1.quo(15))
- assert_equal([1,2,3,4,5,6,7,8,9,10], @c.factors.sort)
- assert_equal(@c.mode,5)
- assert_equal(@c.n_valid,15)
- end
- def test_equality
- v1=[1,2,3].to_vector
- v2=[1,2,3].to_vector
- assert_equal(v1,v2)
- v1=[1,2,3].to_vector(:nominal)
- v2=[1,2,3].to_vector(:ordinal)
- assert_not_equal(v1,v2)
- v2=[1,2,3]
- assert_not_equal(v1,v2)
- v1=[1,2,3].to_vector()
- v2=[1,2,3].to_vector()
- assert_equal(v1,v2)
- assert_equal(false, v1 == Object.new)
- end
- def test_vector_percentil
- a=[1,2,2,3,4,5,5,5,6,10].to_scale
- expected=[10,25,25,40,50,70,70,70,90,100].to_scale
- assert_equal(expected, a.vector_percentil)
- a=[1,nil,nil,2,2,3,4,nil,nil,5,5,5,6,10].to_scale
- expected=[10,nil,nil,25,25,40,50,nil,nil,70,70,70,90,100].to_scale
- assert_equal(expected, a.vector_percentil)
- end
- def test_ordinal
- @c.type=:ordinal
- assert_equal(5,@c.median)
- assert_equal(4,@c.percentil(25))
- assert_equal(7,@c.percentil(75))
-
- v=[200000, 200000, 210000, 220000, 230000, 250000, 250000, 250000, 270000, 300000, 450000, 130000, 140000, 140000, 140000, 145000, 148000, 165000, 170000, 180000, 180000, 180000, 180000, 180000, 180000 ].to_scale
- assert_equal(180000,v.median)
- a=[7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 9.0, 9.0, 10.0, 10.0, 10.0, 10.0, 10.0, 12.0, 12.0, 13.0, 14.0, 14.0, 2.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0].to_scale
- assert_equal(4.5, a.percentil(25))
- assert_equal(6.5, a.percentil(50))
- assert_equal(9.5, a.percentil(75))
- assert_equal(3.0, a.percentil(10))
- end
- def test_linear_percentil_strategy
- values = [102, 104, 105, 107, 108, 109, 110, 112, 115, 116].shuffle.to_scale
- assert_equal 102, values.percentil(0, :linear)
- assert_equal 104.75, values.percentil(25, :linear)
- assert_equal 108.5, values.percentil(50, :linear)
- assert_equal 112.75, values.percentil(75, :linear)
- assert_equal 116, values.percentil(100, :linear)
-
- values = [102, 104, 105, 107, 108, 109, 110, 112, 115, 116, 118].shuffle.to_scale
- assert_equal 102, values.percentil(0, :linear)
- assert_equal 105, values.percentil(25, :linear)
- assert_equal 109, values.percentil(50, :linear)
- assert_equal 115, values.percentil(75, :linear)
- assert_equal 118, values.percentil(100, :linear)
- end
- def test_ranked
- v1=[0.8,1.2,1.2,2.3,18].to_vector(:ordinal)
- expected=[1,2.5,2.5,4,5].to_vector(:ordinal)
- assert_equal(expected,v1.ranked)
- v1=[nil,0.8,1.2,1.2,2.3,18,nil].to_vector(:ordinal)
- expected=[nil,1,2.5,2.5,4,5,nil].to_vector(:ordinal)
- assert_equal(expected,v1.ranked)
- end
- def test_scale
- a=Statsample::Vector.new([1,2,3,4,"STRING"], :scale)
- assert_equal(10, a.sum)
- i=0
- factors=a.factors.sort
- [0,1,2,3,4].each{|v|
- assert(v==factors[i])
- assert(v.class==factors[i].class,"#{v} - #{v.class} != #{factors[i]} - #{factors[i].class}")
- i+=1
- }
- end
- def test_vector_centered
- mean=rand()
- samples=11
- centered=samples.times.map {|i| i-((samples/2).floor).to_i}.to_scale
- not_centered=centered.recode {|v| v+mean}
- obs=not_centered.centered
- centered.each_with_index do |v,i|
- assert_in_delta(v,obs[i],0.0001)
- end
- end
- def test_vector_standarized
- v1=[1,2,3,4,nil].to_vector(:scale)
- sds=v1.sds
- expected=[((1-2.5).quo(sds)),((2-2.5).quo(sds)),((3-2.5).quo(sds)),((4-2.5).quo(sds)), nil].to_vector(:scale)
- vs=v1.vector_standarized
- assert_equal(expected, vs)
- assert_equal(0,vs.mean)
- assert_equal(1,vs.sds)
- end
-
- def test_vector_standarized_with_zero_variance
- v1=100.times.map {|i| 1}.to_scale
- exp=100.times.map {nil}.to_scale
- assert_equal(exp,v1.standarized)
- end
-
- def test_check_type
- v=Statsample::Vector.new
- v.type=:nominal
- assert_raise(NoMethodError) { v.check_type(:scale)}
- assert_raise(NoMethodError) { v.check_type(:ordinal)}
- assert(v.check_type(:nominal).nil?)
-
- v.type=:ordinal
-
- assert_raise(NoMethodError) { v.check_type(:scale)}
-
- assert(v.check_type(:ordinal).nil?)
- assert(v.check_type(:nominal).nil?)
-
-
- v.type=:scale
- assert(v.check_type(:scale).nil?)
- assert(v.check_type(:ordinal).nil?)
- assert(v.check_type(:nominal).nil?)
-
- v.type=:date
- assert_raise(NoMethodError) { v.check_type(:scale)}
- assert_raise(NoMethodError) { v.check_type(:ordinal)}
- assert_raise(NoMethodError) { v.check_type(:nominal)}
- end
-
- def test_add
- a=Statsample::Vector.new([1,2,3,4,5], :scale)
- b=Statsample::Vector.new([11,12,13,14,15], :scale)
- assert_equal([3,4,5,6,7], (a+2).to_a)
- assert_equal([12,14,16,18,20], (a+b).to_a)
- assert_raise ArgumentError do
- a + @c
- end
- assert_raise TypeError do
- a+"string"
- end
- a=Statsample::Vector.new([nil,1, 2 ,3 ,4 ,5], :scale)
- b=Statsample::Vector.new([11, 12,nil,13,14,15], :scale)
- assert_equal([nil,13,nil,16,18,20], (a+b).to_a)
- assert_equal([nil,13,nil,16,18,20], (a+b.to_a).to_a)
- end
- def test_minus
- a=Statsample::Vector.new([1,2,3,4,5], :scale)
- b=Statsample::Vector.new([11,12,13,14,15], :scale)
- assert_equal([-1,0,1,2,3], (a-2).to_a)
- assert_equal([10,10,10,10,10], (b-a).to_a)
- assert_raise ArgumentError do
- a-@c
- end
- assert_raise TypeError do
- a-"string"
- end
- a=Statsample::Vector.new([nil,1, 2 ,3 ,4 ,5], :scale)
- b=Statsample::Vector.new([11, 12,nil,13,14,15], :scale)
- assert_equal([nil,11,nil,10,10,10], (b-a).to_a)
- assert_equal([nil,11,nil,10,10,10], (b-a.to_a).to_a)
- end
- def test_sum_of_squares
- a=[1,2,3,4,5,6].to_vector(:scale)
- assert_equal(17.5, a.sum_of_squared_deviation)
- end
- def test_average_deviation
- a=[1,2,3,4,5,6,7,8,9].to_scale
- assert_equal(20.quo(9), a.average_deviation_population)
- end
- def test_samples
- srand(1)
- assert_equal(100,@c.sample_with_replacement(100).size)
- assert_equal(@c.valid_data.to_a.sort, @c.sample_without_replacement(15).sort)
- assert_raise ArgumentError do
- @c.sample_without_replacement(20)
- end
- @c.type=:scale
- srand(1)
- assert_equal(100, @c.sample_with_replacement(100).size)
- assert_equal(@c.valid_data.to_a.sort, @c.sample_without_replacement(15).sort)
-
- end
- def test_valid_data
- a=Statsample::Vector.new([1,2,3,4,"STRING"])
- a.missing_values=[-99]
- a.add(1,false)
- a.add(2,false)
- a.add(-99,false)
- a.set_valid_data
- exp_valid_data=[1,2,3,4,"STRING",1,2]
- assert_equal(exp_valid_data,a.valid_data)
- a.add(20,false)
- a.add(30,false)
- assert_equal(exp_valid_data,a.valid_data)
- a.set_valid_data
- exp_valid_data_2=[1,2,3,4,"STRING",1,2,20,30]
- assert_equal(exp_valid_data_2,a.valid_data)
- end
- def test_set_value
- @c[2]=10
- expected=[5,5,10,5,5,6,6,7,8,9,10,1,2,3,4,nil,-99,-99].to_vector
- assert_equal(expected.data,@c.data)
- end
- def test_gsl
- if Statsample.has_gsl?
- a=Statsample::Vector.new([1,2,3,4,"STRING"], :scale)
-
- assert_equal(2,a.mean)
- assert_equal(a.variance_sample_ruby,a.variance_sample)
- assert_equal(a.standard_deviation_sample_ruby,a.sds)
- assert_equal(a.variance_population_ruby,a.variance_population)
- assert_equal(a.standard_deviation_population_ruby,a.standard_deviation_population)
- assert_nothing_raised do
- a=[].to_vector(:scale)
- end
- a.add(1,false)
- a.add(2,false)
- a.set_valid_data
- assert_equal(3,a.sum)
- b=[1,2,nil,3,4,5,nil,6].to_vector(:scale)
- assert_equal(21, b.sum)
- assert_equal(3.5, b.mean)
- assert_equal(6,b.gsl.size)
- c=[10,20,30,40,50,100,1000,2000,5000].to_scale
- assert_in_delta(c.skew, c.skew_ruby ,0.0001)
- assert_in_delta(c.kurtosis, c.kurtosis_ruby ,0.0001)
- end
- end
- def test_vector_matrix
- v1=%w{a a a b b b c c}.to_vector
- v2=%w{1 3 4 5 6 4 3 2}.to_vector
- v3=%w{1 0 0 0 1 1 1 0}.to_vector
- ex=Matrix.rows([["a", "1", "1"], ["a", "3", "0"], ["a", "4", "0"], ["b", "5", "0"], ["b", "6", "1"], ["b", "4", "1"], ["c", "3", "1"], ["c", "2", "0"]])
- assert_equal(ex,Statsample.vector_cols_matrix(v1,v2,v3))
- end
- def test_marshalling
- v1=(0..100).to_a.collect{|n| rand(100)}.to_vector(:scale)
- v2=Marshal.load(Marshal.dump(v1))
- assert_equal(v1,v2)
- end
- def test_dup
- v1=%w{a a a b b b c c}.to_vector
- v2=v1.dup
- assert_equal(v1.data,v2.data)
- assert_not_same(v1.data,v2.data)
- assert_equal(v1.type,v2.type)
-
- v1.type=:ordinal
- assert_not_equal(v1.type,v2.type)
- assert_equal(v1.missing_values,v2.missing_values)
- assert_not_same(v1.missing_values,v2.missing_values)
- assert_equal(v1.labels,v2.labels)
- assert_not_same(v1.labels,v2.labels)
-
- v3=v1.dup_empty
- assert_equal([],v3.data)
- assert_not_equal(v1.data,v3.data)
- assert_not_same(v1.data,v3.data)
- assert_equal(v1.type,v3.type)
- v1.type=:ordinal
- v3.type=:nominal
- assert_not_equal(v1.type,v3.type)
- assert_equal(v1.missing_values,v3.missing_values)
- assert_not_same(v1.missing_values,v3.missing_values)
- assert_equal(v1.labels,v3.labels)
- assert_not_same(v1.labels,v3.labels)
- end
- def test_paired_ties
- a=[0,0,0,1,1,2,3,3,4,4,4].to_vector(:ordinal)
- expected=[2,2,2,4.5,4.5,6,7.5,7.5,10,10,10].to_vector(:ordinal)
- assert_equal(expected,a.ranked)
- end
- def test_dichotomize
- a= [0,0,0,1,2,3,nil].to_vector
- exp=[0,0,0,1,1,1,nil].to_scale
- assert_equal(exp,a.dichotomize)
- a= [1,1,1,2,2,2,3].to_vector
- exp=[0,0,0,1,1,1,1].to_scale
- assert_equal(exp,a.dichotomize)
- a= [0,0,0,1,2,3,nil].to_vector
- exp=[0,0,0,0,1,1,nil].to_scale
- assert_equal(exp,a.dichotomize(1))
- a= %w{a a a b c d}.to_vector
- exp=[0,0,0,1,1,1].to_scale
- assert_equal(exp, a.dichotomize)
- end
- def test_can_be_methods
- a= [0,0,0,1,2,3,nil].to_vector
- assert(a.can_be_scale?)
- a=[0,"s",0,1,2,3,nil].to_vector
- assert(!a.can_be_scale?)
- a.missing_values=["s"]
- assert(a.can_be_scale?)
-
- a=[Date.new(2009,10,10), Date.today(), "2009-10-10", "2009-1-1", nil, "NOW"].to_vector
- assert(a.can_be_date?)
- a=[Date.new(2009,10,10), Date.today(),nil,"sss"].to_vector
- assert(!a.can_be_date?)
- end
- def test_date_vector
- a=[Date.new(2009,10,10), :NOW, "2009-10-10", "2009-1-1", nil, "NOW","MISSING"].to_vector(:date, :missing_values=>["MISSING"])
-
- assert(a.type==:date)
- expected=[Date.new(2009,10,10), Date.today(), Date.new(2009,10,10), Date.new(2009,1,1), nil, Date.today(), nil ]
- assert_equal(expected, a.date_data_with_nils)
end
end
diff --git a/test/test_wilcoxonsignedrank.rb b/test/test_wilcoxonsignedrank.rb
index f10b492..c32e341 100644
--- a/test/test_wilcoxonsignedrank.rb
+++ b/test/test_wilcoxonsignedrank.rb
@@ -1,67 +1,64 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
+require(File.expand_path(File.dirname(__FILE__) + '/helpers_tests.rb'))
-class StatsampleUMannWhitneyTestCase < MiniTest::Unit::TestCase
+class StatsampleUMannWhitneyTestCase < Minitest::Test
include Statsample::Test
context Statsample::Test::WilcoxonSignedRank do
- context "Example 1" do
- setup do
- @v1=[110,122,125,120,140,124,123,137,135,145].to_scale
- @v2=[125,115,130,140,140,115,140,125,140,135].to_scale
- @u=Statsample::Test::WilcoxonSignedRank.new(@v1,@v2)
- end
- should "have same result using class or Test#u_mannwhitney" do
- assert_equal(Statsample::Test.wilcoxon_signed_rank(@v1,@v2).w, @u.w)
- end
- should "have correct W values" do
- assert_equal(9,@u.w)
- end
- should "have correct nr values" do
- assert_equal(9,@u.nr)
- end
- should "have correct value for z" do
- assert_in_delta(0.503,@u.z,0.001)
- end
- should "have correct value for probability_z" do
- assert_in_delta(0.614,@u.probability_z,0.001)
- end
- should "have correct value for probability_exact" do
- assert_in_delta(0.652,@u.probability_exact,0.001)
- end
- should "have summary" do
- assert(@u.summary!="")
- end
- end
-
- context "Example 2" do
- setup do
- @v2=[78,24,64,45,64,52,30,50,64,50,78,22,84,40,90,72].to_scale
- @v1=[78,24,62,48,68,56,25,44,56,40,68,36,68,20,58,32].to_scale
- @u=Statsample::Test::WilcoxonSignedRank.new(@v1,@v2)
- end
- should "have same result using class or Test#u_mannwhitney" do
- assert_equal(Statsample::Test.wilcoxon_signed_rank(@v1,@v2).w, @u.w)
- end
- should "have correct W values" do
- assert_equal(67,@u.w)
- end
- should "have correct nr values" do
- assert_equal(14,@u.nr)
- end
- should "have correct value for z" do
- assert_in_delta(2.087,@u.z,0.001)
- end
- should "have correct value for probability_z" do
- assert_in_delta(0.036,@u.probability_z,0.001)
- end
- should "have correct value for probability_exact" do
- assert_in_delta(0.036,@u.probability_exact,0.001)
- end
- should "have summary" do
- assert(@u.summary!="")
- end
- end
-
-
- end
-
+ context 'Example 1' do
+ setup do
+ @v1 = Daru::Vector.new([110, 122, 125, 120, 140, 124, 123, 137, 135, 145])
+ @v2 = Daru::Vector.new([125, 115, 130, 140, 140, 115, 140, 125, 140, 135])
+ @u = Statsample::Test::WilcoxonSignedRank.new(@v1, @v2)
+ end
+ should 'have same result using class or Test#u_mannwhitney' do
+ assert_equal(Statsample::Test.wilcoxon_signed_rank(@v1, @v2).w, @u.w)
+ end
+ should 'have correct W values' do
+ assert_equal(9, @u.w)
+ end
+ should 'have correct nr values' do
+ assert_equal(9, @u.nr)
+ end
+ should 'have correct value for z' do
+ assert_in_delta(0.503, @u.z, 0.001)
+ end
+ should 'have correct value for probability_z' do
+ assert_in_delta(0.614, @u.probability_z, 0.001)
+ end
+ should 'have correct value for probability_exact' do
+ assert_in_delta(0.652, @u.probability_exact, 0.001)
+ end
+ should 'have summary' do
+ assert(@u.summary != '')
+ end
+ end
+
+ context 'Example 2' do
+ setup do
+ @v2 = Daru::Vector.new([78, 24, 64, 45, 64, 52, 30, 50, 64, 50, 78, 22, 84, 40, 90, 72])
+ @v1 = Daru::Vector.new([78, 24, 62, 48, 68, 56, 25, 44, 56, 40, 68, 36, 68, 20, 58, 32])
+ @u = Statsample::Test::WilcoxonSignedRank.new(@v1, @v2)
+ end
+ should 'have same result using class or Test#u_mannwhitney' do
+ assert_equal(Statsample::Test.wilcoxon_signed_rank(@v1, @v2).w, @u.w)
+ end
+ should 'have correct W values' do
+ assert_equal(67, @u.w)
+ end
+ should 'have correct nr values' do
+ assert_equal(14, @u.nr)
+ end
+ should 'have correct value for z' do
+ assert_in_delta(2.087, @u.z, 0.001)
+ end
+ should 'have correct value for probability_z' do
+ assert_in_delta(0.036, @u.probability_z, 0.001)
+ end
+ should 'have correct value for probability_exact' do
+ assert_in_delta(0.036, @u.probability_exact, 0.001)
+ end
+ should 'have summary' do
+ assert(@u.summary != '')
+ end
+ end
+ end
end
diff --git a/test/test_xls.rb b/test/test_xls.rb
deleted file mode 100644
index 0a2584d..0000000
--- a/test/test_xls.rb
+++ /dev/null
@@ -1,52 +0,0 @@
-require(File.expand_path(File.dirname(__FILE__)+'/helpers_tests.rb'))
-class StatsampleExcelTestCase < MiniTest::Unit::TestCase
- context "Excel reader" do
- setup do
- @ds=Statsample::Excel.read(File.dirname(__FILE__)+"/fixtures/test_xls.xls")
- end
- should "set the number of cases" do
- assert_equal(6,@ds.cases)
- end
- should "set correct field names" do
- assert_equal(%w{id name age city a1},@ds.fields)
- end
- should "set a dataset equal to expected" do
- id=[1,2,3,4,5,6].to_vector(:scale)
- name=["Alex","Claude","Peter","Franz","George","Fernand"].to_vector(:nominal)
- age=[20,23,25,nil,5.5,nil].to_vector(:scale)
- city=["New York","London","London","Paris","Tome",nil].to_vector(:nominal)
- a1=["a,b","b,c","a",nil,"a,b,c",nil].to_vector(:nominal)
- ds_exp=Statsample::Dataset.new({'id'=>id,'name'=>name,'age'=>age,'city'=>city,'a1'=>a1}, %w{id name age city a1})
- ds_exp.fields.each{|f|
- assert_equal(ds_exp[f],@ds[f])
- }
- assert_equal(ds_exp,@ds)
- end
- should "set to nil empty cells" do
- assert_equal(nil,@ds['age'][5])
- end
- end
- context "Excel writer" do
- setup do
- a=100.times.map{rand(100)}.to_scale
- b=(["b"]*100).to_vector
- @ds={'b'=>b, 'a'=>a}.to_dataset(%w{b a})
- tempfile=Tempfile.new("test_write.xls")
- Statsample::Excel.write(@ds,tempfile.path)
- @ds2=Statsample::Excel.read(tempfile.path)
- end
- should "return same fields as original" do
- assert_equal(@ds.fields ,@ds2.fields)
- end
- should "return same number of cases as original" do
- assert_equal(@ds.cases, @ds2.cases)
- end
- should "return same cases as original" do
- i=0
- @ds2.each_array do |row|
- assert_equal(@ds.case_as_array(i),row)
- i+=1
- end
- end
- end
-end
diff --git a/web/Rakefile b/web/Rakefile
deleted file mode 100644
index b2f4127..0000000
--- a/web/Rakefile
+++ /dev/null
@@ -1,39 +0,0 @@
-# -*- ruby -*-
-require 'rake'
-require 'fileutils'
-directory "examples"
-
-def get_base(f)
- f.sub(File.dirname(__FILE__)+"/../examples/","").gsub("/","_").gsub(".rb","")
-end
-
-
-EXAMPLES=Dir.glob(File.dirname(__FILE__)+"/../examples/**/*.rb").map {|v| [v, get_base(v)]
-}.find_all{|v| !v[0].include?"_data"}
-
-EXAMPLES_BASE=EXAMPLES.map {|v| v[1]}
-
-
-desc "Build all html, rtf and pdf files"
-task :build_site do
- ruby "build_site.rb"
-end
-
-
-task :clean do
- Dir.glob(File.dirname(__FILE__)+"/examples/*.pdf").each do |t|
- FileUtils.rm t
- end
- Dir.glob(File.dirname(__FILE__)+"/examples/*.html").each do |t|
- FileUtils.rm t
- end
- Dir.glob(File.dirname(__FILE__)+"/examples/*.rtf").each do |t|
- FileUtils.rm t
- end
- Dir.glob(File.dirname(__FILE__)+"/examples/images/*.*").each do |t|
- FileUtils.rm t
- end
-end
-
-
-load 'upload_task.rb' if File.exists? "upload_task.rb"