From 25213e700508961a68c4036de59c803feda36ea5 Mon Sep 17 00:00:00 2001 From: Simon Parten Date: Wed, 15 May 2024 20:06:24 +0200 Subject: [PATCH] Add cosine similarity. --- .scalafmt.conf | 3 +- build.sc | 2 +- vecxt/js/src/package.scala | 9 +- vecxt/src/cosineSimilarity.scala | 3 +- vecxt/test/src/simple.stats.scala | 153 ++++++++++++++++-------------- 5 files changed, 96 insertions(+), 74 deletions(-) diff --git a/.scalafmt.conf b/.scalafmt.conf index db7feb5..f4785ce 100644 --- a/.scalafmt.conf +++ b/.scalafmt.conf @@ -10,4 +10,5 @@ runner.dialectOverride.allowSignificantIndentation = true rewrite.scala3.countEndMarkerLines = lastBlockOnly rewrite.scala3.insertEndMarkerMinLines = 1 indent.main = 4 -maxColumn = 120 \ No newline at end of file +maxColumn = 120 +exclude.filters = ["build.sc"] \ No newline at end of file diff --git a/build.sc b/build.sc index 9e40e1b..df99a5b 100644 --- a/build.sc +++ b/build.sc @@ -14,7 +14,7 @@ import mill.scalanativelib._ import mill.api.Result trait Common extends ScalaModule with PublishModule { - def scalaVersion = "3.3.1" + def scalaVersion = "3.3.3" def publishVersion = VcsVersion.vcsState().format() diff --git a/vecxt/js/src/package.scala b/vecxt/js/src/package.scala index a537e68..0358301 100644 --- a/vecxt/js/src/package.scala +++ b/vecxt/js/src/package.scala @@ -206,7 +206,14 @@ object extensions: inline def dot(v1: Float64Array)(using inline boundsCheck: BoundsCheck): Double = dimCheck(vec, v1) - blas.ddot(vec.length, vec, 1, v1, 1) + + var product = 0.0 + var i = 0; + while i < vec.length do + product = product + vec(i) * v1(i) + i = i + 1 + end while + product end dot inline def norm: Double = blas.dnrm2(vec.length, vec, 1) diff --git a/vecxt/src/cosineSimilarity.scala b/vecxt/src/cosineSimilarity.scala index 16eb471..758d5ed 100644 --- a/vecxt/src/cosineSimilarity.scala +++ b/vecxt/src/cosineSimilarity.scala @@ -2,6 +2,7 @@ package vecxt import vecxt.extensions.norm import vecxt.extensions.dot +import narr.NArray /** Compute the cosine similarity between two vectors * @@ -12,7 +13,7 @@ import vecxt.extensions.dot */ object cosineSimilarity: - inline def apply(v1: Array[Double], v2: Array[Double])(using inline boundsCheck: BoundsCheck): Double = + inline def apply(v1: NArray[Double], v2: NArray[Double])(using inline boundsCheck: BoundsCheck): Double = dimCheck(v1, v2) v1.dot(v2) / (v1.norm * v2.norm) end apply diff --git a/vecxt/test/src/simple.stats.scala b/vecxt/test/src/simple.stats.scala index 3704fbe..a2ab866 100644 --- a/vecxt/test/src/simple.stats.scala +++ b/vecxt/test/src/simple.stats.scala @@ -21,76 +21,89 @@ import vecxt.DoBoundsCheck class StatsSuite extends munit.FunSuite: - // import vecxt.BoundsCheck.yes - - test("sample covariance") { - // Sample version - // https://corporatefinanceinstitute.com/resources/data-science/covariance/ - - val vector1 = NArray[Double](1692.0, 1978.0, 1884.0, 2151.0, 2519.0) - val vector2 = NArray[Double](68.0, 102.0, 110.0, 112.0, 154.0) - - val result = vector1.covariance(vector2) - - assertEqualsDouble(result, 9107.3, 0.001) - } - - test("sample variance and std") { - val v = NArray[Double](2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0) - assertEqualsDouble(v.variance, 4.571429, 0.00001) - assertEqualsDouble(v.stdDev, 2.13809, 0.00001) - } - - test("elementRanks") { - - assertVecEquals( - NArray.tabulate[Double](10)((i: Int) => 11.0 - i).elementRanks, - NArray[Double](10, 9, 8, 7, 6, 5, 4, 3, 2, 1) - ) - assertVecEquals( - NArray.fill[Double](5)(42.0).elementRanks, - NArray[Double](3, 3, 3, 3, 3) - ) - assertVecEquals( - NArray[Double](1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5).elementRanks, - NArray[Double](1, 2.5, 2.5, 5, 5, 5, 8.5, 8.5, 8.5, 8.5, 13, 13, 13, 13, 13) - ) - assertVecEquals( - NArray[Double](1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5).elementRanks, - NArray[Double](3, 3, 3, 3, 3, 7.5, 7.5, 7.5, 7.5, 11, 11, 11, 13.5, 13.5, 15) - ) - } - - test("pearson correlation coefficient") { - // https://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/ - val v1 = NArray[Double](43.0, 21.0, 25.0, 42.0, 57.0, 59.0) - val v2 = NArray[Double](99.0, 65.0, 79.0, 75.0, 87.0, 81.0) - assertEqualsDouble(v1.pearsonCorrelationCoefficient(v2)(using DoBoundsCheck.yes), 0.529809, 0.0001) - - } - - test("element rank") { - val v = NArray[Double](1.0, 5.0, 3.0, 6.0, 1.0, 5.0) - /* + // import vecxt.BoundsCheck.yes + + test("sample covariance") { + // Sample version + // https://corporatefinanceinstitute.com/resources/data-science/covariance/ + + val vector1 = NArray[Double](1692.0, 1978.0, 1884.0, 2151.0, 2519.0) + val vector2 = NArray[Double](68.0, 102.0, 110.0, 112.0, 154.0) + + val result = vector1.covariance(vector2) + + assertEqualsDouble(result, 9107.3, 0.001) + } + + test("sample variance and std") { + val v = NArray[Double](2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0) + assertEqualsDouble(v.variance, 4.571429, 0.00001) + assertEqualsDouble(v.stdDev, 2.13809, 0.00001) + } + + test("elementRanks") { + + assertVecEquals( + NArray.tabulate[Double](10)((i: Int) => 11.0 - i).elementRanks, + NArray[Double](10, 9, 8, 7, 6, 5, 4, 3, 2, 1) + ) + assertVecEquals( + NArray.fill[Double](5)(42.0).elementRanks, + NArray[Double](3, 3, 3, 3, 3) + ) + assertVecEquals( + NArray[Double](1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5).elementRanks, + NArray[Double](1, 2.5, 2.5, 5, 5, 5, 8.5, 8.5, 8.5, 8.5, 13, 13, 13, 13, 13) + ) + assertVecEquals( + NArray[Double](1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5).elementRanks, + NArray[Double](3, 3, 3, 3, 3, 7.5, 7.5, 7.5, 7.5, 11, 11, 11, 13.5, 13.5, 15) + ) + } + + test("pearson correlation coefficient") { + // https://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/ + val v1 = NArray[Double](43.0, 21.0, 25.0, 42.0, 57.0, 59.0) + val v2 = NArray[Double](99.0, 65.0, 79.0, 75.0, 87.0, 81.0) + assertEqualsDouble(v1.pearsonCorrelationCoefficient(v2)(using DoBoundsCheck.yes), 0.529809, 0.0001) + + } + + test("element rank") { + val v = NArray[Double](1.0, 5.0, 3.0, 6.0, 1.0, 5.0) + /* 1.0 is the first, but has as tied rank. Take the average - 1.5 - */ - assertVecEquals( - v.elementRanks, - NArray[Double](1.5, 4.5, 3.0, 6.0, 1.5, 4.5) - ) - } - - test("spearmans rank") { - // https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php - val v1 = NArray[Double](56.0, 75.0, 45.0, 71.0, 62.0, 64.0, 58.0, 80.0, 76.0, 61.0) - val v2 = NArray[Double](66.0, 70.0, 40.0, 60.0, 65.0, 56.0, 59.0, 77.0, 67.0, 63.0) - assertEqualsDouble(v1.spearmansRankCorrelation(v2)(using DoBoundsCheck.yes), 0.6727, 0.001) - - // https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient - - val v3 = NArray[Double](86.0, 97.0, 99.0, 100.0, 101.0, 103.0, 106.0, 110.0, 112.0, 113.0) - val v4 = NArray[Double](2, 20.0, 28.0, 27.0, 50.0, 29.0, 7.0, 17.0, 6.0, 12.0) - assertEqualsDouble(-0.1757575, v3.spearmansRankCorrelation(v4)(using DoBoundsCheck.yes), 0.000001) - } + */ + assertVecEquals( + v.elementRanks, + NArray[Double](1.5, 4.5, 3.0, 6.0, 1.5, 4.5) + ) + } + + test("spearmans rank") { + // https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php + val v1 = NArray[Double](56.0, 75.0, 45.0, 71.0, 62.0, 64.0, 58.0, 80.0, 76.0, 61.0) + val v2 = NArray[Double](66.0, 70.0, 40.0, 60.0, 65.0, 56.0, 59.0, 77.0, 67.0, 63.0) + assertEqualsDouble(v1.spearmansRankCorrelation(v2)(using DoBoundsCheck.yes), 0.6727, 0.001) + + // https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient + + val v3 = NArray[Double](86.0, 97.0, 99.0, 100.0, 101.0, 103.0, 106.0, 110.0, 112.0, 113.0) + val v4 = NArray[Double](2, 20.0, 28.0, 27.0, 50.0, 29.0, 7.0, 17.0, 6.0, 12.0) + assertEqualsDouble(-0.1757575, v3.spearmansRankCorrelation(v4)(using DoBoundsCheck.yes), 0.000001) + } + + test("dot product") { + val v1 = NArray[Double](1.0, 2.0, 3.0) + val v2 = NArray[Double](4.0, 5.0, 6.0) + assertEqualsDouble(v1.dot(v2)(using DoBoundsCheck.yes), 32.0, 0.0001) + } + + // https://www.learndatasci.com/glossary/cosine-similarity/ + test("cosine similarity") { + val v1 = NArray[Double](1.0,1,1,1,1,0,0) + val v2 = NArray[Double](0.0,0,1.0,1.0,0,1.0,1.0) + assertEqualsDouble(cosineSimilarity(v1, v2)(using DoBoundsCheck.yes), 0.44721, 0.0001) + } end StatsSuite