Skip to content

Commit

Permalink
Add cosine similarity.
Browse files Browse the repository at this point in the history
  • Loading branch information
Quafadas committed May 15, 2024
1 parent a586c54 commit 25213e7
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 74 deletions.
3 changes: 2 additions & 1 deletion .scalafmt.conf
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ runner.dialectOverride.allowSignificantIndentation = true
rewrite.scala3.countEndMarkerLines = lastBlockOnly
rewrite.scala3.insertEndMarkerMinLines = 1
indent.main = 4
maxColumn = 120
maxColumn = 120
exclude.filters = ["build.sc"]
2 changes: 1 addition & 1 deletion build.sc
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import mill.scalanativelib._
import mill.api.Result

trait Common extends ScalaModule with PublishModule {
def scalaVersion = "3.3.1"
def scalaVersion = "3.3.3"

def publishVersion = VcsVersion.vcsState().format()

Expand Down
9 changes: 8 additions & 1 deletion vecxt/js/src/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,14 @@ object extensions:

inline def dot(v1: Float64Array)(using inline boundsCheck: BoundsCheck): Double =
dimCheck(vec, v1)
blas.ddot(vec.length, vec, 1, v1, 1)

var product = 0.0
var i = 0;
while i < vec.length do
product = product + vec(i) * v1(i)
i = i + 1
end while
product
end dot

inline def norm: Double = blas.dnrm2(vec.length, vec, 1)
Expand Down
3 changes: 2 additions & 1 deletion vecxt/src/cosineSimilarity.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package vecxt

import vecxt.extensions.norm
import vecxt.extensions.dot
import narr.NArray

/** Compute the cosine similarity between two vectors
*
Expand All @@ -12,7 +13,7 @@ import vecxt.extensions.dot
*/
object cosineSimilarity:

inline def apply(v1: Array[Double], v2: Array[Double])(using inline boundsCheck: BoundsCheck): Double =
inline def apply(v1: NArray[Double], v2: NArray[Double])(using inline boundsCheck: BoundsCheck): Double =
dimCheck(v1, v2)
v1.dot(v2) / (v1.norm * v2.norm)
end apply
Expand Down
153 changes: 83 additions & 70 deletions vecxt/test/src/simple.stats.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,76 +21,89 @@ import vecxt.DoBoundsCheck

class StatsSuite extends munit.FunSuite:

// import vecxt.BoundsCheck.yes

test("sample covariance") {
// Sample version
// https://corporatefinanceinstitute.com/resources/data-science/covariance/

val vector1 = NArray[Double](1692.0, 1978.0, 1884.0, 2151.0, 2519.0)
val vector2 = NArray[Double](68.0, 102.0, 110.0, 112.0, 154.0)

val result = vector1.covariance(vector2)

assertEqualsDouble(result, 9107.3, 0.001)
}

test("sample variance and std") {
val v = NArray[Double](2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0)
assertEqualsDouble(v.variance, 4.571429, 0.00001)
assertEqualsDouble(v.stdDev, 2.13809, 0.00001)
}

test("elementRanks") {

assertVecEquals(
NArray.tabulate[Double](10)((i: Int) => 11.0 - i).elementRanks,
NArray[Double](10, 9, 8, 7, 6, 5, 4, 3, 2, 1)
)
assertVecEquals(
NArray.fill[Double](5)(42.0).elementRanks,
NArray[Double](3, 3, 3, 3, 3)
)
assertVecEquals(
NArray[Double](1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5).elementRanks,
NArray[Double](1, 2.5, 2.5, 5, 5, 5, 8.5, 8.5, 8.5, 8.5, 13, 13, 13, 13, 13)
)
assertVecEquals(
NArray[Double](1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5).elementRanks,
NArray[Double](3, 3, 3, 3, 3, 7.5, 7.5, 7.5, 7.5, 11, 11, 11, 13.5, 13.5, 15)
)
}

test("pearson correlation coefficient") {
// https://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/
val v1 = NArray[Double](43.0, 21.0, 25.0, 42.0, 57.0, 59.0)
val v2 = NArray[Double](99.0, 65.0, 79.0, 75.0, 87.0, 81.0)
assertEqualsDouble(v1.pearsonCorrelationCoefficient(v2)(using DoBoundsCheck.yes), 0.529809, 0.0001)

}

test("element rank") {
val v = NArray[Double](1.0, 5.0, 3.0, 6.0, 1.0, 5.0)
/*
// import vecxt.BoundsCheck.yes

test("sample covariance") {
// Sample version
// https://corporatefinanceinstitute.com/resources/data-science/covariance/

val vector1 = NArray[Double](1692.0, 1978.0, 1884.0, 2151.0, 2519.0)
val vector2 = NArray[Double](68.0, 102.0, 110.0, 112.0, 154.0)

val result = vector1.covariance(vector2)

assertEqualsDouble(result, 9107.3, 0.001)
}

test("sample variance and std") {
val v = NArray[Double](2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0)
assertEqualsDouble(v.variance, 4.571429, 0.00001)
assertEqualsDouble(v.stdDev, 2.13809, 0.00001)
}

test("elementRanks") {

assertVecEquals(
NArray.tabulate[Double](10)((i: Int) => 11.0 - i).elementRanks,
NArray[Double](10, 9, 8, 7, 6, 5, 4, 3, 2, 1)
)
assertVecEquals(
NArray.fill[Double](5)(42.0).elementRanks,
NArray[Double](3, 3, 3, 3, 3)
)
assertVecEquals(
NArray[Double](1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5).elementRanks,
NArray[Double](1, 2.5, 2.5, 5, 5, 5, 8.5, 8.5, 8.5, 8.5, 13, 13, 13, 13, 13)
)
assertVecEquals(
NArray[Double](1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5).elementRanks,
NArray[Double](3, 3, 3, 3, 3, 7.5, 7.5, 7.5, 7.5, 11, 11, 11, 13.5, 13.5, 15)
)
}

test("pearson correlation coefficient") {
// https://www.statisticshowto.com/probability-and-statistics/correlation-coefficient-formula/
val v1 = NArray[Double](43.0, 21.0, 25.0, 42.0, 57.0, 59.0)
val v2 = NArray[Double](99.0, 65.0, 79.0, 75.0, 87.0, 81.0)
assertEqualsDouble(v1.pearsonCorrelationCoefficient(v2)(using DoBoundsCheck.yes), 0.529809, 0.0001)

}

test("element rank") {
val v = NArray[Double](1.0, 5.0, 3.0, 6.0, 1.0, 5.0)
/*
1.0 is the first, but has as tied rank. Take the average - 1.5
*/
assertVecEquals(
v.elementRanks,
NArray[Double](1.5, 4.5, 3.0, 6.0, 1.5, 4.5)
)
}

test("spearmans rank") {
// https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php
val v1 = NArray[Double](56.0, 75.0, 45.0, 71.0, 62.0, 64.0, 58.0, 80.0, 76.0, 61.0)
val v2 = NArray[Double](66.0, 70.0, 40.0, 60.0, 65.0, 56.0, 59.0, 77.0, 67.0, 63.0)
assertEqualsDouble(v1.spearmansRankCorrelation(v2)(using DoBoundsCheck.yes), 0.6727, 0.001)

// https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient

val v3 = NArray[Double](86.0, 97.0, 99.0, 100.0, 101.0, 103.0, 106.0, 110.0, 112.0, 113.0)
val v4 = NArray[Double](2, 20.0, 28.0, 27.0, 50.0, 29.0, 7.0, 17.0, 6.0, 12.0)
assertEqualsDouble(-0.1757575, v3.spearmansRankCorrelation(v4)(using DoBoundsCheck.yes), 0.000001)
}
*/
assertVecEquals(
v.elementRanks,
NArray[Double](1.5, 4.5, 3.0, 6.0, 1.5, 4.5)
)
}

test("spearmans rank") {
// https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide-2.php
val v1 = NArray[Double](56.0, 75.0, 45.0, 71.0, 62.0, 64.0, 58.0, 80.0, 76.0, 61.0)
val v2 = NArray[Double](66.0, 70.0, 40.0, 60.0, 65.0, 56.0, 59.0, 77.0, 67.0, 63.0)
assertEqualsDouble(v1.spearmansRankCorrelation(v2)(using DoBoundsCheck.yes), 0.6727, 0.001)

// https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient

val v3 = NArray[Double](86.0, 97.0, 99.0, 100.0, 101.0, 103.0, 106.0, 110.0, 112.0, 113.0)
val v4 = NArray[Double](2, 20.0, 28.0, 27.0, 50.0, 29.0, 7.0, 17.0, 6.0, 12.0)
assertEqualsDouble(-0.1757575, v3.spearmansRankCorrelation(v4)(using DoBoundsCheck.yes), 0.000001)
}

test("dot product") {
val v1 = NArray[Double](1.0, 2.0, 3.0)
val v2 = NArray[Double](4.0, 5.0, 6.0)
assertEqualsDouble(v1.dot(v2)(using DoBoundsCheck.yes), 32.0, 0.0001)
}

// https://www.learndatasci.com/glossary/cosine-similarity/
test("cosine similarity") {
val v1 = NArray[Double](1.0,1,1,1,1,0,0)
val v2 = NArray[Double](0.0,0,1.0,1.0,0,1.0,1.0)
assertEqualsDouble(cosineSimilarity(v1, v2)(using DoBoundsCheck.yes), 0.44721, 0.0001)
}

end StatsSuite

0 comments on commit 25213e7

Please sign in to comment.