diff --git a/src/main/scala/util/StatCounter.scala b/src/main/scala/util/StatCounter.scala index 5ce342f..6a8bccb 100644 --- a/src/main/scala/util/StatCounter.scala +++ b/src/main/scala/util/StatCounter.scala @@ -63,7 +63,6 @@ class StatCounter[T : Funnel](size: Long) extends LightStatCounter[T]: override def add(values: Seq[T]): Unit = // the bloom filter is thread-safe values.foreach(bloomFilter.put) - // but the counter is not lightAdd(values.distinct.size) @@ -73,3 +72,18 @@ class StatCounter[T : Funnel](size: Long) extends LightStatCounter[T]: override def result: Result = super.result.copy(uniqueCount = Some(bloomFilter.approximateElementCount)) + +// uses sets instead of bloom filters +class PreciseStatCounter[T] extends LightStatCounter[T]: + private val set: scala.collection.mutable.HashSet[T] = scala.collection.mutable.HashSet.empty + + override def add(values: Seq[T]): Unit = + set ++= values + lightAdd(values.distinct.size) + + override def addUnique(values: Iterable[T]): Unit = + set ++= values + lightAdd(values.size) + + override def result: StatCounter.Result = + super.result.copy(uniqueCount = Some(set.size.toLong)) diff --git a/src/main/scala/util/StatCounterSuite.scala b/src/main/scala/util/StatCounterSuite.scala index 5123dc9..fa33e94 100644 --- a/src/main/scala/util/StatCounterSuite.scala +++ b/src/main/scala/util/StatCounterSuite.scala @@ -13,8 +13,8 @@ import scala.jdk.CollectionConverters.* object StatCounterSuite: case class Result(iris: StatCounter.Result, blankNodes: StatCounter.Result, literals: StatCounter.Result, plainLiterals: StatCounter.Result, dtLiterals: StatCounter.Result, - langLiterals: StatCounter.Result, controlChars: StatCounter.Result, - quotedTriples: StatCounter.Result, + langLiterals: StatCounter.Result, datatypes: StatCounter.Result, + controlChars: StatCounter.Result, quotedTriples: StatCounter.Result, subjects: StatCounter.Result, predicates: StatCounter.Result, objects: StatCounter.Result, graphs: StatCounter.Result, statements: StatCounter.Result): @@ -27,6 +27,7 @@ object StatCounterSuite: "SimpleLiteralCountStatistics" -> plainLiterals, "DatatypeLiteralCountStatistics" -> dtLiterals, "LanguageLiteralCountStatistics" -> langLiterals, + "DatatypeCountStatistics" -> datatypes, "AsciiControlCharacterCountStatistics" -> controlChars, "QuotedTripleCountStatistics" -> quotedTriples, "SubjectCountStatistics" -> subjects, @@ -73,6 +74,7 @@ class StatCounterSuite(val size: Long): private val cPlainLiterals = new StatCounter[String](10 * size) private val cDtLiterals = new StatCounter[String](10 * size) private val cLangLiterals = new StatCounter[String](10 * size) + private val cDatatypes = new PreciseStatCounter[String] private val cAsciiControlChars = LightStatCounter[Char]() private val cBlankNodes = new LightStatCounter[String]() @@ -100,6 +102,7 @@ class StatCounterSuite(val size: Long): val simpleLiterals = mutable.Set[String]() val dtLiterals = mutable.Set[String]() val langLiterals = mutable.Set[String]() + val datatypes = mutable.Set[String]() var controlCharCount = 0 var quotedTripleCount = 0 var stCount = 0 @@ -137,6 +140,7 @@ class StatCounterSuite(val size: Long): simpleLiterals += n.getLiteralLexicalForm else if n.getLiteralDatatypeURI != null then dtLiterals += lit + datatypes += n.getLiteralDatatypeURI else simpleLiterals += n.getLiteralLexicalForm else if n.isNodeTriple then @@ -150,6 +154,7 @@ class StatCounterSuite(val size: Long): cPlainLiterals.addUnique(simpleLiterals) cDtLiterals.addUnique(dtLiterals) cLangLiterals.addUnique(langLiterals) + cDatatypes.addUnique(datatypes) cAsciiControlChars.lightAdd(controlCharCount) cQuotedTriples.lightAdd(quotedTripleCount) @@ -165,5 +170,5 @@ class StatCounterSuite(val size: Long): def result: StatCounterSuite.Result = StatCounterSuite.Result(cIris.result, cBlankNodes.result, cLiterals.result, cPlainLiterals.result, - cDtLiterals.result, cLangLiterals.result, cAsciiControlChars.result, cQuotedTriples.result, cSubjects.result, - cPredicates.result, cObjects.result, cGraphs.result, cStatements.result) + cDtLiterals.result, cLangLiterals.result, cDatatypes.result, cAsciiControlChars.result, cQuotedTriples.result, + cSubjects.result, cPredicates.result, cObjects.result, cGraphs.result, cStatements.result)