Skip to content

Commit

Permalink
CORE-207: optimize stringToTypedAttribute in TSV upload (#1519)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidangb authored Dec 18, 2024
1 parent be0aeb5 commit 36b8583
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 26 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package org.broadinstitute.dsde.firecloud.utils

import org.broadinstitute.dsde.firecloud.model.{EntityUpdateDefinition, FlexibleModelSchema, ModelSchema}
import org.broadinstitute.dsde.firecloud.service.TSVFileSupport
import org.broadinstitute.dsde.firecloud.utils.TsvFileSupportBenchmark.TsvData
import org.openjdk.jmh.annotations.{Benchmark, Scope, State}
import org.openjdk.jmh.infra.Blackhole

object TsvFileSupportBenchmark {
@State(Scope.Thread)
class TsvData {
val entityType: String = "sample"
val memberTypeOpt: Option[String] = None
// representation of the inbound TSV row
val row: Seq[String] = Seq(
"0005",
"foo",
"\"some\tquoted\tvalue\"",
"42",
"true",
"-123.456",
"gs://some-bucket/somefile.ext",
"""{"entityType":"targetType", "entityName":"targetName"}""",
"[1,2,3,4,5]"
)
val colInfo: Seq[(String, Option[String])] = Seq(
("sample_id", None),
("string", None),
("quotedstring", None),
("int", None),
("boolean", None),
("double", None),
("file", None),
("reference", None),
("array", None)
)
val modelSchema: ModelSchema = FlexibleModelSchema
val deleteEmptyValues: Boolean = false
}
}

class TsvFileSupportBenchmark {

@Benchmark
def makeEntityRows(blackHole: Blackhole, tsvData: TsvData): EntityUpdateDefinition = {

val result: EntityUpdateDefinition = TsvFileSupportHarness.setAttributesOnEntity(tsvData.entityType,
tsvData.memberTypeOpt,
tsvData.row,
tsvData.colInfo,
tsvData.modelSchema,
tsvData.deleteEmptyValues
)
blackHole.consume(result)
result
}

}

// helper object to get access to TSVFileSupport trait
object TsvFileSupportHarness extends TSVFileSupport {}
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ import scala.concurrent.{ExecutionContext, Future}
import scala.util.{Failure, Success, Try}

/**
* The different types of tsv import/export formats
*/
* The different types of tsv import/export formats
*/
object TsvTypes {
sealed trait TsvType
case object ENTITY extends TsvType {
Expand Down Expand Up @@ -179,31 +179,64 @@ trait TSVFileSupport {
Creates an AttributeValue whose implementation is more closely tied to the value of the input.
*/
def stringToTypedAttribute(value: String): Attribute =
Try(java.lang.Integer.parseInt(value)) match {
case Success(intValue) => AttributeNumber(intValue)
case Failure(_) =>
Try(java.lang.Double.parseDouble(value)) match {
// because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN,
// if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through
// to AttributeString.
case Success(doubleValue)
if !Double.NegativeInfinity.equals(doubleValue)
&& !Double.PositiveInfinity.equals(doubleValue)
&& !Double.NaN.equals(doubleValue)
&& !matchesLiteral(value) =>
AttributeNumber(doubleValue)
case _ =>
Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")) match {
case Success(booleanValue) => AttributeBoolean(booleanValue)
case Failure(_) =>
Try(value.parseJson.convertTo[AttributeEntityReference]) match {
case Success(ref) => ref
case Failure(_) => AttributeString(value)
}
}
}
if (value.length > 1 && value.startsWith("\"") && value.endsWith("\"")) {
// if this value starts and ends with a quote, it should always be treated as a string
AttributeString(value.substring(1, value.length - 1))
} else {
// else, inspect the value to find an appropriate datatype
toAttribute(value)
}

// the `toAttribute` method checks the first non-whitespace character of inbound TSV cells.
private val possibleNumbers = "0123456789+-." // possible first characters for a number
private val possibleBooleans = "tfTF" // possible first characters for a boolean
private val possibleReferences = "{" // possible first characters for an entity reference
private val possibleNonStrings = possibleNumbers + possibleBooleans + possibleReferences

private def toAttribute(value: String): Attribute = {
val trimmed = value.trim
// empty string
if (trimmed.isEmpty) {
AttributeString(value)
} else {
// find the first character of the inbound cell
val firstChar = trimmed.charAt(0)

// using the first character, try to convert the cell to a number, boolean, or reference
val maybeNonString: Option[Attribute] = firstChar match {
// first char doesn't match any known starters; this is a string
case _ if !possibleNonStrings.contains(firstChar) => Some(AttributeString(value))
// could this be a number?
case _ if possibleNumbers.contains(firstChar) =>
Try(java.lang.Integer.parseInt(value)) match {
case Success(intValue) => Some(AttributeNumber(intValue))
case Failure(_) =>
Try(java.lang.Double.parseDouble(value)) match {
// because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN,
// if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through
// to AttributeString.
case Success(doubleValue)
if !Double.NegativeInfinity.equals(doubleValue)
&& !Double.PositiveInfinity.equals(doubleValue)
&& !Double.NaN.equals(doubleValue)
&& !matchesLiteral(value) =>
Some(AttributeNumber(doubleValue))
case _ => None
}
}
// could this be a boolean?
case _ if possibleBooleans.contains(firstChar) =>
Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")).toOption.map(AttributeBoolean)
// could this be a boolean?
case _ if possibleReferences.contains(firstChar) =>
Try(value.parseJson.convertTo[AttributeEntityReference]).toOption
}

// if we didn't find one of the other attributes, it's a string.
maybeNonString.getOrElse(AttributeString(value))
}
}

def checkForJson(value: String): Attribute =
Try(value.parseJson) match {
case Success(_: JsObject) => AttributeValueRawJson(value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class TSVFileSupportSpec extends AnyFreeSpec with TSVFileSupport {
Double.MinPositiveValue.toString -> AttributeNumber(Double.MinPositiveValue),
Double.MaxValue.toString -> AttributeNumber(Double.MaxValue)
)
val stringTestCases = List("", "string", "true525600", ",")
val stringTestCases = List("", "string", "true525600", ",", "\"")
val referenceTestCases = Map(
"""{"entityType":"targetType","entityName":"targetName"}""" -> AttributeEntityReference("targetType",
"targetName"
Expand Down

0 comments on commit 36b8583

Please sign in to comment.