Skip to content

Commit

Permalink
AJ-1730: sunset BDBag support (#1353)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidangb authored May 16, 2024
1 parent 33ca45b commit 875bbb8
Show file tree
Hide file tree
Showing 11 changed files with 1 addition and 369 deletions.
2 changes: 0 additions & 2 deletions project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,6 @@ object Dependencies {

"org.scalatest" %% "scalatest" % "3.2.18" % "test",
"org.mock-server" % "mockserver-netty" % "5.15.0" % "test",
// jaxb-api needed by WorkspaceApiServiceSpec.bagitService() method
"javax.xml.bind" % "jaxb-api" % "2.3.1" % "test",
// provides testing mocks
"com.google.cloud" % "google-cloud-nio" % "0.127.17" % "test",
"org.scalatestplus" %% "mockito-4-5" % "3.2.12.0" % "test"
Expand Down
1 change: 0 additions & 1 deletion src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ workspace {
workspacesAclPath="/workspaces/%s/%s/acl"
entitiesPath="/workspaces/%s/%s/entities"
entityQueryPath="/workspaces/%s/%s/entityQuery"
entityBagitMaximumSize=200000000
importEntitiesPath="/workspaces/%s/%s/importEntities"
workspacesEntitiesCopyPath="/workspaces/entities/copy"
submissionsPath="/workspaces/%s/%s/submissions"
Expand Down
48 changes: 0 additions & 48 deletions src/main/resources/swagger/api-docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3775,39 +3775,6 @@ paths:
description: Internal Error
content: {}
x-passthrough: false
/api/workspaces/{workspaceNamespace}/{workspaceName}/importBagit:
post:
tags:
- Entities
deprecated: true
summary: Import entity TSVs from a zipped [BagIt](https://tools.ietf.org/html/draft-kunze-bagit-14)
directory, whose payload contains two files - participants.tsv and samples.tsv
operationId: importBagit
parameters:
- $ref: '#/components/parameters/workspaceNamespaceParam'
- $ref: '#/components/parameters/workspaceNameParam'
requestBody:
description: JSON object containing bagit URL
content:
'application/json':
schema:
$ref: '#/components/schemas/BagitRequest'
required: true
responses:
200:
description: Successful Request
content: {}
400:
description: Bad Request
content: {}
404:
description: Source Workspace not found
content: {}
500:
description: Internal Error
content: {}
x-passthrough: false
x-codegen-request-body-name: bagitImportRequest
/api/workspaces/{workspaceNamespace}/{workspaceName}/importEntities:
post:
tags:
Expand Down Expand Up @@ -6415,21 +6382,6 @@ components:
service
format: int32
description: Stuff you need to know about calls
BagitRequest:
required:
- bagitURL
- format
type: object
properties:
bagitURL:
type: string
description: link to publically accessible zipped BagIt directory
format:
type: string
description: the type of the files inside the bagit. Currently this must
be the string "TSV".
enum:
- TSV
ConfigurationIngest:
required:
- inputs
Expand Down
130 changes: 0 additions & 130 deletions src/main/scala/org/broadinstitute/dsde/firecloud/EntityService.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import akka.http.scaladsl.model.StatusCodes._
import akka.http.scaladsl.model.{HttpResponse, StatusCodes}
import com.typesafe.scalalogging.LazyLogging
import org.broadinstitute.dsde.firecloud.EntityService._
import org.broadinstitute.dsde.firecloud.FireCloudConfig.Rawls
import org.broadinstitute.dsde.firecloud.dataaccess.ImportServiceFiletypes.FILETYPE_RAWLS
import org.broadinstitute.dsde.firecloud.dataaccess.{CwdsDAO, GoogleServicesDAO, ImportServiceDAO, RawlsDAO}
import org.broadinstitute.dsde.firecloud.model.ModelJsonProtocol._
Expand All @@ -18,16 +17,9 @@ import org.broadinstitute.dsde.workbench.model.google.{GcsBucketName, GcsObjectN
import org.databiosphere.workspacedata.client.ApiException
import spray.json.DefaultJsonProtocol._

import java.io.{File, FileNotFoundException, FileOutputStream, InputStream}
import java.net.{HttpURLConnection, URL}
import java.nio.channels.Channels
import java.nio.charset.StandardCharsets
import java.text.SimpleDateFormat
import java.util.concurrent.atomic.AtomicLong
import java.util.zip.{ZipEntry, ZipException, ZipFile}
import scala.concurrent.{ExecutionContext, Future}
import scala.io.Source
import scala.jdk.CollectionConverters._
import scala.language.postfixOps
import scala.util.{Failure, Success, Try}

Expand Down Expand Up @@ -56,50 +48,6 @@ object EntityService {
}
}

//returns (contents of participants.tsv, contents of samples.tsv)
def unzipTSVs(bagName: String, zipFile: ZipFile)(op: (Option[String], Option[String]) => Future[PerRequestMessage]): Future[PerRequestMessage] = {
val zipEntries = zipFile.entries.asScala

val rand = java.util.UUID.randomUUID.toString.take(8)
val participantsTmp = File.createTempFile(s"$rand-participants", ".tsv")
val samplesTmp = File.createTempFile(s"$rand-samples", ".tsv")

val unzippedFiles = zipEntries.foldLeft((None: Option[String], None: Option[String])){ (acc: (Option[String], Option[String]), ent: ZipEntry) =>
if(!ent.isDirectory && (ent.getName.contains("/participants.tsv") || ent.getName.equals("participants.tsv"))) {
acc._1 match {
case Some(_) => throw new FireCloudExceptionWithErrorReport(errorReport = ErrorReport(StatusCodes.BadRequest, s"More than one participants.tsv file found in bagit $bagName"))
case None =>
unzipSingleFile(zipFile.getInputStream(ent), participantsTmp)
(Some(participantsTmp.getPath), acc._2)
}
} else if(!ent.isDirectory && (ent.getName.contains("/samples.tsv") || ent.getName.equals("samples.tsv"))) {
acc._2 match {
case Some(_) => throw new FireCloudExceptionWithErrorReport(errorReport = ErrorReport(StatusCodes.BadRequest, s"More than one samples.tsv file found in bagit $bagName"))
case None =>
unzipSingleFile (zipFile.getInputStream (ent), samplesTmp)
(acc._1, Some (samplesTmp.getPath) )
}
} else {
acc
}
}

try {
op(unzippedFiles._1.map(f => Source.fromFile(f).mkString), unzippedFiles._2.map(f => Source.fromFile(f).mkString))
} catch {
case e: Exception => throw e
} finally {
participantsTmp.delete
samplesTmp.delete
}
}

private def unzipSingleFile(zis: InputStream, fileTarget: File): Unit = {
val fout = new FileOutputStream(fileTarget)
val buffer = new Array[Byte](1024)
LazyList.continually(zis.read(buffer)).takeWhile(_ != -1).foreach(fout.write(buffer, 0, _))
}

}

class EntityService(rawlsDAO: RawlsDAO, importServiceDAO: ImportServiceDAO, cwdsDAO: CwdsDAO, googleServicesDAO: GoogleServicesDAO, modelSchema: ModelSchema)(implicit val executionContext: ExecutionContext)
Expand Down Expand Up @@ -320,84 +268,6 @@ class EntityService(rawlsDAO: RawlsDAO, importServiceDAO: ImportServiceDAO, cwds
}
}

def importBagit(workspaceNamespace: String, workspaceName: String, bagitRq: BagitImportRequest, userInfo: UserInfo): Future[PerRequestMessage] = {
if(bagitRq.format != "TSV") {
Future.successful(RequestCompleteWithErrorReport(StatusCodes.BadRequest, "Invalid format; for now, you must place the string \"TSV\" here"))
} else {

//Java URL handles http, https, ftp, file, and jar protocols.
//We're only OK with https to avoid MITM attacks.
val bagitURL = new URL(bagitRq.bagitURL.replace(" ", "%20"))
val acceptableProtocols = Seq("https") //for when we inevitably change our mind and need to support others
if (!acceptableProtocols.contains(bagitURL.getProtocol)) {
Future.successful(RequestCompleteWithErrorReport(StatusCodes.BadRequest, "Invalid bagitURL protocol: must be https only"))
} else {

val rand = java.util.UUID.randomUUID.toString.take(8)
val bagItFile = File.createTempFile(s"$rand-samples", ".tsv")
var bytesDownloaded = new AtomicLong(-1) // careful, this is a var

try {
val conn = bagitURL.openConnection()
val length = conn.getContentLength
conn.asInstanceOf[HttpURLConnection].disconnect()

if (length == 0) {
Future.successful(RequestCompleteWithErrorReport(StatusCodes.BadRequest, s"BDBag has content-length 0"))
} else if (length > Rawls.entityBagitMaximumSize) {
Future.successful(RequestCompleteWithErrorReport(StatusCodes.BadRequest, s"BDBag size is too large."))
} else {
// download the file
val readFromBagit = Channels.newChannel(bagitURL.openStream())
val writeToTemp = new FileOutputStream(bagItFile)
try {
bytesDownloaded.set(writeToTemp.getChannel.transferFrom(readFromBagit, 0, length))
} finally {
readFromBagit.close()
writeToTemp.close()
}

val zipFile = new ZipFile(bagItFile.getAbsolutePath)
if (!zipFile.entries().hasMoreElements) {
Future(RequestCompleteWithErrorReport(StatusCodes.BadRequest, s"BDBag has no entries."))
} else {
//make two big strings containing the participants and samples TSVs
//if i could turn back time this would use streams to save memory, but hopefully this will all go away when entity service comes along
unzipTSVs(bagitRq.bagitURL, zipFile) { (participantsStr, samplesStr) =>
(participantsStr, samplesStr) match {
case (None, None) =>
Future.successful(RequestCompleteWithErrorReport(StatusCodes.BadRequest, "You must have either (or both) participants.tsv and samples.tsv in the zip file"))
case _ =>
for {
// This should vomit back errors from rawls.
participantResult <- participantsStr.map(ps => importEntitiesFromTSV(workspaceNamespace, workspaceName, ps, userInfo)).getOrElse(Future.successful(RequestComplete(OK)))
sampleResult <- samplesStr.map(ss => importEntitiesFromTSV(workspaceNamespace, workspaceName, ss, userInfo)).getOrElse(Future.successful(RequestComplete(OK)))
} yield {
participantResult match {
case RequestComplete((OK, _)) => sampleResult
case _ => participantResult
}
}
}
}
}
}
} catch {
case _:FileNotFoundException =>
Future.successful(RequestCompleteWithErrorReport(StatusCodes.NotFound, s"BDBag ${bagitRq.bagitURL} was not found."))
case ze:ZipException =>
logger.info(s"ZipException: ${ze.getMessage} - ${bagItFile.getAbsolutePath} has length ${bagItFile.length}. " +
s"We originally downloaded $bytesDownloaded bytes.")
Future.successful(RequestCompleteWithErrorReport(StatusCodes.BadRequest, s"Problem with BDBag: ${ze.getMessage}"))
case e: Exception =>
throw e
} finally {
bagItFile.delete()
}
}
}
}

def importJob(workspaceNamespace: String, workspaceName: String, importRequest: AsyncImportRequest, userInfo: UserInfo): Future[PerRequestMessage] = {
// validate that filetype exists in the importRequest
if (importRequest.filetype.isEmpty)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ object FireCloudConfig {
val workspacesUrl = authUrl + workspacesPath
val entitiesPath = workspace.getString("entitiesPath")
val entityQueryPath = workspace.getString("entityQueryPath")
val entityBagitMaximumSize = workspace.getInt("entityBagitMaximumSize")
val workspacesEntitiesCopyPath = workspace.getString("workspacesEntitiesCopyPath")
def workspacesEntitiesCopyUrl(linkExistingEntities: Boolean) = authUrl + workspacesEntitiesCopyPath + "?linkExistingEntities=%s".format(linkExistingEntities)
val submissionsCountPath = workspace.getString("submissionsCountPath")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,6 @@ object ModelJsonProtocol extends WorkspaceJsonSupport with SprayJsonSupport {
implicit val impCurator: RootJsonFormat[Curator] = jsonFormat1(Curator)
implicit val impUserImportPermission: RootJsonFormat[UserImportPermission] = jsonFormat2(UserImportPermission)

implicit val impBagitImportRequest: RootJsonFormat[BagitImportRequest] = jsonFormat2(BagitImportRequest)
implicit val impPFBImportRequest: RootJsonFormat[PFBImportRequest] = jsonFormat1(PFBImportRequest)
implicit val impOptions: RootJsonFormat[ImportOptions] = jsonFormat2(ImportOptions)
implicit val impAsyncImportRequest: RootJsonFormat[AsyncImportRequest] = jsonFormat3(AsyncImportRequest)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ case class EntityCopyWithoutDestinationDefinition(

case class EntityId(entityType: String, entityName: String)

case class BagitImportRequest(bagitURL: String, format: String)

// legacy class specific to PFB import; prefer AsyncImportRequest instead
case class PFBImportRequest(url: String)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package org.broadinstitute.dsde.firecloud.webservice

import akka.http.scaladsl.model.StatusCodes.{Accepted, OK}
import akka.http.scaladsl.model.StatusCodes.OK

import java.text.SimpleDateFormat
import akka.http.scaladsl.model.Uri.Query
Expand All @@ -22,7 +22,6 @@ import org.slf4j.{Logger, LoggerFactory}
import spray.json.DefaultJsonProtocol._

import scala.concurrent.ExecutionContext
import scala.util.Try

trait WorkspaceApiService extends FireCloudRequestBuilding with FireCloudDirectives with StandardUserInfoDirectives {

Expand Down Expand Up @@ -146,15 +145,6 @@ trait WorkspaceApiService extends FireCloudRequestBuilding with FireCloudDirecti
}
}
} ~
path("importBagit"){
post {
requireUserInfo() { userInfo =>
entity(as[BagitImportRequest]) { bagitRq =>
complete { entityServiceConstructor(FirecloudModelSchema).importBagit(workspaceNamespace, workspaceName, bagitRq, userInfo) }
}
}
}
} ~
// POST importPFB will likely be deprecated in the future; use POST importJob instead
path("importPFB") {
post {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import org.scalatest.concurrent.ScalaFutures
import org.scalatestplus.mockito.MockitoSugar.{mock => mockito}

import java.util.UUID
import java.util.zip.ZipFile
import scala.concurrent.ExecutionContext.Implicits._
import scala.concurrent.Future
import scala.reflect.classTag
Expand All @@ -42,84 +41,6 @@ class EntityServiceSpec extends BaseServiceSpec with BeforeAndAfterEach {

private def dummyUserInfo(tokenStr: String) = UserInfo("dummy", OAuth2BearerToken(tokenStr), -1, "dummy")

"EntityClient should extract TSV files out of bagit zips" - {
"with neither participants nor samples in the zip" in {
val zip = new ZipFile("src/test/resources/testfiles/bagit/nothingbag.zip")
EntityService.unzipTSVs("nothingbag", zip) { (participants, samples) =>
participants shouldBe None
samples shouldBe None
Future.successful(RequestComplete(StatusCodes.OK))
}
}

"with both participants and samples in a zip with a flat file structure" in {
val zip = new ZipFile("src/test/resources/testfiles/bagit/flat_testbag.zip")
EntityService.unzipTSVs("flat_testbag", zip) { (participants, samples) =>
participants.map(_.stripLineEnd) shouldEqual Some("imagine this is a participants.tsv in a flat file structure")
samples.map(_.stripLineEnd) shouldEqual Some("imagine this is a samples.tsv in a flat file structure")
Future.successful(RequestComplete(StatusCodes.OK))
}
}

"with only participants in a zip with a flat file structure" in {
val zip = new ZipFile("src/test/resources/testfiles/bagit/participants_only_flat_testbag.zip")
EntityService.unzipTSVs("participants_only_flat_testbag", zip) { (participants, samples) =>
participants.map(_.stripLineEnd) shouldEqual Some("imagine this is a participants.tsv all alone in a flat file structure")
samples.map(_.stripLineEnd) shouldEqual None
Future.successful(RequestComplete(StatusCodes.OK))
}
}

"with only samples in a zip with a flat file structure" in {
val zip = new ZipFile("src/test/resources/testfiles/bagit/samples_only_flat_testbag.zip")
EntityService.unzipTSVs("samples_only_flat_testbag", zip) { (participants, samples) =>
participants.map(_.stripLineEnd) shouldEqual None
samples.map(_.stripLineEnd) shouldEqual Some("imagine this is a samples.tsv all alone in a flat file structure")
Future.successful(RequestComplete(StatusCodes.OK))
}
}

"with both participants and samples in a zip with a nested file structure" in {
val zip = new ZipFile("src/test/resources/testfiles/bagit/nested_testbag.zip")
EntityService.unzipTSVs("nested_testbag", zip) { (participants, samples) =>
participants.map(_.stripLineEnd) shouldEqual Some("imagine this is a participants.tsv in a nested file structure")
samples.map(_.stripLineEnd) shouldEqual Some("imagine this is a samples.tsv in a nested file structure")
Future.successful(RequestComplete(StatusCodes.OK))
}
}

"with both participants and samples and an extra tsv file in a zip with a nested file structure" in {
val zip = new ZipFile("src/test/resources/testfiles/bagit/extra_file_nested_testbag.zip")
EntityService.unzipTSVs("extra_file_nested_testbag", zip) { (participants, samples) =>
participants.map(_.stripLineEnd) shouldEqual Some("imagine this is a participants.tsv in a nested file structure")
samples.map(_.stripLineEnd) shouldEqual Some("imagine this is a samples.tsv in a nested file structure")
Future.successful(RequestComplete(StatusCodes.OK))
}
}

"with multiple participants in a zip with a flat file structure" in {
val zip = new ZipFile("src/test/resources/testfiles/bagit/duplicate_participants_nested_testbag.zip")

val ex = intercept[FireCloudExceptionWithErrorReport] {
EntityService.unzipTSVs("duplicate_participants_nested_testbag", zip) { (_, _) =>
Future.successful(RequestComplete(StatusCodes.OK))
}
}
ex.errorReport.message shouldEqual "More than one participants.tsv file found in bagit duplicate_participants_nested_testbag"
}

"with multiple samples in a zip with a flat file structure" in {
val zip = new ZipFile("src/test/resources/testfiles/bagit/duplicate_samples_nested_testbag.zip")

val ex = intercept[FireCloudExceptionWithErrorReport] {
EntityService.unzipTSVs("duplicate_samples_nested_testbag", zip) { (_, _) =>
Future.successful(RequestComplete(StatusCodes.OK))
}
}
ex.errorReport.message shouldEqual "More than one samples.tsv file found in bagit duplicate_samples_nested_testbag"
}
}

"EntityService.importEntitiesFromTSV()" - {
val tsvParticipants = FileUtils.readAllTextFromResource("testfiles/tsv/ADD_PARTICIPANTS.txt")
val tsvMembership = FileUtils.readAllTextFromResource("testfiles/tsv/MEMBERSHIP_SAMPLE_SET.tsv")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ object MockUtils {
val ontologyServerPort = 8993
val samServerPort = 8994
val cromiamServerPort = 8995
val bagitServerPort = 9393
val importServiceServerPort = 9394

def randomPositiveInt(): Int = {
Expand Down
Loading

0 comments on commit 875bbb8

Please sign in to comment.