From ad3ccdf41afa081d8abd6372f343551cda83384c Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 20 Jan 2024 07:11:22 +1300 Subject: [PATCH 01/26] Correct the dates of recent releases (#14161) --- hail/python/hail/docs/change_log.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/hail/python/hail/docs/change_log.md b/hail/python/hail/docs/change_log.md index c0f232b4637..05479d4eda2 100644 --- a/hail/python/hail/docs/change_log.md +++ b/hail/python/hail/docs/change_log.md @@ -55,7 +55,7 @@ critically depend on experimental functionality.** ## Version 0.2.127 -Released 2023-12-08 +Released 2024-01-12 If you have an Apple M1 laptop, verify that @@ -162,7 +162,7 @@ Released 2023-09-21 ## Version 0.2.123 -Released 2023-09-18 +Released 2023-09-19 ### New Features @@ -189,7 +189,7 @@ Released 2023-09-07 ## Version 0.2.121 -Released 2023-08-31 +Released 2023-09-06 ### New Features @@ -250,7 +250,7 @@ Released 2023-08-31 ## Version 0.2.120 -Released 2023-07-20 +Released 2023-07-27 ### New Features - (hail#13206) The VDS Combiner now works in Query-on-Batch. @@ -300,7 +300,7 @@ Released 2023-06-28 ## Version 0.2.118 -Released 2023-05-30 +Released 2023-06-13 ### New Features @@ -318,7 +318,7 @@ Released 2023-05-30 ## Version 0.2.117 -Released 2023-05-19 +Released 2023-05-22 ### New Features From 2bfa530eafdaf0955108050bb79a8d93963b96c0 Mon Sep 17 00:00:00 2001 From: Dan King Date: Mon, 22 Jan 2024 10:40:38 -0500 Subject: [PATCH 02/26] [hailtop] remove dead code in test (#14181) AFAICT, this does a copy that has no effect. We blow away dest_dir so we can't possibly verify that this copy was correct. Unless there's some side effect that I'm misunderstanding? --- hail/python/test/hailtop/inter_cloud/test_copy.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/hail/python/test/hailtop/inter_cloud/test_copy.py b/hail/python/test/hailtop/inter_cloud/test_copy.py index 27c42f9eb87..f9c15e85669 100644 --- a/hail/python/test/hailtop/inter_cloud/test_copy.py +++ b/hail/python/test/hailtop/inter_cloud/test_copy.py @@ -472,10 +472,6 @@ async def test_file_and_directory_error_with_slash_empty_file( for transfer_type in (Transfer.DEST_IS_TARGET, Transfer.DEST_DIR, Transfer.INFER_DEST): dest_base = await fresh_dir(fs, bases, cloud_scheme) - await Copier.copy(fs, sema, Transfer(f'{src_base}', dest_base.rstrip('/'), treat_dest_as=transfer_type)) - - dest_base = await fresh_dir(fs, bases, cloud_scheme) - await Copier.copy(fs, sema, Transfer(f'{src_base}empty/', dest_base.rstrip('/'), treat_dest_as=transfer_type)) await collect_files(await fs.listfiles(f'{dest_base}')) From ae7b87ffd683fab3bc00a8e66388e51f93b2d42f Mon Sep 17 00:00:00 2001 From: jigold Date: Mon, 22 Jan 2024 11:43:38 -0500 Subject: [PATCH 03/26] [hailtop.utils] Add address not available as a retryable error (#14185) --- hail/python/hailtop/utils/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hail/python/hailtop/utils/utils.py b/hail/python/hailtop/utils/utils.py index 583c32afc3d..78a3abd7c9a 100644 --- a/hail/python/hailtop/utils/utils.py +++ b/hail/python/hailtop/utils/utils.py @@ -532,6 +532,7 @@ async def bounded_gather2( RETRYABLE_ERRNOS = { # these should match (where an equivalent exists) nettyRetryableErrorNumbers in # is/hail/services/package.scala + errno.EADDRNOTAVAIL, errno.ETIMEDOUT, errno.ECONNREFUSED, errno.EHOSTUNREACH, From 28582597ca1e93c7e7fb3da2415b27e8de7fdea4 Mon Sep 17 00:00:00 2001 From: Daniel Goldstein Date: Mon, 22 Jan 2024 12:21:33 -0500 Subject: [PATCH 04/26] [batch] Add json parsing and severity to GCP Ops Agent config (#14187) Currently the Ops Agent does not do any parsing of the log message, so the log entry in Google Logging looks like: ``` jsonPayload: { message: "{"severity":"INFO","levelname":"INFO","asctime":"2024-01-22 16:10:45,748","filename":"worker.py","funcNameAndLine":":3461","message":"closed","hail_log":1}" } ``` The `parse_json` processor extracts the json fields from the message into fields on the `jsonPayload` so it looks like this ``` jsonPayload: { asctime: "2024-01-22 16:14:06,098" filename: "worker.py" funcNameAndLine: ":180" hail_log: 1 levelname: "INFO" message: "CLOUD gcp" } ``` and only the new `message` field is displayed in the Google Logging row instead of the whole json. This also adds a `severity` field on the log entry so filters such as `SEVERITY!=INFO` work as expected. --- batch/batch/cloud/gcp/driver/create_instance.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/batch/batch/cloud/gcp/driver/create_instance.py b/batch/batch/cloud/gcp/driver/create_instance.py index d800090356f..57d3000b286 100644 --- a/batch/batch/cloud/gcp/driver/create_instance.py +++ b/batch/batch/cloud/gcp/driver/create_instance.py @@ -230,6 +230,8 @@ def scheduling() -> dict: - /batch/jvm-container-logs/jvm-*.log record_log_file_path: true processors: + parse_message: + type: parse_json labels: type: modify_fields fields: @@ -237,11 +239,13 @@ def scheduling() -> dict: static_value: $NAMESPACE labels.instance_id: static_value: $INSTANCE_ID + severity: + move_from: jsonPayload.severity service: log_level: error pipelines: default_pipeline: - processors: [labels] + processors: [parse_message, labels] receivers: [runlog, workerlog, jvmlog] metrics: From 8ae336f6811f4d4435666b0c616693ba37cf2c71 Mon Sep 17 00:00:00 2001 From: Edmund Higham Date: Mon, 22 Jan 2024 13:12:07 -0500 Subject: [PATCH 05/26] [compiler] Emit `Let` Bindings Iteratively (#14163) Previously `Emit(?:Stream)?$` would emit let bindings recursively, regardless of if that binding was used. If a stream is not used, `Emit(?:Stream)?$` would define its missing labels, making emission recursive. This can lead to stack overflows for large numbers of let-bindings (and does so for the benchmark benchmark `matrix-multi-write-nothing`). By not emitting unused streams, we can make let-binding emission iterative. --- .../src/main/scala/is/hail/expr/ir/Emit.scala | 88 ++++++++++++------- .../is/hail/expr/ir/EmitCodeBuilder.scala | 31 ++++--- hail/src/main/scala/is/hail/expr/ir/Env.scala | 2 +- .../is/hail/expr/ir/streams/EmitStream.scala | 20 ++--- 4 files changed, 81 insertions(+), 60 deletions(-) diff --git a/hail/src/main/scala/is/hail/expr/ir/Emit.scala b/hail/src/main/scala/is/hail/expr/ir/Emit.scala index 6397686596c..2061e685536 100644 --- a/hail/src/main/scala/is/hail/expr/ir/Emit.scala +++ b/hail/src/main/scala/is/hail/expr/ir/Emit.scala @@ -88,6 +88,7 @@ case class EmitEnv(bindings: Env[EmitValue], inputValues: IndexedSeq[EmitValue]) } (paramTypes, params, recreateFromMB) } + } object Emit { @@ -675,11 +676,7 @@ abstract class EstimableEmitter[C] { def estimatedSize: Int } -class Emit[C]( - val ctx: EmitContext, - val cb: EmitClassBuilder[C], -) { - emitSelf => +class Emit[C](val ctx: EmitContext, val cb: EmitClassBuilder[C]) { val methods: mutable.Map[(String, Seq[Type], Seq[SType], SType), EmitMethodBuilder[C]] = mutable.Map() @@ -801,6 +798,7 @@ class Emit[C]( def emitI( ir: IR, + cb: EmitCodeBuilder = cb, region: Value[Region] = region, env: EmitEnv = env, container: Option[AggContainer] = container, @@ -840,19 +838,17 @@ class Emit[C]( emitI(cond).consume(cb, {}, m => cb.if_(m.asBoolean.value, emitVoid(cnsq), emitVoid(altr))) - case Let(bindings, body) => - def go(env: EmitEnv): IndexedSeq[(String, IR)] => Unit = { - case (name, value) +: rest => - val xVal = - if (value.typ.isInstanceOf[TStream]) emitStream(value, region, env = env) - else emit(value, env = env) - - cb.withScopedMaybeStreamValue(xVal, s"let_$name")(ev => go(env.bind(name, ev))(rest)) - case Seq() => - emitVoid(body, env = env) - } - - go(env)(bindings) + case let: Let => + emitLet( + emitI = (ir, cb, env) => + if (ir.typ.isInstanceOf[TStream]) emitStream(ir, region, env = env).toI(cb) + else emitI(ir, cb = cb, env = env), + emitBody = (ir, cb, env) => emitVoid(ir, cb, env = env), + )( + let, + cb, + env, + ) case StreamFor(a, valueName, body) => emitStream(a, region).toI(cb).consume( @@ -1448,7 +1444,7 @@ class Emit[C]( sorter.sort( cb, region, - makeDependentSortingFunction(cb, sct, lessThan, env, emitSelf, Array(left, right)), + makeDependentSortingFunction(cb, sct, lessThan, env, this, Array(left, right)), ) sorter.toRegion(cb, x.typ) } @@ -3559,22 +3555,18 @@ class Emit[C]( val result: EmitCode = (ir: @unchecked) match { - case Let(bindings, body) => + case let: Let => EmitCode.fromI(mb) { cb => - def go(env: EmitEnv): IndexedSeq[(String, IR)] => IEmitCode = { - case (name, value) +: rest => - val xVal = - if (value.typ.isInstanceOf[TStream]) emitStream(value, region, env = env) - else emit(value, env = env) - - cb.withScopedMaybeStreamValue(xVal, s"let_$name") { ev => - go(env.bind(name, ev))(rest) - } - case Seq() => - emitI(body, cb, env = env) - } - - go(env)(bindings) + emitLet( + emitI = (ir, cb, env) => + if (ir.typ.isInstanceOf[TStream]) emitStream(ir, region, env = env).toI(cb) + else emitI(ir, cb = cb, env = env), + emitBody = (ir, cb, env) => emitI(ir, cb, env = env), + )( + let, + cb, + env, + ) } case Ref(name, t) => @@ -3701,6 +3693,34 @@ class Emit[C]( (cb: EmitCodeBuilder, region: Value[Region], l: Value[_], r: Value[_]) => cb.memoize(cb.invokeCode[Boolean](sort, cb.this_, region, l, r)) } + + def emitLet[A]( + emitI: (IR, EmitCodeBuilder, EmitEnv) => IEmitCode, + emitBody: (IR, EmitCodeBuilder, EmitEnv) => A, + )( + let: Let, + cb: EmitCodeBuilder, + env: EmitEnv, + ): A = { + val uses: mutable.Set[String] = + ctx.usesAndDefs.uses.get(let) match { + case Some(refs) => refs.map(_.t.name) + case None => mutable.Set.empty + } + + emitBody( + let.body, + cb, + let.bindings.foldLeft(env) { case (newEnv, (name, ir)) => + if (!uses.contains(name)) newEnv + else { + val value = emitI(ir, cb, newEnv) + val memo = cb.memoizeMaybeStreamValue(value, s"let_$name") + newEnv.bind(name, memo) + } + }, + ) + } } object NDArrayEmitter { diff --git a/hail/src/main/scala/is/hail/expr/ir/EmitCodeBuilder.scala b/hail/src/main/scala/is/hail/expr/ir/EmitCodeBuilder.scala index 0d9a1f726ab..b386f374de5 100644 --- a/hail/src/main/scala/is/hail/expr/ir/EmitCodeBuilder.scala +++ b/hail/src/main/scala/is/hail/expr/ir/EmitCodeBuilder.scala @@ -160,24 +160,27 @@ class EmitCodeBuilder(val emb: EmitMethodBuilder[_], var code: Code[Unit]) exten } def withScopedMaybeStreamValue[T](ec: EmitCode, name: String)(f: EmitValue => T): T = { - if (ec.st.isRealizable) { - f(memoizeField(ec, name)) - } else { - assert(ec.st.isInstanceOf[SStream]) - val ev = if (ec.required) - EmitValue(None, ec.toI(this).get(this, "")) + val ev = memoizeMaybeStreamValue(ec.toI(this), name) + val res = f(ev) + ec.pv match { + case ss: SStreamValue => + ss.defineUnusedLabels(emb) + case _ => + } + res + } + + def memoizeMaybeStreamValue(iec: IEmitCode, name: String): EmitValue = + if (iec.st.isRealizable) memoizeField(iec, name) + else { + assert(iec.st.isInstanceOf[SStream]) + if (iec.required) EmitValue(None, iec.get(this, "")) else { val m = emb.genFieldThisRef[Boolean](name + "_missing") - ec.toI(this).consume(this, assign(m, true), _ => assign(m, false)) - EmitValue(Some(m), ec.pv) - } - val res = f(ev) - ec.pv match { - case ss: SStreamValue => ss.defineUnusedLabels(emb) + iec.consume(this, assign(m, true), _ => assign(m, false)) + EmitValue(Some(m), iec.value) } - res } - } def memoizeField(v: IEmitCode, name: String): EmitValue = { require(v.st.isRealizable) diff --git a/hail/src/main/scala/is/hail/expr/ir/Env.scala b/hail/src/main/scala/is/hail/expr/ir/Env.scala index bd2a40384cc..8a6783ec9c1 100644 --- a/hail/src/main/scala/is/hail/expr/ir/Env.scala +++ b/hail/src/main/scala/is/hail/expr/ir/Env.scala @@ -150,7 +150,7 @@ class Env[V] private (val m: Map[Env.K, V]) { def apply(name: String): V = m(name) def lookup(name: String): V = - m.get(name).getOrElse(throw new RuntimeException(s"Cannot find $name in $m")) + m.getOrElse(name, throw new RuntimeException(s"Cannot find $name in $m")) def lookupOption(name: String): Option[V] = m.get(name) diff --git a/hail/src/main/scala/is/hail/expr/ir/streams/EmitStream.scala b/hail/src/main/scala/is/hail/expr/ir/streams/EmitStream.scala index e64f790bda5..710584f2503 100644 --- a/hail/src/main/scala/is/hail/expr/ir/streams/EmitStream.scala +++ b/hail/src/main/scala/is/hail/expr/ir/streams/EmitStream.scala @@ -364,17 +364,15 @@ object EmitStream { SStreamValue(producer) } - case Let(bindings, body) => - def go(env: EmitEnv): IndexedSeq[(String, IR)] => IEmitCode = { - case (name, value) +: rest => - cb.withScopedMaybeStreamValue( - EmitCode.fromI(cb.emb)(cb => emit(value, cb, env = env)), - s"let_$name", - )(ev => go(env.bind(name, ev))(rest)) - case Seq() => - produce(body, cb, env = env) - } - go(env)(bindings) + case let: Let => + emitter.emitLet( + emitI = (ir, cb, env) => emit(ir, cb, env = env), + emitBody = (ir, cb, env) => produce(ir, cb, env = env), + )( + let, + cb, + env, + ) case In(n, _) => // this, Code[Region], ... From 42a072ec1aeb662a8dbc02d8c3b78301efa071a1 Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 23 Jan 2024 12:16:49 -0500 Subject: [PATCH 06/26] [prometheus] 90 day retention (#14194) Open question: we're using ~20GiB on /prometheus for 15d. We request 150GiB (and get closer to 146GiB). Should we increase the storage to give ourselves more slack? Assuming linear scaling, 90d would use 120GiB (26GiB of slack). https://hail.zulipchat.com/#narrow/stream/300487-Hail-Batch-Dev/topic/Grafana.20retention.20period ``` /prometheus $ df -h Filesystem Size Used Available Use% Mounted on overlay 94.3G 28.9G 65.3G 31% / tmpfs 64.0M 0 64.0M 0% /dev tmpfs 3.6G 0 3.6G 0% /sys/fs/cgroup /dev/sdf 146.6G 18.9G 127.6G 13% /prometheus /dev/sda1 94.3G 28.9G 65.3G 31% /etc/prometheus /dev/sda1 94.3G 28.9G 65.3G 31% /etc/hosts /dev/sda1 94.3G 28.9G 65.3G 31% /dev/termination-log /dev/sda1 94.3G 28.9G 65.3G 31% /etc/hostname /dev/sda1 94.3G 28.9G 65.3G 31% /etc/resolv.conf shm 64.0M 4.0K 64.0M 0% /dev/shm tmpfs 5.5G 12.0K 5.5G 0% /var/run/secrets/kubernetes.io/serviceaccount tmpfs 3.6G 0 3.6G 0% /proc/acpi tmpfs 64.0M 0 64.0M 0% /proc/kcore tmpfs 64.0M 0 64.0M 0% /proc/keys tmpfs 64.0M 0 64.0M 0% /proc/timer_list tmpfs 3.6G 0 3.6G 0% /proc/scsi tmpfs 3.6G 0 3.6G 0% /sys/firmware ``` --- prometheus/prometheus.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prometheus/prometheus.yaml b/prometheus/prometheus.yaml index ede6863d7d7..a8150f07c01 100644 --- a/prometheus/prometheus.yaml +++ b/prometheus/prometheus.yaml @@ -296,7 +296,7 @@ spec: - "/bin/prometheus" - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - - "--storage.tsdb.retention.time=15d" + - "--storage.tsdb.retention.time=90d" - "--web.console.libraries=/usr/share/prometheus/console_libraries" - "--web.console.templates=/usr/share/prometheus/consoles" - "--web.enable-lifecycle" From 0411c8937cba4f721ea642397ca9abce4eb493eb Mon Sep 17 00:00:00 2001 From: Daniel Goldstein Date: Wed, 24 Jan 2024 13:28:28 -0500 Subject: [PATCH 07/26] [gear] Make csrf cookie samesite=strict (#14180) Currently, the `_csrf` cookie is made available to all subdomains of `.hail.is`. This means that if I first visit `batch.hail.is` I get a `_csrf` cookie set for `.hail.is`. That cookie is then reused if I visit `ci.hail.is`. Even more awkward, the same value of the cookie will get reused if I then visit `batch.azure.hail.is`. This isn't that big of a deal, these can all be considered part of the same application that the hail team delivers and secures, but it is very little work to set stricter bounds on where this cookie is sent. By removing the `domain` attribute and using `samesite='strict'`, the cookie's domain will be set by the browser to the domain of the request whose response included the `Set-Cookie` header, e.g. `batch.hail.is` or `internal.hail.is`. `Strict` mode then ensures that the cookie will only be sent to that exact domain, meaning that each application is guaranteed to receive the `_csrf` token that it itself delivered, and a `_csrf` token from CI cannot be used to take actions against Batch. This should not have an adverse impact on existing users' browser sessions. In `render_template` we preserve the value of an existing `_csrf` cookie so this change should do the following: - Logged in user visits a page with an existing widely scoped (`.hail.is`) `_csrf` cookie - The server returns a `Set-Cookie` header with a new `_csrf` cookie for strictly the `batch.hail.is` domain but with the same token value as the original `_csrf` cookie - The user now has two cookies and the browser could send either one on a given request, but it does not matter because they have the same value - If the user logs out and back in, their old widely scoped cookie will be cleared and they only get the strict cookie from now on. --- devbin/dev_proxy.py | 2 +- web_common/web_common/web_common.py | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/devbin/dev_proxy.py b/devbin/dev_proxy.py index a277f7f4d34..268a2d33afb 100644 --- a/devbin/dev_proxy.py +++ b/devbin/dev_proxy.py @@ -48,7 +48,7 @@ async def render_html(request: web.Request, context: dict): # Make links point back to the local dev server and not use # the dev namespace path rewrite shenanigans. context['page_context']['base_path'] = '' - return await render_template(SERVICE, request, **context, cookie_domain='localhost:8000') + return await render_template(SERVICE, request, **context) async def on_startup(app: web.Application): diff --git a/web_common/web_common/web_common.py b/web_common/web_common/web_common.py index 511f1eb984f..3aca0472740 100644 --- a/web_common/web_common/web_common.py +++ b/web_common/web_common/web_common.py @@ -79,8 +79,6 @@ async def render_template( userdata: Optional[UserData], file: str, page_context: Dict[str, Any], - *, - cookie_domain: Optional[str] = None, ) -> web.Response: if request.headers.get('x-hail-return-jinja-context'): if userdata and userdata['is_developer']: @@ -98,6 +96,5 @@ async def render_template( context['csrf_token'] = csrf_token response = aiohttp_jinja2.render_template(file, request, context) - domain = cookie_domain or deploy_config._domain - response.set_cookie('_csrf', csrf_token, domain=domain, secure=True, httponly=True) + response.set_cookie('_csrf', csrf_token, secure=True, httponly=True, samesite='strict') return response From b7bde56d5aad1fa8d1c28b46a1f06b00c45bc8bf Mon Sep 17 00:00:00 2001 From: jigold Date: Wed, 24 Jan 2024 14:10:26 -0500 Subject: [PATCH 08/26] [batch] Stop writing to v2 billing tables (#13892) This PR modifies the billing triggers to stop writing to the v2 billing tables as well as remove the check for whether the equivalent v2 rows have been "migrated" when writing to the v3 tables. Stacked on #13891. --- batch/sql/estimated-current.sql | 135 +++---------------------- batch/sql/remove-v2-billing-writes.sql | 120 ++++++++++++++++++++++ build.yaml | 3 + 3 files changed, 135 insertions(+), 123 deletions(-) create mode 100644 batch/sql/remove-v2-billing-writes.sql diff --git a/batch/sql/estimated-current.sql b/batch/sql/estimated-current.sql index 74aa7ea114c..fc3d6f99707 100644 --- a/batch/sql/estimated-current.sql +++ b/batch/sql/estimated-current.sql @@ -595,16 +595,6 @@ BEGIN SET cur_billing_date = CAST(UTC_DATE() AS DATE); IF msec_diff_rollup != 0 THEN - INSERT INTO aggregated_billing_project_user_resources_v2 (billing_project, user, resource_id, token, `usage`) - SELECT billing_project, `user`, - resource_id, - rand_token, - msec_diff_rollup * quantity - FROM attempt_resources - JOIN batches ON batches.id = attempt_resources.batch_id - WHERE batch_id = NEW.batch_id AND job_id = NEW.job_id AND attempt_id = NEW.attempt_id - ON DUPLICATE KEY UPDATE `usage` = `usage` + msec_diff_rollup * quantity; - INSERT INTO aggregated_billing_project_user_resources_v3 (billing_project, user, resource_id, token, `usage`) SELECT batches.billing_project, batches.`user`, attempt_resources.deduped_resource_id, @@ -612,68 +602,26 @@ BEGIN msec_diff_rollup * quantity FROM attempt_resources JOIN batches ON batches.id = attempt_resources.batch_id - INNER JOIN aggregated_billing_project_user_resources_v2 ON - aggregated_billing_project_user_resources_v2.billing_project = batches.billing_project AND - aggregated_billing_project_user_resources_v2.user = batches.user AND - aggregated_billing_project_user_resources_v2.resource_id = attempt_resources.resource_id AND - aggregated_billing_project_user_resources_v2.token = rand_token - WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id AND migrated = 1 + WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id ON DUPLICATE KEY UPDATE `usage` = aggregated_billing_project_user_resources_v3.`usage` + msec_diff_rollup * quantity; - INSERT INTO aggregated_job_group_resources_v2 (batch_id, resource_id, token, `usage`) - SELECT batch_id, - resource_id, - rand_token, - msec_diff_rollup * quantity - FROM attempt_resources - WHERE batch_id = NEW.batch_id AND job_id = NEW.job_id AND attempt_id = NEW.attempt_id - ON DUPLICATE KEY UPDATE `usage` = `usage` + msec_diff_rollup * quantity; - INSERT INTO aggregated_job_group_resources_v3 (batch_id, resource_id, token, `usage`) SELECT attempt_resources.batch_id, attempt_resources.deduped_resource_id, rand_token, msec_diff_rollup * quantity FROM attempt_resources - JOIN aggregated_job_group_resources_v2 ON - aggregated_job_group_resources_v2.batch_id = attempt_resources.batch_id AND - aggregated_job_group_resources_v2.resource_id = attempt_resources.resource_id AND - aggregated_job_group_resources_v2.token = rand_token - WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id AND migrated = 1 + WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id ON DUPLICATE KEY UPDATE `usage` = aggregated_job_group_resources_v3.`usage` + msec_diff_rollup * quantity; - INSERT INTO aggregated_job_resources_v2 (batch_id, job_id, resource_id, `usage`) - SELECT batch_id, job_id, - resource_id, - msec_diff_rollup * quantity - FROM attempt_resources - WHERE batch_id = NEW.batch_id AND job_id = NEW.job_id AND attempt_id = NEW.attempt_id - ON DUPLICATE KEY UPDATE `usage` = `usage` + msec_diff_rollup * quantity; - INSERT INTO aggregated_job_resources_v3 (batch_id, job_id, resource_id, `usage`) SELECT attempt_resources.batch_id, attempt_resources.job_id, attempt_resources.deduped_resource_id, msec_diff_rollup * quantity FROM attempt_resources - JOIN aggregated_job_resources_v2 ON - aggregated_job_resources_v2.batch_id = attempt_resources.batch_id AND - aggregated_job_resources_v2.job_id = attempt_resources.job_id AND - aggregated_job_resources_v2.resource_id = attempt_resources.resource_id - WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id AND migrated = 1 + WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id ON DUPLICATE KEY UPDATE `usage` = aggregated_job_resources_v3.`usage` + msec_diff_rollup * quantity; - INSERT INTO aggregated_billing_project_user_resources_by_date_v2 (billing_date, billing_project, user, resource_id, token, `usage`) - SELECT cur_billing_date, - billing_project, - `user`, - resource_id, - rand_token, - msec_diff_rollup * quantity - FROM attempt_resources - JOIN batches ON batches.id = attempt_resources.batch_id - WHERE batch_id = NEW.batch_id AND job_id = NEW.job_id AND attempt_id = NEW.attempt_id - ON DUPLICATE KEY UPDATE `usage` = `usage` + msec_diff_rollup * quantity; - INSERT INTO aggregated_billing_project_user_resources_by_date_v3 (billing_date, billing_project, user, resource_id, token, `usage`) SELECT cur_billing_date, batches.billing_project, @@ -683,13 +631,7 @@ BEGIN msec_diff_rollup * quantity FROM attempt_resources JOIN batches ON batches.id = attempt_resources.batch_id - JOIN aggregated_billing_project_user_resources_by_date_v2 ON - aggregated_billing_project_user_resources_by_date_v2.billing_date = cur_billing_date AND - aggregated_billing_project_user_resources_by_date_v2.billing_project = batches.billing_project AND - aggregated_billing_project_user_resources_by_date_v2.user = batches.user AND - aggregated_billing_project_user_resources_by_date_v2.resource_id = attempt_resources.resource_id AND - aggregated_billing_project_user_resources_by_date_v2.token = rand_token - WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id AND migrated = 1 + WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id ON DUPLICATE KEY UPDATE `usage` = aggregated_billing_project_user_resources_by_date_v3.`usage` + msec_diff_rollup * quantity; END IF; END $$ @@ -866,10 +808,6 @@ BEGIN DECLARE cur_n_tokens INT; DECLARE rand_token INT; DECLARE cur_billing_date DATE; - DECLARE bp_user_resources_migrated BOOLEAN DEFAULT FALSE; - DECLARE bp_user_resources_by_date_migrated BOOLEAN DEFAULT FALSE; - DECLARE batch_resources_migrated BOOLEAN DEFAULT FALSE; - DECLARE job_resources_migrated BOOLEAN DEFAULT FALSE; SELECT billing_project, user INTO cur_billing_project, cur_user FROM batches WHERE id = NEW.batch_id; @@ -887,74 +825,25 @@ BEGIN SET cur_billing_date = CAST(UTC_DATE() AS DATE); IF msec_diff_rollup != 0 THEN - INSERT INTO aggregated_billing_project_user_resources_v2 (billing_project, user, resource_id, token, `usage`) - VALUES (cur_billing_project, cur_user, NEW.resource_id, rand_token, NEW.quantity * msec_diff_rollup) + INSERT INTO aggregated_billing_project_user_resources_v3 (billing_project, user, resource_id, token, `usage`) + VALUES (cur_billing_project, cur_user, NEW.deduped_resource_id, rand_token, NEW.quantity * msec_diff_rollup) ON DUPLICATE KEY UPDATE `usage` = `usage` + NEW.quantity * msec_diff_rollup; - SELECT migrated INTO bp_user_resources_migrated - FROM aggregated_billing_project_user_resources_v2 - WHERE billing_project = cur_billing_project AND user = cur_user AND resource_id = NEW.resource_id AND token = rand_token - FOR UPDATE; - - IF bp_user_resources_migrated THEN - INSERT INTO aggregated_billing_project_user_resources_v3 (billing_project, user, resource_id, token, `usage`) - VALUES (cur_billing_project, cur_user, NEW.deduped_resource_id, rand_token, NEW.quantity * msec_diff_rollup) - ON DUPLICATE KEY UPDATE - `usage` = `usage` + NEW.quantity * msec_diff_rollup; - END IF; - - INSERT INTO aggregated_job_group_resources_v2 (batch_id, resource_id, token, `usage`) - VALUES (NEW.batch_id, NEW.resource_id, rand_token, NEW.quantity * msec_diff_rollup) + INSERT INTO aggregated_job_group_resources_v3 (batch_id, resource_id, token, `usage`) + VALUES (NEW.batch_id, NEW.deduped_resource_id, rand_token, NEW.quantity * msec_diff_rollup) ON DUPLICATE KEY UPDATE `usage` = `usage` + NEW.quantity * msec_diff_rollup; - SELECT migrated INTO batch_resources_migrated - FROM aggregated_job_group_resources_v2 - WHERE batch_id = NEW.batch_id AND resource_id = NEW.resource_id AND token = rand_token - FOR UPDATE; - - IF batch_resources_migrated THEN - INSERT INTO aggregated_job_group_resources_v3 (batch_id, resource_id, token, `usage`) - VALUES (NEW.batch_id, NEW.deduped_resource_id, rand_token, NEW.quantity * msec_diff_rollup) - ON DUPLICATE KEY UPDATE - `usage` = `usage` + NEW.quantity * msec_diff_rollup; - END IF; - - INSERT INTO aggregated_job_resources_v2 (batch_id, job_id, resource_id, `usage`) - VALUES (NEW.batch_id, NEW.job_id, NEW.resource_id, NEW.quantity * msec_diff_rollup) + INSERT INTO aggregated_job_resources_v3 (batch_id, job_id, resource_id, `usage`) + VALUES (NEW.batch_id, NEW.job_id, NEW.deduped_resource_id, NEW.quantity * msec_diff_rollup) ON DUPLICATE KEY UPDATE `usage` = `usage` + NEW.quantity * msec_diff_rollup; - SELECT migrated INTO job_resources_migrated - FROM aggregated_job_resources_v2 - WHERE batch_id = NEW.batch_id AND job_id = NEW.job_id AND resource_id = NEW.resource_id - FOR UPDATE; - - IF job_resources_migrated THEN - INSERT INTO aggregated_job_resources_v3 (batch_id, job_id, resource_id, `usage`) - VALUES (NEW.batch_id, NEW.job_id, NEW.deduped_resource_id, NEW.quantity * msec_diff_rollup) - ON DUPLICATE KEY UPDATE - `usage` = `usage` + NEW.quantity * msec_diff_rollup; - END IF; - - INSERT INTO aggregated_billing_project_user_resources_by_date_v2 (billing_date, billing_project, user, resource_id, token, `usage`) - VALUES (cur_billing_date, cur_billing_project, cur_user, NEW.resource_id, rand_token, NEW.quantity * msec_diff_rollup) + INSERT INTO aggregated_billing_project_user_resources_by_date_v3 (billing_date, billing_project, user, resource_id, token, `usage`) + VALUES (cur_billing_date, cur_billing_project, cur_user, NEW.deduped_resource_id, rand_token, NEW.quantity * msec_diff_rollup) ON DUPLICATE KEY UPDATE `usage` = `usage` + NEW.quantity * msec_diff_rollup; - - SELECT migrated INTO bp_user_resources_by_date_migrated - FROM aggregated_billing_project_user_resources_by_date_v2 - WHERE billing_date = cur_billing_date AND billing_project = cur_billing_project AND user = cur_user - AND resource_id = NEW.resource_id AND token = rand_token - FOR UPDATE; - - IF bp_user_resources_by_date_migrated THEN - INSERT INTO aggregated_billing_project_user_resources_by_date_v3 (billing_date, billing_project, user, resource_id, token, `usage`) - VALUES (cur_billing_date, cur_billing_project, cur_user, NEW.deduped_resource_id, rand_token, NEW.quantity * msec_diff_rollup) - ON DUPLICATE KEY UPDATE - `usage` = `usage` + NEW.quantity * msec_diff_rollup; - END IF; END IF; END $$ diff --git a/batch/sql/remove-v2-billing-writes.sql b/batch/sql/remove-v2-billing-writes.sql new file mode 100644 index 00000000000..fedeea0facb --- /dev/null +++ b/batch/sql/remove-v2-billing-writes.sql @@ -0,0 +1,120 @@ +DELIMITER $$ + +DROP TRIGGER IF EXISTS attempts_after_update $$ +CREATE TRIGGER attempts_after_update AFTER UPDATE ON attempts +FOR EACH ROW +BEGIN + DECLARE job_cores_mcpu INT; + DECLARE cur_billing_project VARCHAR(100); + DECLARE msec_diff_rollup BIGINT; + DECLARE cur_n_tokens INT; + DECLARE rand_token INT; + DECLARE cur_billing_date DATE; + + SELECT n_tokens INTO cur_n_tokens FROM globals LOCK IN SHARE MODE; + SET rand_token = FLOOR(RAND() * cur_n_tokens); + + SELECT cores_mcpu INTO job_cores_mcpu FROM jobs + WHERE batch_id = NEW.batch_id AND job_id = NEW.job_id; + + SELECT billing_project INTO cur_billing_project FROM batches WHERE id = NEW.batch_id; + + SET msec_diff_rollup = (GREATEST(COALESCE(NEW.rollup_time - NEW.start_time, 0), 0) - + GREATEST(COALESCE(OLD.rollup_time - OLD.start_time, 0), 0)); + + SET cur_billing_date = CAST(UTC_DATE() AS DATE); + + IF msec_diff_rollup != 0 THEN + INSERT INTO aggregated_billing_project_user_resources_v3 (billing_project, user, resource_id, token, `usage`) + SELECT batches.billing_project, batches.`user`, + attempt_resources.deduped_resource_id, + rand_token, + msec_diff_rollup * quantity + FROM attempt_resources + JOIN batches ON batches.id = attempt_resources.batch_id + WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id + ON DUPLICATE KEY UPDATE `usage` = aggregated_billing_project_user_resources_v3.`usage` + msec_diff_rollup * quantity; + + INSERT INTO aggregated_job_group_resources_v3 (batch_id, resource_id, token, `usage`) + SELECT attempt_resources.batch_id, + attempt_resources.deduped_resource_id, + rand_token, + msec_diff_rollup * quantity + FROM attempt_resources + WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id + ON DUPLICATE KEY UPDATE `usage` = aggregated_job_group_resources_v3.`usage` + msec_diff_rollup * quantity; + + INSERT INTO aggregated_job_resources_v3 (batch_id, job_id, resource_id, `usage`) + SELECT attempt_resources.batch_id, attempt_resources.job_id, + attempt_resources.deduped_resource_id, + msec_diff_rollup * quantity + FROM attempt_resources + WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id + ON DUPLICATE KEY UPDATE `usage` = aggregated_job_resources_v3.`usage` + msec_diff_rollup * quantity; + + INSERT INTO aggregated_billing_project_user_resources_by_date_v3 (billing_date, billing_project, user, resource_id, token, `usage`) + SELECT cur_billing_date, + batches.billing_project, + batches.`user`, + attempt_resources.deduped_resource_id, + rand_token, + msec_diff_rollup * quantity + FROM attempt_resources + JOIN batches ON batches.id = attempt_resources.batch_id + WHERE attempt_resources.batch_id = NEW.batch_id AND attempt_resources.job_id = NEW.job_id AND attempt_id = NEW.attempt_id + ON DUPLICATE KEY UPDATE `usage` = aggregated_billing_project_user_resources_by_date_v3.`usage` + msec_diff_rollup * quantity; + END IF; +END $$ + +DROP TRIGGER IF EXISTS attempt_resources_after_insert $$ +CREATE TRIGGER attempt_resources_after_insert AFTER INSERT ON attempt_resources +FOR EACH ROW +BEGIN + DECLARE cur_start_time BIGINT; + DECLARE cur_rollup_time BIGINT; + DECLARE cur_billing_project VARCHAR(100); + DECLARE cur_user VARCHAR(100); + DECLARE msec_diff_rollup BIGINT; + DECLARE cur_n_tokens INT; + DECLARE rand_token INT; + DECLARE cur_billing_date DATE; + + SELECT billing_project, user INTO cur_billing_project, cur_user + FROM batches WHERE id = NEW.batch_id; + + SELECT n_tokens INTO cur_n_tokens FROM globals LOCK IN SHARE MODE; + SET rand_token = FLOOR(RAND() * cur_n_tokens); + + SELECT start_time, rollup_time INTO cur_start_time, cur_rollup_time + FROM attempts + WHERE batch_id = NEW.batch_id AND job_id = NEW.job_id AND attempt_id = NEW.attempt_id + LOCK IN SHARE MODE; + + SET msec_diff_rollup = GREATEST(COALESCE(cur_rollup_time - cur_start_time, 0), 0); + + SET cur_billing_date = CAST(UTC_DATE() AS DATE); + + IF msec_diff_rollup != 0 THEN + INSERT INTO aggregated_billing_project_user_resources_v3 (billing_project, user, resource_id, token, `usage`) + VALUES (cur_billing_project, cur_user, NEW.deduped_resource_id, rand_token, NEW.quantity * msec_diff_rollup) + ON DUPLICATE KEY UPDATE + `usage` = `usage` + NEW.quantity * msec_diff_rollup; + + INSERT INTO aggregated_job_group_resources_v3 (batch_id, resource_id, token, `usage`) + VALUES (NEW.batch_id, NEW.deduped_resource_id, rand_token, NEW.quantity * msec_diff_rollup) + ON DUPLICATE KEY UPDATE + `usage` = `usage` + NEW.quantity * msec_diff_rollup; + + INSERT INTO aggregated_job_resources_v3 (batch_id, job_id, resource_id, `usage`) + VALUES (NEW.batch_id, NEW.job_id, NEW.deduped_resource_id, NEW.quantity * msec_diff_rollup) + ON DUPLICATE KEY UPDATE + `usage` = `usage` + NEW.quantity * msec_diff_rollup; + + INSERT INTO aggregated_billing_project_user_resources_by_date_v3 (billing_date, billing_project, user, resource_id, token, `usage`) + VALUES (cur_billing_date, cur_billing_project, cur_user, NEW.deduped_resource_id, rand_token, NEW.quantity * msec_diff_rollup) + ON DUPLICATE KEY UPDATE + `usage` = `usage` + NEW.quantity * msec_diff_rollup; + END IF; +END $$ + +DELIMITER ; diff --git a/build.yaml b/build.yaml index 212120b24e9..e4256804708 100644 --- a/build.yaml +++ b/build.yaml @@ -2358,6 +2358,9 @@ steps: - name: rename-job-groups-tables script: /io/sql/rename-job-groups-tables.sql online: false # this must be offline + - name: remove-v2-billing-writes + script: /io/sql/remove-v2-billing-writes.sql + online: true inputs: - from: /repo/batch/sql to: /io/sql From 728f43bab4a474442b61d746e1881fa450f7ade5 Mon Sep 17 00:00:00 2001 From: Patrick Schultz Date: Fri, 26 Jan 2024 07:17:17 -0500 Subject: [PATCH 09/26] [query] fix remaining scala warnings (#14188) --- hail/build.gradle | 3 +- .../scala/is/hail/backend/BackendUtils.scala | 2 + .../is/hail/backend/ExecuteContext.scala | 2 - .../is/hail/backend/local/LocalBackend.scala | 2 +- .../hail/backend/service/ServiceBackend.scala | 18 +++------ .../is/hail/backend/service/Worker.scala | 4 +- .../scala/is/hail/experimental/package.scala | 2 +- .../expr/ir/AbstractMatrixTableSpec.scala | 1 - .../main/scala/is/hail/expr/ir/BinaryOp.scala | 2 - .../scala/is/hail/expr/ir/BinarySearch.scala | 9 ----- .../src/main/scala/is/hail/expr/ir/Emit.scala | 20 ++-------- .../is/hail/expr/ir/EmitClassBuilder.scala | 7 ---- .../hail/expr/ir/ExtractIntervalFilters.scala | 19 ---------- .../scala/is/hail/expr/ir/GenericLines.scala | 3 +- .../scala/is/hail/expr/ir/LowerMatrixIR.scala | 1 - .../scala/is/hail/expr/ir/MatrixWriter.scala | 1 - .../is/hail/expr/ir/NativeReaderOptions.scala | 5 +-- .../is/hail/expr/ir/NormalizeNames.scala | 3 ++ .../is/hail/expr/ir/PruneDeadFields.scala | 4 -- .../main/scala/is/hail/expr/ir/Simplify.scala | 19 ---------- .../main/scala/is/hail/expr/ir/TableIR.scala | 3 +- .../expr/ir/agg/CollectAsSetAggregator.scala | 2 - .../expr/ir/agg/StagedBlockLinkedList.scala | 3 -- .../is/hail/expr/ir/functions/Functions.scala | 3 -- .../expr/ir/functions/LocusFunctions.scala | 1 - .../expr/ir/functions/NDArrayFunctions.scala | 2 +- .../ir/lowering/LowerDistributedSort.scala | 1 - .../hail/expr/ir/ndarrays/EmitNDArray.scala | 2 +- .../expr/ir/orderings/IterableOrdering.scala | 2 - .../is/hail/expr/ir/streams/EmitStream.scala | 9 +---- .../main/scala/is/hail/io/IndexBTree.scala | 2 +- .../main/scala/is/hail/io/InputBuffers.scala | 4 +- .../avro/UnsafeAvroTableReaderOptions.scala | 1 - .../is/hail/io/bgen/BgenRDDPartitions.scala | 2 - .../is/hail/io/bgen/StagedBGENReader.scala | 1 - .../hail/io/compress/BGzipOutputStream.scala | 4 +- .../scala/is/hail/io/fs/AzureStorageFS.scala | 5 --- .../main/scala/is/hail/io/fs/HadoopFS.scala | 2 +- .../scala/is/hail/io/index/IndexReader.scala | 1 - .../scala/is/hail/io/index/IndexWriter.scala | 5 +-- .../scala/is/hail/io/plink/LoadPlink.scala | 3 +- .../scala/is/hail/io/tabix/TabixReader.scala | 2 +- .../main/scala/is/hail/io/vcf/LoadVCF.scala | 9 +---- .../scala/is/hail/linalg/BlockMatrix.scala | 8 ++-- .../is/hail/linalg/LinalgCodeUtils.scala | 2 +- hail/src/main/scala/is/hail/lir/PST.scala | 2 +- .../scala/is/hail/methods/LocalLDPrune.scala | 19 ---------- .../is/hail/methods/LogisticRegression.scala | 2 - .../is/hail/methods/PoissonRegression.scala | 2 +- .../src/main/scala/is/hail/methods/Skat.scala | 4 +- .../main/scala/is/hail/misc/BGZipBlocks.scala | 2 +- .../scala/is/hail/rvd/AbstractRVDSpec.scala | 1 - hail/src/main/scala/is/hail/rvd/RVD.scala | 4 -- .../scala/is/hail/services/BatchConfig.scala | 3 -- .../scala/is/hail/services/DeployConfig.scala | 3 -- .../services/batch_client/BatchClient.scala | 2 +- .../is/hail/sparkextras/ContextRDD.scala | 3 -- .../GeneralizedChiSquaredDistribution.scala | 6 +-- .../scala/is/hail/stats/RegressionUtils.scala | 1 - .../main/scala/is/hail/stats/package.scala | 4 +- .../hail/types/physical/PCanonicalLocus.scala | 2 +- .../stypes/interfaces/SBaseStruct.scala | 1 - .../physical/stypes/interfaces/SNDArray.scala | 4 +- .../is/hail/types/virtual/TNDArray.scala | 5 --- .../scala/is/hail/types/virtual/Type.scala | 2 +- .../scala/is/hail/utils/ErrorHandling.scala | 1 - .../is/hail/utils/FlipbookIterator.scala | 2 +- .../is/hail/utils/StringEscapeUtils.scala | 1 - .../is/hail/utils/StringSocketAppender.scala | 5 +-- .../scala/is/hail/variant/HardCallView.scala | 2 +- .../is/hail/variant/ReferenceGenome.scala | 1 - .../is/hail/annotations/UnsafeSuite.scala | 6 --- .../test/scala/is/hail/asm4s/ASM4SSuite.scala | 38 +++++++++---------- .../scala/is/hail/asm4s/{A.java => Foo.java} | 2 +- .../is/hail/expr/ir/Aggregators2Suite.scala | 3 +- .../is/hail/expr/ir/AggregatorsSuite.scala | 2 +- .../is/hail/expr/ir/EmitStreamSuite.scala | 29 -------------- .../test/scala/is/hail/expr/ir/IRSuite.scala | 10 +---- .../scala/is/hail/expr/ir/MatrixIRSuite.scala | 1 - .../is/hail/expr/ir/MemoryLeakSuite.scala | 2 +- .../scala/is/hail/expr/ir/OrderingSuite.scala | 8 ---- .../is/hail/expr/ir/StagedBTreeSuite.scala | 1 - .../scala/is/hail/expr/ir/TableIRSuite.scala | 4 +- .../expr/ir/analyses/SemanticHashSuite.scala | 4 +- .../is/hail/expr/ir/table/TableGenSuite.scala | 26 ++++++------- .../scala/is/hail/io/IndexBTreeSuite.scala | 1 - .../test/scala/is/hail/io/IndexSuite.scala | 10 +---- .../test/scala/is/hail/io/fs/FSSuite.scala | 8 ++-- .../src/test/scala/is/hail/io/fs/FakeFS.scala | 4 +- .../is/hail/methods/LocalLDPruneSuite.scala | 2 - .../is/hail/methods/MultiArray2Suite.scala | 20 +++++----- .../is/hail/stats/FisherExactTestSuite.scala | 4 -- .../scala/is/hail/stats/eigSymDSuite.scala | 1 - .../hail/types/physical/PNDArraySuite.scala | 3 +- .../is/hail/utils/FlipbookIteratorSuite.scala | 2 +- .../is/hail/utils/PartitionCountsSuite.scala | 2 +- .../utils/RichDenseMatrixDoubleSuite.scala | 2 +- .../is/hail/utils/TreeTraversalSuite.scala | 6 +-- .../test/scala/is/hail/utils/UtilsSuite.scala | 1 - .../hail/variant/ReferenceGenomeSuite.scala | 7 ---- 100 files changed, 126 insertions(+), 373 deletions(-) rename hail/src/test/scala/is/hail/asm4s/{A.java => Foo.java} (89%) diff --git a/hail/build.gradle b/hail/build.gradle index c111fa6e2ba..32125ba7121 100644 --- a/hail/build.gradle +++ b/hail/build.gradle @@ -70,7 +70,8 @@ tasks.withType(ScalaCompile) { "-deprecation", "-unchecked", "-Ywarn-unused:_,-explicits,-implicits", - "-Wconf:cat=unused-locals:w,cat=unused:info,any:w", +// "-Wconf:cat=unused-locals:w,cat=unused:info,any:w", + "-Wconf:any:e", "-Ypartial-unification", ] diff --git a/hail/src/main/scala/is/hail/backend/BackendUtils.scala b/hail/src/main/scala/is/hail/backend/BackendUtils.scala index fd70d0457e5..78bb30fd0f0 100644 --- a/hail/src/main/scala/is/hail/backend/BackendUtils.scala +++ b/hail/src/main/scala/is/hail/backend/BackendUtils.scala @@ -10,6 +10,7 @@ import is.hail.io.fs._ import is.hail.services._ import is.hail.utils._ +import scala.annotation.nowarn import scala.util.Try object BackendUtils { @@ -93,6 +94,7 @@ class BackendUtils( results case Some(cachedResults) => + @nowarn("cat=unused-pat-vars&msg=pattern var c") val remainingContexts = for { c @ (_, k) <- contexts.zipWithIndex diff --git a/hail/src/main/scala/is/hail/backend/ExecuteContext.scala b/hail/src/main/scala/is/hail/backend/ExecuteContext.scala index 5fbb4d197cf..07a411bb309 100644 --- a/hail/src/main/scala/is/hail/backend/ExecuteContext.scala +++ b/hail/src/main/scala/is/hail/backend/ExecuteContext.scala @@ -136,8 +136,6 @@ class ExecuteContext( private val cleanupFunctions = mutable.ArrayBuffer[() => Unit]() - private[this] val broadcasts = mutable.ArrayBuffer.empty[BroadcastValue[_]] - val memo: mutable.Map[Any, Any] = new mutable.HashMap[Any, Any]() val taskContext: HailTaskContext = new LocalTaskContext(0, 0) diff --git a/hail/src/main/scala/is/hail/backend/local/LocalBackend.scala b/hail/src/main/scala/is/hail/backend/local/LocalBackend.scala index ac4ec419b29..cdb3105b012 100644 --- a/hail/src/main/scala/is/hail/backend/local/LocalBackend.scala +++ b/hail/src/main/scala/is/hail/backend/local/LocalBackend.scala @@ -208,7 +208,7 @@ class LocalBackend( throw new LowererUnsupportedOperation(s"lowered to uncompilable IR: ${Pretty(ctx, ir)}") if (ir.typ == TVoid) { - val (pt, f) = ctx.timer.time("Compile") { + val (_, f) = ctx.timer.time("Compile") { Compile[AsmFunction1RegionUnit]( ctx, FastSeq(), diff --git a/hail/src/main/scala/is/hail/backend/service/ServiceBackend.scala b/hail/src/main/scala/is/hail/backend/service/ServiceBackend.scala index 48e91a7b107..f2fc277e3a2 100644 --- a/hail/src/main/scala/is/hail/backend/service/ServiceBackend.scala +++ b/hail/src/main/scala/is/hail/backend/service/ServiceBackend.scala @@ -154,7 +154,6 @@ class ServiceBackend( fs: FS, collection: Array[Array[Byte]], stageIdentifier: String, - dependency: Option[TableStageDependency] = None, f: (Array[Byte], HailTaskContext, HailClassLoader, FS) => Array[Byte], ): (String, String, Int) = { val backendContext = _backendContext.asInstanceOf[ServiceBackendContext] @@ -291,7 +290,7 @@ class ServiceBackend( f: (Array[Byte], HailTaskContext, HailClassLoader, FS) => Array[Byte] ): Array[Array[Byte]] = { val (token, root, n) = - submitAndWaitForBatch(_backendContext, fs, collection, stageIdentifier, dependency, f) + submitAndWaitForBatch(_backendContext, fs, collection, stageIdentifier, f) log.info(s"parallelizeAndComputeWithIndex: $token: reading results") val startTime = System.nanoTime() @@ -321,14 +320,8 @@ class ServiceBackend( )( f: (Array[Byte], HailTaskContext, HailClassLoader, FS) => Array[Byte] ): (Option[Throwable], IndexedSeq[(Array[Byte], Int)]) = { - val (token, root, n) = submitAndWaitForBatch( - _backendContext, - fs, - collection.map(_._1).toArray, - stageIdentifier, - dependency, - f, - ) + val (token, root, _) = + submitAndWaitForBatch(_backendContext, fs, collection.map(_._1).toArray, stageIdentifier, f) log.info(s"parallelizeAndComputeWithIndex: $token: reading results") val startTime = System.nanoTime() val r @ (_, results) = runAllKeepFirstError(executor) { @@ -372,7 +365,6 @@ class ServiceBackend( MakeTuple.ordered(FastSeq(x)), optimize = true, ) - val retPType = pt.asInstanceOf[PBaseStruct] val elementType = pt.fields(0).typ val off = ctx.scopedExecution((hcl, fs, htc, r) => f(hcl, fs, htc, r).apply(r)) val codec = TypedCodecSpec( @@ -455,7 +447,7 @@ object ServiceBackendAPI { assert(argv.length == 7, argv.toFastSeq) val scratchDir = argv(0) - val logFile = argv(1) + // val logFile = argv(1) val jarLocation = argv(2) val kind = argv(3) assert(kind == Main.DRIVER) @@ -473,7 +465,7 @@ object ServiceBackendAPI { val batchClient = new BatchClient(s"$scratchDir/secrets/gsa-key/key.json") log.info("BatchClient allocated.") - var batchId = + val batchId = BatchConfig.fromConfigFile(s"$scratchDir/batch-config/batch-config.json").map(_.batchId) log.info("BatchConfig parsed.") diff --git a/hail/src/main/scala/is/hail/backend/service/Worker.scala b/hail/src/main/scala/is/hail/backend/service/Worker.scala index b4adb68e65b..ad0b2498954 100644 --- a/hail/src/main/scala/is/hail/backend/service/Worker.scala +++ b/hail/src/main/scala/is/hail/backend/service/Worker.scala @@ -104,8 +104,8 @@ object Worker { throw new IllegalArgumentException(s"expected seven arguments, not: ${argv.length}") } val scratchDir = argv(0) - val logFile = argv(1) - var jarLocation = argv(2) + // val logFile = argv(1) + // var jarLocation = argv(2) val kind = argv(3) assert(kind == Main.WORKER) val root = argv(4) diff --git a/hail/src/main/scala/is/hail/experimental/package.scala b/hail/src/main/scala/is/hail/experimental/package.scala index 9ac4ba51d71..623c7e4cb69 100644 --- a/hail/src/main/scala/is/hail/experimental/package.scala +++ b/hail/src/main/scala/is/hail/experimental/package.scala @@ -30,7 +30,7 @@ package object experimental { if (ac <= 1 || an == 0) // FAF should not be calculated on singletons 0.0 else { - var f = (af: Double) => ac.toDouble - 1 - qpois(ci, an.toDouble * af) + val f = (af: Double) => ac.toDouble - 1 - qpois(ci, an.toDouble * af) val root = uniroot(f, lower, upper, tol) val rounder = 1d / (precision / 100d) var max_af = math.round(root.getOrElse(0.0) * rounder) / rounder diff --git a/hail/src/main/scala/is/hail/expr/ir/AbstractMatrixTableSpec.scala b/hail/src/main/scala/is/hail/expr/ir/AbstractMatrixTableSpec.scala index 37b584196e8..9b12157d5af 100644 --- a/hail/src/main/scala/is/hail/expr/ir/AbstractMatrixTableSpec.scala +++ b/hail/src/main/scala/is/hail/expr/ir/AbstractMatrixTableSpec.scala @@ -75,7 +75,6 @@ object RelationalSpec { def read(fs: FS, path: String): RelationalSpec = { val jv = readMetadata(fs, path) - val references = readReferences(fs, path, jv) (jv \ "name").extract[String] match { case "TableSpec" => TableSpec.fromJValue(fs, path, jv) diff --git a/hail/src/main/scala/is/hail/expr/ir/BinaryOp.scala b/hail/src/main/scala/is/hail/expr/ir/BinaryOp.scala index 17941317b04..91be6146c88 100644 --- a/hail/src/main/scala/is/hail/expr/ir/BinaryOp.scala +++ b/hail/src/main/scala/is/hail/expr/ir/BinaryOp.scala @@ -118,8 +118,6 @@ object BinaryOp { case _ => incompatible(lt, rt, op) } case (TBoolean, TBoolean) => - val ll = coerce[Boolean](l) - val rr = coerce[Boolean](r) op match { case _ => incompatible(lt, rt, op) } diff --git a/hail/src/main/scala/is/hail/expr/ir/BinarySearch.scala b/hail/src/main/scala/is/hail/expr/ir/BinarySearch.scala index dd61e95f18a..146502af3ee 100644 --- a/hail/src/main/scala/is/hail/expr/ir/BinarySearch.scala +++ b/hail/src/main/scala/is/hail/expr/ir/BinarySearch.scala @@ -286,15 +286,6 @@ object BinarySearch { } } - private def runSearchUnit( - cb: EmitCodeBuilder, - haystack: SIndexableValue, - compare: Comparator, - found: (Value[Int], Value[Int], Value[Int]) => Unit, - notFound: Value[Int] => Unit, - ): Unit = - runSearchBoundedUnit(cb, haystack, compare, 0, haystack.loadLength(), found, notFound) - private def runSearchBounded[T: TypeInfo]( cb: EmitCodeBuilder, haystack: SIndexableValue, diff --git a/hail/src/main/scala/is/hail/expr/ir/Emit.scala b/hail/src/main/scala/is/hail/expr/ir/Emit.scala index 2061e685536..0d3163a1e01 100644 --- a/hail/src/main/scala/is/hail/expr/ir/Emit.scala +++ b/hail/src/main/scala/is/hail/expr/ir/Emit.scala @@ -13,7 +13,7 @@ import is.hail.expr.ir.streams.{EmitStream, StreamProducer, StreamUtils} import is.hail.io.{BufferSpec, InputBuffer, OutputBuffer, TypedCodecSpec} import is.hail.io.fs.FS import is.hail.linalg.{BLAS, LAPACK, LinalgCodeUtils} -import is.hail.types.{tcoerce, TypeWithRequiredness, VirtualTypeWithReq} +import is.hail.types.{TypeWithRequiredness, VirtualTypeWithReq, tcoerce} import is.hail.types.physical._ import is.hail.types.physical.stypes._ import is.hail.types.physical.stypes.concrete._ @@ -25,8 +25,8 @@ import is.hail.variant.ReferenceGenome import scala.collection.mutable import scala.language.existentials - import java.io._ +import scala.annotation.nowarn // class for holding all information computed ahead-of-time that we need in the emitter object EmitContext { @@ -766,6 +766,7 @@ class Emit[C](val ctx: EmitContext, val cb: EmitClassBuilder[C]) { val mb: EmitMethodBuilder[C] = cb.emb.asInstanceOf[EmitMethodBuilder[C]] + @nowarn("cat=unused-locals&msg=local default argument") def emit( ir: IR, mb: EmitMethodBuilder[C] = mb, @@ -2788,7 +2789,7 @@ class Emit[C](val ctx: EmitContext, val cb: EmitClassBuilder[C]) { } case ResultOp(idx, sig) => - val AggContainer(aggs, sc, _) = container.get + val AggContainer(_, sc, _) = container.get val rvAgg = agg.Extract.getAgg(sig) rvAgg.result(cb, sc.states(idx), region) @@ -3530,16 +3531,6 @@ class Emit[C](val ctx: EmitContext, val cb: EmitClassBuilder[C]) { ): IEmitCode = this.emitI(ir, cb, region, env, container, loopEnv) - def emitVoid( - ir: IR, - env: EmitEnv = env, - container: Option[AggContainer] = container, - loopEnv: Option[Env[LoopRef]] = loopEnv, - ): Code[Unit] = - EmitCodeBuilder.scopedVoid(mb) { cb => - this.emitVoid(cb, ir, region, env, container, loopEnv) - } - def emitStream(ir: IR, outerRegion: Value[Region], env: EmitEnv = env): EmitCode = EmitCode.fromI(mb)(cb => EmitStream.produce(this, ir, cb, cb.emb, outerRegion, env, container) @@ -3669,7 +3660,6 @@ class Emit[C](val ctx: EmitContext, val cb: EmitClassBuilder[C]) { ) sort.emitWithBuilder[Boolean] { cb => - val region = sort.getCodeParam[Region](1) val leftEC = cb.memoize( EmitCode.present(sort, elemSCT.loadToSValue(cb, sort.getCodeParam(2)(elemSCT.ti))), "sort_leftEC", @@ -3794,8 +3784,6 @@ object NDArrayEmitter { rightShape: IndexedSeq[Value[Long]], errorID: Int, ): IndexedSeq[Value[Long]] = { - val mb = cb.emb - assert(leftShape.nonEmpty) assert(rightShape.nonEmpty) diff --git a/hail/src/main/scala/is/hail/expr/ir/EmitClassBuilder.scala b/hail/src/main/scala/is/hail/expr/ir/EmitClassBuilder.scala index ab84caa8fe3..78391c6e975 100644 --- a/hail/src/main/scala/is/hail/expr/ir/EmitClassBuilder.scala +++ b/hail/src/main/scala/is/hail/expr/ir/EmitClassBuilder.scala @@ -384,18 +384,11 @@ final class EmitClassBuilder[C](val emodb: EmitModuleBuilder, val cb: ClassBuild newPField(name, st), ) - private[this] val typMap: mutable.Map[Type, Value[_ <: Type]] = - mutable.Map() - - private[this] val pTypeMap: mutable.Map[PType, Value[_ <: PType]] = mutable.Map() - private[this] type CompareMapKey = (SType, SType) private[this] val memoizedComparisons: mutable.Map[CompareMapKey, CodeOrdering] = mutable.Map[CompareMapKey, CodeOrdering]() - def numTypes: Int = typMap.size - private[this] val decodedLiteralsField = genFieldThisRef[Array[Long]]("decoded_lits") def literalsArray(): Value[Array[Long]] = decodedLiteralsField diff --git a/hail/src/main/scala/is/hail/expr/ir/ExtractIntervalFilters.scala b/hail/src/main/scala/is/hail/expr/ir/ExtractIntervalFilters.scala index 2f22c69d3ba..77280de352c 100644 --- a/hail/src/main/scala/is/hail/expr/ir/ExtractIntervalFilters.scala +++ b/hail/src/main/scala/is/hail/expr/ir/ExtractIntervalFilters.scala @@ -168,7 +168,6 @@ class KeySetLattice(ctx: ExecuteContext, keyType: TStruct) extends Lattice { if (v.isEmpty) return top val builder = mutable.ArrayBuilder.make[Interval]() - var i = 0 if (v.head.left != IntervalEndpoint(Row(), -1)) { builder += Interval(IntervalEndpoint(Row(), -1), v.head.left) } @@ -751,24 +750,6 @@ class ExtractIntervalFilters(ctx: ExecuteContext, keyType: TStruct) { private def literalSizeOkay(lit: Any): Boolean = lit.asInstanceOf[Iterable[_]].size <= MAX_LITERAL_SIZE - private def wrapInRow(intervals: IndexedSeq[Interval]): IndexedSeq[Interval] = intervals - .map { interval => - Interval( - IntervalEndpoint(Row(interval.left.point), interval.left.sign), - IntervalEndpoint(Row(interval.right.point), interval.right.sign), - ) - } - - private def intervalFromComparison(v: Any, op: ComparisonOp[_]): Interval = { - (op: @unchecked) match { - case _: EQ => Interval(endpoint(v, -1), endpoint(v, 1)) - case GT(_, _) => Interval(negInf, endpoint(v, -1)) // value > key - case GTEQ(_, _) => Interval(negInf, endpoint(v, 1)) // value >= key - case LT(_, _) => Interval(endpoint(v, 1), posInf) // value < key - case LTEQ(_, _) => Interval(endpoint(v, -1), posInf) // value <= key - } - } - private def posInf: IntervalEndpoint = IntervalEndpoint(Row(), 1) private def negInf: IntervalEndpoint = IntervalEndpoint(Row(), -1) diff --git a/hail/src/main/scala/is/hail/expr/ir/GenericLines.scala b/hail/src/main/scala/is/hail/expr/ir/GenericLines.scala index cff71fffeed..8adb4fe75eb 100644 --- a/hail/src/main/scala/is/hail/expr/ir/GenericLines.scala +++ b/hail/src/main/scala/is/hail/expr/ir/GenericLines.scala @@ -67,7 +67,7 @@ object GenericLines { private var eof = false private var closed = false - private var buf = new Array[Byte](64 * 1024) + private val buf = new Array[Byte](64 * 1024) private var bufOffset = 0L private var bufMark = 0 private var bufPos = 0 @@ -339,7 +339,6 @@ object GenericLines { } val body: (FS, Any) => CloseableIterator[GenericLine] = { (fs: FS, context: Any) => val contextRow = context.asInstanceOf[Row] - val index = contextRow.getAs[Int](0) val file = contextRow.getAs[String](1) val chrom = contextRow.getAs[String](2) val start = contextRow.getAs[Int](3) diff --git a/hail/src/main/scala/is/hail/expr/ir/LowerMatrixIR.scala b/hail/src/main/scala/is/hail/expr/ir/LowerMatrixIR.scala index 8e68ff1c5ab..07b26c4bb56 100644 --- a/hail/src/main/scala/is/hail/expr/ir/LowerMatrixIR.scala +++ b/hail/src/main/scala/is/hail/expr/ir/LowerMatrixIR.scala @@ -1035,7 +1035,6 @@ object LowerMatrixIR { .aggregate(makeTuple(applyAggOp(Count(), FastSeq(), FastSeq()), 'global(colsField).len)) case MatrixAggregate(child, query) => val lc = lower(ctx, child, ab) - val idx = Symbol(genUID()) TableAggregate( lc, aggExplodeIR( diff --git a/hail/src/main/scala/is/hail/expr/ir/MatrixWriter.scala b/hail/src/main/scala/is/hail/expr/ir/MatrixWriter.scala index e801bc70985..f27faab7cf5 100644 --- a/hail/src/main/scala/is/hail/expr/ir/MatrixWriter.scala +++ b/hail/src/main/scala/is/hail/expr/ir/MatrixWriter.scala @@ -1011,7 +1011,6 @@ case class VCFPartitionWriter( _writeB(cb, v.toBytes(cb).loadBytes(cb)) case v: SCallValue => val ploidy = v.ploidy(cb) - val phased = v.isPhased(cb) cb.if_(ploidy.ceq(0), cb._fatal("VCF spec does not support 0-ploid calls.")) cb.if_(ploidy.ceq(1), cb._fatal("VCF spec does not support phased haploid calls.")) val c = v.canonicalCall(cb) diff --git a/hail/src/main/scala/is/hail/expr/ir/NativeReaderOptions.scala b/hail/src/main/scala/is/hail/expr/ir/NativeReaderOptions.scala index fdbecc51aa9..d0c31e07a69 100644 --- a/hail/src/main/scala/is/hail/expr/ir/NativeReaderOptions.scala +++ b/hail/src/main/scala/is/hail/expr/ir/NativeReaderOptions.scala @@ -21,12 +21,11 @@ class NativeReaderOptionsSerializer() extends CustomSerializer[NativeReaderOptio NativeReaderOptions(intervals, intervalPointType, filterIntervals) }, { case opts: NativeReaderOptions => - implicit val fmt = format val ty = TArray(TInterval(opts.intervalPointType)) - (("name" -> opts.getClass.getSimpleName) ~ + ("name" -> opts.getClass.getSimpleName) ~ ("intervals" -> JSONAnnotationImpex.exportAnnotation(opts.intervals, ty)) ~ ("intervalPointType" -> opts.intervalPointType.parsableString()) ~ - ("filterIntervals" -> opts.filterIntervals)) + ("filterIntervals" -> opts.filterIntervals) }, ) ) diff --git a/hail/src/main/scala/is/hail/expr/ir/NormalizeNames.scala b/hail/src/main/scala/is/hail/expr/ir/NormalizeNames.scala index 93593109cc3..108834eb585 100644 --- a/hail/src/main/scala/is/hail/expr/ir/NormalizeNames.scala +++ b/hail/src/main/scala/is/hail/expr/ir/NormalizeNames.scala @@ -3,6 +3,8 @@ package is.hail.expr.ir import is.hail.backend.ExecuteContext import is.hail.utils.StackSafe._ +import scala.annotation.nowarn + class NormalizeNames(normFunction: Int => String, allowFreeVariables: Boolean = false) { var count: Int = 0 @@ -23,6 +25,7 @@ class NormalizeNames(normFunction: Int => String, allowFreeVariables: Boolean = private def normalizeIR(ir: BaseIR, env: BindingEnv[String], context: Array[String] = Array()) : StackFrame[BaseIR] = { + @nowarn("cat=unused-locals&msg=default argument") def normalizeBaseIR(next: BaseIR, env: BindingEnv[String] = env): StackFrame[BaseIR] = call(normalizeIR(next, env, context :+ ir.getClass().getName())) diff --git a/hail/src/main/scala/is/hail/expr/ir/PruneDeadFields.scala b/hail/src/main/scala/is/hail/expr/ir/PruneDeadFields.scala index 9744b0730f4..67a1a2f8d04 100644 --- a/hail/src/main/scala/is/hail/expr/ir/PruneDeadFields.scala +++ b/hail/src/main/scala/is/hail/expr/ir/PruneDeadFields.scala @@ -1109,7 +1109,6 @@ object PruneDeadFields { memoizeMatrixIR(ctx, child, dep, memo) case MatrixColsTail(child, _) => memoizeMatrixIR(ctx, child, requestedType, memo) case CastTableToMatrix(child, entriesFieldName, colsFieldName, _) => - val m = Map(MatrixType.entriesIdentifier -> entriesFieldName) val childDep = child.typ.copy( key = requestedType.rowKey, globalType = unify( @@ -1679,7 +1678,6 @@ object PruneDeadFields { memoizeValueIR(ctx, aggIR, requestedType.asInstanceOf[TDict].valueType, memo), ) case AggArrayPerElement(a, elementName, indexName, aggBody, knownLength, isScan) => - val aType = a.typ.asInstanceOf[TArray] val bodyEnv = memoizeValueIR(ctx, aggBody, TIterable.elementType(requestedType), memo) if (isScan) { val valueType = @@ -1778,7 +1776,6 @@ object PruneDeadFields { val sType = requestedType.asInstanceOf[TStruct] val insFieldNames = fields.map(_._1).toSet val rightDep = sType.filter(f => insFieldNames.contains(f.name))._1 - val rightDepFields = rightDep.fieldNames.toSet val leftDep = TStruct( old.typ.asInstanceOf[TStruct] .fields @@ -1815,7 +1812,6 @@ object PruneDeadFields { } ) case GetTupleElement(o, idx) => - val childTupleType = o.typ.asInstanceOf[TTuple] val tupleDep = TTuple(FastSeq(TupleField(idx, requestedType))) memoizeValueIR(ctx, o, tupleDep, memo) case ConsoleLog(message, result) => diff --git a/hail/src/main/scala/is/hail/expr/ir/Simplify.scala b/hail/src/main/scala/is/hail/expr/ir/Simplify.scala index 6c0da6b3cc9..1da59d653b7 100644 --- a/hail/src/main/scala/is/hail/expr/ir/Simplify.scala +++ b/hail/src/main/scala/is/hail/expr/ir/Simplify.scala @@ -68,24 +68,6 @@ object Simplify { private[this] def rewriteBlockMatrixNode: BlockMatrixIR => Option[BlockMatrixIR] = blockMatrixRules.lift - /** Returns true if 'x' propagates missingness, meaning if any child of 'x' evaluates to missing, - * then 'x' will evaluate to missing. - */ - private[this] def isStrict(x: IR): Boolean = { - x match { - case _: Apply | - _: ApplySeeded | - _: ApplyUnaryPrimOp | - _: ApplyBinaryPrimOp | - _: ArrayRef | - _: ArrayLen | - _: GetField | - _: GetTupleElement => true - case ApplyComparisonOp(op, _, _) => op.strict - case _ => false - } - } - /** Returns true if any strict child of 'x' is NA. A child is strict if 'x' evaluates to missing * whenever the child does. */ @@ -484,7 +466,6 @@ object Simplify { allRefsCanBePassedThrough(Let(after.toFastSeq, body)) } => - val r = Ref(name, x.typ) val fieldNames = newFields.map(_._1).toArray val newFieldMap = newFields.toMap val newFieldRefs = newFieldMap.map { case (k, ir) => diff --git a/hail/src/main/scala/is/hail/expr/ir/TableIR.scala b/hail/src/main/scala/is/hail/expr/ir/TableIR.scala index fc0a39fb4b8..c54afe4635b 100644 --- a/hail/src/main/scala/is/hail/expr/ir/TableIR.scala +++ b/hail/src/main/scala/is/hail/expr/ir/TableIR.scala @@ -3511,7 +3511,7 @@ case class TableExplode(child: TableIR, path: IndexedSeq[String]) extends TableI 0, )) - val (len, l) = Compile[AsmFunction2RegionLongInt]( + val (_, l) = Compile[AsmFunction2RegionLongInt]( ctx, FastSeq(( "row", @@ -3972,7 +3972,6 @@ case class TableAggregateByKey(child: TableIR, expr: IR) extends TableIR { var current: Long = 0 val rowKey: WritableRegionValue = WritableRegionValue(sm, keyType, ctx.freshRegion()) val consumerRegion: Region = ctx.region - val newRV = RegionValue(consumerRegion) def hasNext: Boolean = { if (isEnd || (current == 0 && !it.hasNext)) { diff --git a/hail/src/main/scala/is/hail/expr/ir/agg/CollectAsSetAggregator.scala b/hail/src/main/scala/is/hail/expr/ir/agg/CollectAsSetAggregator.scala index 7f7078ee609..efc71582ee7 100644 --- a/hail/src/main/scala/is/hail/expr/ir/agg/CollectAsSetAggregator.scala +++ b/hail/src/main/scala/is/hail/expr/ir/agg/CollectAsSetAggregator.scala @@ -161,8 +161,6 @@ class AppendOnlySetState(val kb: EmitClassBuilder[_], vt: VirtualTypeWithReq) def deserialize(codec: BufferSpec): (EmitCodeBuilder, Value[InputBuffer]) => Unit = { val kDec = et.buildDecoder(t.virtualType, kb) - val km = kb.genFieldThisRef[Boolean]("km") - val kv = kb.genFieldThisRef("kv")(typeToTypeInfo(t)) { (cb: EmitCodeBuilder, ib: Value[InputBuffer]) => init(cb) diff --git a/hail/src/main/scala/is/hail/expr/ir/agg/StagedBlockLinkedList.scala b/hail/src/main/scala/is/hail/expr/ir/agg/StagedBlockLinkedList.scala index 84fde3a5e01..f2895416721 100644 --- a/hail/src/main/scala/is/hail/expr/ir/agg/StagedBlockLinkedList.scala +++ b/hail/src/main/scala/is/hail/expr/ir/agg/StagedBlockLinkedList.scala @@ -75,9 +75,6 @@ class StagedBlockLinkedList(val elemType: PType, val kb: EmitClassBuilder[_]) { private def next(n: Node): Code[Long] = Region.loadAddress(nodeType.fieldOffset(n, "next")) - private def hasNext(n: Node): Code[Boolean] = - next(n) cne nil - private def setNext(cb: EmitCodeBuilder, n: Node, nNext: Node): Unit = cb += Region.storeAddress(nodeType.fieldOffset(n, "next"), nNext) diff --git a/hail/src/main/scala/is/hail/expr/ir/functions/Functions.scala b/hail/src/main/scala/is/hail/expr/ir/functions/Functions.scala index 4a68c9247c5..357a84685f9 100644 --- a/hail/src/main/scala/is/hail/expr/ir/functions/Functions.scala +++ b/hail/src/main/scala/is/hail/expr/ir/functions/Functions.scala @@ -6,7 +6,6 @@ import is.hail.backend.{ExecuteContext, HailStateManager} import is.hail.experimental.ExperimentalFunctions import is.hail.expr.ir._ import is.hail.io.bgen.BGENFunctions -import is.hail.types._ import is.hail.types.physical._ import is.hail.types.physical.stypes.{EmitType, SType, SValue} import is.hail.types.physical.stypes.concrete._ @@ -308,8 +307,6 @@ abstract class RegistryFunctions { def registerAll(): Unit - private val boxes = mutable.Map[String, Box[Type]]() - def tv(name: String): TVariable = TVariable(name) diff --git a/hail/src/main/scala/is/hail/expr/ir/functions/LocusFunctions.scala b/hail/src/main/scala/is/hail/expr/ir/functions/LocusFunctions.scala index f43809bb4b1..234a33d45a1 100644 --- a/hail/src/main/scala/is/hail/expr/ir/functions/LocusFunctions.scala +++ b/hail/src/main/scala/is/hail/expr/ir/functions/LocusFunctions.scala @@ -701,7 +701,6 @@ object LocusFunctions extends RegistryFunctions { val iT = interval.st.asInstanceOf[SInterval] val srcRG = iT.pointType.asInstanceOf[SLocus].rg val destRG = rt.types(0).asInstanceOf[PInterval].pointType.asInstanceOf[PLocus].rg - val er = EmitRegion(cb.emb, r) val intervalObj = Code.checkcast[Interval](svalueToJavaValue(cb, r, interval)) val lifted = cb.newLocal[(Interval, Boolean)]( "liftover_locus_interval_lifted", diff --git a/hail/src/main/scala/is/hail/expr/ir/functions/NDArrayFunctions.scala b/hail/src/main/scala/is/hail/expr/ir/functions/NDArrayFunctions.scala index 29d43a79b67..18adad9f0b3 100644 --- a/hail/src/main/scala/is/hail/expr/ir/functions/NDArrayFunctions.scala +++ b/hail/src/main/scala/is/hail/expr/ir/functions/NDArrayFunctions.scala @@ -413,7 +413,7 @@ object NDArrayFunctions extends RegistryFunctions { SNDArrayPointerValue ] val row = cb.newLocal[Long]("rowIdx") - val IndexedSeq(nRows, nCols) = newBlock.shapes + val IndexedSeq(nRows, _) = newBlock.shapes cb.for_( cb.assign(row, 0L), row < nRows.get, diff --git a/hail/src/main/scala/is/hail/expr/ir/lowering/LowerDistributedSort.scala b/hail/src/main/scala/is/hail/expr/ir/lowering/LowerDistributedSort.scala index c57893459e8..8330fa7c6dc 100644 --- a/hail/src/main/scala/is/hail/expr/ir/lowering/LowerDistributedSort.scala +++ b/hail/src/main/scala/is/hail/expr/ir/lowering/LowerDistributedSort.scala @@ -54,7 +54,6 @@ object LowerDistributedSort { val rowsType = resultPType.fieldType("rows").asInstanceOf[PArray] val rowType = rowsType.elementType.asInstanceOf[PStruct] val rows = rowsAndGlobal.getAs[IndexedSeq[Annotation]](0) - val kType = TStruct(sortFields.map(f => (f.field, rowType.virtualType.fieldType(f.field))): _*) val sortedRows = localAnnotationSort(ctx, rows, sortFields, rowType.virtualType) diff --git a/hail/src/main/scala/is/hail/expr/ir/ndarrays/EmitNDArray.scala b/hail/src/main/scala/is/hail/expr/ir/ndarrays/EmitNDArray.scala index 2001b4de4e3..2f8165c4e7e 100644 --- a/hail/src/main/scala/is/hail/expr/ir/ndarrays/EmitNDArray.scala +++ b/hail/src/main/scala/is/hail/expr/ir/ndarrays/EmitNDArray.scala @@ -593,7 +593,7 @@ object EmitNDArray { shape.indices.map(idx => { (cb: EmitCodeBuilder, outerStep: Value[Long]) => // SlicingIndices is a map from my coordinates to my child's coordinates. val whichSlicingAxis = slicingIndices(idx) - val (start, stop, sliceStep) = slicingValueTriples(idx) + val (_, _, sliceStep) = slicingValueTriples(idx) val innerStep = cb.newLocal[Long]( "ndarray_producer_slice_child_step", sliceStep * outerStep, diff --git a/hail/src/main/scala/is/hail/expr/ir/orderings/IterableOrdering.scala b/hail/src/main/scala/is/hail/expr/ir/orderings/IterableOrdering.scala index 62a71a4ade9..af4950b160e 100644 --- a/hail/src/main/scala/is/hail/expr/ir/orderings/IterableOrdering.scala +++ b/hail/src/main/scala/is/hail/expr/ir/orderings/IterableOrdering.scala @@ -123,8 +123,6 @@ object IterableOrdering { val lhs = x.asIndexable val rhs = y.asIndexable - val gt = cb.newLocal("gt", false) - val eq = cb.newLocal("eq", true) loop(cb, lhs, rhs) { (lhs, rhs) => val gt = elemGt(cb, lhs, rhs) diff --git a/hail/src/main/scala/is/hail/expr/ir/streams/EmitStream.scala b/hail/src/main/scala/is/hail/expr/ir/streams/EmitStream.scala index 710584f2503..45734a73949 100644 --- a/hail/src/main/scala/is/hail/expr/ir/streams/EmitStream.scala +++ b/hail/src/main/scala/is/hail/expr/ir/streams/EmitStream.scala @@ -149,7 +149,7 @@ object EmitStream { cb: EmitCodeBuilder, region: Value[Region] = outerRegion, env: EmitEnv = env, - container: Option[AggContainer] = container, + container: Option[AggContainer], ): Unit = emitter.emitVoid(cb, ir, region, env, container, None) @@ -170,8 +170,7 @@ object EmitStream { streamIR: IR, elementPType: PType, cb: EmitCodeBuilder, - outerRegion: Value[Region] = outerRegion, - env: EmitEnv = env, + env: EmitEnv, ): IEmitCode = { val ecb = cb.emb.genEmitClass[NoBoxLongIterator]("stream_to_iter") ecb.cb.addInterface(typeInfo[MissingnessAsMethod].iname) @@ -2917,8 +2916,6 @@ object EmitStream { producers.flatMap(_.length) match { case Seq() => None case ls => - val len = mb.genFieldThisRef[Int]("zip_asl_len") - val lenTemp = mb.genFieldThisRef[Int]("zip_asl_len_temp") Some({ cb: EmitCodeBuilder => val len = cb.newLocal[Int]("zip_len", ls.head(cb)) ls.tail.foreach { compL => @@ -3370,7 +3367,6 @@ object EmitStream { makeProducer, eltType, cb, - outerRegion, env.bind(ctxName, cb.memoize(contextsArray.loadElement(cb, idx))), ) .get(cb, "streams in zipJoinProducers cannot be missing") @@ -3640,7 +3636,6 @@ object EmitStream { .storageType .asInstanceOf[PCanonicalStruct] - val region = mb.genFieldThisRef[Region]("smm_region") val regionArray = mb.genFieldThisRef[Array[Region]]("smm_region_array") val staticMemManagementArray = diff --git a/hail/src/main/scala/is/hail/io/IndexBTree.scala b/hail/src/main/scala/is/hail/io/IndexBTree.scala index a6c2be81e13..11cf0d4fece 100644 --- a/hail/src/main/scala/is/hail/io/IndexBTree.scala +++ b/hail/src/main/scala/is/hail/io/IndexBTree.scala @@ -182,7 +182,7 @@ class IndexBTree(indexFileName: String, fs: FS, branchingFactor: Int = 1024) ext def queryIndex(query: Long): Option[Long] = { require(query >= 0) - val (index, result) = traverseTree(query, 0L, 1) + val (_, result) = traverseTree(query, 0L, 1) if (result != -1L) Option(result) diff --git a/hail/src/main/scala/is/hail/io/InputBuffers.scala b/hail/src/main/scala/is/hail/io/InputBuffers.scala index 25d3c77cc51..97ad9bfc1d3 100644 --- a/hail/src/main/scala/is/hail/io/InputBuffers.scala +++ b/hail/src/main/scala/is/hail/io/InputBuffers.scala @@ -185,7 +185,7 @@ final class MemoryInputBuffer(mb: MemoryBuffer) extends InputBuffer { def readBytes(toRegion: Region, toOff: Long, n: Int): Unit = mb.readBytes(toOff, n) def readBytesArray(n: Int): Array[Byte] = { - var arr = new Array[Byte](n) + val arr = new Array[Byte](n) mb.readBytesArray(arr, n) arr } @@ -457,7 +457,7 @@ final class BlockingInputBuffer(blockSize: Int, in: InputBlockBuffer) extends In } def readBytesArray(n: Int): Array[Byte] = { - var arr = new Array[Byte](n) + val arr = new Array[Byte](n) read(arr, 0, n) arr } diff --git a/hail/src/main/scala/is/hail/io/avro/UnsafeAvroTableReaderOptions.scala b/hail/src/main/scala/is/hail/io/avro/UnsafeAvroTableReaderOptions.scala index adb151693d7..14a2c5b8175 100644 --- a/hail/src/main/scala/is/hail/io/avro/UnsafeAvroTableReaderOptions.scala +++ b/hail/src/main/scala/is/hail/io/avro/UnsafeAvroTableReaderOptions.scala @@ -29,7 +29,6 @@ class UnsafeAvroTableReaderOptionsSerializer UnsafeAvroTableReaderOptions(key, intervals, intervalPointType) }, { case UnsafeAvroTableReaderOptions(key, intervals, intervalPointType) => - implicit val fmt: Formats = format val ty = TArray(TInterval(intervalPointType)) ("name" -> UnsafeAvroTableReaderOptions.getClass.getSimpleName) ~ ("key" -> key) ~ diff --git a/hail/src/main/scala/is/hail/io/bgen/BgenRDDPartitions.scala b/hail/src/main/scala/is/hail/io/bgen/BgenRDDPartitions.scala index 7904bbf7457..0241e824747 100644 --- a/hail/src/main/scala/is/hail/io/bgen/BgenRDDPartitions.scala +++ b/hail/src/main/scala/is/hail/io/bgen/BgenRDDPartitions.scala @@ -51,8 +51,6 @@ object BgenRDDPartitions extends Logging { nPartitions: Option[Int], keyType: Type, ): IndexedSeq[FilePartitionInfo] = { - val fs = ctx.fs - val fileRangeBounds = checkFilesDisjoint(ctx, files, keyType) val intervalOrdering = TInterval(keyType).ordering(ctx.stateManager) diff --git a/hail/src/main/scala/is/hail/io/bgen/StagedBGENReader.scala b/hail/src/main/scala/is/hail/io/bgen/StagedBGENReader.scala index bccc68a1c23..f16d9177a49 100644 --- a/hail/src/main/scala/is/hail/io/bgen/StagedBGENReader.scala +++ b/hail/src/main/scala/is/hail/io/bgen/StagedBGENReader.scala @@ -115,7 +115,6 @@ object StagedBGENReader { val nAlleles2 = cb.newLocal[Int]("nAlleles2") val minPloidy = cb.newLocal[Int]("minPloidy") val maxPloidy = cb.newLocal[Int]("maxPloidy") - val longPloidy = cb.newLocal[Long]("longPloidy") val ploidy = cb.newLocal[Int]("ploidy") val phase = cb.newLocal[Int]("phase") val nBitsPerProb = cb.newLocal[Int]("nBitsPerProb") diff --git a/hail/src/main/scala/is/hail/io/compress/BGzipOutputStream.scala b/hail/src/main/scala/is/hail/io/compress/BGzipOutputStream.scala index e84be3825d2..bdea7344437 100644 --- a/hail/src/main/scala/is/hail/io/compress/BGzipOutputStream.scala +++ b/hail/src/main/scala/is/hail/io/compress/BGzipOutputStream.scala @@ -75,7 +75,7 @@ class BGzipOutputStream(out: OutputStream) extends CompressionOutputStream(out) var numBytesRemaining = length while (numBytesRemaining > 0) { - var bytesToWrite = + val bytesToWrite = math.min(uncompressedBuffer.length - numUncompressedBytes, numBytesRemaining) System.arraycopy(bytes, currentPosition, uncompressedBuffer, numUncompressedBytes, bytesToWrite) @@ -111,7 +111,7 @@ class BGzipOutputStream(out: OutputStream) extends CompressionOutputStream(out) crc32.reset() crc32.update(uncompressedBuffer, 0, numUncompressedBytes) - val totalBlockSize: Int = writeGzipBlock(compressedSize, numUncompressedBytes, crc32.getValue) + writeGzipBlock(compressedSize, numUncompressedBytes, crc32.getValue) numUncompressedBytes = 0 // reset variable } diff --git a/hail/src/main/scala/is/hail/io/fs/AzureStorageFS.scala b/hail/src/main/scala/is/hail/io/fs/AzureStorageFS.scala index 4c5ec74e2d1..05613fc3945 100644 --- a/hail/src/main/scala/is/hail/io/fs/AzureStorageFS.scala +++ b/hail/src/main/scala/is/hail/io/fs/AzureStorageFS.scala @@ -25,7 +25,6 @@ import java.io.{ByteArrayOutputStream, FileNotFoundException, OutputStream} import java.nio.file.Paths import java.time.Duration -import org.apache.log4j.Logger import org.json4s.Formats import org.json4s.jackson.JsonMethods @@ -88,10 +87,6 @@ object AzureStorageFS { private val AZURE_HTTPS_URI_REGEX = "^https:\\/\\/([a-z0-9_\\-\\.]+)\\.blob\\.core\\.windows\\.net\\/([a-z0-9_\\-\\.]+)(\\/.*)?".r - private val log = Logger.getLogger(getClass.getName) - - val schemes: Array[String] = Array("hail-az", "https") - def parseUrl(filename: String): AzureStorageFSURL = { val scheme = filename.split(":")(0) if (scheme == "hail-az") { diff --git a/hail/src/main/scala/is/hail/io/fs/HadoopFS.scala b/hail/src/main/scala/is/hail/io/fs/HadoopFS.scala index 7f6fbb6f6b4..285cfd578c1 100644 --- a/hail/src/main/scala/is/hail/io/fs/HadoopFS.scala +++ b/hail/src/main/scala/is/hail/io/fs/HadoopFS.scala @@ -126,7 +126,7 @@ class HadoopFS(private[this] var conf: SerializableHadoopConfiguration) extends new hadoop.fs.Path(filename).getFileSystem(conf.value) def listDirectory(url: URL): Array[FileListEntry] = { - var statuses = url.hadoopFs.globStatus(url.hadoopPath) + val statuses = url.hadoopFs.globStatus(url.hadoopPath) if (statuses == null) { throw new FileNotFoundException(url.toString) } else { diff --git a/hail/src/main/scala/is/hail/io/index/IndexReader.scala b/hail/src/main/scala/is/hail/io/index/IndexReader.scala index b75fb68b6c4..4ae410403cd 100644 --- a/hail/src/main/scala/is/hail/io/index/IndexReader.scala +++ b/hail/src/main/scala/is/hail/io/index/IndexReader.scala @@ -172,7 +172,6 @@ class IndexReader( } else { val node = readInternalNode(offset) val children = node.children - val n = children.length val idx = children.upperBound(key, ordering.lt, _.firstKey) upperBound(key, level - 1, children(idx - 1).indexFileOffset) } diff --git a/hail/src/main/scala/is/hail/io/index/IndexWriter.scala b/hail/src/main/scala/is/hail/io/index/IndexWriter.scala index 345e1901601..20a2e974240 100644 --- a/hail/src/main/scala/is/hail/io/index/IndexWriter.scala +++ b/hail/src/main/scala/is/hail/io/index/IndexWriter.scala @@ -1,6 +1,6 @@ package is.hail.io.index -import is.hail.annotations.{Annotation, Region, RegionPool, RegionValueBuilder} +import is.hail.annotations.{Annotation, Region, RegionPool} import is.hail.asm4s.{HailClassLoader, _} import is.hail.backend.{ExecuteContext, HailStateManager, HailTaskContext} import is.hail.expr.ir.{ @@ -110,7 +110,6 @@ class IndexWriter( attributes: Map[String, Any], ) extends AutoCloseable { private val region = Region(pool = pool) - private val rvb = new RegionValueBuilder(sm, region) def appendRow(x: Annotation, offset: Long, annotation: Annotation): Unit = { val koff = keyType.unstagedStoreJavaObject(sm, x, region) @@ -370,7 +369,7 @@ class StagedIndexWriter( ) { require(branchingFactor > 1) - private var elementIdx = cb.genFieldThisRef[Long]() + private val elementIdx = cb.genFieldThisRef[Long]() private val ob = cb.genFieldThisRef[OutputBuffer]() private val utils = new StagedIndexWriterUtils(cb.genFieldThisRef[IndexWriterUtils]()) diff --git a/hail/src/main/scala/is/hail/io/plink/LoadPlink.scala b/hail/src/main/scala/is/hail/io/plink/LoadPlink.scala index e36a67e89e3..9c4cd0d5f43 100644 --- a/hail/src/main/scala/is/hail/io/plink/LoadPlink.scala +++ b/hail/src/main/scala/is/hail/io/plink/LoadPlink.scala @@ -112,7 +112,7 @@ object LoadPlink { val idBuilder = new BoxedArrayBuilder[String] val structBuilder = new BoxedArrayBuilder[Row] - val m = fs.readLines(filename) { + fs.readLines(filename) { _.foreachLine { line => val split = line.split(delimiter) if (split.length != 6) @@ -181,7 +181,6 @@ object LoadPlink { object MatrixPLINKReader { def fromJValue(ctx: ExecuteContext, jv: JValue): MatrixPLINKReader = { - val backend = ctx.backend val fs = ctx.fs implicit val formats: Formats = DefaultFormats diff --git a/hail/src/main/scala/is/hail/io/tabix/TabixReader.scala b/hail/src/main/scala/is/hail/io/tabix/TabixReader.scala index 891eb1130ad..ee1f09d56fd 100644 --- a/hail/src/main/scala/is/hail/io/tabix/TabixReader.scala +++ b/hail/src/main/scala/is/hail/io/tabix/TabixReader.scala @@ -98,7 +98,7 @@ class TabixReader(val filePath: String, fs: FS, idxFilePath: Option[String] = No fatal(s"Hail only supports tabix indexing for VCF, found format code $format") val colSeq = readInt(is) val colBeg = readInt(is) - val colEnd = readInt(is) + readInt(is) // colEnd val meta = readInt(is) // meta char for VCF is '#' if (meta != '#') diff --git a/hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala b/hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala index 12e5e3ee04a..28d9ce62903 100644 --- a/hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala +++ b/hail/src/main/scala/is/hail/io/vcf/LoadVCF.scala @@ -1570,8 +1570,8 @@ object LoadVCF { val prefix = if (excerptStart > 0) "... " else "" val suffix = if (excerptEnd < line.length) " ..." else "" - var caretPad = prefix.length + pos - excerptStart - var pad = " " * caretPad + val caretPad = prefix.length + pos - excerptStart + val pad = " " * caretPad fatal( s"${source.locationString(pos)}: ${e.msg}\n$prefix$excerpt$suffix\n$pad^\noffending line: @1\nsee the Hail log for the full offending line", @@ -1790,8 +1790,6 @@ object MatrixVCFReader { } checkGzipOfGlobbedFiles(params.files, fileListEntries, params.forceGZ, params.gzAsBGZ) - val entryFloatType = LoadVCF.getEntryFloatType(params.entryFloatTypeName) - val headerLines1 = getHeaderLines( fs, params.headerFile.getOrElse(fileListEntries.head.getPath), @@ -1803,10 +1801,7 @@ object MatrixVCFReader { if (params.headerFile.isEmpty) { val header1Bc = backend.broadcast(header1) - val localCallFields = params.callFields - val localFloatType = entryFloatType val files = fileListEntries.map(_.getPath) - val localArrayElementsRequired = params.arrayElementsRequired val localFilterAndReplace = params.filterAndReplace val fsConfigBC = backend.broadcast(fs.getConfiguration()) diff --git a/hail/src/main/scala/is/hail/linalg/BlockMatrix.scala b/hail/src/main/scala/is/hail/linalg/BlockMatrix.scala index 241720184fe..18c08349811 100644 --- a/hail/src/main/scala/is/hail/linalg/BlockMatrix.scala +++ b/hail/src/main/scala/is/hail/linalg/BlockMatrix.scala @@ -295,7 +295,7 @@ object BlockMatrix { val d = digitsNeeded(bms.length) val fsBc = fs.broadcast - val partitionCounts = collectMatrices(bms) + collectMatrices(bms) .mapPartitionsWithIndex { case (i, it) => assert(it.hasNext) val m = it.next() @@ -339,7 +339,7 @@ object BlockMatrix { val compressionExtension = compression.map(x => "." + x).getOrElse("") - val partitionCounts = collectMatrices(bms) + collectMatrices(bms) .mapPartitionsWithIndex { case (i, it) => assert(it.hasNext) val m = it.next() @@ -2375,7 +2375,7 @@ class BlockMatrixReadRowBlockedRDD( ) { import BlockMatrixReadRowBlockedRDD._ - private[this] val BlockMatrixMetadata(blockSize, nRows, nCols, maybeFiltered, partFiles) = + private[this] val BlockMatrixMetadata(blockSize, nRows, nCols, _, partFiles) = metadata private[this] val gp = GridPartitioner(blockSize, nRows, nCols) @@ -2411,7 +2411,6 @@ class BlockMatrixReadRowBlockedRDD( Iterator.single { ctx => val region = ctx.region val rvb = new RegionValueBuilder(HailStateManager(Map.empty), region) - val rv = RegionValue(region) val firstRow = rowsForPartition(0) var blockRow = (firstRow / blockSize).toInt val fs = fsBc.value @@ -2519,7 +2518,6 @@ class BlockMatrixCachedPartFile( ) in.readDoubles(cache, startWritingAt, doublesToRead) cacheEnd = doublesToRead + startWritingAt - var i = 0 fileIndex += doublesToRead assert(doublesToRead > 0) } diff --git a/hail/src/main/scala/is/hail/linalg/LinalgCodeUtils.scala b/hail/src/main/scala/is/hail/linalg/LinalgCodeUtils.scala index 9d499791bbd..d38c9f7b7df 100644 --- a/hail/src/main/scala/is/hail/linalg/LinalgCodeUtils.scala +++ b/hail/src/main/scala/is/hail/linalg/LinalgCodeUtils.scala @@ -51,7 +51,7 @@ object LinalgCodeUtils { PCanonicalNDArray(pndv.st.elementType.storageType().setRequired(true), pndv.st.nDims, false) val strides = pt.makeColumnMajorStrides(shape, cb) - val (dataFirstElementAddress, dataFinisher) = + val (_, dataFinisher) = pt.constructDataFunction(shape, strides, cb, region) // construct an SNDArrayCode with undefined contents val result = dataFinisher(cb) diff --git a/hail/src/main/scala/is/hail/lir/PST.scala b/hail/src/main/scala/is/hail/lir/PST.scala index 0c9d36c8b6d..a8f6b685ff0 100644 --- a/hail/src/main/scala/is/hail/lir/PST.scala +++ b/hail/src/main/scala/is/hail/lir/PST.scala @@ -315,7 +315,7 @@ class PSTBuilder( // find regions in [start, end] // no edges from [0, start) target (start, end] private def findRegions(start: Int, end: Int): Unit = { - var regionStarts = new IntArrayBuilder() + val regionStarts = new IntArrayBuilder() regionStarts += start // find subregions of [start, end] diff --git a/hail/src/main/scala/is/hail/methods/LocalLDPrune.scala b/hail/src/main/scala/is/hail/methods/LocalLDPrune.scala index b23bc42345c..6e22f021523 100644 --- a/hail/src/main/scala/is/hail/methods/LocalLDPrune.scala +++ b/hail/src/main/scala/is/hail/methods/LocalLDPrune.scala @@ -11,8 +11,6 @@ import is.hail.variant._ import java.util -import org.apache.spark.rdd.RDD - object BitPackedVector { final val GENOTYPES_PER_PACK: Int = 32 final val BITS_PER_PACK: Int = 2 * GENOTYPES_PER_PACK @@ -278,21 +276,6 @@ object LocalLDPrune { keepVariant } - private def pruneLocal( - inputRDD: RDD[BitPackedVector], - r2Threshold: Double, - windowSize: Int, - queueSize: Int, - ): RDD[BitPackedVector] = { - inputRDD.mapPartitions( - { it => - val queue = new util.ArrayDeque[BitPackedVector](queueSize) - it.filter(bpvv => pruneLocal(queue, bpvv, r2Threshold, windowSize, queueSize)) - }, - preservesPartitioning = true, - ) - } - def apply( ctx: ExecuteContext, mt: MatrixValue, @@ -337,8 +320,6 @@ case class LocalLDPrune( def execute(ctx: ExecuteContext, mv: MatrixValue): TableValue = { val nSamples = mv.nCols - val fullRowPType = mv.rvRowPType - val localCallField = callField val tableType = typ(mv.typ) val ts = TableExecuteIntermediate(mv.toTableValue).asTableStage(ctx).mapPartition(Some( tableType.key diff --git a/hail/src/main/scala/is/hail/methods/LogisticRegression.scala b/hail/src/main/scala/is/hail/methods/LogisticRegression.scala index 66ec3e26fae..d8b79558e98 100644 --- a/hail/src/main/scala/is/hail/methods/LogisticRegression.scala +++ b/hail/src/main/scala/is/hail/methods/LogisticRegression.scala @@ -40,8 +40,6 @@ case class LogisticRegression( val tableType = typ(mv.typ) val newRVDType = tableType.canonicalRVDType - val multiPhenoSchema = TStruct(("logistic_regression", TArray(logRegTest.schema))) - val (yVecs, cov, completeColIdx) = RegressionUtils.getPhenosCovCompleteSamples(mv, yFields.toArray, covFields.toArray) diff --git a/hail/src/main/scala/is/hail/methods/PoissonRegression.scala b/hail/src/main/scala/is/hail/methods/PoissonRegression.scala index 2fbe2315447..b174616d86d 100644 --- a/hail/src/main/scala/is/hail/methods/PoissonRegression.scala +++ b/hail/src/main/scala/is/hail/methods/PoissonRegression.scala @@ -60,7 +60,7 @@ case class PoissonRegression( + s" with input variable x, and $k additional ${plural(k, "covariate")}...") val nullModel = new PoissonRegressionModel(cov, y) - var nullFit = nullModel.fit(None, maxIter = maxIterations, tol = tolerance) + val nullFit = nullModel.fit(None, maxIter = maxIterations, tol = tolerance) if (!nullFit.converged) fatal("Failed to fit poisson regression null model (standard MLE with covariates only): " + ( diff --git a/hail/src/main/scala/is/hail/methods/Skat.scala b/hail/src/main/scala/is/hail/methods/Skat.scala index 12f9b0556c7..f8c3ae4b088 100644 --- a/hail/src/main/scala/is/hail/methods/Skat.scala +++ b/hail/src/main/scala/is/hail/methods/Skat.scala @@ -133,8 +133,6 @@ object Skat { q, dof, evals, noncentrality, s, iterations, accuracy, ) val x = result.value - val nIntegrations = result.nIterations - val converged = result.converged val fault = result.fault val pval = 1 - x @@ -204,7 +202,7 @@ case class Skat( s"sample; found ${badVals.length} ${plural(badVals.length, "violation")} starting with ${badVals(0)}") } - val (keyGsWeightRdd, keyType) = + val (keyGsWeightRdd, _) = computeKeyGsWeightRdd(mv, xField, completeColIdx, keyField, weightField) val backend = HailContext.backend diff --git a/hail/src/main/scala/is/hail/misc/BGZipBlocks.scala b/hail/src/main/scala/is/hail/misc/BGZipBlocks.scala index a7ecd958e3a..e5461cd7c11 100644 --- a/hail/src/main/scala/is/hail/misc/BGZipBlocks.scala +++ b/hail/src/main/scala/is/hail/misc/BGZipBlocks.scala @@ -8,7 +8,7 @@ import java.io.InputStream object BGZipBlocks { // Print block starts of block gzip (bgz) file def apply(fs: FS, file: String): Unit = { - var buf = new Array[Byte](64 * 1024) + val buf = new Array[Byte](64 * 1024) // position of 'buf[0]' in input stream var bufPos = 0L diff --git a/hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala b/hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala index d7abcd7331e..b1bf4ad3395 100644 --- a/hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala +++ b/hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala @@ -81,7 +81,6 @@ object AbstractRVDSpec { val (part0Count, bytesWritten) = using(fs.create(partsPath + "/" + filePath)) { os => using(RVDContext.default(execCtx.r.pool)) { ctx => - val rvb = ctx.rvb RichContextRDDRegionValue.writeRowsPartition(codecSpec.buildEncoder(execCtx, rowType))( ctx, rows.iterator.map { a => diff --git a/hail/src/main/scala/is/hail/rvd/RVD.scala b/hail/src/main/scala/is/hail/rvd/RVD.scala index 0dd3a1c6ec1..1bacb52afe8 100644 --- a/hail/src/main/scala/is/hail/rvd/RVD.scala +++ b/hail/src/main/scala/is/hail/rvd/RVD.scala @@ -20,7 +20,6 @@ import scala.reflect.ClassTag import java.util -import org.apache.commons.lang3.StringUtils import org.apache.spark.{Partitioner, SparkContext, TaskContext} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.sql.Row @@ -1198,7 +1197,6 @@ object RVD { def _coerce(typ: RVDType, crdd: CRDD): RVD = empty(execCtx, typ) } - val numPartitions = keys.getNumPartitions val keyInfo = getKeyInfo(execCtx, fullType, partitionKey, keys) if (keyInfo.isEmpty) @@ -1408,7 +1406,6 @@ object RVD { _makeIndexWriter(_, theHailClassLoaderForSparkWorkers, SparkTaskContext.get(), _) val partDigits = digitsNeeded(nPartitions) - val fileDigits = digitsNeeded(rvds.length) for (i <- 0 until nRVDs) { val path = paths(i) fs.mkDir(path + "/rows/rows/parts") @@ -1456,7 +1453,6 @@ object RVD { .par .foreach { case (partFiles, i) => val fs = fsBc.value - val s = StringUtils.leftPad(i.toString, fileDigits, '0') val basePath = paths(i) RichContextRDDRegionValue.writeSplitSpecs( fs, diff --git a/hail/src/main/scala/is/hail/services/BatchConfig.scala b/hail/src/main/scala/is/hail/services/BatchConfig.scala index ff2d0f753d2..661bc94e638 100644 --- a/hail/src/main/scala/is/hail/services/BatchConfig.scala +++ b/hail/src/main/scala/is/hail/services/BatchConfig.scala @@ -4,13 +4,10 @@ import is.hail.utils._ import java.io.{File, FileInputStream} -import org.apache.log4j.Logger import org.json4s._ import org.json4s.jackson.JsonMethods object BatchConfig { - private[this] val log = Logger.getLogger("BatchConfig") - def fromConfigFile(file: String): Option[BatchConfig] = if (new File(file).exists()) { using(new FileInputStream(file))(in => Some(fromConfig(JsonMethods.parse(in)))) diff --git a/hail/src/main/scala/is/hail/services/DeployConfig.scala b/hail/src/main/scala/is/hail/services/DeployConfig.scala index b4d195ee9df..55ccdf36dfd 100644 --- a/hail/src/main/scala/is/hail/services/DeployConfig.scala +++ b/hail/src/main/scala/is/hail/services/DeployConfig.scala @@ -4,13 +4,10 @@ import is.hail.utils._ import java.io.{File, FileInputStream} -import org.apache.log4j.Logger import org.json4s._ import org.json4s.jackson.JsonMethods object DeployConfig { - private[this] val log = Logger.getLogger("DeployConfig") - private[this] lazy val default: DeployConfig = fromConfigFile() private[this] var _get: DeployConfig = null diff --git a/hail/src/main/scala/is/hail/services/batch_client/BatchClient.scala b/hail/src/main/scala/is/hail/services/batch_client/BatchClient.scala index a8a01b8f98f..cb23ecbf852 100644 --- a/hail/src/main/scala/is/hail/services/batch_client/BatchClient.scala +++ b/hail/src/main/scala/is/hail/services/batch_client/BatchClient.scala @@ -193,7 +193,7 @@ class BatchClient( // at most, 5s val now = System.nanoTime() val elapsed = now - start - var d = math.max( + val d = math.max( math.min( (0.1 * (0.8 + Random.nextFloat() * 0.4) * (elapsed / 1000.0 / 1000)).toInt, 5000, diff --git a/hail/src/main/scala/is/hail/sparkextras/ContextRDD.scala b/hail/src/main/scala/is/hail/sparkextras/ContextRDD.scala index 692d617dc94..fe9c4d4e4ac 100644 --- a/hail/src/main/scala/is/hail/sparkextras/ContextRDD.scala +++ b/hail/src/main/scala/is/hail/sparkextras/ContextRDD.scala @@ -402,9 +402,6 @@ class ContextRDD[T: ClassTag]( def preferredLocations(partition: Partition): Seq[String] = rdd.preferredLocations(partition) - private[this] def clean[U <: AnyRef](value: U): U = - ExposedUtils.clean(value) - def partitions: Array[Partition] = rdd.partitions def partitioner: Option[Partitioner] = rdd.partitioner diff --git a/hail/src/main/scala/is/hail/stats/GeneralizedChiSquaredDistribution.scala b/hail/src/main/scala/is/hail/stats/GeneralizedChiSquaredDistribution.scala index b1751238dc2..c9d52cf6431 100644 --- a/hail/src/main/scala/is/hail/stats/GeneralizedChiSquaredDistribution.scala +++ b/hail/src/main/scala/is/hail/stats/GeneralizedChiSquaredDistribution.scala @@ -175,13 +175,13 @@ class DaviesAlgorithm( def truncation(_u: Double, _tausq: Double): Double = { counter() var u = _u - var tausq = _tausq + val tausq = _tausq var sum1 = 0.0 var prod2 = 0.0 var prod3 = 0.0 var s = 0 - var sum2 = (sigsq + tausq) * square(u) + val sum2 = (sigsq + tausq) * square(u) var prod1 = 2.0 * sum2 u = 2.0 * u @@ -587,7 +587,7 @@ object GeneralizedChiSquaredDistribution { assert(lim >= 0) assert(acc >= 0) - val (value, trace, fault) = new DaviesAlgorithm(c, n, lb, nc, lim, sigma).cdf(acc) + val (value, _, fault) = new DaviesAlgorithm(c, n, lb, nc, lim, sigma).cdf(acc) assert(fault >= 0 && fault <= 2, fault) diff --git a/hail/src/main/scala/is/hail/stats/RegressionUtils.scala b/hail/src/main/scala/is/hail/stats/RegressionUtils.scala index 96cfa015f6c..81533606a65 100644 --- a/hail/src/main/scala/is/hail/stats/RegressionUtils.scala +++ b/hail/src/main/scala/is/hail/stats/RegressionUtils.scala @@ -57,7 +57,6 @@ object RegressionUtils { // IndexedSeq indexed by column, Array by field def getColumnVariables(mv: MatrixValue, names: Array[String]) : IndexedSeq[Array[Option[Double]]] = { - val colType = mv.typ.colType assert(names.forall(name => mv.typ.colType.field(name).typ == TFloat64)) val fieldIndices = names.map { name => val field = mv.typ.colType.field(name) diff --git a/hail/src/main/scala/is/hail/stats/package.scala b/hail/src/main/scala/is/hail/stats/package.scala index 207deed677a..2b87f1d320f 100644 --- a/hail/src/main/scala/is/hail/stats/package.scala +++ b/hail/src/main/scala/is/hail/stats/package.scala @@ -203,7 +203,7 @@ package object stats { val hgd = new HypergeometricDistribution(null, popSize, numSuccessPopulation, sampleSize) val epsilon = 2.220446e-16 - def dhyper(k: Int, logProb: Boolean = false): Double = + def dhyper(k: Int, logProb: Boolean): Double = if (logProb) hgd.logProbability(k) else hgd.probability(k) val logdc = support.map(dhyper(_, logProb = true)) @@ -214,7 +214,7 @@ package object stats { d.map(_ / d.sum) } - def phyper(k: Int, lower_tail: Boolean = true): Double = + def phyper(k: Int, lower_tail: Boolean): Double = if (lower_tail) hgd.cumulativeProbability(k) else diff --git a/hail/src/main/scala/is/hail/types/physical/PCanonicalLocus.scala b/hail/src/main/scala/is/hail/types/physical/PCanonicalLocus.scala index 82b2ea5c08b..402d34f3c0d 100644 --- a/hail/src/main/scala/is/hail/types/physical/PCanonicalLocus.scala +++ b/hail/src/main/scala/is/hail/types/physical/PCanonicalLocus.scala @@ -13,7 +13,7 @@ import is.hail.utils.FastSeq import is.hail.variant._ object PCanonicalLocus { - private def representation(required: Boolean = false): PCanonicalStruct = PCanonicalStruct( + private def representation(required: Boolean): PCanonicalStruct = PCanonicalStruct( required, "contig" -> PCanonicalString(required = true), "position" -> PInt32(required = true), diff --git a/hail/src/main/scala/is/hail/types/physical/stypes/interfaces/SBaseStruct.scala b/hail/src/main/scala/is/hail/types/physical/stypes/interfaces/SBaseStruct.scala index 2a5810d5291..ecbd34cdc61 100644 --- a/hail/src/main/scala/is/hail/types/physical/stypes/interfaces/SBaseStruct.scala +++ b/hail/src/main/scala/is/hail/types/physical/stypes/interfaces/SBaseStruct.scala @@ -17,7 +17,6 @@ object SBaseStruct { val rt = s2.st.virtualType.asInstanceOf[TStruct] val resultVType = TStruct.concat(lt, rt) - val st1 = s1.st val st2 = s2.st (s1, s2) match { diff --git a/hail/src/main/scala/is/hail/types/physical/stypes/interfaces/SNDArray.scala b/hail/src/main/scala/is/hail/types/physical/stypes/interfaces/SNDArray.scala index 3d44aa37b38..53791e5c651 100644 --- a/hail/src/main/scala/is/hail/types/physical/stypes/interfaces/SNDArray.scala +++ b/hail/src/main/scala/is/hail/types/physical/stypes/interfaces/SNDArray.scala @@ -652,7 +652,7 @@ object SNDArray { work: SNDArrayValue, blocksize: Value[Long], ): Unit = { - val Seq(m, n) = A.shapes + val Seq(_, n) = A.shapes SNDArray.geqrt(A, T, work, blocksize, cb) // copy upper triangle of A0 to R SNDArray.copyMatrix(cb, "U", A.slice(cb, (null, n), ColonIndex), R) @@ -803,7 +803,7 @@ object SNDArray { T: SNDArrayValue, work: SNDArrayValue, ): Unit = { - val Seq(m, n) = A.shapes + val Seq(_, n) = A.shapes SNDArray.geqr(cb, A, T, work) // copy upper triangle of A0 to R SNDArray.copyMatrix(cb, "U", A.slice(cb, (null, n), ColonIndex), R) diff --git a/hail/src/main/scala/is/hail/types/virtual/TNDArray.scala b/hail/src/main/scala/is/hail/types/virtual/TNDArray.scala index c5f10636c8e..3743350a5b3 100644 --- a/hail/src/main/scala/is/hail/types/virtual/TNDArray.scala +++ b/hail/src/main/scala/is/hail/types/virtual/TNDArray.scala @@ -122,9 +122,4 @@ final case class TNDArray(elementType: Type, nDimsBase: NatBase) extends Type { override def mkOrdering(sm: HailStateManager, missingEqual: Boolean): ExtendedOrdering = null lazy val shapeType: TTuple = TTuple(Array.fill(nDims)(TInt64): _*) - - private lazy val representation = TStruct( - ("shape", shapeType), - ("data", TArray(elementType)), - ) } diff --git a/hail/src/main/scala/is/hail/types/virtual/Type.scala b/hail/src/main/scala/is/hail/types/virtual/Type.scala index db03335ffa6..1df05838fa3 100644 --- a/hail/src/main/scala/is/hail/types/virtual/Type.scala +++ b/hail/src/main/scala/is/hail/types/virtual/Type.scala @@ -138,7 +138,7 @@ abstract class Type extends BaseType with Serializable { def query(fields: String*): Querier = query(fields.toList) def query(path: List[String]): Querier = { - val (t, q) = queryTyped(path) + val (_, q) = queryTyped(path) q } diff --git a/hail/src/main/scala/is/hail/utils/ErrorHandling.scala b/hail/src/main/scala/is/hail/utils/ErrorHandling.scala index 5718ed0f766..176df006080 100644 --- a/hail/src/main/scala/is/hail/utils/ErrorHandling.scala +++ b/hail/src/main/scala/is/hail/utils/ErrorHandling.scala @@ -59,7 +59,6 @@ trait ErrorHandling { def handleForPython(e: Throwable): (String, String, Int) = { val short = deepestMessage(e) val expanded = expandException(e, false) - val logExpanded = expandException(e, true) def searchForErrorCode(exception: Throwable): Int = { if (exception.isInstanceOf[HailException]) { diff --git a/hail/src/main/scala/is/hail/utils/FlipbookIterator.scala b/hail/src/main/scala/is/hail/utils/FlipbookIterator.scala index 7118dccbaa9..f42616cea09 100644 --- a/hail/src/main/scala/is/hail/utils/FlipbookIterator.scala +++ b/hail/src/main/scala/is/hail/utils/FlipbookIterator.scala @@ -28,7 +28,7 @@ abstract class StateMachine[A] { object StateMachine { def terminal[A]: StateMachine[A] = new StateMachine[A] { val isValid = false - var value: A = _ + def value: A = ??? def advance(): Unit = {} } } diff --git a/hail/src/main/scala/is/hail/utils/StringEscapeUtils.scala b/hail/src/main/scala/is/hail/utils/StringEscapeUtils.scala index e4f000b76c5..5c5c452d268 100644 --- a/hail/src/main/scala/is/hail/utils/StringEscapeUtils.scala +++ b/hail/src/main/scala/is/hail/utils/StringEscapeUtils.scala @@ -135,7 +135,6 @@ object StringEscapeUtils { def unescapeString(str: String, sb: StringBuilder): String = { sb.clear() - val sz = str.length() var hadSlash = false var inUnicode = false lazy val unicode = new StringBuilder(capacity = 4) diff --git a/hail/src/main/scala/is/hail/utils/StringSocketAppender.scala b/hail/src/main/scala/is/hail/utils/StringSocketAppender.scala index 44ad28b8232..6f05eea2f93 100644 --- a/hail/src/main/scala/is/hail/utils/StringSocketAppender.scala +++ b/hail/src/main/scala/is/hail/utils/StringSocketAppender.scala @@ -19,13 +19,11 @@ object StringSocketAppender { } class StringSocketAppender() extends AppenderSkeleton { - private var remoteHost: String = _ private var address: InetAddress = _ private var port: Int = _ private var os: OutputStream = _ - private var reconnectionDelay = StringSocketAppender.DEFAULT_RECONNECTION_DELAY + private val reconnectionDelay = StringSocketAppender.DEFAULT_RECONNECTION_DELAY private var connector: SocketConnector = null - private var counter = 0 private var patternLayout: PatternLayout = _ private var initialized: Boolean = false @@ -34,7 +32,6 @@ class StringSocketAppender() extends AppenderSkeleton { def connect(host: String, port: Int, format: String): Unit = { this.port = port this.address = InetAddress.getByName(host) - this.remoteHost = host this.patternLayout = new PatternLayout(format) connect(address, port) initialized = true diff --git a/hail/src/main/scala/is/hail/variant/HardCallView.scala b/hail/src/main/scala/is/hail/variant/HardCallView.scala index f7b812839c4..cc5e715e847 100644 --- a/hail/src/main/scala/is/hail/variant/HardCallView.scala +++ b/hail/src/main/scala/is/hail/variant/HardCallView.scala @@ -22,7 +22,7 @@ final class ArrayGenotypeView(rvType: PStruct) { } } - private val (gtExists, gtIndex, gtType) = lookupField("GT", _ == PCanonicalCall()) + private val (gtExists, gtIndex, _) = lookupField("GT", _ == PCanonicalCall()) private val (gpExists, gpIndex, _gpType) = lookupField( "GP", diff --git a/hail/src/main/scala/is/hail/variant/ReferenceGenome.scala b/hail/src/main/scala/is/hail/variant/ReferenceGenome.scala index 760e7c12d10..7412b32ff9b 100644 --- a/hail/src/main/scala/is/hail/variant/ReferenceGenome.scala +++ b/hail/src/main/scala/is/hail/variant/ReferenceGenome.scala @@ -640,7 +640,6 @@ object ReferenceGenome { mtContigs: Array[String] = Array.empty[String], parInput: Array[String] = Array.empty[String], ): ReferenceGenome = { - val tmpdir = ctx.localTmpdir val fs = ctx.fs if (!fs.isFile(fastaFile)) diff --git a/hail/src/test/scala/is/hail/annotations/UnsafeSuite.scala b/hail/src/test/scala/is/hail/annotations/UnsafeSuite.scala index 7ad2b893463..2730821034a 100644 --- a/hail/src/test/scala/is/hail/annotations/UnsafeSuite.scala +++ b/hail/src/test/scala/is/hail/annotations/UnsafeSuite.scala @@ -71,9 +71,6 @@ class UnsafeSuite extends HailSuite { val region2 = Region(pool = pool) val region3 = Region(pool = pool) val region4 = Region(pool = pool) - val rvb = new RegionValueBuilder(sm, region) - - val path = ctx.createTmpPath("test-codec", "ser") val g = Type.genStruct .flatMap(t => Gen.zip(Gen.const(t), t.genValue(sm))) @@ -83,7 +80,6 @@ class UnsafeSuite extends HailSuite { val pt = PType.canonical(t).asInstanceOf[PStruct] val requestedType = subsetType(t).asInstanceOf[TStruct] - val prt = PType.canonical(requestedType).asInstanceOf[PStruct] val a2 = subset(t, requestedType, a) assert(requestedType.typeCheck(a2)) @@ -329,8 +325,6 @@ class UnsafeSuite extends HailSuite { @Test def testUnsafeOrdering(): Unit = { val region = Region(pool = pool) val region2 = Region(pool = pool) - val rvb = new RegionValueBuilder(sm, region) - val rvb2 = new RegionValueBuilder(sm, region2) val g = PType.genStruct .flatMap(t => Gen.zip(Gen.const(t), Gen.zip(t.genValue(sm), t.genValue(sm)))) diff --git a/hail/src/test/scala/is/hail/asm4s/ASM4SSuite.scala b/hail/src/test/scala/is/hail/asm4s/ASM4SSuite.scala index dbfbcb32e45..ceb3bee5bf4 100644 --- a/hail/src/test/scala/is/hail/asm4s/ASM4SSuite.scala +++ b/hail/src/test/scala/is/hail/asm4s/ASM4SSuite.scala @@ -68,44 +68,44 @@ class ASM4SSuite extends HailSuite { } @Test def get(): Unit = { - val fb = FunctionBuilder[A, Int]("F") - fb.emit(fb.getArg[A](1).getField[Int]("i")) + val fb = FunctionBuilder[Foo, Int]("F") + fb.emit(fb.getArg[Foo](1).getField[Int]("i")) val i = fb.result(ctx.shouldWriteIRFiles())(theHailClassLoader) - val a = new A + val a = new Foo assert(i(a) == 5) } @Test def invoke(): Unit = { - val fb = FunctionBuilder[A, Int]("F") - fb.emit(fb.getArg[A](1).invoke[Int]("f")) + val fb = FunctionBuilder[Foo, Int]("F") + fb.emit(fb.getArg[Foo](1).invoke[Int]("f")) val i = fb.result(ctx.shouldWriteIRFiles())(theHailClassLoader) - val a = new A + val a = new Foo assert(i(a) == 6) } @Test def invoke2(): Unit = { - val fb = FunctionBuilder[A, Int]("F") - fb.emit(fb.getArg[A](1).invoke[Int, Int]("g", 6)) + val fb = FunctionBuilder[Foo, Int]("F") + fb.emit(fb.getArg[Foo](1).invoke[Int, Int]("g", 6)) val j = fb.result(ctx.shouldWriteIRFiles())(theHailClassLoader) - val a = new A + val a = new Foo assert(j(a) == 11) } @Test def newInstance(): Unit = { val fb = FunctionBuilder[Int]("F") - fb.emit(Code.newInstance[A]().invoke[Int]("f")) + fb.emit(Code.newInstance[Foo]().invoke[Int]("f")) val f = fb.result(ctx.shouldWriteIRFiles())(theHailClassLoader) assert(f() == 6) } @Test def put(): Unit = { val fb = FunctionBuilder[Int]("F") - val inst = fb.newLocal[A]() + val inst = fb.newLocal[Foo]() fb.emit(Code( - inst.store(Code.newInstance[A]()), + inst.store(Code.newInstance[Foo]()), inst.put("i", -2), inst.getField[Int]("i"), )) @@ -115,11 +115,11 @@ class ASM4SSuite extends HailSuite { @Test def staticPut(): Unit = { val fb = FunctionBuilder[Int]("F") - val inst = fb.newLocal[A]() + val inst = fb.newLocal[Foo]() fb.emit(Code( - inst.store(Code.newInstance[A]()), + inst.store(Code.newInstance[Foo]()), inst.put("j", -2), - Code.getStatic[A, Int]("j"), + Code.getStatic[Foo, Int]("j"), )) val f = fb.result(ctx.shouldWriteIRFiles())(theHailClassLoader) assert(f() == -2) @@ -174,11 +174,11 @@ class ASM4SSuite extends HailSuite { @Test def anewarray(): Unit = { val fb = FunctionBuilder[Int]("F") - val arr = fb.newLocal[Array[A]]() + val arr = fb.newLocal[Array[Foo]]() fb.emit(Code( - arr.store(newArray[A](2)), - arr(0) = Code.newInstance[A](), - arr(1) = Code.newInstance[A](), + arr.store(newArray[Foo](2)), + arr(0) = Code.newInstance[Foo](), + arr(1) = Code.newInstance[Foo](), arr(0).getField[Int]("i") + arr(1).getField[Int]("i"), )) val f = fb.result(ctx.shouldWriteIRFiles())(theHailClassLoader) diff --git a/hail/src/test/scala/is/hail/asm4s/A.java b/hail/src/test/scala/is/hail/asm4s/Foo.java similarity index 89% rename from hail/src/test/scala/is/hail/asm4s/A.java rename to hail/src/test/scala/is/hail/asm4s/Foo.java index 91e5ea28600..dc44fbf412a 100644 --- a/hail/src/test/scala/is/hail/asm4s/A.java +++ b/hail/src/test/scala/is/hail/asm4s/Foo.java @@ -1,6 +1,6 @@ package is.hail.asm4s; -public class A { +public class Foo { public static int j = 11; public int i = 5; diff --git a/hail/src/test/scala/is/hail/expr/ir/Aggregators2Suite.scala b/hail/src/test/scala/is/hail/expr/ir/Aggregators2Suite.scala index 540210d8d4c..c657bd9a302 100644 --- a/hail/src/test/scala/is/hail/expr/ir/Aggregators2Suite.scala +++ b/hail/src/test/scala/is/hail/expr/ir/Aggregators2Suite.scala @@ -708,7 +708,6 @@ class Aggregators2Suite extends HailSuite { @Test def testNestedArrayElementsAgg(): Unit = { val alstate1 = ArrayLenAggSig(knownLength = false, FastSeq(sumAggSig)) - val aestate1 = AggElementsAggSig(FastSeq(sumAggSig)) val alstate2 = ArrayLenAggSig(knownLength = false, FastSeq[PhysicalAggSig](alstate1)) val init = InitOp( @@ -899,7 +898,7 @@ class Aggregators2Suite extends HailSuite { val eltsPrimitive = Array.tabulate(rows.length)(i => FastSeq(GetField(ArrayRef(rref, i), "b"))) val expected = Set("abcd", "foo", null) - val expectedPrimitive = Set(5L, -2L, 7L, null) + val expectedPrimitive: Set[Any] = Set(5L, -2L, 7L, null) val aggsig = PhysicalAggSig(CollectAsSet(), CollectAsSetStateSig(VirtualTypeWithReq(PCanonicalString()))) diff --git a/hail/src/test/scala/is/hail/expr/ir/AggregatorsSuite.scala b/hail/src/test/scala/is/hail/expr/ir/AggregatorsSuite.scala index 3c1f6f22991..08e7c9da26b 100644 --- a/hail/src/test/scala/is/hail/expr/ir/AggregatorsSuite.scala +++ b/hail/src/test/scala/is/hail/expr/ir/AggregatorsSuite.scala @@ -1050,7 +1050,7 @@ class AggregatorsSuite extends HailSuite { val agg = FastSeq(Row("EUR", true, 1), Row("EUR", false, 2), Row("AFR", true, 3), Row("AFR", null, 4)) val aggType = TStruct("k1" -> TString, "k2" -> TBoolean, "x" -> TInt32) - val expected = Map( + val expected: Map[String, Map[Any, Seq[Int]]] = Map( "EUR" -> Map(true -> FastSeq(1), false -> FastSeq(2)), "AFR" -> Map(true -> FastSeq(3), (null, FastSeq(4))), ) diff --git a/hail/src/test/scala/is/hail/expr/ir/EmitStreamSuite.scala b/hail/src/test/scala/is/hail/expr/ir/EmitStreamSuite.scala index 99b7b77b478..d0abf67043a 100644 --- a/hail/src/test/scala/is/hail/expr/ir/EmitStreamSuite.scala +++ b/hail/src/test/scala/is/hail/expr/ir/EmitStreamSuite.scala @@ -25,35 +25,6 @@ class EmitStreamSuite extends HailSuite { implicit val execStrats = ExecStrategy.compileOnly - private def compile1[T: TypeInfo, R: TypeInfo](f: (EmitMethodBuilder[_], Value[T]) => Code[R]) - : T => R = { - val fb = EmitFunctionBuilder[T, R](ctx, "stream_test") - val mb = fb.apply_method - mb.emit(f(mb, mb.getCodeParam[T](1))) - val asmFn = fb.result()(theHailClassLoader) - asmFn.apply - } - - private def compile2[T: TypeInfo, U: TypeInfo, R: TypeInfo]( - f: (EmitMethodBuilder[_], Code[T], Code[U]) => Code[R] - ): (T, U) => R = { - val fb = EmitFunctionBuilder[T, U, R](ctx, "F") - val mb = fb.apply_method - mb.emit(f(mb, mb.getCodeParam[T](1), mb.getCodeParam[U](2))) - val asmFn = fb.result()(theHailClassLoader) - asmFn.apply - } - - private def compile3[T: TypeInfo, U: TypeInfo, V: TypeInfo, R: TypeInfo]( - f: (EmitMethodBuilder[_], Code[T], Code[U], Code[V]) => Code[R] - ): (T, U, V) => R = { - val fb = EmitFunctionBuilder[T, U, V, R](ctx, "F") - val mb = fb.apply_method - mb.emit(f(mb, mb.getCodeParam[T](1), mb.getCodeParam[U](2), mb.getCodeParam[V](3))) - val asmFn = fb.result()(theHailClassLoader) - asmFn.apply - } - def log(str: Code[String], enabled: Boolean = false): Code[Unit] = if (enabled) Code._println(str) else Code._empty diff --git a/hail/src/test/scala/is/hail/expr/ir/IRSuite.scala b/hail/src/test/scala/is/hail/expr/ir/IRSuite.scala index 61d26123181..f10e0ead8f3 100644 --- a/hail/src/test/scala/is/hail/expr/ir/IRSuite.scala +++ b/hail/src/test/scala/is/hail/expr/ir/IRSuite.scala @@ -1470,7 +1470,7 @@ class IRSuite extends HailSuite { val t = TDict(TInt32, TString) assertEvalsTo(CastToArray(NA(t)), null) - val d = Map(1 -> "a", 2 -> null, (null, "c")) + val d: Map[Any, Any] = Map(1 -> "a", 2 -> null, (null, "c")) assertEvalsTo( CastToArray(In(0, t)), // wtf you can't do null -> ... @@ -1515,7 +1515,7 @@ class IRSuite extends HailSuite { val t = TDict(TInt32, TString) assertEvalsTo(invoke("contains", TBoolean, NA(t), I32(2)), null) - val d = Map(1 -> "a", 2 -> null, (null, "c")) + val d: Map[Any, Any] = Map(1 -> "a", 2 -> null, (null, "c")) assertEvalsTo(invoke("contains", TBoolean, In(0, t), NA(TInt32)), FastSeq((d, t)), true) assertEvalsTo(invoke("contains", TBoolean, In(0, t), I32(2)), FastSeq((d, t)), true) assertEvalsTo(invoke("contains", TBoolean, In(0, t), I32(0)), FastSeq((d, t)), false) @@ -2096,7 +2096,6 @@ class IRSuite extends HailSuite { val data = 0 until 10 val shape = FastSeq(2L, 5L) - val nDim = 2 val positives = makeNDArray(data.map(_.toDouble), shape, True()) val negatives = NDArrayMap(positives, "e", ApplyUnaryPrimOp(Negate, Ref("e", TFloat64))) @@ -3422,10 +3421,7 @@ class IRSuite extends HailSuite { "newChunk" -> TNDArray(TFloat64, Nat(2)), )), ) - val mat = Ref("mat", TNDArray(TFloat64, Nat(2))) - val aa = Ref("aa", TArray(TArray(TInt32))) val sta = Ref("sta", TStream(TArray(TInt32))) - val da = Ref("da", TArray(TTuple(TInt32, TString))) val std = Ref("std", TStream(TTuple(TInt32, TString))) val v = Ref("v", TInt32) val s = Ref("s", TStruct("x" -> TInt32, "y" -> TInt64, "z" -> TFloat64)) @@ -4390,8 +4386,6 @@ class IRSuite extends HailSuite { } @Test def testTailLoopNDMemory(): Unit = { - implicit val execStrats = ExecStrategy.compileOnly - val ndType = TNDArray(TInt32, Nat(2)) val ndSum: IR = TailLoop( diff --git a/hail/src/test/scala/is/hail/expr/ir/MatrixIRSuite.scala b/hail/src/test/scala/is/hail/expr/ir/MatrixIRSuite.scala index bd7e93ef277..9ef28b85a72 100644 --- a/hail/src/test/scala/is/hail/expr/ir/MatrixIRSuite.scala +++ b/hail/src/test/scala/is/hail/expr/ir/MatrixIRSuite.scala @@ -232,7 +232,6 @@ class MatrixIRSuite extends HailSuite { @Test(dataProvider = "explodeRowsData") def testMatrixExplode(path: IndexedSeq[String], collection: IndexedSeq[Integer]): Unit = { - val tarray = TArray(TInt32) val range = rangeMatrix(5, 2, None) val field = path.init.foldRight(path.last -> toIRArray(collection))(_ -> IRStruct(_)) diff --git a/hail/src/test/scala/is/hail/expr/ir/MemoryLeakSuite.scala b/hail/src/test/scala/is/hail/expr/ir/MemoryLeakSuite.scala index 9ac4874d2cd..07727bdc187 100644 --- a/hail/src/test/scala/is/hail/expr/ir/MemoryLeakSuite.scala +++ b/hail/src/test/scala/is/hail/expr/ir/MemoryLeakSuite.scala @@ -18,7 +18,7 @@ class MemoryLeakSuite extends HailSuite { val lit = Literal(TSet(TString), (0 until litSize).map(_.toString).toSet) val queries = Literal(TArray(TString), (0 until size).map(_.toString).toFastSeq) ExecuteContext.scoped() { ctx => - val r = eval( + eval( ToArray( mapIR(ToStream(queries))(r => ir.invoke("contains", TBoolean, lit, r)) ), diff --git a/hail/src/test/scala/is/hail/expr/ir/OrderingSuite.scala b/hail/src/test/scala/is/hail/expr/ir/OrderingSuite.scala index 85e730489d9..57d38090233 100644 --- a/hail/src/test/scala/is/hail/expr/ir/OrderingSuite.scala +++ b/hail/src/test/scala/is/hail/expr/ir/OrderingSuite.scala @@ -79,7 +79,6 @@ class OrderingSuite extends HailSuite { val p = Prop.forAll(compareGen) { case (t, a) => pool.scopedRegion { region => val pType = PType.canonical(t).asInstanceOf[PStruct] - val rvb = new RegionValueBuilder(sm, region) val v = pType.unstagedStoreJavaObject(sm, a, region) @@ -236,7 +235,6 @@ class OrderingSuite extends HailSuite { val p = Prop.forAll(compareGen) { case (t, a1, a2) => pool.scopedRegion { region => val pType = PType.canonical(t) - val rvb = new RegionValueBuilder(sm, region) val v1 = pType.unstagedStoreJavaObject(sm, a1, region) @@ -291,7 +289,6 @@ class OrderingSuite extends HailSuite { val p = Prop.forAll(compareGen) { case (t, a1, a2) => pool.scopedRegion { region => val pType = PType.canonical(t) - val rvb = new RegionValueBuilder(sm, region) val v1 = pType.unstagedStoreJavaObject(sm, a1, region) @@ -480,14 +477,11 @@ class OrderingSuite extends HailSuite { val pArray = PCanonicalArray(pt) pool.scopedRegion { region => - val rvb = new RegionValueBuilder(sm, region) - val soff = pset.unstagedStoreJavaObject(sm, set, region) val eoff = pTuple.unstagedStoreJavaObject(sm, Row(elem), region) val fb = EmitFunctionBuilder[Region, Long, Long, Int](ctx, "binary_search") - val cregion = fb.getCodeParam[Region](1).load() val cset = fb.getCodeParam[Long](2) val cetuple = fb.getCodeParam[Long](3) @@ -685,8 +679,6 @@ class OrderingSuite extends HailSuite { def rowDoubleOrderingData(): Array[Array[Any]] = { val xs = Array[Any](null, Double.NegativeInfinity, -0.0, 0.0, 1.0, Double.PositiveInfinity, Double.NaN) - val as = Array(null: IndexedSeq[Any]) ++ - (for (x <- xs) yield FastSeq[Any](x)) val ss = Array[Any](null, "a", "aa") val rs = for { diff --git a/hail/src/test/scala/is/hail/expr/ir/StagedBTreeSuite.scala b/hail/src/test/scala/is/hail/expr/ir/StagedBTreeSuite.scala index 23ea06b0e90..8c923f5bc59 100644 --- a/hail/src/test/scala/is/hail/expr/ir/StagedBTreeSuite.scala +++ b/hail/src/test/scala/is/hail/expr/ir/StagedBTreeSuite.scala @@ -69,7 +69,6 @@ object BTreeBackedSet { val root = fb.genFieldThisRef[Long]() val r = fb.genFieldThisRef[Region]() val ib = fb.getCodeParam[InputBuffer](2) - val ib2 = fb.genFieldThisRef[InputBuffer]() val km = fb.genFieldThisRef[Boolean]() val kv = fb.genFieldThisRef[Long]() diff --git a/hail/src/test/scala/is/hail/expr/ir/TableIRSuite.scala b/hail/src/test/scala/is/hail/expr/ir/TableIRSuite.scala index 4696c798610..6c1cd4a8bde 100644 --- a/hail/src/test/scala/is/hail/expr/ir/TableIRSuite.scala +++ b/hail/src/test/scala/is/hail/expr/ir/TableIRSuite.scala @@ -914,7 +914,6 @@ class TableIRSuite extends HailSuite { } @Test def testTableWrite(): Unit = { - implicit val execStrats = ExecStrategy.interpretOnly val table = TableRange(5, 4) val path = ctx.createTmpPath("test-table-write", "ht") Interpret[Unit](ctx, TableWrite(table, TableNativeWriter(path))) @@ -927,7 +926,6 @@ class TableIRSuite extends HailSuite { } @Test def testWriteKeyDistinctness(): Unit = { - implicit val execStrats = ExecStrategy.interpretOnly val rt = TableRange(40, 4) val idxRef = GetField(Ref("row", rt.typ.rowType), "idx") val at = TableMapRows( @@ -1581,7 +1579,7 @@ class TableIRSuite extends HailSuite { ) } - @Test def testRepartitionCostEstimate: Unit = { + @Test def testRepartitionCostEstimate(): Unit = { val empty = RVDPartitioner.empty(ctx.stateManager, TStruct(Array.empty[Field])) val some = RVDPartitioner.unkeyed(ctx.stateManager, _) diff --git a/hail/src/test/scala/is/hail/expr/ir/analyses/SemanticHashSuite.scala b/hail/src/test/scala/is/hail/expr/ir/analyses/SemanticHashSuite.scala index 730fbc8b0fb..866cb000cfe 100644 --- a/hail/src/test/scala/is/hail/expr/ir/analyses/SemanticHashSuite.scala +++ b/hail/src/test/scala/is/hail/expr/ir/analyses/SemanticHashSuite.scala @@ -309,7 +309,7 @@ class SemanticHashSuite extends HailSuite { val fs = new FakeFS { override def eTag(url: FakeURL): Option[String] = - throw new FileNotFoundException(url.getPath()) + throw new FileNotFoundException(url.getPath) } val ir = @@ -345,7 +345,7 @@ class SemanticHashSuite extends HailSuite { override def glob(url: FakeURL): Array[FileListEntry] = Array(new FileListEntry { override def getPath: String = url.getPath - override def getActualUrl(): String = url.getPath + override def getActualUrl: String = url.getPath override def getModificationTime: lang.Long = ??? override def getLen: Long = ??? override def isDirectory: Boolean = ??? diff --git a/hail/src/test/scala/is/hail/expr/ir/table/TableGenSuite.scala b/hail/src/test/scala/is/hail/expr/ir/table/TableGenSuite.scala index 2364dfae8bd..8cc3d8b6cdf 100644 --- a/hail/src/test/scala/is/hail/expr/ir/table/TableGenSuite.scala +++ b/hail/src/test/scala/is/hail/expr/ir/table/TableGenSuite.scala @@ -20,7 +20,7 @@ class TableGenSuite extends HailSuite { implicit val execStrategy = ExecStrategy.lowering @Test(groups = Array("construction", "typecheck")) - def testWithInvalidContextsType: Unit = { + def testWithInvalidContextsType(): Unit = { val ex = intercept[IllegalArgumentException] { mkTableGen(contexts = Some(Str("oh noes :'("))).typecheck() } @@ -31,7 +31,7 @@ class TableGenSuite extends HailSuite { } @Test(groups = Array("construction", "typecheck")) - def testWithInvalidGlobalsType: Unit = { + def testWithInvalidGlobalsType(): Unit = { val ex = intercept[IllegalArgumentException] { mkTableGen( globals = Some(Str("oh noes :'(")), @@ -44,7 +44,7 @@ class TableGenSuite extends HailSuite { } @Test(groups = Array("construction", "typecheck")) - def testWithInvalidBodyType: Unit = { + def testWithInvalidBodyType(): Unit = { val ex = intercept[IllegalArgumentException] { mkTableGen(body = Some(Str("oh noes :'("))).typecheck() } @@ -54,7 +54,7 @@ class TableGenSuite extends HailSuite { } @Test(groups = Array("construction", "typecheck")) - def testWithInvalidBodyElementType: Unit = { + def testWithInvalidBodyElementType(): Unit = { val ex = intercept[IllegalArgumentException] { mkTableGen(body = Some(MakeStream(IndexedSeq(Str("oh noes :'(")), TStream(TString))) @@ -66,7 +66,7 @@ class TableGenSuite extends HailSuite { } @Test(groups = Array("construction", "typecheck")) - def testWithInvalidPartitionerKeyType: Unit = { + def testWithInvalidPartitionerKeyType(): Unit = { val ex = intercept[IllegalArgumentException] { mkTableGen(partitioner = Some(RVDPartitioner.empty(ctx.stateManager, TStruct("does-not-exist" -> TInt32))) @@ -76,7 +76,7 @@ class TableGenSuite extends HailSuite { } @Test(groups = Array("construction", "typecheck")) - def testWithTooLongPartitionerKeyType: Unit = { + def testWithTooLongPartitionerKeyType(): Unit = { val ex = intercept[IllegalArgumentException] { mkTableGen(partitioner = Some(RVDPartitioner.empty(ctx.stateManager, TStruct("does-not-exist" -> TInt32))) @@ -86,7 +86,7 @@ class TableGenSuite extends HailSuite { } @Test(groups = Array("requiredness")) - def testRequiredness: Unit = { + def testRequiredness(): Unit = { val table = mkTableGen() val analysis = Requiredness(table, ctx) analysis.lookup(table).required shouldBe true @@ -94,14 +94,14 @@ class TableGenSuite extends HailSuite { } @Test(groups = Array("lowering")) - def testLowering: Unit = { + def testLowering(): Unit = { val table = TestUtils.collect(mkTableGen()) val lowered = LowerTableIR(table, DArrayLowering.All, ctx, LoweringAnalyses(table, ctx)) assertEvalsTo(lowered, Row(FastSeq(0, 0).map(Row(_)), Row(0))) } @Test(groups = Array("lowering")) - def testNumberOfContextsMatchesPartitions: Unit = { + def testNumberOfContextsMatchesPartitions(): Unit = { val errorId = 42 val table = TestUtils.collect(mkTableGen( partitioner = Some(RVDPartitioner.unkeyed(ctx.stateManager, 0)), @@ -116,7 +116,7 @@ class TableGenSuite extends HailSuite { } @Test(groups = Array("lowering")) - def testRowsAreCorrectlyKeyed: Unit = { + def testRowsAreCorrectlyKeyed(): Unit = { val errorId = 56 val table = TestUtils.collect(mkTableGen( partitioner = Some(new RVDPartitioner( @@ -139,14 +139,14 @@ class TableGenSuite extends HailSuite { } @Test(groups = Array("optimization", "prune")) - def testPruneNoUnusedFields: Unit = { + def testPruneNoUnusedFields(): Unit = { val start = mkTableGen() val pruned = PruneDeadFields(ctx, start) pruned.typ shouldBe start.typ } @Test(groups = Array("optimization", "prune")) - def testPruneGlobals: Unit = { + def testPruneGlobals(): Unit = { val cname = "contexts" val start = mkTableGen( cname = Some(cname), @@ -165,7 +165,7 @@ class TableGenSuite extends HailSuite { } @Test(groups = Array("optimization", "prune")) - def testPruneContexts: Unit = { + def testPruneContexts(): Unit = { val start = mkTableGen() val TableGetGlobals(pruned) = PruneDeadFields(ctx, TableGetGlobals(start)) pruned.typ should not be start.typ diff --git a/hail/src/test/scala/is/hail/io/IndexBTreeSuite.scala b/hail/src/test/scala/is/hail/io/IndexBTreeSuite.scala index b69656846a9..4682309ba0b 100644 --- a/hail/src/test/scala/is/hail/io/IndexBTreeSuite.scala +++ b/hail/src/test/scala/is/hail/io/IndexBTreeSuite.scala @@ -34,7 +34,6 @@ class IndexBTreeSuite extends HailSuite { property("query gives same answer as array") = forAll(arraySizeGenerator) { case (depth: Int, arraySize: Int) => val arrayRandomStarts = fillRandomArray(arraySize) - val maxLong = arrayRandomStarts.takeRight(1)(0) val index = ctx.createTmpPath("testBtree", "idx") fs.delete(index, true) diff --git a/hail/src/test/scala/is/hail/io/IndexSuite.scala b/hail/src/test/scala/is/hail/io/IndexSuite.scala index 5ba86181a2c..18e4bbee6be 100644 --- a/hail/src/test/scala/is/hail/io/IndexSuite.scala +++ b/hail/src/test/scala/is/hail/io/IndexSuite.scala @@ -40,8 +40,6 @@ class IndexSuite extends HailSuite { branchingFactor: Int, attributes: Map[String, Any], ): Unit = { - val bufferSpec = BufferSpec.default - val iw = IndexWriter.builder(ctx, keyType, annotationType, branchingFactor, attributes)( file, theHailClassLoader, @@ -100,7 +98,7 @@ class IndexSuite extends HailSuite { @Test(dataProvider = "elements") def writeReadGivesSameAsInput(data: Array[String]): Unit = { val file = ctx.createTmpPath("test", "idx") - val attributes = Map("foo" -> true, "bar" -> 5) + val attributes: Map[String, Any] = Map("foo" -> true, "bar" -> 5) val a: (Int) => Annotation = (i: Int) => Row(i % 2 == 0) @@ -155,9 +153,6 @@ class IndexSuite extends HailSuite { ) val index = indexReader(file, TStruct.empty) - val n = stringsWithDups.length - val f = { i: Int => stringsWithDups(i) } - val expectedResult = Array( "aardvark" -> 0, "bear" -> 0, @@ -191,9 +186,6 @@ class IndexSuite extends HailSuite { ) val index = indexReader(file, TStruct.empty) - val n = stringsWithDups.length - val f = { i: Int => stringsWithDups(i) } - val expectedResult = Array( "aardvark" -> 0, "bear" -> 2, diff --git a/hail/src/test/scala/is/hail/io/fs/FSSuite.scala b/hail/src/test/scala/is/hail/io/fs/FSSuite.scala index 506de8078fd..0901c6a2f57 100644 --- a/hail/src/test/scala/is/hail/io/fs/FSSuite.scala +++ b/hail/src/test/scala/is/hail/io/fs/FSSuite.scala @@ -30,7 +30,7 @@ trait FSSuite extends TestNGSuite { def pathsRelRoot(root: String, statuses: Array[FileListEntry]): Set[String] = statuses.map { status => - var p = status.getPath + val p = status.getPath assert(p.startsWith(root), s"$p $root") p.drop(root.length) }.toSet @@ -73,8 +73,8 @@ trait FSSuite extends TestNGSuite { @Test def testFileStatusOnDirIsFailure(): Unit = { val f = r("/dir") - TestUtils.interceptException[FileNotFoundException](r("/dir"))( - fs.fileStatus(r("/dir")) + TestUtils.interceptException[FileNotFoundException](f)( + fs.fileStatus(f) ) } @@ -213,7 +213,7 @@ trait FSSuite extends TestNGSuite { assert(pathsRelRoot(root, statuses) == Set("")) } - @Test def testFileEndingWithPeriod: Unit = { + @Test def testFileEndingWithPeriod(): Unit = { val f = fs.makeQualified(t()) fs.touch(f + "/foo.") val statuses = fs.listDirectory(f) diff --git a/hail/src/test/scala/is/hail/io/fs/FakeFS.scala b/hail/src/test/scala/is/hail/io/fs/FakeFS.scala index 26578742e57..d91a6e57339 100644 --- a/hail/src/test/scala/is/hail/io/fs/FakeFS.scala +++ b/hail/src/test/scala/is/hail/io/fs/FakeFS.scala @@ -1,8 +1,8 @@ package is.hail.io.fs case class FakeURL(path: String) extends FSURL { - def getPath(): String = path - def getActualUrl(): String = path + def getPath: String = path + def getActualUrl: String = path } abstract class FakeFS extends FS { diff --git a/hail/src/test/scala/is/hail/methods/LocalLDPruneSuite.scala b/hail/src/test/scala/is/hail/methods/LocalLDPruneSuite.scala index 232b6ad02d6..ebee4aa797d 100644 --- a/hail/src/test/scala/is/hail/methods/LocalLDPruneSuite.scala +++ b/hail/src/test/scala/is/hail/methods/LocalLDPruneSuite.scala @@ -157,7 +157,6 @@ class LocalLDPruneSuite extends HailSuite { ): Boolean = { val locallyPrunedRDD = getLocallyPrunedRDDWithGT(unprunedMatrixTable, locallyPrunedTable) - val nSamples = unprunedMatrixTable.nCols val r2Matrix = LocalLDPruneSuite.correlationMatrixGT(locallyPrunedRDD.map { case (_, _, gs) => gs @@ -188,7 +187,6 @@ class LocalLDPruneSuite extends HailSuite { ): Boolean = { val locallyPrunedRDD = getLocallyPrunedRDDWithGT(unprunedMatrixTable, locallyPrunedTable) - val nSamples = unprunedMatrixTable.nCols val locallyUncorrelated = { locallyPrunedRDD.mapPartitions( diff --git a/hail/src/test/scala/is/hail/methods/MultiArray2Suite.scala b/hail/src/test/scala/is/hail/methods/MultiArray2Suite.scala index 8ac664a0423..0026478fb9c 100644 --- a/hail/src/test/scala/is/hail/methods/MultiArray2Suite.scala +++ b/hail/src/test/scala/is/hail/methods/MultiArray2Suite.scala @@ -9,7 +9,7 @@ class MultiArray2Suite extends HailSuite { @Test def test() = { // test multiarray of size 0 will be created - val ma0 = MultiArray2.fill[Int](0, 0)(0) + MultiArray2.fill[Int](0, 0)(0) // test multiarray of size 0 that apply nothing out intercept[IllegalArgumentException] { @@ -25,12 +25,12 @@ class MultiArray2Suite extends HailSuite { // bad multiarray initiation -- negative number intercept[IllegalArgumentException] { - val a = MultiArray2.fill[Int](-5, 5)(0) + MultiArray2.fill[Int](-5, 5)(0) } // bad multiarray initiation -- negative number intercept[IllegalArgumentException] { - val a = MultiArray2.fill[Int](5, -5)(0) + MultiArray2.fill[Int](5, -5)(0) } val ma1 = MultiArray2.fill[Int](10, 3)(0) @@ -41,7 +41,7 @@ class MultiArray2Suite extends HailSuite { // Catch exception if try to apply value that is not in indices of multiarray intercept[IllegalArgumentException] { - val foo = ma1(100, 100) + ma1(100, 100) } val ma2 = MultiArray2.fill[Int](10, 3)(0) @@ -70,29 +70,29 @@ class MultiArray2Suite extends HailSuite { assert(row(idx) == ((row.i * idx, "foo"))) intercept[IllegalArgumentException] { - val x = ma5.row(100) + ma5.row(100) } intercept[ArrayIndexOutOfBoundsException] { val x = ma5.row(0) - val y = x(100) + x(100) } intercept[IllegalArgumentException] { - val x = ma5.row(-5) + ma5.row(-5) } intercept[IllegalArgumentException] { - val x = ma5.column(100) + ma5.column(100) } intercept[IllegalArgumentException] { - val x = ma5.column(-5) + ma5.column(-5) } intercept[ArrayIndexOutOfBoundsException] { val x = ma5.column(0) - val y = x(100) + x(100) } // Test column slice diff --git a/hail/src/test/scala/is/hail/stats/FisherExactTestSuite.scala b/hail/src/test/scala/is/hail/stats/FisherExactTestSuite.scala index d4a7cef2f1a..bf5d68aa584 100644 --- a/hail/src/test/scala/is/hail/stats/FisherExactTestSuite.scala +++ b/hail/src/test/scala/is/hail/stats/FisherExactTestSuite.scala @@ -7,10 +7,6 @@ import org.testng.annotations.Test class FisherExactTestSuite extends HailSuite { @Test def testPvalue(): Unit = { - val N = 200 - val K = 100 - val k = 10 - val n = 15 val a = 5 val b = 10 val c = 95 diff --git a/hail/src/test/scala/is/hail/stats/eigSymDSuite.scala b/hail/src/test/scala/is/hail/stats/eigSymDSuite.scala index e1e6ab7e4da..73bcb32e9e4 100644 --- a/hail/src/test/scala/is/hail/stats/eigSymDSuite.scala +++ b/hail/src/test/scala/is/hail/stats/eigSymDSuite.scala @@ -24,7 +24,6 @@ class eigSymDSuite extends HailSuite { val svdK = svd(K) val eigSymK = eigSym(K) val eigSymDK = eigSymD(K) - val eigSymRK = eigSymR(K) // eigSymD = svdW for (j <- 0 until n) { diff --git a/hail/src/test/scala/is/hail/types/physical/PNDArraySuite.scala b/hail/src/test/scala/is/hail/types/physical/PNDArraySuite.scala index 6e445ecb14c..c91333fe690 100644 --- a/hail/src/test/scala/is/hail/types/physical/PNDArraySuite.scala +++ b/hail/src/test/scala/is/hail/types/physical/PNDArraySuite.scala @@ -371,7 +371,6 @@ class PNDArraySuite extends PhysicalTestUtils { val fb = EmitFunctionBuilder[Region, Region, Region, Long](ctx, "ref_count_test") val codeRegion1 = fb.getCodeParam[Region](1) val codeRegion2 = fb.getCodeParam[Region](2) - val codeRegion3 = fb.getCodeParam[Region](3) try { fb.emitWithBuilder { cb => @@ -386,7 +385,7 @@ class PNDArraySuite extends PhysicalTestUtils { // Region 2 gets an ndarray at ndaddress2, plus a reference to the one at ndarray 1. val (_, snd2Finisher) = nd.constructDataFunction(shapeSeq, shapeSeq, cb, codeRegion2) - val snd2 = snd2Finisher(cb) + snd2Finisher(cb) cb.assign(r2PointerToNDAddress1, nd.store(cb, codeRegion2, snd1, true)) // Return the 1st ndarray diff --git a/hail/src/test/scala/is/hail/utils/FlipbookIteratorSuite.scala b/hail/src/test/scala/is/hail/utils/FlipbookIteratorSuite.scala index 8f463eefbaa..85a33e681db 100644 --- a/hail/src/test/scala/is/hail/utils/FlipbookIteratorSuite.scala +++ b/hail/src/test/scala/is/hail/utils/FlipbookIteratorSuite.scala @@ -329,7 +329,7 @@ class FlipbookIteratorSuite extends HailSuite { val a: Array[Box[Int]] = Array.fill(3)(default) var i = 0; while (i < ar.size) { - var v = ar(i) + val v = ar(i) a(v._2) = v._1 i += 1 } diff --git a/hail/src/test/scala/is/hail/utils/PartitionCountsSuite.scala b/hail/src/test/scala/is/hail/utils/PartitionCountsSuite.scala index 51b1d566041..b64466e3f07 100644 --- a/hail/src/test/scala/is/hail/utils/PartitionCountsSuite.scala +++ b/hail/src/test/scala/is/hail/utils/PartitionCountsSuite.scala @@ -43,7 +43,7 @@ class PartitionCountsSuite extends TestNGSuite { } @Test def testIncrementalPCSubset() = { - var pcs = Array(0L, 0L, 5L, 6L, 4L, 3L, 3L, 3L, 2L, 1L) + val pcs = Array(0L, 0L, 5L, 6L, 4L, 3L, 3L, 3L, 2L, 1L) def headOffset(n: Long) = incrementalPCSubsetOffset(n, 0 until pcs.length)(_.map(pcs)) diff --git a/hail/src/test/scala/is/hail/utils/RichDenseMatrixDoubleSuite.scala b/hail/src/test/scala/is/hail/utils/RichDenseMatrixDoubleSuite.scala index 549025e4d64..a8d05321bf4 100644 --- a/hail/src/test/scala/is/hail/utils/RichDenseMatrixDoubleSuite.scala +++ b/hail/src/test/scala/is/hail/utils/RichDenseMatrixDoubleSuite.scala @@ -31,7 +31,7 @@ class RichDenseMatrixDoubleSuite extends HailSuite { val mT = m.t RichDenseMatrixDouble.exportToDoubles(fs, fileT, mT, forceRowMajor = true) val lmT2 = RichDenseMatrixDouble.importFromDoubles(fs, fileT, 100, 50, rowMajor = true) - assert(mT === mT) + assert(mT === lmT2) TestUtils.interceptFatal("Premature") { RichDenseMatrixDouble.importFromDoubles(fs, fileT, 100, 100, rowMajor = true) diff --git a/hail/src/test/scala/is/hail/utils/TreeTraversalSuite.scala b/hail/src/test/scala/is/hail/utils/TreeTraversalSuite.scala index 32836c3207a..0106d66c23a 100644 --- a/hail/src/test/scala/is/hail/utils/TreeTraversalSuite.scala +++ b/hail/src/test/scala/is/hail/utils/TreeTraversalSuite.scala @@ -8,21 +8,21 @@ class TreeTraversalSuite { def binaryTree(i: Int): Iterator[Int] = (1 to 2).map(2 * i + _).iterator.filter(_ < 7) - @Test def testPostOrder = + @Test def testPostOrder() = Assert.assertEquals( TreeTraversal.postOrder(binaryTree)(0).toArray, Array(3, 4, 1, 5, 6, 2, 0), "", ) - @Test def testPreOrder = + @Test def testPreOrder() = Assert.assertEquals( TreeTraversal.preOrder(binaryTree)(0).toArray, Array(0, 1, 3, 4, 2, 5, 6), "", ) - @Test def levelOrder = + @Test def levelOrder() = Assert.assertEquals( TreeTraversal.levelOrder(binaryTree)(0).toArray, (0 to 6).toArray, diff --git a/hail/src/test/scala/is/hail/utils/UtilsSuite.scala b/hail/src/test/scala/is/hail/utils/UtilsSuite.scala index f79493d1dd2..24a5423ed58 100644 --- a/hail/src/test/scala/is/hail/utils/UtilsSuite.scala +++ b/hail/src/test/scala/is/hail/utils/UtilsSuite.scala @@ -143,7 +143,6 @@ class UtilsSuite extends HailSuite { assert(c2.toSeq == Seq("a", "b", "c", "a_1", "a_2", "c_1", "a_3")) assert(diff.toSeq == Seq("a" -> "a_1", "a" -> "a_2", "c" -> "c_1", "a" -> "a_3")) - val c3 = Array("a", "b", "c", "a", "a", "c", "a") val (c4, diff2) = mangle(c1, "D" * _) assert(c4.toSeq == Seq("a", "b", "c", "aD", "aDD", "cD", "aDDD")) assert(diff2.toSeq == Seq("a" -> "aD", "a" -> "aDD", "c" -> "cD", "a" -> "aDDD")) diff --git a/hail/src/test/scala/is/hail/variant/ReferenceGenomeSuite.scala b/hail/src/test/scala/is/hail/variant/ReferenceGenomeSuite.scala index 5b1124aff65..1f7c361f914 100644 --- a/hail/src/test/scala/is/hail/variant/ReferenceGenomeSuite.scala +++ b/hail/src/test/scala/is/hail/variant/ReferenceGenomeSuite.scala @@ -123,11 +123,6 @@ class ReferenceGenomeSuite extends HailSuite { assert(rg.compare("X", "Y") < 0) assert(rg.compare("Y", "X") > 0) assert(rg.compare("Y", "MT") < 0) - - // Test loci - val l1 = Locus("1", 25) - val l2 = Locus("1", 13000) - val l3 = Locus("2", 26) } @Test def testWriteToFile(): Unit = { @@ -230,7 +225,6 @@ class ReferenceGenomeSuite extends HailSuite { withExecuteContext() { ctx => val grch38 = ctx.getReference(ReferenceGenome.GRCh38) val fb = EmitFunctionBuilder[String, Boolean](ctx, "serialize_rg") - val cb = fb.ecb val rgfield = fb.getReferenceGenome(grch38.name) fb.emit(rgfield.invoke[String, Boolean]("isValidContig", fb.getCodeParam[String](1))) @@ -248,7 +242,6 @@ class ReferenceGenomeSuite extends HailSuite { val fb = EmitFunctionBuilder[String, Locus, Double, (Locus, Boolean)](ctx, "serialize_with_liftover") - val cb = fb.ecb val rgfield = fb.getReferenceGenome(grch37.name) fb.emit(rgfield.invoke[String, Locus, Double, (Locus, Boolean)]( "liftoverLocus", From 5fde6f2a8cf0d520769efd6f9ea2db10a033f976 Mon Sep 17 00:00:00 2001 From: Dan King Date: Fri, 26 Jan 2024 15:25:14 -0500 Subject: [PATCH 10/26] [spark_backend] avoid infinite recursion when initialization fails (#14199) --- hail/python/hail/backend/py4j_backend.py | 3 +++ hail/python/hail/utils/java.py | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/hail/python/hail/backend/py4j_backend.py b/hail/python/hail/backend/py4j_backend.py index 9a4794833fc..9fcb9d61579 100644 --- a/hail/python/hail/backend/py4j_backend.py +++ b/hail/python/hail/backend/py4j_backend.py @@ -62,6 +62,9 @@ def deco(*args, **kwargs): if s.startswith('java.util.NoSuchElementException'): raise + if not Env.is_fully_initialized: + raise ValueError('Error occurred during Hail initialization.') from e + tpl = Env.jutils().handleForPython(e.java_exception) deepest, full, error_id = tpl._1(), tpl._2(), tpl._3() raise fatal_error_from_java_error_triplet(deepest, full, error_id) from None diff --git a/hail/python/hail/utils/java.py b/hail/python/hail/utils/java.py index 45beeab70d0..fa711fd38c6 100644 --- a/hail/python/hail/utils/java.py +++ b/hail/python/hail/utils/java.py @@ -66,6 +66,10 @@ def hc() -> 'hail.context.HailContext': assert Env._hc is not None return Env._hc + @staticmethod + def is_fully_initialized() -> bool: + return Env._hc is not None + @staticmethod async def _async_hc() -> 'hail.context.HailContext': if not Env._hc: From feed09d221ab753e04c7659a6c895442cee06d70 Mon Sep 17 00:00:00 2001 From: Dan King Date: Mon, 29 Jan 2024 15:32:22 -0500 Subject: [PATCH 11/26] [rotate_keys.py] make deletion non-interactive as well (#14208) Rotation has a non-interactive mode for when you are rotating keys in an expected state. I added the same behavior for the bulk delete mode. --- devbin/rotate_keys.py | 57 ++++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/devbin/rotate_keys.py b/devbin/rotate_keys.py index 7ccbb1f5d86..e07e80abbc7 100644 --- a/devbin/rotate_keys.py +++ b/devbin/rotate_keys.py @@ -304,9 +304,10 @@ async def add_new_keys( for sa in service_accounts_under_consideration: sa.list_keys(sys.stdout) - if interactive: - if input('Create new key?\nOnly yes will be accepted: ') != 'yes': - continue + + if interactive and input('Create new key?\nOnly yes will be accepted: ') != 'yes': + print(f'Doing nothing for this key.') + continue new_key, key_data = await iam_manager.create_new_key(sa) sa.add_new_key(new_key) @@ -317,7 +318,11 @@ async def add_new_keys( async def delete_old_keys( - service_accounts: List[ServiceAccount], iam_manager: IAMManager, focus: Optional[RotationState] = None + service_accounts: List[ServiceAccount], + iam_manager: IAMManager, + *, + focus: Optional[RotationState] = None, + interactive: bool, ): async def delete_old_and_refresh(sa: ServiceAccount): to_delete = sa.redundant_user_keys() @@ -333,23 +338,27 @@ async def delete_old_and_refresh(sa: ServiceAccount): if sa.disabled or focus is not None and rotation_state != focus: continue sa.list_keys(sys.stdout) - if input('Delete all but the newest key?\nOnly yes will be accepted: ') == 'yes': - if rotation_state == RotationState.READY_FOR_DELETE: + + if interactive and input('Delete all but the newest key?\nOnly yes will be accepted: ') != 'yes': + print(f'Doing nothing for this key.') + continue + + if rotation_state == RotationState.READY_FOR_DELETE: + await delete_old_and_refresh(sa) + elif rotation_state == RotationState.IN_PROGRESS: + warnings.warn( + 'The most recent key was generated less than ' + 'thirty days ago. Old keys should not be deleted ' + 'as they might still be in use.', + stacklevel=2, + ) + if input('Are you sure you want to delete old keys? ') == 'yes': await delete_old_and_refresh(sa) - elif rotation_state == RotationState.IN_PROGRESS: - warnings.warn( - 'The most recent key was generated less than ' - 'thirty days ago. Old keys should not be deleted ' - 'as they might still be in use.', - stacklevel=2, - ) - if input('Are you sure you want to delete old keys? ') == 'yes': - await delete_old_and_refresh(sa) - else: - warnings.warn( - f'Cannot delete keys in rotation state: {rotation_state}', - stacklevel=2, - ) + else: + warnings.warn( + f'Cannot delete keys in rotation state: {rotation_state}', + stacklevel=2, + ) async def main(): @@ -421,11 +430,13 @@ async def main(): if action == 'interactive-update': await add_new_keys(service_accounts, iam_manager, k8s_manager, interactive=True) elif action == 'delete': - await delete_old_keys(service_accounts, iam_manager) + await delete_old_keys(service_accounts, iam_manager, interactive=True) elif action == 'delete-ready-only': - await delete_old_keys(service_accounts, iam_manager, focus=RotationState.READY_FOR_DELETE) + await delete_old_keys( + service_accounts, iam_manager, focus=RotationState.READY_FOR_DELETE, interactive=False + ) elif action == 'delete-in-progress-only': - await delete_old_keys(service_accounts, iam_manager, focus=RotationState.IN_PROGRESS) + await delete_old_keys(service_accounts, iam_manager, focus=RotationState.IN_PROGRESS, interactive=False) else: print('Doing nothing') finally: From 4324736acb18256336ff9b7599b9a0751ecfa383 Mon Sep 17 00:00:00 2001 From: Dan King Date: Mon, 29 Jan 2024 16:10:14 -0500 Subject: [PATCH 12/26] [hailtop] allow configuration of default HTTP timeout (#14206) Until we have a mechanism to infer the correct timeout based on network conditions, this provides an escape hatch for users on flaky network connections such as wifi. --- hail/python/hailtop/config/variables.py | 1 + .../hailctl/config/config_variables.py | 12 ++++++++++++ hail/python/hailtop/httpx.py | 19 ++++++++++++++----- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/hail/python/hailtop/config/variables.py b/hail/python/hailtop/config/variables.py index cfd82a3d774..036bae78d26 100644 --- a/hail/python/hailtop/config/variables.py +++ b/hail/python/hailtop/config/variables.py @@ -19,3 +19,4 @@ class ConfigVariable(str, Enum): QUERY_BATCH_WORKER_MEMORY = 'query/batch_worker_memory' QUERY_NAME_PREFIX = 'query/name_prefix' QUERY_DISABLE_PROGRESS_BAR = 'query/disable_progress_bar' + HTTP_TIMEOUT_IN_SECONDS = 'http/timeout_in_seconds' diff --git a/hail/python/hailtop/hailctl/config/config_variables.py b/hail/python/hailtop/hailctl/config/config_variables.py index 72781907dab..50d555ef953 100644 --- a/hail/python/hailtop/hailctl/config/config_variables.py +++ b/hail/python/hailtop/hailctl/config/config_variables.py @@ -9,6 +9,14 @@ ConfigVariableInfo = namedtuple('ConfigVariableInfo', ['help_msg', 'validation']) +def _is_float_str(x: str) -> bool: + try: + float(x) + return True + except ValueError: + return False + + def config_variables(): from hailtop.batch_client.parse import CPU_REGEXPAT, MEMORY_REGEXPAT # pylint: disable=import-outside-toplevel from hailtop.aiotools.router_fs import RouterAsyncFS # pylint: disable=import-outside-toplevel @@ -124,6 +132,10 @@ def config_variables(): help_msg='Disable the progress bar with a value of 1. Enable the progress bar with a value of 0', validation=(lambda x: x in ('0', '1'), 'should be a value of 0 or 1'), ), + ConfigVariable.HTTP_TIMEOUT_IN_SECONDS: ConfigVariableInfo( + help_msg='The default timeout for HTTP requests in seconds.', + validation=(_is_float_str, 'should be a float or an int like 42.42 or 42'), + ), } return _config_variables diff --git a/hail/python/hailtop/httpx.py b/hail/python/hailtop/httpx.py index a40a6b5cf8b..17605dcd4c4 100644 --- a/hail/python/hailtop/httpx.py +++ b/hail/python/hailtop/httpx.py @@ -8,6 +8,7 @@ from .tls import internal_client_ssl_context, external_client_ssl_context from .config.deploy_config import get_deploy_config +from .config import ConfigVariable, configuration_of class ClientResponseError(aiohttp.ClientResponseError): @@ -101,15 +102,23 @@ def __init__( assert 'connector' not in kwargs - if timeout is None: - timeout = aiohttp.ClientTimeout(total=5) - if isinstance(timeout, (float, int)): - timeout = aiohttp.ClientTimeout(total=timeout) + configuration_of_timeout = configuration_of(ConfigVariable.HTTP_TIMEOUT_IN_SECONDS, timeout, 5) + del timeout + + if isinstance(configuration_of_timeout, str): + configuration_of_timeout = float(configuration_of_timeout) + if isinstance(configuration_of_timeout, (float, int)): + configuration_of_timeout = aiohttp.ClientTimeout(total=configuration_of_timeout) + assert isinstance(configuration_of_timeout, aiohttp.ClientTimeout) self.loop = asyncio.get_running_loop() self.raise_for_status = raise_for_status self.client_session = aiohttp.ClientSession( - *args, timeout=timeout, raise_for_status=False, connector=aiohttp.TCPConnector(ssl=tls), **kwargs + *args, + timeout=configuration_of_timeout, + raise_for_status=False, + connector=aiohttp.TCPConnector(ssl=tls), + **kwargs, ) def request( From f56f579cf5dd4da1621395d8b5fc5c262cba18ae Mon Sep 17 00:00:00 2001 From: Christopher Vittal Date: Mon, 29 Jan 2024 15:42:59 -0600 Subject: [PATCH 13/26] [vds/combiner] Better calculation of ref_block_max_length (#14178) Use the patch in place function of `store_ref_block_max_length` to compute `ref_block_max_length` rather than computing it on a zip join pipeline, causing that zip join pipeline to be executed twice. --- .../hail/vds/combiner/variant_dataset_combiner.py | 14 +++----------- hail/python/hail/vds/variant_dataset.py | 2 +- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/hail/python/hail/vds/combiner/variant_dataset_combiner.py b/hail/python/hail/vds/combiner/variant_dataset_combiner.py index 8eab7aa765e..27dab5fb9ed 100644 --- a/hail/python/hail/vds/combiner/variant_dataset_combiner.py +++ b/hail/python/hail/vds/combiner/variant_dataset_combiner.py @@ -428,19 +428,11 @@ def step(self): self._job_id += 1 def _write_final(self, vds): - fd = VariantDataset.ref_block_max_length_field + vds.write(self._output_path) - if fd not in vds.reference_data.globals: + if VariantDataset.ref_block_max_length_field not in vds.reference_data.globals: info("VDS combiner: computing reference block max length...") - max_len = vds.reference_data.aggregate_entries( - hl.agg.max(vds.reference_data.END + 1 - vds.reference_data.locus.position) - ) - info(f"VDS combiner: max reference block length is {max_len}") - vds = VariantDataset( - reference_data=vds.reference_data.annotate_globals(**{fd: max_len}), variant_data=vds.variant_data - ) - - vds.write(self._output_path) + hl.vds.store_ref_block_max_length(self._output_path) def _step_vdses(self): current_bin = original_bin = min(self._vdses) diff --git a/hail/python/hail/vds/variant_dataset.py b/hail/python/hail/vds/variant_dataset.py index 1a02f68026c..28d2c2b8be7 100644 --- a/hail/python/hail/vds/variant_dataset.py +++ b/hail/python/hail/vds/variant_dataset.py @@ -83,7 +83,7 @@ def store_ref_block_max_length(vds_path): ---------- vds_path : :obj:`str` """ - vds = hl.vds.read_vds(vds_path) + vds = read_vds(vds_path, _warn_no_ref_block_max_length=False) if VariantDataset.ref_block_max_length_field in vds.reference_data.globals: warning(f"VDS at {vds_path} already contains a global annotation with the max reference block length") From 497f8a90d83d6223fccc3cef91b0c0473a38137f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 31 Jan 2024 10:04:42 -0500 Subject: [PATCH 14/26] Bump jupyterlab from 4.0.9 to 4.0.12 in /hail/python/dev (#14218) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [jupyterlab](https://github.com/jupyterlab/jupyterlab) from 4.0.9 to 4.0.12.
Release notes

Sourced from jupyterlab's releases.

v4.0.12

4.0.12

(Full Changelog)

Bugs fixed

Maintenance and upkeep improvements

Documentation improvements

Contributors to this release

(GitHub contributors page for this release)

@​FoSuCloud | @​github-actions | @​j264415 | @​JasonWeill | @​jupyterlab-bot | @​jupyterlab-probot | @​krassowski | @​lumberbot-app | @​meeseeksmachine | @​welcome

v4.0.11

4.0.11

(Full Changelog)

Security fixes

Bugs fixed

Documentation improvements

Contributors to this release

(GitHub contributors page for this release)

... (truncated)

Changelog

Sourced from jupyterlab's changelog.

4.0.12

(Full Changelog)

Bugs fixed

Maintenance and upkeep improvements

Documentation improvements

Contributors to this release

(GitHub contributors page for this release)

@​FoSuCloud | @​github-actions | @​j264415 | @​JasonWeill | @​jupyterlab-bot | @​jupyterlab-probot | @​krassowski | @​lumberbot-app | @​meeseeksmachine | @​welcome

4.0.11

(Full Changelog)

Security fixes

Bugs fixed

Documentation improvements

Contributors to this release

(GitHub contributors page for this release)

... (truncated)

Commits
  • e7a1af7 [ci skip] Publish 4.0.12
  • 69079ec Backport PR #15710: Removes Python 3.0, Notebook 5 mentions from contributor ...
  • 353707e Backport PR #15524: Fix visual tests (#15578)
  • 482aaa0 Backport PR #15650: Fix jupyterlab downgrade issue on extension installation ...
  • 58fb4a9 Backport PR #15690: Fix search highlights removal on clearing input box (#15712)
  • 5062929 Backport PR #15703 on branch 4.0.x (Add scroll margin to headings for better ...
  • c00b0ca Backport PR #15642: Fix outputarea package from not detecting updates (#15652)
  • 8326236 Backport PR #15262 on branch 4.0.x (Fix connection loop issue with standalone...
  • 8a5acf2 Backport PR #15695: Fix shortcut UI failing on filtering when empty command i...
  • 9d4a361 Automated Changelog Entry - Remove 1 placeholder entries. (#15667)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=jupyterlab&package-manager=pip&previous-version=4.0.9&new-version=4.0.12)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- hail/python/dev/pinned-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hail/python/dev/pinned-requirements.txt b/hail/python/dev/pinned-requirements.txt index 5a44b10439e..a7a0385d54d 100644 --- a/hail/python/dev/pinned-requirements.txt +++ b/hail/python/dev/pinned-requirements.txt @@ -220,7 +220,7 @@ jupyter-server==2.12.1 # notebook-shim jupyter-server-terminals==0.5.0 # via jupyter-server -jupyterlab==4.0.9 +jupyterlab==4.0.12 # via notebook jupyterlab-pygments==0.3.0 # via nbconvert From ef5c352343688b4a2a3ec78e29b28ec0f35599f6 Mon Sep 17 00:00:00 2001 From: Christopher Vittal Date: Wed, 31 Jan 2024 17:56:40 -0600 Subject: [PATCH 15/26] [vds/combiner] Add sanity check on uniqueness of gvcf paths/sample names (#14207) --- .../hail/vds/combiner/variant_dataset_combiner.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/hail/python/hail/vds/combiner/variant_dataset_combiner.py b/hail/python/hail/vds/combiner/variant_dataset_combiner.py index 27dab5fb9ed..4d476031944 100644 --- a/hail/python/hail/vds/combiner/variant_dataset_combiner.py +++ b/hail/python/hail/vds/combiner/variant_dataset_combiner.py @@ -669,6 +669,19 @@ def new_combiner( raise ValueError("at least one of 'gvcf_paths' or 'vds_paths' must be nonempty") if gvcf_paths is None: gvcf_paths = [] + if len(gvcf_paths) > 0: + if len(set(gvcf_paths)) != len(gvcf_paths): + duplicates = [gvcf for gvcf, count in collections.Counter(gvcf_paths).items() if count > 1] + duplicates = '\n '.join(duplicates) + raise ValueError(f'gvcf paths should be unique, the following paths are repeated:{duplicates}') + if gvcf_sample_names is not None and len(set(gvcf_sample_names)) != len(gvcf_sample_names): + duplicates = [gvcf for gvcf, count in collections.Counter(gvcf_sample_names).items() if count > 1] + duplicates = '\n '.join(duplicates) + raise ValueError( + "provided sample names ('gvcf_sample_names') should be unique, " + f'the following names are repeated:{duplicates}' + ) + if vds_paths is None: vds_paths = [] if vds_sample_counts is not None and len(vds_paths) != len(vds_sample_counts): From 534037881530248fc025f5381964a26405338c69 Mon Sep 17 00:00:00 2001 From: Daniel Goldstein Date: Wed, 31 Jan 2024 19:39:33 -0500 Subject: [PATCH 16/26] [batch] Add metadata server to batch jobs in GCP (#14019) Implements a basic GCP metadata server for user jobs as described in https://github.com/hail-is/hail-rfcs/pull/12. It implements only so much as is needed for `hail` and `gcloud` to get access tokens for hail GSAs so they can then make API calls to GCS or Hail Batch. With this in place user jobs should no longer require GSA key files, but removing them is future work and requires a well-communicated deprecation and removal process. --- batch/batch/cloud/azure/worker/worker_api.py | 4 + .../batch/cloud/gcp/driver/create_instance.py | 6 +- .../batch/cloud/gcp/worker/metadata_server.py | 109 ++++++++++++++++++ batch/batch/cloud/gcp/worker/worker_api.py | 35 +++--- batch/batch/globals.py | 2 +- batch/batch/worker/worker.py | 20 ++++ batch/batch/worker/worker_api.py | 6 + batch/test/test_batch.py | 51 ++++++-- build.yaml | 2 +- ci/Dockerfile.ci-utils | 4 +- .../hailtop/aiocloud/aiogoogle/__init__.py | 10 +- .../aiocloud/aiogoogle/client/__init__.py | 2 + .../client/metadata_server_client.py | 30 +++++ .../hailtop/aiocloud/aiogoogle/credentials.py | 14 ++- 14 files changed, 261 insertions(+), 34 deletions(-) create mode 100644 batch/batch/cloud/gcp/worker/metadata_server.py create mode 100644 hail/python/hailtop/aiocloud/aiogoogle/client/metadata_server_client.py diff --git a/batch/batch/cloud/azure/worker/worker_api.py b/batch/batch/cloud/azure/worker/worker_api.py index c9e3f8f7cfe..779bc13fc8d 100644 --- a/batch/batch/cloud/azure/worker/worker_api.py +++ b/batch/batch/cloud/azure/worker/worker_api.py @@ -6,6 +6,7 @@ import aiohttp import orjson +from aiohttp import web from hailtop import httpx from hailtop.aiocloud import aioazure @@ -60,6 +61,9 @@ async def user_container_registry_credentials(self, credentials: Dict[str, str]) credentials = orjson.loads(base64.b64decode(credentials['key.json']).decode()) return {'username': credentials['appId'], 'password': credentials['password']} + def create_metadata_server_app(self, credentials: Dict[str, str]) -> web.Application: + raise NotImplementedError + def instance_config_from_config_dict(self, config_dict: Dict[str, str]) -> AzureSlimInstanceConfig: return AzureSlimInstanceConfig.from_dict(config_dict) diff --git a/batch/batch/cloud/gcp/driver/create_instance.py b/batch/batch/cloud/gcp/driver/create_instance.py index 57d3000b286..9fc7aaaaa59 100644 --- a/batch/batch/cloud/gcp/driver/create_instance.py +++ b/batch/batch/cloud/gcp/driver/create_instance.py @@ -266,9 +266,9 @@ def scheduling() -> dict: iptables --table nat --append POSTROUTING --source 172.20.0.0/15 --jump MASQUERADE # [public] -# Block public traffic to the metadata server -iptables --append FORWARD --source 172.21.0.0/16 --destination 169.254.169.254 --jump DROP -# But allow the internal gateway +# Send public jobs' metadata server requests to the batch worker itself +iptables --table nat --append PREROUTING --source 172.21.0.0/16 --destination 169.254.169.254 -p tcp -j REDIRECT --to-ports 5555 +# Allow the internal gateway iptables --append FORWARD --destination $INTERNAL_GATEWAY_IP --jump ACCEPT # And this worker iptables --append FORWARD --destination $IP_ADDRESS --jump ACCEPT diff --git a/batch/batch/cloud/gcp/worker/metadata_server.py b/batch/batch/cloud/gcp/worker/metadata_server.py new file mode 100644 index 00000000000..5475c9982a9 --- /dev/null +++ b/batch/batch/cloud/gcp/worker/metadata_server.py @@ -0,0 +1,109 @@ +from aiohttp import web + +from hailtop.aiocloud import aiogoogle + +from ....globals import HTTP_CLIENT_MAX_SIZE + + +class AppKeys: + USER_CREDENTIALS = web.AppKey('credentials', aiogoogle.GoogleServiceAccountCredentials) + GCE_METADATA_SERVER_CLIENT = web.AppKey('ms_client', aiogoogle.GoogleMetadataServerClient) + + +async def root(_): + return web.Response(text='computeMetadata/\n') + + +async def project_id(request: web.Request): + metadata_server_client = request.app[AppKeys.GCE_METADATA_SERVER_CLIENT] + return web.Response(text=await metadata_server_client.project()) + + +async def numeric_project_id(request: web.Request): + metadata_server_client = request.app[AppKeys.GCE_METADATA_SERVER_CLIENT] + return web.Response(text=await metadata_server_client.numeric_project_id()) + + +async def service_accounts(request: web.Request): + gsa_email = request.app[AppKeys.USER_CREDENTIALS].email + return web.Response(text=f'default\n{gsa_email}\n') + + +async def user_service_account(request: web.Request): + gsa_email = request.app[AppKeys.USER_CREDENTIALS].email + recursive = request.query.get('recursive') + # https://cloud.google.com/compute/docs/metadata/querying-metadata + # token is not included in the recursive version, presumably as that + # is not simple metadata but requires requesting an access token + if recursive == 'true': + return web.json_response( + { + 'aliases': ['default'], + 'email': gsa_email, + 'scopes': ['https://www.googleapis.com/auth/cloud-platform'], + }, + ) + return web.Response(text='aliases\nemail\nscopes\ntoken\n') + + +async def user_email(request: web.Request): + return web.Response(text=request.app[AppKeys.USER_CREDENTIALS].email) + + +async def user_token(request: web.Request): + access_token = await request.app[AppKeys.USER_CREDENTIALS]._get_access_token() + return web.json_response({ + 'access_token': access_token.token, + 'expires_in': access_token.expires_in, + 'token_type': 'Bearer', + }) + + +@web.middleware +async def middleware(request: web.Request, handler): + credentials = request.app[AppKeys.USER_CREDENTIALS] + gsa = request.match_info.get('gsa') + if gsa and gsa not in (credentials.email, 'default'): + raise web.HTTPBadRequest() + + response = await handler(request) + response.enable_compression() + + # `gcloud` does not properly respect `charset`, which aiohttp automatically + # sets so we have to explicitly erase it + # See https://github.com/googleapis/google-auth-library-python/blob/b935298aaf4ea5867b5778bcbfc42408ba4ec02c/google/auth/compute_engine/_metadata.py#L170 + if 'application/json' in response.headers['Content-Type']: + response.headers['Content-Type'] = 'application/json' + response.headers['Metadata-Flavor'] = 'Google' + response.headers['Server'] = 'Metadata Server for VM' + response.headers['X-XSS-Protection'] = '0' + response.headers['X-Frame-Options'] = 'SAMEORIGIN' + return response + + +def create_app( + credentials: aiogoogle.GoogleServiceAccountCredentials, + metadata_server_client: aiogoogle.GoogleMetadataServerClient, +) -> web.Application: + app = web.Application( + client_max_size=HTTP_CLIENT_MAX_SIZE, + middlewares=[middleware], + ) + app[AppKeys.USER_CREDENTIALS] = credentials + app[AppKeys.GCE_METADATA_SERVER_CLIENT] = metadata_server_client + + app.add_routes([ + web.get('/', root), + web.get('/computeMetadata/v1/project/project-id', project_id), + web.get('/computeMetadata/v1/project/numeric-project-id', numeric_project_id), + web.get('/computeMetadata/v1/instance/service-accounts/', service_accounts), + web.get('/computeMetadata/v1/instance/service-accounts/{gsa}/', user_service_account), + web.get('/computeMetadata/v1/instance/service-accounts/{gsa}/email', user_email), + web.get('/computeMetadata/v1/instance/service-accounts/{gsa}/token', user_token), + ]) + + async def close_credentials(_): + await credentials.close() + + app.on_cleanup.append(close_credentials) + return app diff --git a/batch/batch/cloud/gcp/worker/worker_api.py b/batch/batch/cloud/gcp/worker/worker_api.py index 3865ad67dd3..173bcb99b15 100644 --- a/batch/batch/cloud/gcp/worker/worker_api.py +++ b/batch/batch/cloud/gcp/worker/worker_api.py @@ -3,17 +3,18 @@ import tempfile from typing import Dict, List -import aiohttp import orjson +from aiohttp import web from hailtop import httpx from hailtop.aiocloud import aiogoogle from hailtop.auth.auth import IdentityProvider -from hailtop.utils import check_exec_output, retry_transient_errors +from hailtop.utils import check_exec_output from ....worker.worker_api import CloudWorkerAPI, ContainerRegistryCredentials from ..instance_config import GCPSlimInstanceConfig from .disk import GCPDisk +from .metadata_server import create_app class GCPWorkerAPI(CloudWorkerAPI): @@ -24,14 +25,24 @@ class GCPWorkerAPI(CloudWorkerAPI): async def from_env() -> 'GCPWorkerAPI': project = os.environ['PROJECT'] zone = os.environ['ZONE'].rsplit('/', 1)[1] - compute_client = aiogoogle.GoogleComputeClient(project) - return GCPWorkerAPI(project, zone, compute_client) + worker_credentials = aiogoogle.GoogleInstanceMetadataCredentials() + http_session = httpx.ClientSession() + return GCPWorkerAPI(project, zone, worker_credentials, http_session) - def __init__(self, project: str, zone: str, compute_client: aiogoogle.GoogleComputeClient): + def __init__( + self, + project: str, + zone: str, + worker_credentials: aiogoogle.GoogleInstanceMetadataCredentials, + http_session: httpx.ClientSession, + ): self.project = project self.zone = zone - self._compute_client = compute_client + self._http_session = http_session + self._metadata_server_client = aiogoogle.GoogleMetadataServerClient(http_session) + self._compute_client = aiogoogle.GoogleComputeClient(project) self._gcsfuse_credential_files: Dict[str, str] = {} + self._worker_credentials = worker_credentials @property def cloud_specific_env_vars_for_user_jobs(self) -> List[str]: @@ -53,13 +64,7 @@ def create_disk(self, instance_name: str, disk_name: str, size_in_gb: int, mount ) async def worker_container_registry_credentials(self, session: httpx.ClientSession) -> ContainerRegistryCredentials: - token_dict = await retry_transient_errors( - session.post_read_json, - 'http://169.254.169.254/computeMetadata/v1/instance/service-accounts/default/token', - headers={'Metadata-Flavor': 'Google'}, - timeout=aiohttp.ClientTimeout(total=60), # type: ignore - ) - access_token = token_dict['access_token'] + access_token = await self._worker_credentials.access_token() return {'username': 'oauth2accesstoken', 'password': access_token} async def user_container_registry_credentials(self, credentials: Dict[str, str]) -> ContainerRegistryCredentials: @@ -68,6 +73,10 @@ async def user_container_registry_credentials(self, credentials: Dict[str, str]) access_token = await sa_credentials.access_token() return {'username': 'oauth2accesstoken', 'password': access_token} + def create_metadata_server_app(self, credentials: Dict[str, str]) -> web.Application: + key = orjson.loads(base64.b64decode(credentials['key.json']).decode()) + return create_app(aiogoogle.GoogleServiceAccountCredentials(key), self._metadata_server_client) + def instance_config_from_config_dict(self, config_dict: Dict[str, str]) -> GCPSlimInstanceConfig: return GCPSlimInstanceConfig.from_dict(config_dict) diff --git a/batch/batch/globals.py b/batch/batch/globals.py index 134878338d5..316771774f4 100644 --- a/batch/batch/globals.py +++ b/batch/batch/globals.py @@ -23,7 +23,7 @@ BATCH_FORMAT_VERSION = 7 STATUS_FORMAT_VERSION = 5 -INSTANCE_VERSION = 26 +INSTANCE_VERSION = 27 MAX_PERSISTENT_SSD_SIZE_GIB = 64 * 1024 RESERVED_STORAGE_GB_PER_CORE = 5 diff --git a/batch/batch/worker/worker.py b/batch/batch/worker/worker.py index b049bc62b33..64bcf90ac30 100644 --- a/batch/batch/worker/worker.py +++ b/batch/batch/worker/worker.py @@ -263,6 +263,8 @@ async def init(self): for service in HAIL_SERVICES: hosts.write(f'{INTERNAL_GATEWAY_IP} {service}.hail\n') hosts.write(f'{INTERNAL_GATEWAY_IP} internal.hail\n') + if CLOUD == 'gcp': + hosts.write('169.254.169.254 metadata metadata.google.internal') # Jobs on the private network should have access to the metadata server # and our vdc. The public network should not so we use google's public @@ -760,6 +762,7 @@ def __init__( command: List[str], cpu_in_mcpu: int, memory_in_bytes: int, + user_credentials: Optional[Dict[str, str]], network: Optional[Union[bool, str]] = None, port: Optional[int] = None, timeout: Optional[int] = None, @@ -777,6 +780,7 @@ def __init__( self.command = command self.cpu_in_mcpu = cpu_in_mcpu self.memory_in_bytes = memory_in_bytes + self.user_credentials = user_credentials self.network = network self.port = port self.timeout = timeout @@ -820,6 +824,8 @@ def __init__( self.monitor: Optional[ResourceUsageMonitor] = None + self.metadata_app_runner: Optional[web.AppRunner] = None + async def create(self): self.state = 'creating' try: @@ -959,6 +965,9 @@ async def _cleanup(self): if self._cleaned_up: return + if self.metadata_app_runner: + await self.metadata_app_runner.cleanup() + assert self._run_fut is None try: if self.overlay_mounted: @@ -1025,6 +1034,14 @@ async def _setup_network_namespace(self): else: assert self.network is None or self.network == 'public' self.netns = await network_allocator.allocate_public() + if self.user_credentials and CLOUD == 'gcp': + assert CLOUD_WORKER_API + self.metadata_app_runner = web.AppRunner( + CLOUD_WORKER_API.create_metadata_server_app(self.user_credentials) + ) + await self.metadata_app_runner.setup() + site = web.TCPSite(self.metadata_app_runner, self.netns.host_ip, 5555) + await site.start() except asyncio.TimeoutError: log.exception(network_allocator.task_manager.tasks) raise @@ -1454,6 +1471,7 @@ def copy_container( cpu_in_mcpu=cpu_in_mcpu, memory_in_bytes=memory_in_bytes, volume_mounts=volume_mounts, + user_credentials=job.credentials, stdin=json.dumps(files), ) @@ -1778,6 +1796,7 @@ def __init__( command=job_spec['process']['command'], cpu_in_mcpu=self.cpu_in_mcpu, memory_in_bytes=self.memory_in_bytes, + user_credentials=self.credentials, network=job_spec.get('network'), port=job_spec.get('port'), timeout=job_spec.get('timeout'), @@ -2536,6 +2555,7 @@ async def create_and_start( command=command, cpu_in_mcpu=n_cores * 1000, memory_in_bytes=total_memory_bytes, + user_credentials=None, env=[f'HAIL_WORKER_OFF_HEAP_MEMORY_PER_CORE_MB={off_heap_memory_per_core_mib}', f'HAIL_CLOUD={CLOUD}'], volume_mounts=volume_mounts, log_path=f'/batch/jvm-container-logs/jvm-{index}.log', diff --git a/batch/batch/worker/worker_api.py b/batch/batch/worker/worker_api.py index d665b61aa6a..016f759b467 100644 --- a/batch/batch/worker/worker_api.py +++ b/batch/batch/worker/worker_api.py @@ -1,6 +1,8 @@ import abc from typing import Dict, List, TypedDict +from aiohttp import web + from hailtop import httpx from hailtop.utils import CalledProcessError, sleep_before_try @@ -33,6 +35,10 @@ async def worker_container_registry_credentials(self, session: httpx.ClientSessi async def user_container_registry_credentials(self, credentials: Dict[str, str]) -> ContainerRegistryCredentials: raise NotImplementedError + @abc.abstractmethod + def create_metadata_server_app(self, credentials: Dict[str, str]) -> web.Application: + raise NotImplementedError + @abc.abstractmethod def instance_config_from_config_dict(self, config_dict: Dict[str, str]) -> InstanceConfig: raise NotImplementedError diff --git a/batch/test/test_batch.py b/batch/test/test_batch.py index 212a9e522e9..b5e4288a05e 100644 --- a/batch/test/test_batch.py +++ b/batch/test/test_batch.py @@ -8,7 +8,7 @@ import pytest from hailtop import httpx -from hailtop.auth import hail_credentials +from hailtop.auth import get_userinfo, hail_credentials from hailtop.batch.backend import HAIL_GENETICS_HAILTOP_IMAGE from hailtop.batch_client import BatchNotCreatedError, JobNotSubmittedError from hailtop.batch_client.aioclient import BatchClient as AioBatchClient @@ -1098,24 +1098,57 @@ def test_duplicate_parents(client: BatchClient): @skip_in_azure -def test_verify_no_access_to_google_metadata_server(client: BatchClient): +def test_hail_metadata_server_uses_correct_user_credentials(client: BatchClient): b = create_batch(client) - j = b.create_job(os.environ['HAIL_CURL_IMAGE'], ['curl', '-fsSL', 'metadata.google.internal', '--max-time', '10']) + userinfo = get_userinfo() + assert userinfo + hail_identity = userinfo['hail_identity'] + j = b.create_job( + os.environ['HAIL_CURL_IMAGE'], + ['curl', '-fsSL', 'metadata.google.internal/computeMetadata/v1/instance/service-accounts/', '--max-time', '10'], + ) b.submit() status = j.wait() - assert status['state'] == 'Failed', str((status, b.debug_info())) job_log = j.log() - assert "Could not resolve host" in job_log['main'], str((job_log, b.debug_info())) + service_accounts = set(sa.strip() for sa in job_log['main'].split()) + assert status['state'] == 'Success', str((status, b.debug_info())) + assert service_accounts == set(('default', hail_identity)) -def test_verify_no_access_to_metadata_server(client: BatchClient): +@skip_in_azure +def test_gcloud_works_with_hail_metadata_server(client: BatchClient): + b = create_batch(client) + token = secrets.token_urlsafe(16) + tmpdir = os.environ['HAIL_BATCH_REMOTE_TMPDIR'] + random_dir = f'{tmpdir}/{token}' + script = f""" +set -ex +unset GOOGLE_APPLICATION_CREDENTIALS +gcloud config list account +echo "hello" >hello.txt +gcloud storage cp hello.txt {random_dir}/hello.txt +gcloud storage ls {random_dir} +gcloud storage rm -r {random_dir}/ +""" + j = b.create_job(os.environ['CI_UTILS_IMAGE'], ['/bin/bash', '-c', script]) + b.submit() + status = j.wait() + assert status['state'] == 'Success', str((status, b.debug_info())) + + +def test_hail_metadata_server_available_only_in_gcp(client: BatchClient): + cloud = os.environ['HAIL_CLOUD'] b = create_batch(client) j = b.create_job(os.environ['HAIL_CURL_IMAGE'], ['curl', '-fsSL', '169.254.169.254', '--max-time', '10']) b.submit() status = j.wait() - assert status['state'] == 'Failed', str((status, b.debug_info())) - job_log = j.log() - assert "Connection timeout" in job_log['main'], str((job_log, b.debug_info())) + if cloud == 'gcp': + assert status['state'] == 'Success', str((status, b.debug_info())) + else: + assert cloud == 'azure' + assert status['state'] == 'Failed', str((status, b.debug_info())) + job_log = j.log() + assert "Connection timeout" in job_log['main'], str((job_log, b.debug_info())) def test_submit_batch_in_job(client: BatchClient, remote_tmpdir: str): diff --git a/build.yaml b/build.yaml index e4256804708..123542089d3 100644 --- a/build.yaml +++ b/build.yaml @@ -2730,7 +2730,7 @@ steps: export HAIL_CLOUD="{{ global.cloud }}" export HAIL_PRODUCTION_DOMAIN="{{ global.domain }}" export HAIL_GPU_IMAGE="{{ gpu_image.image }}" - hailctl config set batch/remote_tmpdir {{ global.test_storage_uri }}/test_batch/{{ token }}/ + export HAIL_BATCH_REMOTE_TMPDIR="{{ global.test_storage_uri }}/test_batch/{{ token }}/" hail-pip-install -r /io/dev-requirements.txt diff --git a/ci/Dockerfile.ci-utils b/ci/Dockerfile.ci-utils index ab87759cd5b..27e43e9c292 100644 --- a/ci/Dockerfile.ci-utils +++ b/ci/Dockerfile.ci-utils @@ -2,8 +2,8 @@ ARG BASE_IMAGE={{ base_image.image }} FROM $BASE_IMAGE AS base # source: https://cloud.google.com/storage/docs/gsutil_install#linux -RUN curl --remote-name https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-421.0.0-linux-x86_64.tar.gz && \ - tar -xf google-cloud-sdk-421.0.0-linux-x86_64.tar.gz && \ +RUN curl --remote-name https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-447.0.0-linux-x86_64.tar.gz && \ + tar -xf google-cloud-sdk-447.0.0-linux-x86_64.tar.gz && \ curl --remote-name https://dl.k8s.io/release/v1.21.14/bin/linux/amd64/kubectl && \ install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl ENV PATH $PATH:/google-cloud-sdk/bin diff --git a/hail/python/hailtop/aiocloud/aiogoogle/__init__.py b/hail/python/hailtop/aiocloud/aiogoogle/__init__.py index 29c9f980b91..c8e8d21a80d 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/__init__.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/__init__.py @@ -5,12 +5,18 @@ GoogleComputeClient, GoogleIAmClient, GoogleLoggingClient, + GoogleMetadataServerClient, GoogleStorageClient, GCSRequesterPaysConfiguration, GoogleStorageAsyncFS, GoogleStorageAsyncFSFactory, ) -from .credentials import GoogleCredentials, GoogleApplicationDefaultCredentials, GoogleServiceAccountCredentials +from .credentials import ( + GoogleCredentials, + GoogleApplicationDefaultCredentials, + GoogleServiceAccountCredentials, + GoogleInstanceMetadataCredentials, +) from .user_config import get_gcs_requester_pays_configuration @@ -19,12 +25,14 @@ 'GoogleCredentials', 'GoogleApplicationDefaultCredentials', 'GoogleServiceAccountCredentials', + 'GoogleInstanceMetadataCredentials', 'GoogleBigQueryClient', 'GoogleBillingClient', 'GoogleContainerClient', 'GoogleComputeClient', 'GoogleIAmClient', 'GoogleLoggingClient', + 'GoogleMetadataServerClient', 'GoogleStorageClient', 'GoogleStorageAsyncFS', 'GoogleStorageAsyncFSFactory', diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/__init__.py b/hail/python/hailtop/aiocloud/aiogoogle/client/__init__.py index 97862eab671..d31a3b885c9 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/__init__.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/__init__.py @@ -4,6 +4,7 @@ from .compute_client import GoogleComputeClient from .iam_client import GoogleIAmClient from .logging_client import GoogleLoggingClient +from .metadata_server_client import GoogleMetadataServerClient from .storage_client import ( GCSRequesterPaysConfiguration, GoogleStorageClient, @@ -18,6 +19,7 @@ 'GoogleComputeClient', 'GoogleIAmClient', 'GoogleLoggingClient', + 'GoogleMetadataServerClient', 'GCSRequesterPaysConfiguration', 'GoogleStorageClient', 'GoogleStorageAsyncFS', diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/metadata_server_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/metadata_server_client.py new file mode 100644 index 00000000000..b716830ae06 --- /dev/null +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/metadata_server_client.py @@ -0,0 +1,30 @@ +from typing import Optional + +import aiohttp + +from hailtop import httpx +from hailtop.utils import retry_transient_errors + + +class GoogleMetadataServerClient: + def __init__(self, http_session: httpx.ClientSession): + self._session = http_session + self._project_id: Optional[str] = None + self._numeric_project_id: Optional[str] = None + + async def project(self) -> str: + if self._project_id is None: + self._project_id = await retry_transient_errors(self._get_text, '/project/project-id') + return self._project_id + + async def numeric_project_id(self) -> str: + if self._numeric_project_id is None: + self._numeric_project_id = await retry_transient_errors(self._get_text, '/project/numeric-project-id') + return self._numeric_project_id + + async def _get_text(self, path: str) -> str: + url = f'http://metadata.google.internal/computeMetadata/v1{path}' + headers = {'Metadata-Flavor': 'Google'} + timeout = aiohttp.ClientTimeout(total=60) + res = await self._session.get_read(url, headers=headers, timeout=timeout) + return res.decode('utf-8') diff --git a/hail/python/hailtop/aiocloud/aiogoogle/credentials.py b/hail/python/hailtop/aiocloud/aiogoogle/credentials.py index e2507017fbd..a98f7cbb08e 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/credentials.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/credentials.py @@ -19,11 +19,13 @@ class GoogleExpiringAccessToken: def from_dict(data: dict) -> 'GoogleExpiringAccessToken': now = time.time() token = data['access_token'] - expiry_time = now + data['expires_in'] // 2 - return GoogleExpiringAccessToken(token, expiry_time) + expires_in = data['expires_in'] + expiry_time = now + expires_in // 2 + return GoogleExpiringAccessToken(token, expires_in, expiry_time) - def __init__(self, token, expiry_time: int): + def __init__(self, token, expires_in: int, expiry_time: int): self.token = token + self.expires_in = expires_in self._expiry_time = expiry_time def expired(self) -> bool: @@ -171,13 +173,17 @@ async def _get_access_token(self) -> GoogleExpiringAccessToken: # https://developers.google.com/identity/protocols/oauth2/service-account # studying `gcloud --log-http print-access-token` was also useful class GoogleServiceAccountCredentials(GoogleCredentials): - def __init__(self, key, **kwargs): + def __init__(self, key: dict, **kwargs): super().__init__(**kwargs) self.key = key def __str__(self): return f'GoogleServiceAccountCredentials for {self.key["client_email"]}' + @property + def email(self) -> str: + return self.key['client_email'] + async def _get_access_token(self) -> GoogleExpiringAccessToken: now = int(time.time()) scope = ' '.join(self._scopes) From f47efb4d4f95c9377cb1d15b4c06a61e4139334d Mon Sep 17 00:00:00 2001 From: Daniel Goldstein Date: Thu, 1 Feb 2024 09:42:43 -0500 Subject: [PATCH 17/26] [qob] Update scala deploy config to use new base_path field (#14195) I forgot to include the changes in #14056 to the scala code as well. This favors using `basePath` in the Scala deploy config over the `defaultNamespace`. --- .../scala/is/hail/services/DeployConfig.scala | 52 +++++++++++-------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/hail/src/main/scala/is/hail/services/DeployConfig.scala b/hail/src/main/scala/is/hail/services/DeployConfig.scala index 55ccdf36dfd..95ca0e01ece 100644 --- a/hail/src/main/scala/is/hail/services/DeployConfig.scala +++ b/hail/src/main/scala/is/hail/services/DeployConfig.scala @@ -42,7 +42,7 @@ object DeployConfig { if (file != null) { using(new FileInputStream(file))(in => fromConfig(JsonMethods.parse(in))) } else - fromConfig("external", "default", "hail.is") + fromConfig("external", "default", "hail.is", None) } def fromConfig(config: JValue): DeployConfig = { @@ -51,15 +51,28 @@ object DeployConfig { (config \ "location").extract[String], (config \ "default_namespace").extract[String], (config \ "domain").extract[Option[String]].getOrElse("hail.is"), + (config \ "base_path").extract[Option[String]], ) } - def fromConfig(location: String, defaultNamespace: String, domain: String): DeployConfig = - new DeployConfig( - sys.env.getOrElse(toEnvVarName("location"), location), - sys.env.getOrElse(toEnvVarName("default_namespace"), defaultNamespace), - sys.env.getOrElse(toEnvVarName("domain"), domain), - ) + def fromConfig( + locationFromConfig: String, + defaultNamespaceFromConfig: String, + domainFromConfig: String, + basePathFromConfig: Option[String], + ): DeployConfig = { + val location = sys.env.getOrElse(toEnvVarName("location"), locationFromConfig) + val defaultNamespace = + sys.env.getOrElse(toEnvVarName("default_namespace"), defaultNamespaceFromConfig) + val domain = sys.env.getOrElse(toEnvVarName("domain"), domainFromConfig) + val basePath = sys.env.get(toEnvVarName("basePath")).orElse(basePathFromConfig) + + (basePath, defaultNamespace) match { + case (None, ns) if ns != "default" => + new DeployConfig(location, ns, s"internal.$domain", Some(s"/$ns")) + case _ => new DeployConfig(location, defaultNamespace, domain, basePath) + } + } private[this] def toEnvVarName(s: String): String = "HAIL_" + s.toUpperCase @@ -69,6 +82,7 @@ class DeployConfig( val location: String, val defaultNamespace: String, val domain: String, + val basePath: Option[String], ) { def scheme(baseScheme: String = "http"): String = @@ -77,34 +91,28 @@ class DeployConfig( else baseScheme - def getServiceNamespace(service: String): String = - defaultNamespace - def domain(service: String): String = { - val ns = getServiceNamespace(service) location match { case "k8s" => - s"$service.$ns" + s"$service.$defaultNamespace" case "gce" => - if (ns == "default") + if (basePath.isEmpty) s"$service.hail" else "internal.hail" case "external" => - if (ns == "default") + if (basePath.isEmpty) s"$service.$domain" else - s"internal.$domain" + domain } } - def basePath(service: String): String = { - val ns = getServiceNamespace(service) - if (ns == "default") - "" - else - s"/$ns/$service" - } + def basePath(service: String): String = + basePath match { + case Some(base) => s"$base/$service" + case None => "" + } def baseUrl(service: String, baseScheme: String = "http"): String = s"${scheme(baseScheme)}://${domain(service)}${basePath(service)}" From 171b39df2f5e2754ff493e091513174e47cc8a3f Mon Sep 17 00:00:00 2001 From: jigold Date: Thu, 1 Feb 2024 12:32:37 -0500 Subject: [PATCH 18/26] [batch] Update IP Fee pricing for February 2024 price increase (#14190) Fixes #13784 Here's the GCP documentation: https://cloud.google.com/vpc/pricing-announce-external-ips We were previously billing the same IP-Fee for both spot and regular instances. I changed it so we're billing for each instance type accordingly. Following #13542, I hard coded the new resource rates. --- batch/batch/cloud/gcp/instance_config.py | 2 +- batch/batch/cloud/gcp/resources.py | 9 ++-- batch/sql/update-ip-fee-resource.py | 52 ++++++++++++++++++++++ build.yaml | 3 ++ hail/python/hailtop/batch/docs/service.rst | 23 ++++++---- 5 files changed, 75 insertions(+), 14 deletions(-) create mode 100644 batch/sql/update-ip-fee-resource.py diff --git a/batch/batch/cloud/gcp/instance_config.py b/batch/batch/cloud/gcp/instance_config.py index 02e96662c11..2789ff6adcc 100644 --- a/batch/batch/cloud/gcp/instance_config.py +++ b/batch/batch/cloud/gcp/instance_config.py @@ -57,7 +57,7 @@ def create( GCPStaticSizedDiskResource.create(product_versions, 'pd-ssd', boot_disk_size_gb, region), data_disk_resource, GCPDynamicSizedDiskResource.create(product_versions, 'pd-ssd', region), - GCPIPFeeResource.create(product_versions, 1024), + GCPIPFeeResource.create(product_versions, 1024, preemptible), GCPServiceFeeResource.create(product_versions), GCPSupportLogsSpecsAndFirewallFees.create(product_versions), ] diff --git a/batch/batch/cloud/gcp/resources.py b/batch/batch/cloud/gcp/resources.py index 5fe098fcebf..e058d9d0cf5 100644 --- a/batch/batch/cloud/gcp/resources.py +++ b/batch/batch/cloud/gcp/resources.py @@ -270,8 +270,9 @@ class GCPIPFeeResource(IPFeeResourceMixin, GCPResource): TYPE = 'gcp_ip_fee' @staticmethod - def product_name(base: int) -> str: - return f'ip-fee/{base}' + def product_name(base: int, preemptible: bool) -> str: + preemptible_str = 'preemptible' if preemptible else 'nonpreemptible' + return f'ip-fee/{preemptible_str}/{base}' @staticmethod def from_dict(data: Dict[str, Any]) -> 'GCPIPFeeResource': @@ -279,8 +280,8 @@ def from_dict(data: Dict[str, Any]) -> 'GCPIPFeeResource': return GCPIPFeeResource(data['name']) @staticmethod - def create(product_versions: ProductVersions, base: int) -> 'GCPIPFeeResource': - product = GCPIPFeeResource.product_name(base) + def create(product_versions: ProductVersions, base: int, preemptible: bool) -> 'GCPIPFeeResource': + product = GCPIPFeeResource.product_name(base, preemptible) name = product_versions.resource_name(product) assert name, product return GCPIPFeeResource(name) diff --git a/batch/sql/update-ip-fee-resource.py b/batch/sql/update-ip-fee-resource.py new file mode 100644 index 00000000000..5edc5893127 --- /dev/null +++ b/batch/sql/update-ip-fee-resource.py @@ -0,0 +1,52 @@ +import os +import asyncio +from gear import Database, transaction, Transaction + + +async def main(): + cloud = os.environ['HAIL_CLOUD'] + if cloud != 'gcp': + return + + db = Database() + await db.async_init() + try: + @transaction(db) + async def insert(tx: Transaction): + await tx.execute_many( + ''' +INSERT INTO latest_product_versions (product, version) +VALUES (%s, %s); +''', + [('ip-fee/preemptible/1024', '1'), + ('ip-fee/nonpreemptible/1024', '1')] + ) + + # https://cloud.google.com/vpc/pricing-announce-external-ips + # from hailtop.utils import rate_instance_hour_to_fraction_msec + # spot_ip_fee = rate_instance_hour_to_fraction_msec(0.0025, 1024) + spot_ip_fee = 6.781684027777778e-13 + # standard_ip_fee = rate_instance_hour_to_fraction_msec(0.005, 1024) + standard_ip_fee = 1.3563368055555557e-12 + + await tx.execute_many( + ''' +INSERT INTO resources (resource, rate) +VALUES (%s, %s); +''', + [('ip-fee/preemptible/1024/1', spot_ip_fee), + ('ip-fee/nonpreemptible/1024/1', standard_ip_fee)] + ) + + await tx.execute_update(''' +UPDATE resources +SET deduped_resource_id = resource_id +WHERE resource = 'ip-fee/preemptible/1024/1' OR resource = 'ip-fee/nonpreemptible/1024/1'; +''') + + await insert() + finally: + await db.async_close() + + +asyncio.run(main()) diff --git a/build.yaml b/build.yaml index 123542089d3..fedb42fc571 100644 --- a/build.yaml +++ b/build.yaml @@ -2361,6 +2361,9 @@ steps: - name: remove-v2-billing-writes script: /io/sql/remove-v2-billing-writes.sql online: true + - name: update-ip-fee-resource + script: /io/sql/update-ip-fee-resource.py + online: true inputs: - from: /repo/batch/sql to: /io/sql diff --git a/hail/python/hailtop/batch/docs/service.rst b/hail/python/hailtop/batch/docs/service.rst index f20beb1ce4c..a0d38c4bf20 100644 --- a/hail/python/hailtop/batch/docs/service.rst +++ b/hail/python/hailtop/batch/docs/service.rst @@ -105,7 +105,7 @@ Billing ------- The cost for executing a job depends on the underlying machine type, the region in which the VM is running in, -and how much CPU and memory is being requested. Currently, Batch runs most jobs on 16 core, preemptible, n1 +and how much CPU and memory is being requested. Currently, Batch runs most jobs on 16 core, spot, n1 machines with 10 GB of persistent SSD boot disk and 375 GB of local SSD. The costs are as follows: - Compute cost @@ -116,11 +116,11 @@ machines with 10 GB of persistent SSD boot disk and 375 GB of local SSD. The cos based on the current spot prices for a given worker type and the region in which the worker is running in. You can use :meth:`.Job.regions` to specify which regions to run a job in. - = $0.01 per core per hour for **preemptible standard** worker types + = $0.01 per core per hour for **spot standard** worker types - = $0.012453 per core per hour for **preemptible highmem** worker types + = $0.012453 per core per hour for **spot highmem** worker types - = $0.0074578 per core per hour for **preemptible highcpu** worker types + = $0.0074578 per core per hour for **spot highcpu** worker types = $0.04749975 per core per hour for **nonpreemptible standard** worker types @@ -163,22 +163,27 @@ machines with 10 GB of persistent SSD boot disk and 375 GB of local SSD. The cos - IP network cost - = $0.00025 per core per hour + = $0.0003125 per core per hour for **nonpreemptible** worker types + + = $0.00015625 per core per hour for **spot** worker types - Service cost = $0.01 per core per hour +- Logs, Specs, and Firewall Fee + = $0.005 per core per hour + -The sum of these costs is **$0.021935** per core/hour for standard workers, **$0.024388** per core/hour -for highmem workers, and **$0.019393** per core/hour for highcpu workers. There is also an additional +The sum of these costs is **$0.02684125** per core/hour for standard spot workers, **$0.02929425** per core/hour +for highmem spot workers, and **$0.02429905** per core/hour for highcpu spot workers. There is also an additional cost of **$0.00023** per GB per hour of extra storage requested. At any given moment as many as four cores of the cluster may come from a 4 core machine if the worker type is standard. If a job is scheduled on this machine, then the cost per core hour is **$0.02774** plus **$0.00023** per GB per hour storage of extra storage requested. -For jobs that run on non-preemptible machines, the costs are **$0.060462** per core/hour for standard workers, **$0.072114** per core/hour -for highmem workers, and **$0.048365** per core/hour for highcpu workers. +For jobs that run on non-preemptible machines, the costs are **$0.06449725** per core/hour for standard workers, **$0.076149** per core/hour +for highmem workers, and **$0.0524218** per core/hour for highcpu workers. .. note:: From 7f12473d5477274eb54fb0873efab3b008ad09f9 Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 1 Feb 2024 15:10:41 -0500 Subject: [PATCH 19/26] [query] avoid code explosion for trivial upcasts (#14232) --- .../scala/is/hail/expr/ir/PruneDeadFields.scala | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/hail/src/main/scala/is/hail/expr/ir/PruneDeadFields.scala b/hail/src/main/scala/is/hail/expr/ir/PruneDeadFields.scala index 67a1a2f8d04..013b40b584b 100644 --- a/hail/src/main/scala/is/hail/expr/ir/PruneDeadFields.scala +++ b/hail/src/main/scala/is/hail/expr/ir/PruneDeadFields.scala @@ -2853,12 +2853,16 @@ object PruneDeadFields { ir else { val result = ir.typ match { - case _: TStruct => - bindIR(ir) { ref => - val ms = MakeStruct(rType.asInstanceOf[TStruct].fields.map { f => - f.name -> upcast(ctx, GetField(ref, f.name), f.typ) - }) - If(IsNA(ref), NA(ms.typ), ms) + case tstruct: TStruct => + if (rType.asInstanceOf[TStruct].fields.forall(f => tstruct.field(f.name).typ == f.typ)) { + SelectFields(ir, rType.asInstanceOf[TStruct].fields.map(f => f.name)) + } else { + bindIR(ir) { ref => + val ms = MakeStruct(rType.asInstanceOf[TStruct].fields.map { f => + f.name -> upcast(ctx, GetField(ref, f.name), f.typ) + }) + If(IsNA(ref), NA(ms.typ), ms) + } } case ts: TStream => val ra = rType.asInstanceOf[TStream] From dace919d1ab51cb8104aeaac4445555b28d601bc Mon Sep 17 00:00:00 2001 From: Dan King Date: Fri, 2 Feb 2024 10:25:39 -0500 Subject: [PATCH 20/26] [hailtop.batch] add default_regions to hb.Batch, improve docs (#14224) `hb.Batch` now supports `default_regions` which completes the natural hierarchy of: config, envvar, backend, batch, job. I went a little hog wild with examples. I think we should have more examples everywhere! The ServiceBackend doc page also had several basic formatting issues which I addressed. --- hail/python/hailtop/batch/backend.py | 110 ++++++++++++++---- hail/python/hailtop/batch/batch.py | 24 ++-- hail/python/hailtop/batch/docs/service.rst | 83 +++++++++++-- .../batch/test_batch_service_backend.py | 30 ++++- 4 files changed, 203 insertions(+), 44 deletions(-) diff --git a/hail/python/hailtop/batch/backend.py b/hail/python/hailtop/batch/backend.py index 3987ff41a84..c36594fc61e 100644 --- a/hail/python/hailtop/batch/backend.py +++ b/hail/python/hailtop/batch/backend.py @@ -413,42 +413,100 @@ async def _async_close(self): class ServiceBackend(Backend[bc.Batch]): - ANY_REGION: ClassVar[List[str]] = ['any_region'] - """Backend that executes batches on Hail's Batch Service on Google Cloud. Examples -------- - >>> service_backend = ServiceBackend(billing_project='my-billing-account', remote_tmpdir='gs://my-bucket/temporary-files/') # doctest: +SKIP - >>> b = Batch(backend=service_backend) # doctest: +SKIP + Create and use a backend that bills to the Hail Batch billing project named "my-billing-account" + and stores temporary intermediate files in "gs://my-bucket/temporary-files". + + >>> import hailtop.batch as hb + >>> service_backend = hb.ServiceBackend( + ... billing_project='my-billing-account', + ... remote_tmpdir='gs://my-bucket/temporary-files/' + ... ) # doctest: +SKIP + >>> b = hb.Batch(backend=service_backend) # doctest: +SKIP + >>> j = b.new_job() # doctest: +SKIP + >>> j.command('echo hello world!') # doctest: +SKIP >>> b.run() # doctest: +SKIP - >>> service_backend.close() # doctest: +SKIP - If the Hail configuration parameters batch/billing_project and - batch/remote_tmpdir were previously set with ``hailctl config set``, then - one may elide the `billing_project` and `remote_tmpdir` parameters. + Same as above, but set the billing project and temporary intermediate folders via a + configuration file:: - >>> service_backend = ServiceBackend() - >>> b = Batch(backend=service_backend) - >>> b.run() # doctest: +SKIP - >>> service_backend.close() + cat >my-batch-script.py >>EOF + import hailtop.batch as hb + b = hb.Batch(backend=ServiceBackend()) + j = b.new_job() + j.command('echo hello world!') + b.run() + EOF + hailctl config set batch/billing_project my-billing-account + hailctl config set batch/remote_tmpdir gs://my-bucket/temporary-files/ + python3 my-batch-script.py + + Same as above, but also specify the use of the :class:`.ServiceBackend` via configuration file:: + + cat >my-batch-script.py >>EOF + import hailtop.batch as hb + b = hb.Batch() + j = b.new_job() + j.command('echo hello world!') + b.run() + EOF + hailctl config set batch/billing_project my-billing-account + hailctl config set batch/remote_tmpdir gs://my-bucket/temporary-files/ + hailctl config set batch/backend service + python3 my-batch-script.py + + Create a backend which stores temporary intermediate files in + "https://my-account.blob.core.windows.net/my-container/tempdir". + + >>> service_backend = hb.ServiceBackend( + ... billing_project='my-billing-account', + ... remote_tmpdir='https://my-account.blob.core.windows.net/my-container/tempdir' + ... ) # doctest: +SKIP + + Require all jobs in all batches in this backend to execute in us-central1:: + + >>> b = hb.Batch(backend=hb.ServiceBackend(regions=['us-central1'])) + + Same as above, but using a configuration file:: + + hailctl config set batch/regions us-central1 + python3 my-batch-script.py + Same as above, but using the ``HAIL_BATCH_REGIONS`` environment variable:: + + export HAIL_BATCH_REGIONS=us-central1 + python3 my-batch-script.py + + Permit jobs to execute in *either* us-central1 or us-east1:: + + >>> b = hb.Batch(backend=hb.ServiceBackend(regions=['us-central1', 'us-east1'])) + + Same as above, but using a configuration file:: + + hailctl config set batch/regions us-central1,us-east1 + + Allow reading or writing to buckets even though they are "cold" storage: + + >>> b = hb.Batch( + ... backend=hb.ServiceBackend( + ... gcs_bucket_allow_list=['cold-bucket', 'cold-bucket2'], + ... ), + ... ) Parameters ---------- billing_project: Name of billing project to use. bucket: - Name of bucket to use. Should not include the ``gs://`` prefix. Cannot be used with - `remote_tmpdir`. Temporary data will be stored in the "/batch" folder of this - bucket. This argument is deprecated. Use `remote_tmpdir` instead. + This argument is deprecated. Use `remote_tmpdir` instead. remote_tmpdir: - Temporary data will be stored in this cloud storage folder. Cannot be used with deprecated - argument `bucket`. Paths should match a GCS URI like gs:/// or an ABS - URI of the form https://.blob.core.windows.net//. + Temporary data will be stored in this cloud storage folder. google_project: - DEPRECATED. Please use gcs_requester_pays_configuration. + This argument is deprecated. Use `gcs_requester_pays_configuration` instead. gcs_requester_pays_configuration : either :class:`str` or :class:`tuple` of :class:`str` and :class:`list` of :class:`str`, optional If a string is provided, configure the Google Cloud Storage file system to bill usage to the project identified by that string. If a tuple is provided, configure the Google Cloud @@ -458,15 +516,19 @@ class ServiceBackend(Backend[bc.Batch]): The authorization token to pass to the batch client. Should only be set for user delegation purposes. regions: - Cloud region(s) to run jobs in. Use py:staticmethod:`.ServiceBackend.supported_regions` to list the - available regions to choose from. Use py:attribute:`.ServiceBackend.ANY_REGION` to signify the default is jobs - can run in any available region. The default is jobs can run in any region unless a default value has - been set with hailctl. An example invocation is `hailctl config set batch/regions "us-central1,us-east1"`. + Cloud regions in which jobs may run. :attr:`.ServiceBackend.ANY_REGION` indicates jobs may + run in any region. If unspecified or ``None``, the ``batch/regions`` Hail configuration + variable is consulted. See examples above. If none of these variables are set, then jobs may + run in any region. :meth:`.ServiceBackend.supported_regions` lists the available regions. gcs_bucket_allow_list: A list of buckets that the :class:`.ServiceBackend` should be permitted to read from or write to, even if their - default policy is to use "cold" storage. Should look like ``["bucket1", "bucket2"]``. + default policy is to use "cold" storage. + """ + ANY_REGION: ClassVar[List[str]] = ['any_region'] + """A special value that indicates a job may run in any region.""" + @staticmethod def supported_regions(): """ diff --git a/hail/python/hailtop/batch/batch.py b/hail/python/hailtop/batch/batch.py index f2370628db0..5026ebde873 100644 --- a/hail/python/hailtop/batch/batch.py +++ b/hail/python/hailtop/batch/batch.py @@ -24,7 +24,8 @@ class Batch: -------- Create a batch object: - >>> p = Batch() + >>> import hailtop.batch as hb + >>> p = hb.Batch() Create a new job that prints "hello": @@ -35,6 +36,10 @@ class Batch: >>> p.run() + Require all jobs in this batch to execute in us-central1: + + >>> b = hb.Batch(backend=hb.ServiceBackend(), default_regions=['us-central1']) + Notes ----- @@ -77,6 +82,9 @@ class Batch: default_storage: Storage setting to use by default if not specified by a job. Only applicable for the :class:`.ServiceBackend`. See :meth:`.Job.storage`. + default_regions: + Cloud regions in which jobs may run. When unspecified or ``None``, use the regions attribute of + :class:`.ServiceBackend`. See :class:`.ServiceBackend` for details. default_timeout: Maximum time in seconds for a job to run before being killed. Only applicable for the :class:`.ServiceBackend`. If `None`, there is no @@ -157,6 +165,7 @@ def __init__( default_memory: Optional[Union[int, str]] = None, default_cpu: Optional[Union[float, int, str]] = None, default_storage: Optional[Union[int, str]] = None, + default_regions: Optional[List[str]] = None, default_timeout: Optional[Union[float, int]] = None, default_shell: Optional[str] = None, default_python_image: Optional[str] = None, @@ -195,6 +204,9 @@ def __init__( self._default_memory = default_memory self._default_cpu = default_cpu self._default_storage = default_storage + self._default_regions = default_regions + if self._default_regions is None and isinstance(self._backend, _backend.ServiceBackend): + self._default_regions = self._backend.regions self._default_timeout = default_timeout self._default_shell = default_shell self._default_python_image = default_python_image @@ -316,14 +328,13 @@ def new_bash_job( j.cpu(self._default_cpu) if self._default_storage is not None: j.storage(self._default_storage) + if self._default_regions is not None: + j.regions(self._default_regions) if self._default_timeout is not None: j.timeout(self._default_timeout) if self._default_spot is not None: j.spot(self._default_spot) - if isinstance(self._backend, _backend.ServiceBackend): - j.regions(self._backend.regions) - self._jobs.append(j) return j @@ -388,14 +399,13 @@ def hello(name): j.cpu(self._default_cpu) if self._default_storage is not None: j.storage(self._default_storage) + if self._default_regions is not None: + j.regions(self._default_regions) if self._default_timeout is not None: j.timeout(self._default_timeout) if self._default_spot is not None: j.spot(self._default_spot) - if isinstance(self._backend, _backend.ServiceBackend): - j.regions(self._backend.regions) - self._jobs.append(j) return j diff --git a/hail/python/hailtop/batch/docs/service.rst b/hail/python/hailtop/batch/docs/service.rst index a0d38c4bf20..9b8eff78491 100644 --- a/hail/python/hailtop/batch/docs/service.rst +++ b/hail/python/hailtop/batch/docs/service.rst @@ -232,22 +232,15 @@ error messages in the terminal window. Submitting a Batch to the Service --------------------------------- +.. warning:: + + To avoid substantial network costs, ensure your jobs and data reside in the same `region`_. + To execute a batch on the Batch service rather than locally, first construct a :class:`.ServiceBackend` object with a billing project and bucket for storing intermediate files. Your service account must have read and write access to the bucket. -.. warning:: - - By default, the Batch Service runs jobs in any region in the US. Make sure you have considered additional `ingress and - egress fees `_ when using regional buckets and container or artifact - registries. Multi-regional buckets also have additional replication fees when writing data. A good rule of thumb is to use - a multi-regional artifact registry for Docker images and regional buckets for data. You can then specify which region(s) - you want your job to run in with :meth:`.Job.regions`. To set the default region(s) for all jobs, you can set the input - regions argument to :class:`.ServiceBackend` or use hailctl to set the default value. An example invocation is - `hailctl config set batch/regions "us-central1,us-east1"`. You can also get the full list of supported regions - with py:staticmethod:`.ServiceBackend.supported_regions`. - Next, pass the :class:`.ServiceBackend` object to the :class:`.Batch` constructor with the parameter name `backend`. @@ -257,7 +250,7 @@ and execute the following batch: .. code-block:: python - >>> import hailtop.batch as hb # doctest: +SKIP + >>> import hailtop.batch as hb >>> backend = hb.ServiceBackend('my-billing-project', remote_tmpdir='gs://my-bucket/batch/tmp/') # doctest: +SKIP >>> b = hb.Batch(backend=backend, name='test') # doctest: +SKIP >>> j = b.new_job(name='hello') # doctest: +SKIP @@ -276,6 +269,72 @@ have previously set them with ``hailctl``: A trial billing project is automatically created for you with the name {USERNAME}-trial +.. _region: + +Regions +------- + +Data and compute both reside in a physical location. In Google Cloud Platform, the location of data +is controlled by the location of the containing bucket. ``gcloud`` can determine the location of a +bucket:: + + gcloud storage buckets describe gs://my-bucket + +If your compute resides in a different location from the data it reads or writes, then you will +accrue substantial `network charges `__. + +To avoid network charges ensure all your data is in one region and specify that region in one of the +following five ways. As a running example, we consider data stored in `us-central1`. The options are +listed from highest to lowest precedence. + +1. :meth:`.Job.regions`: + + .. code-block:: python + + >>> b = hb.Batch(backend=hb.ServiceBackend()) + >>> j = b.new_job() + >>> j.regions(['us-central1']) + +2. The ``default_regions`` parameter of :class:`.Batch`: + + .. code-block:: python + + >>> b = hb.Batch(backend=hb.ServiceBackend(), default_regions=['us-central1']) + + +3. The ``regions`` parameter of :class:`.ServiceBackend`: + + .. code-block:: python + + >>> b = hb.Batch(backend=hb.ServiceBackend(regions=['us-central1'])) + +4. The ``HAIL_BATCH_REGIONS`` environment variable: + + .. code-block:: sh + + export HAIL_BATCH_REGIONS=us-central1 + python3 my-batch-script.py + +5. The ``batch/region`` configuration variable: + + .. code-block:: sh + + hailctl config set batch/regions us-central1 + python3 my-batch-script.py + +.. warning:: + + If none of the five options above are specified, your job may run in *any* region! + +In Google Cloud Platform, the location of a multi-region bucket is considered *different* from any +region within that multi-region. For example, if a VM in the `us-central1` region reads data from a +bucket in the `us` multi-region, this incurs network charges becuse `us` is not considered equal to +`us-central1`. + +Container (aka Docker) images are a form of data. In Google Cloud Platform, we recommend storing +your images in a multi-regional artifact registry, which at time of writing, despite being +"multi-regional", does not incur network charges in the manner described above. + Using the UI ------------ diff --git a/hail/python/test/hailtop/batch/test_batch_service_backend.py b/hail/python/test/hailtop/batch/test_batch_service_backend.py index cb8f0291ff3..0932d53d296 100644 --- a/hail/python/test/hailtop/batch/test_batch_service_backend.py +++ b/hail/python/test/hailtop/batch/test_batch_service_backend.py @@ -798,7 +798,7 @@ async def foo(i, j): def test_specify_job_region(backend: ServiceBackend): - b = batch(backend, cancel_after_n_failures=1) + b = batch(backend) j = b.new_job('region') possible_regions = backend.supported_regions() j.regions(possible_regions) @@ -809,6 +809,34 @@ def test_specify_job_region(backend: ServiceBackend): assert res_status['state'] == 'success', str((res_status, res.debug_info())) +def test_job_regions_controls_job_execution_region(backend: ServiceBackend): + the_region = backend.supported_regions()[0] + + b = batch(backend) + j = b.new_job() + j.regions([the_region]) + j.command('true') + res = b.run() + + assert res + job_status = res.get_job(1).status() + assert job_status['status']['region'] == the_region, str((job_status, res.debug_info())) + + +def test_job_regions_overrides_batch_regions(backend: ServiceBackend): + the_region = backend.supported_regions()[0] + + b = batch(backend, default_regions=['some-other-region']) + j = b.new_job() + j.regions([the_region]) + j.command('true') + res = b.run() + + assert res + job_status = res.get_job(1).status() + assert job_status['status']['region'] == the_region, str((job_status, res.debug_info())) + + def test_always_copy_output(backend: ServiceBackend, output_tmpdir: str): output_path = os.path.join(output_tmpdir, 'test_always_copy_output.txt') From 3cb79ec3786d5616db6e84f79d0bc50fcf09c596 Mon Sep 17 00:00:00 2001 From: Dan King Date: Fri, 2 Feb 2024 11:11:18 -0500 Subject: [PATCH 21/26] [batch] silence instance logs (#14243) `oldwarn` is somehow `None` which spams us with instance log errors. We can revisit the warning level in a PR if this is really important. https://cloudlogging.app.goo.gl/VmUohrJSNo6EjsK56 --- batch/batch/worker/worker.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/batch/batch/worker/worker.py b/batch/batch/worker/worker.py index 64bcf90ac30..c265e80564f 100644 --- a/batch/batch/worker/worker.py +++ b/batch/batch/worker/worker.py @@ -14,7 +14,6 @@ import tempfile import traceback import uuid -import warnings from collections import defaultdict from contextlib import AsyncExitStack, ExitStack from typing import ( @@ -95,19 +94,6 @@ with open('/subdomains.txt', 'r', encoding='utf-8') as subdomains_file: HAIL_SERVICES = [line.rstrip() for line in subdomains_file.readlines()] -oldwarn = warnings.warn - - -def deeper_stack_level_warn(*args, **kwargs): - if 'stacklevel' in kwargs: - kwargs['stacklevel'] = max(kwargs['stacklevel'], 5) - else: - kwargs['stacklevel'] = 5 - return oldwarn(*args, **kwargs) - - -warnings.warn = deeper_stack_level_warn - class BatchWorkerAccessLogger(AccessLogger): def __init__(self, logger: logging.Logger, log_format: str): From d2615543476bde5d01061499c92f26124b85caf3 Mon Sep 17 00:00:00 2001 From: Dan King Date: Fri, 2 Feb 2024 14:21:47 -0500 Subject: [PATCH 22/26] [dependencies] mass update (#14233) --- batch/pinned-requirements.txt | 20 ++-- benchmark/python/pinned-requirements.txt | 6 +- ci/pinned-requirements.txt | 4 +- gear/pinned-requirements.txt | 16 +-- hail/python/dev/pinned-requirements.txt | 108 ++++++++---------- hail/python/dev/requirements.txt | 2 + .../hailtop/batch/batch_pool_executor.py | 4 +- hail/python/hailtop/pinned-requirements.txt | 34 +++--- hail/python/hailtop/utils/__init__.py | 2 + hail/python/hailtop/utils/utils.py | 6 + hail/python/pinned-requirements.txt | 50 ++++---- web_common/pinned-requirements.txt | 12 +- 12 files changed, 133 insertions(+), 131 deletions(-) diff --git a/batch/pinned-requirements.txt b/batch/pinned-requirements.txt index 6d32427ac40..b678b8b0db0 100644 --- a/batch/pinned-requirements.txt +++ b/batch/pinned-requirements.txt @@ -6,14 +6,14 @@ # aiodocker==0.21.0 # via -r hail/batch/requirements.txt -aiohttp==3.9.1 +aiohttp==3.9.3 # via # -c hail/batch/../gear/pinned-requirements.txt # -c hail/batch/../hail/python/dev/pinned-requirements.txt # -c hail/batch/../hail/python/pinned-requirements.txt # -c hail/batch/../web_common/pinned-requirements.txt # aiodocker -aiorwlock==1.3.0 +aiorwlock==1.4.0 # via -r hail/batch/requirements.txt aiosignal==1.3.1 # via @@ -30,7 +30,7 @@ async-timeout==4.0.3 # -c hail/batch/../web_common/pinned-requirements.txt # -r hail/batch/requirements.txt # aiohttp -attrs==23.1.0 +attrs==23.2.0 # via # -c hail/batch/../gear/pinned-requirements.txt # -c hail/batch/../hail/python/dev/pinned-requirements.txt @@ -39,7 +39,7 @@ attrs==23.1.0 # aiohttp dictdiffer==0.9.0 # via -r hail/batch/requirements.txt -frozenlist==1.4.0 +frozenlist==1.4.1 # via # -c hail/batch/../gear/pinned-requirements.txt # -c hail/batch/../hail/python/dev/pinned-requirements.txt @@ -62,7 +62,7 @@ multidict==6.0.4 # -c hail/batch/../web_common/pinned-requirements.txt # aiohttp # yarl -numpy==1.26.2 +numpy==1.26.3 # via # -c hail/batch/../hail/python/pinned-requirements.txt # pandas @@ -71,7 +71,7 @@ packaging==23.2 # -c hail/batch/../hail/python/dev/pinned-requirements.txt # -c hail/batch/../hail/python/pinned-requirements.txt # plotly -pandas==2.1.4 +pandas==2.2.0 # via # -c hail/batch/../hail/python/pinned-requirements.txt # -r hail/batch/requirements.txt @@ -85,7 +85,7 @@ python-dateutil==2.8.2 # -c hail/batch/../hail/python/dev/pinned-requirements.txt # -c hail/batch/../hail/python/pinned-requirements.txt # pandas -pytz==2023.3.post1 +pytz==2023.4 # via # -c hail/batch/../hail/python/pinned-requirements.txt # pandas @@ -99,16 +99,16 @@ tenacity==8.2.3 # via # -c hail/batch/../hail/python/pinned-requirements.txt # plotly -typing-extensions==4.8.0 +typing-extensions==4.9.0 # via # -c hail/batch/../hail/python/dev/pinned-requirements.txt # -c hail/batch/../hail/python/pinned-requirements.txt # aiodocker -tzdata==2023.3 +tzdata==2023.4 # via # -c hail/batch/../hail/python/pinned-requirements.txt # pandas -yarl==1.9.3 +yarl==1.9.4 # via # -c hail/batch/../gear/pinned-requirements.txt # -c hail/batch/../hail/python/dev/pinned-requirements.txt diff --git a/benchmark/python/pinned-requirements.txt b/benchmark/python/pinned-requirements.txt index 95ce5f36afa..6ecbfbec2ca 100644 --- a/benchmark/python/pinned-requirements.txt +++ b/benchmark/python/pinned-requirements.txt @@ -10,7 +10,7 @@ contourpy==1.2.0 # matplotlib cycler==0.12.1 # via matplotlib -fonttools==4.47.0 +fonttools==4.47.2 # via matplotlib importlib-resources==6.1.1 # via matplotlib @@ -18,7 +18,7 @@ kiwisolver==1.4.5 # via matplotlib matplotlib==3.8.2 # via -r hail/benchmark/python/requirements.txt -numpy==1.26.2 +numpy==1.26.3 # via # -c hail/benchmark/python/../../hail/python/pinned-requirements.txt # contourpy @@ -28,7 +28,7 @@ packaging==23.2 # -c hail/benchmark/python/../../hail/python/dev/pinned-requirements.txt # -c hail/benchmark/python/../../hail/python/pinned-requirements.txt # matplotlib -pillow==10.1.0 +pillow==10.2.0 # via # -c hail/benchmark/python/../../hail/python/dev/pinned-requirements.txt # -c hail/benchmark/python/../../hail/python/pinned-requirements.txt diff --git a/ci/pinned-requirements.txt b/ci/pinned-requirements.txt index 9bcfd442c2c..12daad20f0d 100644 --- a/ci/pinned-requirements.txt +++ b/ci/pinned-requirements.txt @@ -26,7 +26,7 @@ click==8.1.7 # -c hail/ci/../hail/python/dev/pinned-requirements.txt # -c hail/ci/../hail/python/pinned-requirements.txt # zulip -cryptography==41.0.7 +cryptography==42.0.2 # via # -c hail/ci/../hail/python/pinned-requirements.txt # pyjwt @@ -56,7 +56,7 @@ requests[security]==2.31.0 # -c hail/ci/../hail/python/dev/pinned-requirements.txt # -c hail/ci/../hail/python/pinned-requirements.txt # zulip -typing-extensions==4.8.0 +typing-extensions==4.9.0 # via # -c hail/ci/../hail/python/dev/pinned-requirements.txt # -c hail/ci/../hail/python/pinned-requirements.txt diff --git a/gear/pinned-requirements.txt b/gear/pinned-requirements.txt index 4189b27d894..244e2bb15a9 100644 --- a/gear/pinned-requirements.txt +++ b/gear/pinned-requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=hail/gear/pinned-requirements.txt hail/gear/requirements.txt # -aiohttp==3.9.1 +aiohttp==3.9.3 # via # -c hail/gear/../hail/python/dev/pinned-requirements.txt # -c hail/gear/../hail/python/hailtop/pinned-requirements.txt @@ -27,7 +27,7 @@ async-timeout==4.0.3 # -c hail/gear/../hail/python/hailtop/pinned-requirements.txt # -c hail/gear/../hail/python/pinned-requirements.txt # aiohttp -attrs==23.1.0 +attrs==23.2.0 # via # -c hail/gear/../hail/python/dev/pinned-requirements.txt # -c hail/gear/../hail/python/hailtop/pinned-requirements.txt @@ -51,18 +51,18 @@ charset-normalizer==3.3.2 # -c hail/gear/../hail/python/hailtop/pinned-requirements.txt # -c hail/gear/../hail/python/pinned-requirements.txt # requests -frozenlist==1.4.0 +frozenlist==1.4.1 # via # -c hail/gear/../hail/python/dev/pinned-requirements.txt # -c hail/gear/../hail/python/hailtop/pinned-requirements.txt # -c hail/gear/../hail/python/pinned-requirements.txt # aiohttp # aiosignal -google-api-core==2.15.0 +google-api-core==2.16.1 # via google-api-python-client -google-api-python-client==2.111.0 +google-api-python-client==2.116.0 # via google-cloud-profiler -google-auth==2.23.4 +google-auth==2.27.0 # via # -c hail/gear/../hail/python/hailtop/pinned-requirements.txt # -c hail/gear/../hail/python/pinned-requirements.txt @@ -98,7 +98,7 @@ multidict==6.0.4 # -c hail/gear/../hail/python/pinned-requirements.txt # aiohttp # yarl -orjson==3.9.10 +orjson==3.9.12 # via # -c hail/gear/../hail/python/hailtop/pinned-requirements.txt # -c hail/gear/../hail/python/pinned-requirements.txt @@ -183,7 +183,7 @@ wrapt==1.16.0 # -c hail/gear/../hail/python/dev/pinned-requirements.txt # -c hail/gear/../hail/python/pinned-requirements.txt # prometheus-async -yarl==1.9.3 +yarl==1.9.4 # via # -c hail/gear/../hail/python/dev/pinned-requirements.txt # -c hail/gear/../hail/python/hailtop/pinned-requirements.txt diff --git a/hail/python/dev/pinned-requirements.txt b/hail/python/dev/pinned-requirements.txt index a7a0385d54d..96fb9d708b9 100644 --- a/hail/python/dev/pinned-requirements.txt +++ b/hail/python/dev/pinned-requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=hail/hail/python/dev/pinned-requirements.txt hail/hail/python/dev/requirements.txt # -aiohttp==3.9.1 +aiohttp==3.9.3 # via # -c hail/hail/python/dev/../pinned-requirements.txt # aiohttp-devtools @@ -14,7 +14,7 @@ aiosignal==1.3.1 # via # -c hail/hail/python/dev/../pinned-requirements.txt # aiohttp -alabaster==0.7.13 +alabaster==0.7.16 # via sphinx anyio==4.2.0 # via @@ -38,7 +38,7 @@ async-timeout==4.0.3 # via # -c hail/hail/python/dev/../pinned-requirements.txt # aiohttp -attrs==23.1.0 +attrs==23.2.0 # via # -c hail/hail/python/dev/../pinned-requirements.txt # aiohttp @@ -49,7 +49,7 @@ babel==2.14.0 # via # jupyterlab-server # sphinx -beautifulsoup4==4.12.2 +beautifulsoup4==4.12.3 # via nbconvert bleach==6.1.0 # via nbconvert @@ -72,9 +72,8 @@ click==8.1.7 # -c hail/hail/python/dev/../pinned-requirements.txt # -r hail/hail/python/dev/requirements.txt # aiohttp-devtools - # black # curlylint -comm==0.2.0 +comm==0.2.1 # via # ipykernel # ipywidgets @@ -90,7 +89,7 @@ defusedxml==0.7.1 # via nbconvert devtools==0.12.2 # via aiohttp-devtools -dill==0.3.7 +dill==0.3.8 # via # -c hail/hail/python/dev/../pinned-requirements.txt # pylint @@ -112,13 +111,13 @@ executing==2.0.1 # via # devtools # stack-data -fastjsonschema==2.19.0 +fastjsonschema==2.19.1 # via nbformat filelock==3.13.1 # via virtualenv fqdn==1.5.1 # via jsonschema -frozenlist==1.4.0 +frozenlist==1.4.1 # via # -c hail/hail/python/dev/../pinned-requirements.txt # aiohttp @@ -136,7 +135,7 @@ idna==3.6 # yarl imagesize==1.4.1 # via sphinx -importlib-metadata==7.0.0 +importlib-metadata==7.0.1 # via # jupyter-client # jupyter-lsp @@ -146,7 +145,7 @@ importlib-metadata==7.0.0 # sphinx iniconfig==2.0.0 # via pytest -ipykernel==6.27.1 +ipykernel==6.29.0 # via # jupyter # jupyter-console @@ -178,12 +177,12 @@ json5==0.9.14 # via jupyterlab-server jsonpointer==2.4 # via jsonschema -jsonschema[format-nongpl]==4.20.0 +jsonschema[format-nongpl]==4.21.1 # via # jupyter-events # jupyterlab-server # nbformat -jsonschema-specifications==2023.11.2 +jsonschema-specifications==2023.12.1 # via jsonschema jupyter==1.0.0 # via -r hail/hail/python/dev/requirements.txt @@ -196,7 +195,7 @@ jupyter-client==8.6.0 # qtconsole jupyter-console==6.6.3 # via jupyter -jupyter-core==5.5.1 +jupyter-core==5.7.1 # via # ipykernel # jupyter-client @@ -209,16 +208,16 @@ jupyter-core==5.5.1 # qtconsole jupyter-events==0.9.0 # via jupyter-server -jupyter-lsp==2.2.1 +jupyter-lsp==2.2.2 # via jupyterlab -jupyter-server==2.12.1 +jupyter-server==2.12.5 # via # jupyter-lsp # jupyterlab # jupyterlab-server # notebook # notebook-shim -jupyter-server-terminals==0.5.0 +jupyter-server-terminals==0.5.2 # via jupyter-server jupyterlab==4.0.12 # via notebook @@ -232,7 +231,7 @@ jupyterlab-widgets==3.0.9 # via ipywidgets lazy-object-proxy==1.10.0 # via astroid -markupsafe==2.1.3 +markupsafe==2.1.4 # via # -c hail/hail/python/dev/../pinned-requirements.txt # jinja2 @@ -250,12 +249,11 @@ multidict==6.0.4 # -c hail/hail/python/dev/../pinned-requirements.txt # aiohttp # yarl -mypy-extensions==1.0.0 - # via black nbclient==0.9.0 # via nbconvert -nbconvert==7.13.0 +nbconvert==7.13.1 # via + # -r hail/hail/python/dev/requirements.txt # jupyter # jupyter-server # nbsphinx @@ -267,7 +265,7 @@ nbformat==5.9.2 # nbsphinx nbsphinx==0.9.3 # via -r hail/hail/python/dev/requirements.txt -nest-asyncio==1.5.8 +nest-asyncio==1.6.0 # via # -c hail/hail/python/dev/../pinned-requirements.txt # ipykernel @@ -275,13 +273,13 @@ nodeenv==1.8.0 # via # pre-commit # pyright -notebook==7.0.6 +notebook==7.0.7 # via jupyter notebook-shim==0.2.3 # via # jupyterlab # notebook -overrides==7.4.0 +overrides==7.7.0 # via jupyter-server packaging==23.2 # via @@ -295,29 +293,26 @@ packaging==23.2 # qtconsole # qtpy # sphinx -pandocfilters==1.5.0 +pandocfilters==1.5.1 # via nbconvert parso==0.8.3 # via jedi parsy==1.1.0 # via curlylint pathspec==0.12.1 - # via - # black - # curlylint + # via curlylint pexpect==4.9.0 # via ipython -pillow==10.1.0 +pillow==10.2.0 # via # -c hail/hail/python/dev/../pinned-requirements.txt # -r hail/hail/python/dev/requirements.txt -platformdirs==4.1.0 +platformdirs==4.2.0 # via - # black # jupyter-core # pylint # virtualenv -pluggy==1.3.0 +pluggy==1.4.0 # via pytest pre-commit==3.6.0 # via -r hail/hail/python/dev/requirements.txt @@ -327,7 +322,7 @@ prompt-toolkit==3.0.43 # via # ipython # jupyter-console -psutil==5.9.7 +psutil==5.9.8 # via ipykernel ptyprocess==0.7.0 # via @@ -353,9 +348,9 @@ pygments==2.17.2 # sphinx pylint==2.17.7 # via -r hail/hail/python/dev/requirements.txt -pyright==1.1.341 +pyright==1.1.349 # via -r hail/hail/python/dev/requirements.txt -pytest==7.4.3 +pytest==7.4.4 # via # -r hail/hail/python/dev/requirements.txt # pytest-asyncio @@ -373,7 +368,7 @@ pytest-html==1.22.1 # via -r hail/hail/python/dev/requirements.txt pytest-instafail==0.5.0 # via -r hail/hail/python/dev/requirements.txt -pytest-metadata==3.0.0 +pytest-metadata==3.1.0 # via pytest-html pytest-timeout==2.2.0 # via -r hail/hail/python/dev/requirements.txt @@ -406,7 +401,7 @@ qtconsole==5.5.1 # via jupyter qtpy==2.4.1 # via qtconsole -referencing==0.32.0 +referencing==0.33.0 # via # jsonschema # jsonschema-specifications @@ -424,7 +419,7 @@ rfc3986-validator==0.1.1 # via # jsonschema # jupyter-events -rpds-py==0.15.2 +rpds-py==0.17.1 # via # jsonschema # referencing @@ -451,22 +446,17 @@ sphinx==6.2.1 # nbsphinx # sphinx-autodoc-typehints # sphinx-rtd-theme - # sphinxcontrib-applehelp - # sphinxcontrib-devhelp - # sphinxcontrib-htmlhelp # sphinxcontrib-jquery # sphinxcontrib-katex - # sphinxcontrib-qthelp - # sphinxcontrib-serializinghtml sphinx-autodoc-typehints==1.23.0 # via -r hail/hail/python/dev/requirements.txt sphinx-rtd-theme==1.3.0 # via -r hail/hail/python/dev/requirements.txt -sphinxcontrib-applehelp==1.0.7 +sphinxcontrib-applehelp==1.0.8 # via sphinx -sphinxcontrib-devhelp==1.0.5 +sphinxcontrib-devhelp==1.0.6 # via sphinx -sphinxcontrib-htmlhelp==2.0.4 +sphinxcontrib-htmlhelp==2.0.5 # via sphinx sphinxcontrib-jquery==4.1 # via sphinx-rtd-theme @@ -474,9 +464,9 @@ sphinxcontrib-jsmath==1.0.1 # via sphinx sphinxcontrib-katex==0.9.9 # via -r hail/hail/python/dev/requirements.txt -sphinxcontrib-qthelp==1.0.6 +sphinxcontrib-qthelp==1.0.7 # via sphinx -sphinxcontrib-serializinghtml==1.1.9 +sphinxcontrib-serializinghtml==1.1.10 # via sphinx stack-data==0.6.3 # via ipython @@ -490,7 +480,6 @@ toml==0.10.2 # via curlylint tomli==2.0.1 # via - # black # jupyterlab # pylint # pytest @@ -505,7 +494,7 @@ tornado==6.4 # jupyterlab # notebook # terminado -traitlets==5.14.0 +traitlets==5.14.1 # via # comm # ipykernel @@ -525,13 +514,13 @@ traitlets==5.14.0 # qtconsole types-chardet==5.0.4.6 # via -r hail/hail/python/dev/requirements.txt -types-decorator==5.1.8.4 +types-decorator==5.1.8.20240106 # via -r hail/hail/python/dev/requirements.txt -types-deprecated==1.2.9.3 +types-deprecated==1.2.9.20240106 # via -r hail/hail/python/dev/requirements.txt types-pymysql==1.1.0.1 # via -r hail/hail/python/dev/requirements.txt -types-python-dateutil==2.8.19.14 +types-python-dateutil==2.8.19.20240106 # via # -r hail/hail/python/dev/requirements.txt # arrow @@ -539,23 +528,22 @@ types-pyyaml==6.0.12.12 # via -r hail/hail/python/dev/requirements.txt types-requests==2.31.0.6 # via -r hail/hail/python/dev/requirements.txt -types-setuptools==69.0.0.0 +types-setuptools==69.0.0.20240125 # via -r hail/hail/python/dev/requirements.txt -types-six==1.16.21.9 +types-six==1.16.21.20240106 # via -r hail/hail/python/dev/requirements.txt -types-tabulate==0.9.0.3 +types-tabulate==0.9.0.20240106 # via -r hail/hail/python/dev/requirements.txt types-urllib3==1.26.25.14 # via # -r hail/hail/python/dev/requirements.txt # types-requests -typing-extensions==4.8.0 +typing-extensions==4.9.0 # via # -c hail/hail/python/dev/../pinned-requirements.txt # anyio # astroid # async-lru - # black # ipython # pylint uri-template==1.3.0 @@ -568,7 +556,7 @@ virtualenv==20.25.0 # via pre-commit watchfiles==0.21.0 # via aiohttp-devtools -wcwidth==0.2.12 +wcwidth==0.2.13 # via prompt-toolkit webcolors==1.13 # via jsonschema @@ -586,7 +574,7 @@ wrapt==1.16.0 # via # -c hail/hail/python/dev/../pinned-requirements.txt # astroid -yarl==1.9.3 +yarl==1.9.4 # via # -c hail/hail/python/dev/../pinned-requirements.txt # aiohttp diff --git a/hail/python/dev/requirements.txt b/hail/python/dev/requirements.txt index 5dc26e18440..51c57b7bb92 100644 --- a/hail/python/dev/requirements.txt +++ b/hail/python/dev/requirements.txt @@ -23,6 +23,8 @@ jupyter>=1.0.0,<2 sphinxcontrib.katex>=0.9.0,<1 fswatch>=0.1.1,<1 wheel>=0.41,<0.42 +# https://github.com/jupyter/nbconvert/issues/2092 +nbconvert<7.14 # library type stubs types-Deprecated diff --git a/hail/python/hailtop/batch/batch_pool_executor.py b/hail/python/hailtop/batch/batch_pool_executor.py index 8e8a5102572..c5b88349478 100644 --- a/hail/python/hailtop/batch/batch_pool_executor.py +++ b/hail/python/hailtop/batch/batch_pool_executor.py @@ -7,7 +7,7 @@ import dill import functools -from hailtop.utils import secret_alnum_string, partition, async_to_blocking +from hailtop.utils import secret_alnum_string, partition, async_to_blocking, the_empty_async_generator import hailtop.batch_client.aioclient as low_level_batch_client from hailtop.batch_client.parse import parse_cpu_in_mcpu from hailtop.aiotools.router_fs import RouterAsyncFS @@ -232,7 +232,7 @@ async def async_map( ) -> AsyncGenerator[int, None]: """Aysncio compatible version of :meth:`.map`.""" if not iterables: - return (x for x in range(0)) + return the_empty_async_generator() if chunksize > 1: list_per_argument = [list(x) for x in iterables] diff --git a/hail/python/hailtop/pinned-requirements.txt b/hail/python/hailtop/pinned-requirements.txt index 70adf3f3f6e..b6c845832e8 100644 --- a/hail/python/hailtop/pinned-requirements.txt +++ b/hail/python/hailtop/pinned-requirements.txt @@ -6,17 +6,17 @@ # aiodns==2.0.0 # via -r hail/hail/python/hailtop/requirements.txt -aiohttp==3.9.1 +aiohttp==3.9.3 # via -r hail/hail/python/hailtop/requirements.txt aiosignal==1.3.1 # via aiohttp async-timeout==4.0.3 # via aiohttp -attrs==23.1.0 +attrs==23.2.0 # via aiohttp azure-common==1.1.28 # via azure-mgmt-storage -azure-core==1.29.5 +azure-core==1.29.7 # via # azure-identity # azure-mgmt-core @@ -30,9 +30,9 @@ azure-mgmt-storage==20.1.0 # via -r hail/hail/python/hailtop/requirements.txt azure-storage-blob==12.19.0 # via -r hail/hail/python/hailtop/requirements.txt -boto3==1.33.1 +boto3==1.34.32 # via -r hail/hail/python/hailtop/requirements.txt -botocore==1.33.1 +botocore==1.34.32 # via # -r hail/hail/python/hailtop/requirements.txt # boto3 @@ -53,20 +53,20 @@ click==8.1.7 # via typer commonmark==0.9.1 # via rich -cryptography==41.0.7 +cryptography==42.0.2 # via # azure-identity # azure-storage-blob # msal # pyjwt -dill==0.3.7 +dill==0.3.8 # via -r hail/hail/python/hailtop/requirements.txt -frozenlist==1.4.0 +frozenlist==1.4.1 # via # -r hail/hail/python/hailtop/requirements.txt # aiohttp # aiosignal -google-auth==2.23.4 +google-auth==2.27.0 # via # -r hail/hail/python/hailtop/requirements.txt # google-auth-oauthlib @@ -90,11 +90,11 @@ jmespath==1.0.1 # botocore jproperties==2.1.1 # via -r hail/hail/python/hailtop/requirements.txt -msal==1.25.0 +msal==1.26.0 # via # azure-identity # msal-extensions -msal-extensions==1.0.0 +msal-extensions==1.1.0 # via azure-identity msrest==0.7.1 # via azure-mgmt-storage @@ -102,12 +102,14 @@ multidict==6.0.4 # via # aiohttp # yarl -nest-asyncio==1.5.8 +nest-asyncio==1.6.0 # via -r hail/hail/python/hailtop/requirements.txt oauthlib==3.2.2 # via requests-oauthlib -orjson==3.9.10 +orjson==3.9.12 # via -r hail/hail/python/hailtop/requirements.txt +packaging==23.2 + # via msal-extensions portalocker==2.8.2 # via msal-extensions protobuf==3.20.2 @@ -146,7 +148,7 @@ rich==12.6.0 # via -r hail/hail/python/hailtop/requirements.txt rsa==4.9 # via google-auth -s3transfer==0.8.0 +s3transfer==0.10.0 # via boto3 six==1.16.0 # via @@ -160,7 +162,7 @@ tabulate==0.9.0 # via -r hail/hail/python/hailtop/requirements.txt typer==0.9.0 # via -r hail/hail/python/hailtop/requirements.txt -typing-extensions==4.8.0 +typing-extensions==4.9.0 # via # azure-core # azure-storage-blob @@ -172,5 +174,5 @@ urllib3==1.26.18 # requests uvloop==0.19.0 ; sys_platform != "win32" # via -r hail/hail/python/hailtop/requirements.txt -yarl==1.9.3 +yarl==1.9.4 # via aiohttp diff --git a/hail/python/hailtop/utils/__init__.py b/hail/python/hailtop/utils/__init__.py index c86dd92cae0..3a1e4295c0e 100644 --- a/hail/python/hailtop/utils/__init__.py +++ b/hail/python/hailtop/utils/__init__.py @@ -1,6 +1,7 @@ from .time import time_msecs, time_msecs_str, humanize_timedelta_msecs, parse_timestamp_msecs, time_ns from .utils import ( unzip, + the_empty_async_generator, async_to_blocking, blocking_to_async, AsyncWorkerPool, @@ -72,6 +73,7 @@ __all__ = [ 'time_msecs', + 'the_empty_async_generator', 'time_msecs_str', 'humanize_timedelta_msecs', 'unzip', diff --git a/hail/python/hailtop/utils/utils.py b/hail/python/hailtop/utils/utils.py index 78a3abd7c9a..ded19ee8e20 100644 --- a/hail/python/hailtop/utils/utils.py +++ b/hail/python/hailtop/utils/utils.py @@ -13,6 +13,7 @@ AsyncIterator, Iterator, Union, + AsyncGenerator, ) from typing import Literal, Sequence from typing_extensions import ParamSpec @@ -64,6 +65,11 @@ P = ParamSpec("P") +async def the_empty_async_generator() -> AsyncGenerator[T, None]: + if False: # pylint: disable=using-constant-test + yield # The appearance of the keyword `yield` forces Python to make this function into a generator + + def unpack_comma_delimited_inputs(inputs: List[str]) -> List[str]: return [s.strip() for comma_separated_steps in inputs for s in comma_separated_steps.split(',') if s.strip()] diff --git a/hail/python/pinned-requirements.txt b/hail/python/pinned-requirements.txt index 56a6d00cb26..edd6d15b216 100644 --- a/hail/python/pinned-requirements.txt +++ b/hail/python/pinned-requirements.txt @@ -8,7 +8,7 @@ aiodns==2.0.0 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt -aiohttp==3.9.1 +aiohttp==3.9.3 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt @@ -20,7 +20,7 @@ async-timeout==4.0.3 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # aiohttp -attrs==23.1.0 +attrs==23.2.0 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # aiohttp @@ -30,7 +30,7 @@ azure-common==1.1.28 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # azure-mgmt-storage -azure-core==1.29.5 +azure-core==1.29.7 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # azure-identity @@ -53,13 +53,13 @@ azure-storage-blob==12.19.0 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt -bokeh==3.3.2 +bokeh==3.3.4 # via -r hail/hail/python/requirements.txt -boto3==1.33.1 +boto3==1.34.32 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt -botocore==1.33.1 +botocore==1.34.32 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt @@ -93,7 +93,7 @@ commonmark==0.9.1 # rich contourpy==1.2.0 # via bokeh -cryptography==41.0.7 +cryptography==42.0.2 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # azure-identity @@ -104,17 +104,17 @@ decorator==4.4.2 # via -r hail/hail/python/requirements.txt deprecated==1.2.14 # via -r hail/hail/python/requirements.txt -dill==0.3.7 +dill==0.3.8 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt -frozenlist==1.4.0 +frozenlist==1.4.1 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt # aiohttp # aiosignal -google-auth==2.23.4 +google-auth==2.27.0 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt @@ -152,14 +152,14 @@ jproperties==2.1.1 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt -markupsafe==2.1.3 +markupsafe==2.1.4 # via jinja2 -msal==1.25.0 +msal==1.26.0 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # azure-identity # msal-extensions -msal-extensions==1.0.0 +msal-extensions==1.1.0 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # azure-identity @@ -172,11 +172,11 @@ multidict==6.0.4 # -c hail/hail/python/hailtop/pinned-requirements.txt # aiohttp # yarl -nest-asyncio==1.5.8 +nest-asyncio==1.6.0 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt -numpy==1.26.2 +numpy==1.26.3 # via # -r hail/hail/python/requirements.txt # bokeh @@ -187,21 +187,23 @@ oauthlib==3.2.2 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # requests-oauthlib -orjson==3.9.10 +orjson==3.9.12 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt packaging==23.2 # via + # -c hail/hail/python/hailtop/pinned-requirements.txt # bokeh + # msal-extensions # plotly -pandas==2.1.4 +pandas==2.2.0 # via # -r hail/hail/python/requirements.txt # bokeh parsimonious==0.10.0 # via -r hail/hail/python/requirements.txt -pillow==10.1.0 +pillow==10.2.0 # via bokeh plotly==5.18.0 # via -r hail/hail/python/requirements.txt @@ -254,14 +256,14 @@ python-json-logger==2.0.7 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt -pytz==2023.3.post1 +pytz==2023.4 # via pandas pyyaml==6.0.1 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt # bokeh -regex==2023.10.3 +regex==2023.12.25 # via parsimonious requests==2.31.0 # via @@ -284,7 +286,7 @@ rsa==4.9 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # google-auth -s3transfer==0.8.0 +s3transfer==0.10.0 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # boto3 @@ -313,14 +315,14 @@ typer==0.9.0 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # -r hail/hail/python/hailtop/requirements.txt -typing-extensions==4.8.0 +typing-extensions==4.9.0 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # azure-core # azure-storage-blob # janus # typer -tzdata==2023.3 +tzdata==2023.4 # via pandas urllib3==1.26.18 # via @@ -335,7 +337,7 @@ wrapt==1.16.0 # via deprecated xyzservices==2023.10.1 # via bokeh -yarl==1.9.3 +yarl==1.9.4 # via # -c hail/hail/python/hailtop/pinned-requirements.txt # aiohttp diff --git a/web_common/pinned-requirements.txt b/web_common/pinned-requirements.txt index de64e419ff1..308986c378b 100644 --- a/web_common/pinned-requirements.txt +++ b/web_common/pinned-requirements.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=hail/web_common/pinned-requirements.txt hail/web_common/requirements.txt # -aiohttp==3.9.1 +aiohttp==3.9.3 # via # -c hail/web_common/../gear/pinned-requirements.txt # -c hail/web_common/../hail/python/dev/pinned-requirements.txt @@ -24,13 +24,13 @@ async-timeout==4.0.3 # -c hail/web_common/../hail/python/dev/pinned-requirements.txt # -c hail/web_common/../hail/python/pinned-requirements.txt # aiohttp -attrs==23.1.0 +attrs==23.2.0 # via # -c hail/web_common/../gear/pinned-requirements.txt # -c hail/web_common/../hail/python/dev/pinned-requirements.txt # -c hail/web_common/../hail/python/pinned-requirements.txt # aiohttp -frozenlist==1.4.0 +frozenlist==1.4.1 # via # -c hail/web_common/../gear/pinned-requirements.txt # -c hail/web_common/../hail/python/dev/pinned-requirements.txt @@ -49,9 +49,9 @@ jinja2==3.1.3 # -c hail/web_common/../hail/python/pinned-requirements.txt # -r hail/web_common/requirements.txt # aiohttp-jinja2 -libsass==0.22.0 +libsass==0.23.0 # via -r hail/web_common/requirements.txt -markupsafe==2.1.3 +markupsafe==2.1.4 # via # -c hail/web_common/../hail/python/dev/pinned-requirements.txt # -c hail/web_common/../hail/python/pinned-requirements.txt @@ -63,7 +63,7 @@ multidict==6.0.4 # -c hail/web_common/../hail/python/pinned-requirements.txt # aiohttp # yarl -yarl==1.9.3 +yarl==1.9.4 # via # -c hail/web_common/../gear/pinned-requirements.txt # -c hail/web_common/../hail/python/dev/pinned-requirements.txt From d4679ebd8824777826a571edbccb687018670c4d Mon Sep 17 00:00:00 2001 From: Dan King Date: Fri, 2 Feb 2024 18:37:08 -0500 Subject: [PATCH 23/26] [query] Use valid globals reference in MWZJ and TABK (#14246) CHANGELOG: Fix a bug, introduced in 0.2.114, in which `Table.multi_way_zip_join` and `Table.aggregate_by_key` could throw "NoSuchElementException: Ref with name `__iruid_...`" when one or more of the tables had a number of partitions substantially different from the desired number of output partitions. Fixes https://github.com/hail-is/hail/issues/14245. In both MultiWayZipJoin and TableAggregateByKey, we repartition the child but neglect to use the new globals `Ref` from the repartitioned child. As long as `repartitionNoShuffle` does not create a new TableStage with new globals, this is fine, but that is not, in general, true. It seems that recently, in lowered backends, when the repartition cost is deemed "high" we generate a fresh TableStage with a fresh globals ref. --- hail/python/test/hail/table/test_table.py | 17 +++++ .../hail/expr/ir/lowering/LowerTableIR.scala | 65 ++++++++++--------- 2 files changed, 51 insertions(+), 31 deletions(-) diff --git a/hail/python/test/hail/table/test_table.py b/hail/python/test/hail/table/test_table.py index ed2c5d6bc09..f76659aa8db 100644 --- a/hail/python/test/hail/table/test_table.py +++ b/hail/python/test/hail/table/test_table.py @@ -668,6 +668,23 @@ def test_multi_way_zip_join_key_downcast2(self): ht = hl.Table.multi_way_zip_join(vcfs, 'data', 'new_globals') assert exp_count == ht._force_count() + def test_multi_way_zip_join_highly_unbalanced_partitions__issue_14245(self): + def import_vcf(file: str, partitions: int): + return ( + hl.import_vcf(file, force_bgz=True, reference_genome='GRCh38', min_partitions=partitions) + .rows() + .select() + ) + + hl.Table.multi_way_zip_join( + [ + import_vcf(resource('gvcfs/HG00096.g.vcf.gz'), 100), + import_vcf(resource('gvcfs/HG00268.g.vcf.gz'), 1), + ], + 'data', + 'new_globals', + ).write(new_temp_file(extension='ht')) + def test_index_maintains_count(self): t1 = hl.Table.parallelize( [{'a': 'foo', 'b': 1}, {'a': 'bar', 'b': 2}, {'a': 'bar', 'b': 2}], diff --git a/hail/src/main/scala/is/hail/expr/ir/lowering/LowerTableIR.scala b/hail/src/main/scala/is/hail/expr/ir/lowering/LowerTableIR.scala index b1e5451131c..60212f0de1d 100644 --- a/hail/src/main/scala/is/hail/expr/ir/lowering/LowerTableIR.scala +++ b/hail/src/main/scala/is/hail/expr/ir/lowering/LowerTableIR.scala @@ -1197,39 +1197,39 @@ object LowerTableIR { case TableAggregateByKey(child, expr) => val loweredChild = lower(child) - - loweredChild.repartitionNoShuffle( + val repartitioned = loweredChild.repartitionNoShuffle( ctx, loweredChild.partitioner.coarsen(child.typ.key.length).strictify(), ) - .mapPartition(Some(child.typ.key)) { partition => - Let( - FastSeq("global" -> loweredChild.globals), - mapIR(StreamGroupByKey(partition, child.typ.key, missingEqual = true)) { groupRef => - StreamAgg( - groupRef, - "row", - bindIRs( - ArrayRef( - ApplyAggOp( - FastSeq(I32(1)), - FastSeq(SelectFields(Ref("row", child.typ.rowType), child.typ.key)), - AggSignature(Take(), FastSeq(TInt32), FastSeq(child.typ.keyType)), - ), - I32(0), - ), // FIXME: would prefer a First() agg op - expr, - ) { case Seq(key, value) => - MakeStruct(child.typ.key.map(k => - (k, GetField(key, k)) - ) ++ expr.typ.asInstanceOf[TStruct].fieldNames.map { f => - (f, GetField(value, f)) - }) - }, - ) - }, - ) - } + + repartitioned.mapPartition(Some(child.typ.key)) { partition => + Let( + FastSeq("global" -> repartitioned.globals), + mapIR(StreamGroupByKey(partition, child.typ.key, missingEqual = true)) { groupRef => + StreamAgg( + groupRef, + "row", + bindIRs( + ArrayRef( + ApplyAggOp( + FastSeq(I32(1)), + FastSeq(SelectFields(Ref("row", child.typ.rowType), child.typ.key)), + AggSignature(Take(), FastSeq(TInt32), FastSeq(child.typ.keyType)), + ), + I32(0), + ), // FIXME: would prefer a First() agg op + expr, + ) { case Seq(key, value) => + MakeStruct(child.typ.key.map(k => + (k, GetField(key, k)) + ) ++ expr.typ.asInstanceOf[TStruct].fieldNames.map { f => + (f, GetField(value, f)) + }) + }, + ) + }, + ) + } case TableDistinct(child) => val loweredChild = lower(child) @@ -2155,7 +2155,10 @@ object LowerTableIR { ) val repartitioned = lowered.map(_.repartitionNoShuffle(ctx, newPartitioner)) val newGlobals = MakeStruct(FastSeq( - globalName -> MakeArray(lowered.map(_.globals), TArray(lowered.head.globalType)) + globalName -> MakeArray( + repartitioned.map(_.globals), + TArray(repartitioned.head.globalType), + ) )) val globalsRef = Ref(genUID(), newGlobals.typ) From 0b929233e1eddecd3399ac328330c59279344bf8 Mon Sep 17 00:00:00 2001 From: Dan King Date: Fri, 2 Feb 2024 19:15:48 -0500 Subject: [PATCH 24/26] [query] support importing empty JSON objects (#14202) @patrick-schultz I'm not sure if this makes sense or not, but I observed it while working on something else. It seems weird but acceptable to import an empty dictionary as any struct. Does this seem reasonable to you? How have we avoided this bug for so long? I'm not familiar enough with this code to know how to simply reproduce the bug and add a corresponding test. Thoughts? --- .../scala/is/hail/expr/AnnotationImpex.scala | 10 +++++--- .../scala/is/hail/methods/ExprSuite.scala | 25 ++++++++++++++++++- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/hail/src/main/scala/is/hail/expr/AnnotationImpex.scala b/hail/src/main/scala/is/hail/expr/AnnotationImpex.scala index 089914e097e..aed45885ccf 100644 --- a/hail/src/main/scala/is/hail/expr/AnnotationImpex.scala +++ b/hail/src/main/scala/is/hail/expr/AnnotationImpex.scala @@ -263,11 +263,15 @@ object JSONAnnotationImpex { if (t.size == 0) Annotation.empty else { - val annotationSize = - if (padNulls) t.size - else jfields.map { case (name, _) => + val annotationSize = if (padNulls) { + t.size + } else if (jfields.size == 0) { + 0 + } else { + jfields.map { case (name, _) => t.selfField(name).map(_.index).getOrElse(-1) }.max + 1 + } val a = Array.fill[Any](annotationSize)(null) for ((name, jv2) <- jfields) { diff --git a/hail/src/test/scala/is/hail/methods/ExprSuite.scala b/hail/src/test/scala/is/hail/methods/ExprSuite.scala index 96a84a94d85..35541ce4a3e 100644 --- a/hail/src/test/scala/is/hail/methods/ExprSuite.scala +++ b/hail/src/test/scala/is/hail/methods/ExprSuite.scala @@ -6,12 +6,13 @@ import is.hail.check.Prop._ import is.hail.check.Properties import is.hail.expr._ import is.hail.expr.ir.IRParser -import is.hail.types.virtual.{TInt32, Type} +import is.hail.types.virtual._ import is.hail.utils.StringEscapeUtils._ import org.json4s._ import org.json4s.jackson.JsonMethods._ import org.testng.annotations.Test +import org.apache.spark.sql.Row class ExprSuite extends HailSuite { @@ -70,6 +71,28 @@ class ExprSuite extends HailSuite { p.check() } + @Test def testImportEmptyJSONObjectAsStruct(): Unit = + assert(JSONAnnotationImpex.importAnnotation(parse("{}"), TStruct()) == Row()) + + @Test def testExportEmptyJSONObjectAsStruct(): Unit = + assert(compact(render(JSONAnnotationImpex.exportAnnotation(Row(), TStruct()))) == "{}") + + @Test def testRoundTripEmptyJSONObject(): Unit = { + val actual = JSONAnnotationImpex.exportAnnotation( + JSONAnnotationImpex.importAnnotation(parse("{}"), TStruct()), + TStruct(), + ) + assert(compact(render(actual)) == "{}") + } + + @Test def testRoundTripEmptyStruct(): Unit = { + val actual = JSONAnnotationImpex.importAnnotation( + JSONAnnotationImpex.exportAnnotation(Row(), TStruct()), + TStruct(), + ) + assert(actual == Row()) + } + @Test def testImpexes(): Unit = { val g = for { From 7a418eb664f08154339a8f9e0a11e418a8f8f125 Mon Sep 17 00:00:00 2001 From: Will Tyler Date: Sat, 3 Feb 2024 00:59:35 +0000 Subject: [PATCH 25/26] Fix an error in the MatrixTable tutorial (#14239) ### Description In this pull request, I fix an error in the MatrixTable tutorial. The tutorial shows some genotype data and erroneously states that all the genotypes that are shown are homozygous reference (0/0). In fact, there are also some heterozygous (0/1) and homozygous alternate (1/1) genotypes in the displayed data. In this pull request, I remove the erroneous statement. ### Testing I ran the notebook to confirm that the notebook displays a mix of genotypes, not just homozygous reference. You can view the erroneous version of the tutorial [here](https://hail.is/docs/0.2/tutorials/07-matrixtable.html#MatrixTable-operations). --- hail/python/hail/docs/tutorials/07-matrixtable.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hail/python/hail/docs/tutorials/07-matrixtable.ipynb b/hail/python/hail/docs/tutorials/07-matrixtable.ipynb index d0d005b24b0..aebc842473f 100644 --- a/hail/python/hail/docs/tutorials/07-matrixtable.ipynb +++ b/hail/python/hail/docs/tutorials/07-matrixtable.ipynb @@ -287,7 +287,7 @@ } }, "source": [ - "All homozygous reference, which is not surprising. Let's look at the distribution of genotype calls:" + "Let's look at the distribution of genotype calls:" ] }, { @@ -443,7 +443,7 @@ "metadata": { "celltoolbar": "Slideshow", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -457,9 +457,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.6" + "version": "3.9.18" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From 671deef8f156f62227f5061485a40d001b29b61b Mon Sep 17 00:00:00 2001 From: Dan King Date: Tue, 6 Feb 2024 11:24:38 -0500 Subject: [PATCH 26/26] [fs] support hfs.ls on a bucket (#14176) Teaches `hfs.ls('gs://bucket/')` to list the files and directories at the top-level of the bucket. In `main` that command raises because this line of `_ls_no_glob` raises: ```python3 maybe_sb_and_t, maybe_contents = await asyncio.gather( self._size_bytes_and_time_modified_or_none(path), ls_as_dir() ) ``` In particular, `statfile` raises a cloud-specific, esoteric error about a malformed URL or empty object names: ```python3 async def _size_bytes_and_time_modified_or_none(self, path: str) -> Optional[Tuple[int, float]]: try: # Hadoop semantics: creation time is used if the object has no notion of last modification time. file_status = await self.afs.statfile(path) return (await file_status.size(), file_status.time_modified().timestamp()) except FileNotFoundError: return None ``` I decided to add a sub-class of `FileNotFoundError` which is self-describing: `IsABucketError`. I changed most methods to raise that error when given a bucket URL. The two interesting cases: 1. `isdir`. This raises an error but I could also see this returning `True`. A bucket is like a directory whose path/name is empty. 2. `isfile`. This returns False but I could also see this raising an error. This just seems convenient, we know the bucket is not a file so we should say so. --- Apparently `hfs.ls` had no current tests because the globbing system doesn't work with Azure https:// URLs. I fixed it to use `AsyncFSURL.with_new_path_component` which is resilient to Azure https weirdness. However, I had to change `with_new_path_component` to treat an empty path in a special way. I wanted this to hold: ``` actual = str(afs.parse_url('gs://bucket').with_new_path_component('bar')) expected = 'gs://bucket/bar' assert actual == expected ``` But `with_new_path_component` interacts badly with `GoogleAsyncFSURL.__str__` to return this: ``` 'gs://bucket//bar' ``` --- hail/python/hail/backend/local_backend.py | 7 +- hail/python/hail/backend/service_backend.py | 10 ++- hail/python/hailtop/aiocloud/aioaws/fs.py | 47 ++++++++---- hail/python/hailtop/aiocloud/aioazure/fs.py | 31 ++++++-- .../aiogoogle/client/storage_client.py | 47 +++++++----- hail/python/hailtop/aiotools/__init__.py | 2 + hail/python/hailtop/aiotools/fs/__init__.py | 3 +- hail/python/hailtop/aiotools/fs/exceptions.py | 4 + hail/python/hailtop/aiotools/fs/fs.py | 26 ++++++- hail/python/hailtop/aiotools/local_fs.py | 8 +- hail/python/hailtop/aiotools/router_fs.py | 4 +- hail/python/hailtop/fs/router_fs.py | 74 +++++++++++-------- .../test/hailtop/inter_cloud/test_fs.py | 52 ++++++++++++- 13 files changed, 230 insertions(+), 85 deletions(-) diff --git a/hail/python/hail/backend/local_backend.py b/hail/python/hail/backend/local_backend.py index 595bd04e42e..4a1131970a1 100644 --- a/hail/python/hail/backend/local_backend.py +++ b/hail/python/hail/backend/local_backend.py @@ -1,4 +1,5 @@ from typing import Optional, Union, Tuple, List +from contextlib import ExitStack import os import sys @@ -31,6 +32,7 @@ def __init__( gcs_requester_pays_project: Optional[str] = None, gcs_requester_pays_buckets: Optional[str] = None, ): + self._exit_stack = ExitStack() assert gcs_requester_pays_project is not None or gcs_requester_pays_buckets is None spark_home = find_spark_home() @@ -59,6 +61,7 @@ def __init__( die_on_exit=True, ) self._gateway = JavaGateway(gateway_parameters=GatewayParameters(port=port, auto_convert=True)) + self._exit_stack.callback(self._gateway.shutdown) hail_package = getattr(self._gateway.jvm, 'is').hail @@ -75,7 +78,7 @@ def __init__( super(LocalBackend, self).__init__(self._gateway.jvm, jbackend, jhc) - self._fs = RouterFS() + self._fs = self._exit_stack.enter_context(RouterFS()) self._logger = None self._initialize_flags({}) @@ -108,7 +111,7 @@ def register_ir_function( def stop(self): super().stop() - self._gateway.shutdown() + self._exit_stack.close() uninstall_exception_handler() @property diff --git a/hail/python/hail/backend/service_backend.py b/hail/python/hail/backend/service_backend.py index 1fb95c30115..d7e86bb0433 100644 --- a/hail/python/hail/backend/service_backend.py +++ b/hail/python/hail/backend/service_backend.py @@ -207,6 +207,7 @@ async def create( gcs_requester_pays_configuration: Optional[GCSRequesterPaysConfiguration] = None, gcs_bucket_allow_list: Optional[List[str]] = None, ): + async_exit_stack = AsyncExitStack() billing_project = configuration_of(ConfigVariable.BATCH_BILLING_PROJECT, billing_project, None) if billing_project is None: raise ValueError( @@ -221,9 +222,11 @@ async def create( gcs_kwargs={'gcs_requester_pays_configuration': gcs_requester_pays_configuration}, gcs_bucket_allow_list=gcs_bucket_allow_list, ) + async_exit_stack.push_async_callback(async_fs.close) sync_fs = RouterFS(async_fs) if batch_client is None: batch_client = await BatchClient.create(billing_project, _token=credentials_token) + async_exit_stack.push_async_callback(batch_client.close) batch_attributes: Dict[str, str] = dict() remote_tmpdir = get_remote_tmpdir('ServiceBackend', remote_tmpdir=remote_tmpdir) @@ -288,6 +291,7 @@ async def create( worker_cores=worker_cores, worker_memory=worker_memory, regions=regions, + async_exit_stack=async_exit_stack, ) sb._initialize_flags(flags) return sb @@ -308,6 +312,7 @@ def __init__( worker_cores: Optional[Union[int, str]], worker_memory: Optional[str], regions: List[str], + async_exit_stack: AsyncExitStack, ): super(ServiceBackend, self).__init__() self.billing_project = billing_project @@ -329,6 +334,7 @@ def __init__( self.regions = regions self._batch: Batch = self._create_batch() + self._async_exit_stack = async_exit_stack def _create_batch(self) -> Batch: return self._batch_client.create_batch(attributes=self.batch_attributes) @@ -362,9 +368,7 @@ def stop(self): hail_event_loop().run_until_complete(self._stop()) async def _stop(self): - async with AsyncExitStack() as stack: - stack.push_async_callback(self._async_fs.close) - stack.push_async_callback(self._batch_client.close) + await self._async_exit_stack.aclose() self.functions = [] self._registered_ir_function_names = set() diff --git a/hail/python/hailtop/aiocloud/aioaws/fs.py b/hail/python/hailtop/aiocloud/aioaws/fs.py index f2e369c5cd4..8c22c851692 100644 --- a/hail/python/hailtop/aiocloud/aioaws/fs.py +++ b/hail/python/hailtop/aiocloud/aioaws/fs.py @@ -35,6 +35,7 @@ AsyncFSURL, MultiPartCreate, FileAndDirectoryError, + IsABucketError, ) from hailtop.aiotools.fs.exceptions import UnexpectedEOFError from hailtop.aiotools.fs.stream import ( @@ -325,6 +326,9 @@ def __init__(self, bucket: str, path: str): self._bucket = bucket self._path = path + def __repr__(self): + return f'S3AsyncFSURL({self._bucket}, {self._path})' + @property def bucket_parts(self) -> List[str]: return [self._bucket] @@ -344,6 +348,9 @@ def scheme(self) -> str: def with_path(self, path) -> 'S3AsyncFSURL': return S3AsyncFSURL(self._bucket, path) + def with_root_path(self) -> 'S3AsyncFSURL': + return self.with_path('') + def __str__(self) -> str: return f's3://{self._bucket}/{self._path}' @@ -399,8 +406,11 @@ def valid_url(url: str) -> bool: return url.startswith('s3://') @staticmethod - def parse_url(url: str) -> S3AsyncFSURL: - return S3AsyncFSURL(*S3AsyncFS.get_bucket_and_name(url)) + def parse_url(url: str, *, error_if_bucket: bool = False) -> S3AsyncFSURL: + fsurl = S3AsyncFSURL(*S3AsyncFS.get_bucket_and_name(url)) + if error_if_bucket and fsurl._path == '': + raise IsABucketError + return fsurl @staticmethod def get_bucket_and_name(url: str) -> Tuple[str, str]: @@ -423,22 +433,24 @@ def get_bucket_and_name(url: str) -> Tuple[str, str]: return (bucket, name) async def open(self, url: str) -> ReadableStream: - bucket, name = self.get_bucket_and_name(url) + fsurl = self.parse_url(url, error_if_bucket=True) try: - resp = await blocking_to_async(self._thread_pool, self._s3.get_object, Bucket=bucket, Key=name) + resp = await blocking_to_async( + self._thread_pool, self._s3.get_object, Bucket=fsurl._bucket, Key=fsurl._path + ) return blocking_readable_stream_to_async(self._thread_pool, cast(BinaryIO, resp['Body'])) except self._s3.exceptions.NoSuchKey as e: raise FileNotFoundError(url) from e async def _open_from(self, url: str, start: int, *, length: Optional[int] = None) -> ReadableStream: - bucket, name = self.get_bucket_and_name(url) + fsurl = self.parse_url(url, error_if_bucket=True) range_str = f'bytes={start}-' if length is not None: assert length >= 1 range_str += str(start + length - 1) try: resp = await blocking_to_async( - self._thread_pool, self._s3.get_object, Bucket=bucket, Key=name, Range=range_str + self._thread_pool, self._s3.get_object, Bucket=fsurl._bucket, Key=fsurl._path, Range=range_str ) return blocking_readable_stream_to_async(self._thread_pool, cast(BinaryIO, resp['Body'])) except self._s3.exceptions.NoSuchKey as e: @@ -489,12 +501,12 @@ async def create(self, url: str, *, retry_writes: bool = True) -> S3CreateManage # interface. This has the disadvantage that the read must # complete before the write can begin (unlike the current # code, that copies 128MB parts in 256KB chunks). - bucket, name = self.get_bucket_and_name(url) - return S3CreateManager(self, bucket, name) + fsurl = self.parse_url(url, error_if_bucket=True) + return S3CreateManager(self, fsurl._bucket, fsurl._path) async def multi_part_create(self, sema: asyncio.Semaphore, url: str, num_parts: int) -> MultiPartCreate: - bucket, name = self.get_bucket_and_name(url) - return S3MultiPartCreate(sema, self, bucket, name, num_parts) + fsurl = self.parse_url(url, error_if_bucket=True) + return S3MultiPartCreate(sema, self, fsurl._bucket, fsurl._path, num_parts) async def mkdir(self, url: str) -> None: pass @@ -503,9 +515,11 @@ async def makedirs(self, url: str, exist_ok: bool = False) -> None: pass async def statfile(self, url: str) -> FileStatus: - bucket, name = self.get_bucket_and_name(url) + fsurl = self.parse_url(url, error_if_bucket=True) try: - resp = await blocking_to_async(self._thread_pool, self._s3.head_object, Bucket=bucket, Key=name) + resp = await blocking_to_async( + self._thread_pool, self._s3.head_object, Bucket=fsurl._bucket, Key=fsurl._path + ) return S3HeadObjectFileStatus(resp, url) except botocore.exceptions.ClientError as e: if e.response['ResponseMetadata']['HTTPStatusCode'] == 404: @@ -579,8 +593,10 @@ async def staturl(self, url: str) -> str: return await self._staturl_parallel_isfile_isdir(url) async def isfile(self, url: str) -> bool: + bucket, name = self.get_bucket_and_name(url) + if name == '': + return False try: - bucket, name = self.get_bucket_and_name(url) await blocking_to_async(self._thread_pool, self._s3.head_object, Bucket=bucket, Key=name) return True except botocore.exceptions.ClientError as e: @@ -589,6 +605,7 @@ async def isfile(self, url: str) -> bool: raise e async def isdir(self, url: str) -> bool: + self.parse_url(url, error_if_bucket=True) try: async for _ in await self.listfiles(url, recursive=True): return True @@ -597,9 +614,9 @@ async def isdir(self, url: str) -> bool: return False async def remove(self, url: str) -> None: + fsurl = self.parse_url(url, error_if_bucket=True) try: - bucket, name = self.get_bucket_and_name(url) - await blocking_to_async(self._thread_pool, self._s3.delete_object, Bucket=bucket, Key=name) + await blocking_to_async(self._thread_pool, self._s3.delete_object, Bucket=fsurl._bucket, Key=fsurl._path) except self._s3.exceptions.NoSuchKey as e: raise FileNotFoundError(url) from e diff --git a/hail/python/hailtop/aiocloud/aioazure/fs.py b/hail/python/hailtop/aiocloud/aioazure/fs.py index 780d2e73eca..5023e56e096 100644 --- a/hail/python/hailtop/aiocloud/aioazure/fs.py +++ b/hail/python/hailtop/aiocloud/aioazure/fs.py @@ -30,6 +30,7 @@ FileStatus, FileAndDirectoryError, UnexpectedEOFError, + IsABucketError, ) from .credentials import AzureCredentials @@ -298,6 +299,9 @@ def __init__(self, account: str, container: str, path: str, query: Optional[str] self._path = path self._query = query + def __repr__(self): + return f'AzureAsyncFSURL({self._account}, {self._container}, {self._path}, {self._query})' + @property def bucket_parts(self) -> List[str]: return [self._account, self._container] @@ -326,6 +330,9 @@ def base(self) -> str: def with_path(self, path) -> 'AzureAsyncFSURL': return self.__class__(self._account, self._container, path, self._query) + def with_root_path(self) -> 'AzureAsyncFSURL': + return self.with_path('') + def __str__(self) -> str: return self.base if not self._query else f'{self.base}?{self._query}' @@ -440,7 +447,14 @@ async def generate_sas_token( return token @staticmethod - def parse_url(url: str) -> AzureAsyncFSURL: + def parse_url(url: str, *, error_if_bucket: bool = False) -> AzureAsyncFSURL: + fsurl = AzureAsyncFS._parse_url(url) + if error_if_bucket and fsurl._path == '': + raise IsABucketError + return fsurl + + @staticmethod + def _parse_url(url: str) -> AzureAsyncFSURL: colon_index = url.find(':') if colon_index == -1: raise ValueError(f'invalid URL: {url}') @@ -513,9 +527,10 @@ def get_container_client(self, url: AzureAsyncFSURL) -> ContainerClient: @handle_public_access_error async def open(self, url: str) -> ReadableStream: + parsed_url = self.parse_url(url, error_if_bucket=True) if not await self.exists(url): raise FileNotFoundError - client = self.get_blob_client(self.parse_url(url)) + client = self.get_blob_client(parsed_url) return AzureReadableStream(client, url) @handle_public_access_error @@ -523,11 +538,12 @@ async def _open_from(self, url: str, start: int, *, length: Optional[int] = None assert length is None or length >= 1 if not await self.exists(url): raise FileNotFoundError - client = self.get_blob_client(self.parse_url(url)) + client = self.get_blob_client(self.parse_url(url, error_if_bucket=True)) return AzureReadableStream(client, url, offset=start, length=length) async def create(self, url: str, *, retry_writes: bool = True) -> AsyncContextManager[WritableStream]: # pylint: disable=unused-argument - return AzureCreateManager(self.get_blob_client(self.parse_url(url))) + parsed_url = self.parse_url(url, error_if_bucket=True) + return AzureCreateManager(self.get_blob_client(parsed_url)) async def multi_part_create(self, sema: asyncio.Semaphore, url: str, num_parts: int) -> MultiPartCreate: client = self.get_blob_client(self.parse_url(url)) @@ -545,7 +561,7 @@ async def isfile(self, url: str) -> bool: @handle_public_access_error async def isdir(self, url: str) -> bool: - fs_url = self.parse_url(url) + fs_url = self.parse_url(url, error_if_bucket=True) assert not fs_url.path or fs_url.path.endswith('/'), fs_url.path client = self.get_container_client(fs_url) async for _ in client.walk_blobs(name_starts_with=fs_url.path, include=['metadata'], delimiter='/'): @@ -560,8 +576,8 @@ async def makedirs(self, url: str, exist_ok: bool = False) -> None: @handle_public_access_error async def statfile(self, url: str) -> FileStatus: + parsed_url = self.parse_url(url, error_if_bucket=True) try: - parsed_url = self.parse_url(url) blob_props = await self.get_blob_client(parsed_url).get_blob_properties() return AzureFileStatus(blob_props, parsed_url) except azure.core.exceptions.ResourceNotFoundError as e: @@ -639,7 +655,8 @@ async def staturl(self, url: str) -> str: async def remove(self, url: str) -> None: try: - await self.get_blob_client(self.parse_url(url)).delete_blob() + parsed_url = self.parse_url(url, error_if_bucket=True) + await self.get_blob_client(parsed_url).delete_blob() except azure.core.exceptions.ResourceNotFoundError as e: raise FileNotFoundError(url) from e diff --git a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py index 8055cc2a71f..b77d2a8c32d 100644 --- a/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py +++ b/hail/python/hailtop/aiocloud/aiogoogle/client/storage_client.py @@ -21,6 +21,7 @@ FileAndDirectoryError, MultiPartCreate, UnexpectedEOFError, + IsABucketError, ) from hailtop.aiotools import FeedableAsyncIterable, WriteBuffer @@ -578,6 +579,9 @@ def __init__(self, bucket: str, path: str): self._bucket = bucket self._path = path + def __repr__(self): + return f'GoogleStorageAsyncFSURL({self._bucket}, {self._path})' + @property def bucket_parts(self) -> List[str]: return [self._bucket] @@ -597,6 +601,9 @@ def scheme(self) -> str: def with_path(self, path) -> 'GoogleStorageAsyncFSURL': return GoogleStorageAsyncFSURL(self._bucket, path) + def with_root_path(self) -> 'GoogleStorageAsyncFSURL': + return self.with_path('') + def __str__(self) -> str: return f'gs://{self._bucket}/{self._path}' @@ -645,8 +652,11 @@ def valid_url(url: str) -> bool: return url.startswith('gs://') @staticmethod - def parse_url(url: str) -> GoogleStorageAsyncFSURL: - return GoogleStorageAsyncFSURL(*GoogleStorageAsyncFS.get_bucket_and_name(url)) + def parse_url(url: str, *, error_if_bucket: bool = False) -> GoogleStorageAsyncFSURL: + fsurl = GoogleStorageAsyncFSURL(*GoogleStorageAsyncFS.get_bucket_and_name(url)) + if error_if_bucket and fsurl._path == '': + raise IsABucketError + return fsurl @staticmethod def get_bucket_and_name(url: str) -> Tuple[str, str]: @@ -673,25 +683,26 @@ def get_bucket_and_name(url: str) -> Tuple[str, str]: return (bucket, name) async def open(self, url: str) -> GetObjectStream: - bucket, name = self.get_bucket_and_name(url) - return await self._storage_client.get_object(bucket, name) + fsurl = self.parse_url(url, error_if_bucket=True) + return await self._storage_client.get_object(fsurl._bucket, fsurl._path) async def _open_from(self, url: str, start: int, *, length: Optional[int] = None) -> GetObjectStream: - bucket, name = self.get_bucket_and_name(url) + fsurl = self.parse_url(url, error_if_bucket=True) range_str = f'bytes={start}-' if length is not None: assert length >= 1 range_str += str(start + length - 1) - return await self._storage_client.get_object(bucket, name, headers={'Range': range_str}) + return await self._storage_client.get_object(fsurl._bucket, fsurl._path, headers={'Range': range_str}) async def create(self, url: str, *, retry_writes: bool = True) -> WritableStream: - bucket, name = self.get_bucket_and_name(url) + fsurl = self.parse_url(url, error_if_bucket=True) params = {'uploadType': 'resumable' if retry_writes else 'media'} - return await self._storage_client.insert_object(bucket, name, params=params) + return await self._storage_client.insert_object(fsurl._bucket, fsurl._path, params=params) async def multi_part_create( self, sema: asyncio.Semaphore, url: str, num_parts: int ) -> GoogleStorageMultiPartCreate: + self.parse_url(url, error_if_bucket=True) return GoogleStorageMultiPartCreate(sema, self, url, num_parts) async def staturl(self, url: str) -> str: @@ -705,8 +716,8 @@ async def makedirs(self, url: str, exist_ok: bool = False) -> None: async def statfile(self, url: str) -> GetObjectFileStatus: try: - bucket, name = self.get_bucket_and_name(url) - return GetObjectFileStatus(await self._storage_client.get_object_metadata(bucket, name), url) + fsurl = self.parse_url(url, error_if_bucket=True) + return GetObjectFileStatus(await self._storage_client.get_object_metadata(fsurl._bucket, fsurl._path), url) except aiohttp.ClientResponseError as e: if e.status == 404: raise FileNotFoundError(url) from e @@ -784,12 +795,12 @@ async def cons(first_entry, it) -> AsyncIterator[FileListEntry]: async def isfile(self, url: str) -> bool: try: - bucket, name = self.get_bucket_and_name(url) + fsurl = self.parse_url(url) # if name is empty, get_object_metadata behaves like list objects # the urls are the same modulo the object name - if not name: + if not fsurl._path: return False - await self._storage_client.get_object_metadata(bucket, name) + await self._storage_client.get_object_metadata(fsurl._bucket, fsurl._path) return True except aiohttp.ClientResponseError as e: if e.status == 404: @@ -797,10 +808,10 @@ async def isfile(self, url: str) -> bool: raise async def isdir(self, url: str) -> bool: - bucket, name = self.get_bucket_and_name(url) - assert not name or name.endswith('/'), name - params = {'prefix': name, 'delimiter': '/', 'includeTrailingDelimiter': 'true', 'maxResults': 1} - async for page in await self._storage_client.list_objects(bucket, params=params): + fsurl = self.parse_url(url, error_if_bucket=True) + assert not fsurl._path or fsurl.path.endswith('/'), fsurl._path + params = {'prefix': fsurl._path, 'delimiter': '/', 'includeTrailingDelimiter': 'true', 'maxResults': 1} + async for page in await self._storage_client.list_objects(fsurl._bucket, params=params): prefixes = page.get('prefixes') items = page.get('items') return bool(prefixes or items) @@ -808,6 +819,8 @@ async def isdir(self, url: str) -> bool: async def remove(self, url: str) -> None: bucket, name = self.get_bucket_and_name(url) + if name == '': + raise IsABucketError(url) try: await self._storage_client.delete_object(bucket, name) except aiohttp.ClientResponseError as e: diff --git a/hail/python/hailtop/aiotools/__init__.py b/hail/python/hailtop/aiotools/__init__.py index 89ff43b3a0d..f926ee8868d 100644 --- a/hail/python/hailtop/aiotools/__init__.py +++ b/hail/python/hailtop/aiotools/__init__.py @@ -6,6 +6,7 @@ MultiPartCreate, FileAndDirectoryError, UnexpectedEOFError, + IsABucketError, Copier, ReadableStream, WritableStream, @@ -33,6 +34,7 @@ 'FileAndDirectoryError', 'MultiPartCreate', 'UnexpectedEOFError', + 'IsABucketError', 'WeightedSemaphore', 'WriteBuffer', 'Copier', diff --git a/hail/python/hailtop/aiotools/fs/__init__.py b/hail/python/hailtop/aiotools/fs/__init__.py index 5ceadb88136..8c941c1144f 100644 --- a/hail/python/hailtop/aiotools/fs/__init__.py +++ b/hail/python/hailtop/aiotools/fs/__init__.py @@ -1,6 +1,6 @@ from .fs import AsyncFS, AsyncFSURL, AsyncFSFactory, MultiPartCreate, FileListEntry, FileStatus from .copier import Copier, CopyReport, SourceCopier, SourceReport, Transfer, TransferReport -from .exceptions import UnexpectedEOFError, FileAndDirectoryError +from .exceptions import UnexpectedEOFError, FileAndDirectoryError, IsABucketError from .stream import ( ReadableStream, EmptyReadableStream, @@ -29,4 +29,5 @@ 'FileStatus', 'FileAndDirectoryError', 'UnexpectedEOFError', + 'IsABucketError', ] diff --git a/hail/python/hailtop/aiotools/fs/exceptions.py b/hail/python/hailtop/aiotools/fs/exceptions.py index d6c14c6a547..ed4a24b912d 100644 --- a/hail/python/hailtop/aiotools/fs/exceptions.py +++ b/hail/python/hailtop/aiotools/fs/exceptions.py @@ -4,3 +4,7 @@ class UnexpectedEOFError(Exception): class FileAndDirectoryError(Exception): pass + + +class IsABucketError(FileNotFoundError): + pass diff --git a/hail/python/hailtop/aiotools/fs/fs.py b/hail/python/hailtop/aiotools/fs/fs.py index e7b74810b6a..5a0fed643d5 100644 --- a/hail/python/hailtop/aiotools/fs/fs.py +++ b/hail/python/hailtop/aiotools/fs/fs.py @@ -218,9 +218,27 @@ def scheme(self) -> str: def with_path(self, path) -> "AsyncFSURL": pass - def with_new_path_component(self, new_path_component) -> "AsyncFSURL": - prefix = self.path if self.path.endswith("/") else self.path + "/" - suffix = new_path_component[1:] if new_path_component.startswith("/") else new_path_component + @abc.abstractmethod + def with_root_path(self) -> "AsyncFSURL": + pass + + def with_new_path_component(self, new_path_component: str) -> "AsyncFSURL": + if new_path_component == '': + raise ValueError('new path component must be non-empty') + return self.with_new_path_components(new_path_component) + + def with_new_path_components(self, *parts: str) -> "AsyncFSURL": + if len(parts) == 0: + return self + + prefix = self.path + if not prefix.endswith("/") and not prefix == '': + prefix += "/" + + suffix = '/'.join(parts) + if suffix[0] == '/': + suffix = suffix[1:] + return self.with_path(prefix + suffix) @abc.abstractmethod @@ -250,7 +268,7 @@ def valid_url(url: str) -> bool: @staticmethod @abc.abstractmethod - def parse_url(url: str) -> AsyncFSURL: + def parse_url(url: str, *, error_if_bucket: bool = False) -> AsyncFSURL: pass @abc.abstractmethod diff --git a/hail/python/hailtop/aiotools/local_fs.py b/hail/python/hailtop/aiotools/local_fs.py index 0a1f5b33104..f1bbd98cfa4 100644 --- a/hail/python/hailtop/aiotools/local_fs.py +++ b/hail/python/hailtop/aiotools/local_fs.py @@ -114,6 +114,9 @@ class LocalAsyncFSURL(AsyncFSURL): def __init__(self, path: str): self._path = path + def __repr__(self) -> str: + return f'LocalAsyncFSURL({self.path})' + @property def bucket_parts(self) -> List[str]: return [] @@ -133,6 +136,9 @@ def scheme(self) -> str: def with_path(self, path) -> 'LocalAsyncFSURL': return LocalAsyncFSURL(path) + def with_root_path(self) -> 'LocalAsyncFSURL': + return self.with_path('/') + def __str__(self) -> str: return self._path @@ -246,7 +252,7 @@ def valid_url(url: str) -> bool: return url.startswith('file://') or '://' not in url @staticmethod - def parse_url(url: str) -> LocalAsyncFSURL: + def parse_url(url: str, *, error_if_bucket: bool = False) -> LocalAsyncFSURL: return LocalAsyncFSURL(LocalAsyncFS._get_path(url)) @staticmethod diff --git a/hail/python/hailtop/aiotools/router_fs.py b/hail/python/hailtop/aiotools/router_fs.py index f82cd8183aa..78311a385a1 100644 --- a/hail/python/hailtop/aiotools/router_fs.py +++ b/hail/python/hailtop/aiotools/router_fs.py @@ -52,9 +52,9 @@ def copy_part_size(url: str) -> int: return klass.copy_part_size(url) @staticmethod - def parse_url(url: str) -> AsyncFSURL: + def parse_url(url: str, *, error_if_bucket: bool = False) -> AsyncFSURL: klass = RouterAsyncFS._fs_class(url) - return klass.parse_url(url) + return klass.parse_url(url, error_if_bucket=error_if_bucket) @staticmethod def _fs_class(url: str) -> Type[AsyncFS]: diff --git a/hail/python/hailtop/fs/router_fs.py b/hail/python/hailtop/fs/router_fs.py index 65ebe387f78..32412ffacd7 100644 --- a/hail/python/hailtop/fs/router_fs.py +++ b/hail/python/hailtop/fs/router_fs.py @@ -1,4 +1,5 @@ -from typing import List, AsyncContextManager, BinaryIO, Optional, Tuple, Dict, Any +from typing import List, AsyncContextManager, BinaryIO, Optional, Tuple, Dict, Any, Type +from types import TracebackType import asyncio import io import os @@ -6,7 +7,14 @@ import glob import fnmatch -from hailtop.aiotools.fs import Copier, Transfer, FileListEntry as AIOFileListEntry, ReadableStream, WritableStream +from hailtop.aiotools.fs import ( + Copier, + Transfer, + FileListEntry as AIOFileListEntry, + ReadableStream, + WritableStream, + AsyncFSURL, +) from hailtop.aiotools.router_fs import RouterAsyncFS from hailtop.utils import bounded_gather2, async_to_blocking @@ -184,6 +192,20 @@ def __init__( local_kwargs=local_kwargs, gcs_kwargs=gcs_kwargs, azure_kwargs=azure_kwargs, s3_kwargs=s3_kwargs ) + def __enter__(self): + return self + + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_value: Optional[BaseException], + traceback: Optional[TracebackType], + ): + self.close() + + def close(self): + async_to_blocking(self.afs.close()) + @property def _gcs_kwargs(self) -> Optional[Dict[str, Any]]: return self.afs._gcs_kwargs @@ -291,13 +313,19 @@ async def ls_no_glob(path) -> List[FileListEntry]: except FileNotFoundError: return [] + async def list_within_each_prefix(prefixes: List[AsyncFSURL], parts: List[str]) -> List[List[FileListEntry]]: + pfs = [functools.partial(ls_no_glob, str(prefix.with_new_path_components(*parts))) for prefix in prefixes] + return await bounded_gather2(sema, *pfs, cancel_on_error=True) + url = self.afs.parse_url(path) if any(glob.escape(bucket_part) != bucket_part for bucket_part in url.bucket_parts): raise ValueError(f'glob pattern only allowed in path (e.g. not in bucket): {path}') blobpath = url.path - components = blobpath.split('/') - assert len(components) > 0 + if blobpath == '': + components = [] + else: + components = blobpath.split('/') glob_components = [] running_prefix = [] @@ -311,48 +339,30 @@ async def ls_no_glob(path) -> List[FileListEntry]: running_prefix = [] suffix_components: List[str] = running_prefix - if len(url.bucket_parts) > 0: - first_prefix = [url.scheme + ':', '', *url.bucket_parts] - else: - assert url.scheme == 'file' - if path.startswith('file://'): - first_prefix = ['file:', '', ''] - else: - first_prefix = [] - cached_stats_for_each_cumulative_prefix: Optional[List[FileListEntry]] = None - cumulative_prefixes = [first_prefix] + cumulative_prefixes: List[AsyncFSURL] = [url.with_root_path()] for intervening_components, single_component_glob_pattern in glob_components: - stats_grouped_by_prefix = await bounded_gather2( - sema, - *[ - functools.partial(ls_no_glob, '/'.join([*cumulative_prefix, *intervening_components])) - for cumulative_prefix in cumulative_prefixes - ], - cancel_on_error=True, - ) + stats_grouped_by_prefix = await list_within_each_prefix(cumulative_prefixes, intervening_components) cached_stats_for_each_cumulative_prefix = [ stat for stats_for_one_prefix, cumulative_prefix in zip(stats_grouped_by_prefix, cumulative_prefixes) for stat in stats_for_one_prefix if fnmatch.fnmatch( - stat.path, '/'.join([*cumulative_prefix, *intervening_components, single_component_glob_pattern]) + stat.path, + str( + cumulative_prefix.with_new_path_components( + *intervening_components, single_component_glob_pattern + ) + ), ) ] - cumulative_prefixes = [stat.path.split('/') for stat in cached_stats_for_each_cumulative_prefix] + cumulative_prefixes = [self.afs.parse_url(stat.path) for stat in cached_stats_for_each_cumulative_prefix] if len(suffix_components) == 0 and cached_stats_for_each_cumulative_prefix is not None: found_stats = cached_stats_for_each_cumulative_prefix else: - found_stats_grouped_by_prefix = await bounded_gather2( - sema, - *[ - functools.partial(ls_no_glob, '/'.join([*cumulative_prefix, *suffix_components])) - for cumulative_prefix in cumulative_prefixes - ], - cancel_on_error=True, - ) + found_stats_grouped_by_prefix = await list_within_each_prefix(cumulative_prefixes, suffix_components) found_stats = [stat for stats in found_stats_grouped_by_prefix for stat in stats] if len(glob_components) == 0 and len(found_stats) == 0: diff --git a/hail/python/test/hailtop/inter_cloud/test_fs.py b/hail/python/test/hailtop/inter_cloud/test_fs.py index 44d5c09f2d6..00bf2f4cba0 100644 --- a/hail/python/test/hailtop/inter_cloud/test_fs.py +++ b/hail/python/test/hailtop/inter_cloud/test_fs.py @@ -9,7 +9,8 @@ from hailtop.aiotools.fs.fs import AsyncFSURL import pytest from hailtop.utils import secret_alnum_string, retry_transient_errors, bounded_gather2 -from hailtop.aiotools import LocalAsyncFS, UnexpectedEOFError, AsyncFS +from hailtop.fs.router_fs import RouterFS +from hailtop.aiotools import LocalAsyncFS, UnexpectedEOFError, AsyncFS, IsABucketError from hailtop.aiotools.router_fs import RouterAsyncFS from hailtop.aiocloud.aioaws import S3AsyncFS from hailtop.aiocloud.aioazure import AzureAsyncFS @@ -630,3 +631,52 @@ async def test_rmtree_on_symlink_to_directory(): finally: await fs.rmtree(sema, str(base)) assert not await fs.isdir(str(base)) + + +async def test_operations_on_a_bucket_url_is_error(filesystem: Tuple[asyncio.Semaphore, AsyncFS, AsyncFSURL]): + _, fs, base = filesystem + + if base.scheme in ('', 'file'): + return + + bucket_url = str(base.with_path('')) + + with pytest.raises(IsABucketError): + await fs.isdir(bucket_url) + + assert await fs.isfile(bucket_url) is False + + with pytest.raises(IsABucketError): + await fs.statfile(bucket_url) + + with pytest.raises(IsABucketError): + await fs.remove(bucket_url) + + with pytest.raises(IsABucketError): + await fs.create(bucket_url) + + with pytest.raises(IsABucketError): + await fs.open(bucket_url) + + +async def test_hfs_ls_bucket_url_not_an_error(filesystem: Tuple[asyncio.Semaphore, AsyncFS, AsyncFSURL]): + _, fs, base = filesystem + + if base.scheme in ('', 'file'): + return + + await fs.write(str(base.with_new_path_component('abc123')), b'foo') # ensure the bucket is non-empty + + bucket_url = str(base.with_path('')) + with RouterFS() as fs: + fs.ls(bucket_url) + + +async def test_with_new_path_component(filesystem: Tuple[asyncio.Semaphore, AsyncFS, AsyncFSURL]): + _, _, base = filesystem + + assert str(base.with_path('').with_new_path_component('abc')) == str(base.with_path('abc')) + assert str(base.with_path('abc').with_new_path_component('def')) == str(base.with_path('abc/def')) + + actual = base.with_path('abc').with_new_path_component('def').with_new_path_component('ghi') + assert str(actual) == str(base.with_path('abc/def/ghi'))