Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ALS-6330: Fix variant explorer functionality #115

Merged
merged 40 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from 37 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
b0407fc
ALS-6330: Cleanup old genomic unit tests and test data
ramari16 May 9, 2024
eab1e2f
ALS-6330: (WIP) Integration test refactor, variant explorer fixes.
ramari16 Jun 5, 2024
b1ebeeb
ALS-6330: (WIP) Integration test refactor, variant explorer fixes.
ramari16 Jun 6, 2024
faf2cea
ALS-6330: Add dataset finalizer, disable test not compiling, fix some…
ramari16 Jun 7, 2024
a70da0d
ALS-6330: Update genomic processor config for testing
ramari16 Jun 7, 2024
7d85f71
ALS-6330: Move VariantMetadataIndex to genomic processor. Implement m…
ramari16 Jun 10, 2024
a9d8189
ALS-6330: Fix directory reference issues in variant metadata
ramari16 Jun 10, 2024
7f0c5b0
ALS-6330: Fix directory reference issues in variant metadata
ramari16 Jun 10, 2024
dd89834
ALS-6330: Implement missing method
ramari16 Jun 11, 2024
9d17741
ALS-6330: Implement createMaskForPatientSet for patient merging genom…
ramari16 Jun 11, 2024
396606b
ALS-6330: Fix test compile issues. Add variant list processor integra…
ramari16 Jun 14, 2024
a798763
ALS-6330: Add unit tests, some minor refactoring
ramari16 Jun 17, 2024
ce17d75
ALS-6330: Fix compile issue
ramari16 Jun 17, 2024
b1d6559
ALS-6330: Fix issue merging variant metadata
ramari16 Jun 17, 2024
2223f66
ALS-6330: Fix variant metadata merging bug, refactor variant model, a…
ramari16 Jun 18, 2024
cf79b80
ALS-6330: Add config for bch
ramari16 Jun 26, 2024
d25c178
ALS-6330: Fix java config typo
ramari16 Jun 26, 2024
7c1144f
ALS-6330: Fix java config typo
ramari16 Jun 26, 2024
6af0dcb
ALS-6330: Fix java config typo
ramari16 Jun 26, 2024
e7e39de
ALS-6330: Remove redundant directory update
ramari16 Jun 26, 2024
b034ddc
ALS-6330: Remove misleading stack trace log
ramari16 Jul 2, 2024
e9e418b
ALS-6330: Update integration tests to test no-call data
ramari16 Aug 12, 2024
8626102
ALS-6330: Remove integration test directory
ramari16 Aug 12, 2024
5d27e75
Merge branch 'genomic-v2' into ALS-6330
ramari16 Oct 3, 2024
dfab455
Clean test environment
ramari16 Oct 3, 2024
2eca8bf
Logging to fix tests
ramari16 Oct 8, 2024
724764e
Logging to fix tests
ramari16 Oct 8, 2024
0a64930
Logging to fix tests
ramari16 Oct 8, 2024
3c71284
Logging to fix tests
ramari16 Oct 8, 2024
d8946a3
Fix encryption key issue
ramari16 Oct 8, 2024
d992e58
Fix encryption key issue
ramari16 Oct 8, 2024
851b4b8
Cleanup
ramari16 Oct 8, 2024
3ad5a31
Cleanup
ramari16 Oct 9, 2024
c69e161
Clarify confusing code
ramari16 Oct 9, 2024
44e6337
Cleanup
ramari16 Oct 9, 2024
60f34de
Cleanup
ramari16 Oct 9, 2024
0fb5346
Cleanup
ramari16 Oct 9, 2024
f3f64c2
Remove lombok. Add record
Oct 9, 2024
beab7a3
Changes from PR
ramari16 Oct 10, 2024
92695cb
Fix test
ramari16 Oct 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/github-actions-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ jobs:
- name: Test with Maven
run: mvn --update-snapshots test
env:
GITHUB_TOKEN: ${{ github.token }}
GITHUB_TOKEN: ${{ github.token }}
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,6 @@ public BucketIndexBySample(VariantStore variantStore, String storageDir) throws
});

// For each patient set the patientBucketCharMask entry to 0 or 1 if they have a variant in the bucket.

// todo: implement for variant explorer
int indexOfBucket = Collections.binarySearch(bucketList, bucketKey) + 2; //patientBucketCharMasks has bookend bits
for(int x = 0; x < patientIds.size(); x++) {
if(patientMaskForBucket[0].testBit(x)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,6 @@ static VariantMask emptyInstance() {
}

Set<Integer> patientMaskToPatientIdSet(List<String> patientIds);

boolean isEmpty();
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package edu.harvard.hms.dbmi.avillach.hpds.data.genotype;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.annotation.JsonSerialize;
import com.fasterxml.jackson.databind.ser.std.ToStringSerializer;
Expand Down Expand Up @@ -75,6 +76,13 @@ public Set<Integer> patientMaskToPatientIdSet(List<String> patientIds) {
return ids;
}

@Override
@JsonIgnore
public boolean isEmpty() {
// because the bitmasks are padded with 11 on each end
return bitmask.bitCount() <= 4;
}

private VariantMask union(VariantMaskBitmaskImpl variantMaskBitmask) {
return new VariantMaskBitmaskImpl(variantMaskBitmask.bitmask.or(this.bitmask));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package edu.harvard.hms.dbmi.avillach.hpds.data.genotype;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;

import java.math.BigInteger;
Expand Down Expand Up @@ -59,6 +60,12 @@ public Set<Integer> patientMaskToPatientIdSet(List<String> patientIds) {
.collect(Collectors.toSet());
}

@Override
@JsonIgnore
public boolean isEmpty() {
return this.patientIndexes.isEmpty();
}

private VariantMask union(VariantMaskSparseImpl variantMask) {
HashSet<Integer> union = new HashSet<>(variantMask.patientIndexes);
union.addAll(this.patientIndexes);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package edu.harvard.hms.dbmi.avillach.hpds.data.genotype;

import java.io.*;
import java.nio.file.Files;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import com.google.common.base.Joiner;
import edu.harvard.hms.dbmi.avillach.hpds.storage.FileBackedJavaIndexedStorage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -21,23 +24,23 @@
* a fast, disk-based backing store.
*/
public class VariantMetadataIndex implements Serializable {
// todo: make this variable
public static String VARIANT_METADATA_BIN_FILE = "/opt/local/hpds/all/VariantMetadata.javabin";
public static final String VARIANT_METADATA_FILENAME = "VariantMetadata.javabin";
public static String VARIANT_METADATA_BIN_FILE = "/opt/local/hpds/all/" + VARIANT_METADATA_FILENAME;

private static final long serialVersionUID = 5917054606643971537L;
private static Logger log = LoggerFactory.getLogger(VariantMetadataIndex.class);

// (String) contig --> (Integer) Bucket --> (String) variant spec --> INFO column data[].
private Map<String, FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> > indexMap = new HashMap<String, FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> >();
private final Map<String, FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> > indexMap = new HashMap<>();

// todo: make this variable
private static String fileStoragePrefix = "/opt/local/hpds/all/VariantMetadataStorage";
public static final String VARIANT_METADATA_STORAGE_FILE_PREFIX = "VariantMetadataStorage";
private static String fileStoragePrefix = "/opt/local/hpds/all/" + VARIANT_METADATA_STORAGE_FILE_PREFIX;

/**
* This map allows us to load millions of variants without re-writing the fbbis each time (which would blow up the disk space).
* We need to remember to flush() between each contig this gets saved to the fbbis array.
*/
private transient Map<String, ConcurrentHashMap<Integer, ConcurrentHashMap<String, String[]>> > loadingMap = new HashMap<String, ConcurrentHashMap<Integer, ConcurrentHashMap<String, String[]>> >();
private transient Map<String, ConcurrentHashMap<Integer, ConcurrentHashMap<String, String[]>> > loadingMap = new HashMap<>();

/**
* This constructor should only be used for testing; we expect the files to be in the default locations in production
Expand All @@ -60,7 +63,7 @@ public VariantMetadataIndex() throws IOException {
* @param variantSpec
* @return
*/
public String[] findBySingleVariantSpec(String variantSpec, VariantBucketHolder<String[]> bucketCache) {
public Set<String> findBySingleVariantSpec(String variantSpec, VariantBucketHolder<String[]> bucketCache) {
try {
String[] segments = variantSpec.split(",");
if (segments.length < 2) {
Expand All @@ -75,7 +78,7 @@ public String[] findBySingleVariantSpec(String variantSpec, VariantBucketHolder<
|| chrOffset != bucketCache.lastChunkOffset) {
FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> ContigFbbis = indexMap.get(contig);
if(ContigFbbis == null) {
return new String[0];
return Set.of();
}
bucketCache.lastValue = ContigFbbis.get(chrOffset);
bucketCache.lastContig = contig;
Expand All @@ -85,20 +88,20 @@ public String[] findBySingleVariantSpec(String variantSpec, VariantBucketHolder<
if( bucketCache.lastValue != null) {
if(bucketCache.lastValue.get(variantSpec) == null) {
log.warn("No variant data found for spec " + variantSpec);
return new String[0];
return Set.of();
}
return bucketCache.lastValue.get(variantSpec);
return Set.of(bucketCache.lastValue.get(variantSpec));
}
log.warn("No bucket found for spec " + variantSpec + " in bucket " + chrOffset);
return new String[0];
return Set.of();

} catch (UncheckedIOException e) {
log.warn("IOException caught looking up variantSpec : " + variantSpec, e);
return new String[0];
return Set.of();
}
}

public Map<String, String[]> findByMultipleVariantSpec(Collection<String> varientSpecList) {
public Map<String, Set<String>> findByMultipleVariantSpec(Collection<String> varientSpecList) {
// log.debug("SPEC list " + varientSpecList.size() + " :: " + Arrays.deepToString(varientSpecList.toArray()));

VariantBucketHolder<String[]> bucketCache = new VariantBucketHolder<String[]>();
Expand Down Expand Up @@ -161,7 +164,7 @@ public synchronized void flush() throws IOException {
if(contigFbbis == null) {
log.info("creating new file for " + contig);
String filePath = fileStoragePrefix + "_" + contig + ".bin";
contigFbbis = new FileBackedJavaIndexedStorage(Integer.class, (Class<ConcurrentHashMap<String, String[]>>)(Class<?>) ConcurrentHashMap.class, new File(filePath));
contigFbbis = new FileBackedJavaIndexedStorage(Integer.class, ConcurrentHashMap.class, new File(filePath));
indexMap.put(contig, contigFbbis);
}

Expand Down Expand Up @@ -196,13 +199,57 @@ public void complete() throws IOException {

public static VariantMetadataIndex createInstance(String metadataIndexPath) {
try(ObjectInputStream in = new ObjectInputStream(new GZIPInputStream(
new FileInputStream(metadataIndexPath)))){
return (VariantMetadataIndex) in.readObject();
new FileInputStream(metadataIndexPath + VARIANT_METADATA_FILENAME)))){
VariantMetadataIndex variantMetadataIndex = (VariantMetadataIndex) in.readObject();
variantMetadataIndex.updateStorageDirectory(new File(metadataIndexPath));
return variantMetadataIndex;
} catch(Exception e) {
// todo: handle exceptions better
log.info("No Metadata Index found at " + metadataIndexPath);
log.debug("Error loading metadata index:", e);
return null;
}
}

public static void merge(VariantMetadataIndex variantMetadataIndex1, VariantMetadataIndex variantMetadataIndex2, String outputDirectory) throws IOException {
VariantMetadataIndex merged = new VariantMetadataIndex(outputDirectory + VARIANT_METADATA_STORAGE_FILE_PREFIX);
if (!variantMetadataIndex1.indexMap.keySet().equals(variantMetadataIndex2.indexMap.keySet())) {
log.warn("Merging incompatible variant indexes. Index1 keys: " + Joiner.on(",").join(variantMetadataIndex1.indexMap.keySet()) + ". Index 2 keys: " + Joiner.on(",").join(variantMetadataIndex2.indexMap.keySet()));
throw new IllegalStateException("Cannot merge variant metadata index with different contig keys");
}
for (String contig : variantMetadataIndex1.indexMap.keySet()) {
String filePath = outputDirectory + VARIANT_METADATA_STORAGE_FILE_PREFIX + "_" + contig + ".bin";
FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> mergedFbbis = new FileBackedJavaIndexedStorage(Integer.class, ConcurrentHashMap.class, new File(filePath));

// Store the merged result here because FileBackedByteIndexedStorage must be written all at once
Map<Integer, ConcurrentHashMap<String, String[]>> mergedStagedFbbis = new HashMap<>();

FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> fbbis1 = variantMetadataIndex1.indexMap.get(contig);
FileBackedByteIndexedStorage<Integer, ConcurrentHashMap<String, String[]>> fbbis2 = variantMetadataIndex2.indexMap.get(contig);

fbbis1.keys().forEach(key -> {
mergedStagedFbbis.put(key, fbbis1.get(key));
});
fbbis2.keys().forEach(key -> {
ConcurrentHashMap<String, String[]> metadataMap = mergedStagedFbbis.get(key);
if (metadataMap == null) {
mergedStagedFbbis.put(key, fbbis2.get(key));
} else {
metadataMap.putAll(fbbis2.get(key));
}
});

mergedStagedFbbis.forEach(mergedFbbis::put);
mergedFbbis.complete();
merged.indexMap.put(contig, mergedFbbis);
}

try(ObjectOutputStream out = new ObjectOutputStream(new GZIPOutputStream(Files.newOutputStream(new File(outputDirectory + VARIANT_METADATA_FILENAME).toPath())))){
out.writeObject(merged);
out.flush();
}
}

public void updateStorageDirectory(File genomicDataDirectory) {
indexMap.values().forEach(value -> value.updateStorageDirectory(genomicDataDirectory));
}
}
Loading
Loading