From ab0ee9b847f1574e4bf2a6866022e5a959f58072 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 14 Nov 2011 15:10:50 -0500 Subject: [PATCH 1/3] Perform only necessary validation in VariantContext modify methods --- .../utils/variantcontext/VariantContext.java | 48 +++++++++++++------ 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index f52a7087b..204b4b841 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -223,7 +223,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param referenceBaseForIndel padded reference base */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { - this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, referenceBaseForIndel, false); + this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, referenceBaseForIndel, false, true); } /** @@ -240,7 +240,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, null, false); + this(source, contig, start, stop, alleles, genotypes, negLog10PError, filters, attributes, null, false, true); } /** @@ -261,7 +261,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param referenceBaseForIndel padded reference base */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, double negLog10PError, Set filters, Map attributes, Byte referenceBaseForIndel) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, referenceBaseForIndel, true); + this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, referenceBaseForIndel, true, true); } /** @@ -278,7 +278,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes */ public VariantContext(String source, String contig, long start, long stop, Collection alleles, Collection genotypes, double negLog10PError, Set filters, Map attributes) { - this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, null, false); + this(source, contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, negLog10PError, filters, attributes, null, false, true); } /** @@ -291,7 +291,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param alleles alleles */ public VariantContext(String source, String contig, long start, long stop, Collection alleles) { - this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, null, false); + this(source, contig, start, stop, alleles, NO_GENOTYPES, InferredGeneticContext.NO_NEG_LOG_10PERROR, null, null, null, false, true); } /** @@ -314,7 +314,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param other the VariantContext to copy */ public VariantContext(VariantContext other) { - this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, false); + this(other.getSource(), other.getChr(), other.getStart(), other.getEnd() , other.getAlleles(), other.getGenotypes(), other.getNegLog10PError(), other.filtersWereApplied() ? other.getFilters() : null, other.getAttributes(), other.REFERENCE_BASE_FOR_INDEL, false, true); } /** @@ -331,11 +331,13 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param attributes attributes * @param referenceBaseForIndel padded reference base * @param genotypesAreUnparsed true if the genotypes have not yet been parsed + * @param performValidation if true, call validate() as the final step in construction */ private VariantContext(String source, String contig, long start, long stop, Collection alleles, Map genotypes, double negLog10PError, Set filters, Map attributes, - Byte referenceBaseForIndel, boolean genotypesAreUnparsed) { + Byte referenceBaseForIndel, boolean genotypesAreUnparsed, + boolean performValidation ) { if ( contig == null ) { throw new IllegalArgumentException("Contig cannot be null"); } this.contig = contig; this.start = start; @@ -371,39 +373,57 @@ public class VariantContext implements Feature { // to enable tribble intergrati } } - validate(); + if ( performValidation ) { + validate(); + } } // --------------------------------------------------------------------------------------------------------- // // Partial-cloning routines (because Variant Context is immutable). + // + // IMPORTANT: These routines assume that the VariantContext on which they're called is already valid. + // Due to this assumption, they explicitly tell the constructor NOT to perform validation by + // calling validate(), and instead perform validation only on the data that's changed. + // // Note that we don't call vc.getGenotypes() because that triggers the lazy loading. // Also note that we need to create a new attributes map because it's unmodifiable and the constructor may try to modify it. // // --------------------------------------------------------------------------------------------------------- public static VariantContext modifyGenotypes(VariantContext vc, Map genotypes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), false); + VariantContext modifiedVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), false, false); + modifiedVC.validateGenotypes(); + return modifiedVC; } public static VariantContext modifyLocation(VariantContext vc, String chr, int start, int end) { - return new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); + VariantContext modifiedVC = new VariantContext(vc.getSource(), chr, start, end, vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true, false); + + // Since start and end have changed, we need to call both validateAlleles() and validateReferencePadding(), + // since those validation routines rely on the values of start and end: + modifiedVC.validateAlleles(); + modifiedVC.validateReferencePadding(); + + return modifiedVC; } public static VariantContext modifyFilters(VariantContext vc, Set filters) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd() , vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), filters, new HashMap(vc.getAttributes()), vc.getReferenceBaseForIndel(), true, false); } public static VariantContext modifyAttributes(VariantContext vc, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, vc.getReferenceBaseForIndel(), true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, attributes, vc.getReferenceBaseForIndel(), true, false); } public static VariantContext modifyReferencePadding(VariantContext vc, Byte b) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), b, true); + VariantContext modifiedVC = new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, vc.getNegLog10PError(), vc.filtersWereApplied() ? vc.getFilters() : null, vc.getAttributes(), b, true, false); + modifiedVC.validateReferencePadding(); + return modifiedVC; } public static VariantContext modifyPErrorFiltersAndAttributes(VariantContext vc, double negLog10PError, Set filters, Map attributes) { - return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, vc.getReferenceBaseForIndel(), true); + return new VariantContext(vc.getSource(), vc.getChr(), vc.getStart(), vc.getEnd(), vc.getAlleles(), vc.genotypes, negLog10PError, filters, attributes, vc.getReferenceBaseForIndel(), true, false); } // --------------------------------------------------------------------------------------------------------- From cde829899dd8c2d65edf0efaa9a9757134925c1a Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 14 Nov 2011 18:07:41 -0500 Subject: [PATCH 3/3] compress Reduce Read counts bytes by offset compressed the representation of the reduce reads counts by offset results in 17% average compression in final BAM file size. Example compression --> from : 10, 10, 11, 11, 12, 12, 12, 11, 10 to: 10, 0, 1, 1,2, 2, 2, 1, 0 --- .../org/broadinstitute/sting/utils/sam/GATKSAMRecord.java | 4 +++- .../org/broadinstitute/sting/utils/ReadUtilsUnitTest.java | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 3fe1060dd..6d7c8dad9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -188,7 +188,9 @@ public class GATKSAMRecord extends BAMRecord { } public final byte getReducedCount(final int i) { - return getReducedReadCounts()[i]; + byte firstCount = getReducedReadCounts()[0]; + byte offsetCount = getReducedReadCounts()[i]; + return (i==0) ? firstCount : (byte) Math.min(firstCount + offsetCount, Byte.MAX_VALUE); } diff --git a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java index 46134cd24..53368c339 100755 --- a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java @@ -16,7 +16,8 @@ public class ReadUtilsUnitTest extends BaseTest { GATKSAMRecord read, reducedRead; final static String BASES = "ACTG"; final static String QUALS = "!+5?"; - final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40}; + final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40}; + final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30}; // just the offsets @BeforeTest public void init() { @@ -29,7 +30,7 @@ public class ReadUtilsUnitTest extends BaseTest { reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); reducedRead.setReadBases(BASES.getBytes()); reducedRead.setBaseQualityString(QUALS); - reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS); + reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG); } private void testReadBasesAndQuals(GATKSAMRecord read, int expectedStart, int expectedStop) {