From 480b32e759a5b62a6dcdf9dfa5349d95df263fd9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 1 Jul 2012 14:59:27 -0400 Subject: [PATCH 01/10] BCF2 is now officially zero-based open-interval, and that's how the GATK does it now --- .../broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java | 4 ++-- .../sting/utils/variantcontext/writer/BCF2Writer.java | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index e106e72ac..0503e417a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -239,10 +239,10 @@ public final class BCF2Codec implements FeatureCodec, ReferenceD final String contig = lookupContigName(contigOffset); builder.chr(contig); - this.pos = decoder.decodeInt(BCF2Type.INT32); + this.pos = decoder.decodeInt(BCF2Type.INT32) + 1; // GATK is one based, BCF2 is zero-based final int refLength = decoder.decodeInt(BCF2Type.INT32); builder.start((long)pos); - builder.stop((long)(pos + refLength - 1)); // minus one because of our open intervals + builder.stop((long)(pos + refLength - 1)); // minus one because GATK has closed intervals but BCF2 is open } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index e281475f2..7d9a18d14 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -84,7 +84,6 @@ import java.util.*; */ class BCF2Writer extends IndexingVariantContextWriter { final protected static Logger logger = Logger.getLogger(BCF2Writer.class); - final private static List MISSING_GENOTYPE = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); final private static boolean ALLOW_MISSING_CONTIG_LINES = false; private final OutputStream outputStream; // Note: do not flush until completely done writing, to avoid issues with eventual BGZF support @@ -203,10 +202,11 @@ class BCF2Writer extends IndexingVariantContextWriter { // note use of encodeRawValue to not insert the typing byte encoder.encodeRawValue(contigIndex, BCF2Type.INT32); - // pos - encoder.encodeRawValue(vc.getStart(), BCF2Type.INT32); + // pos. GATK is 1 based, BCF2 is 0 based + encoder.encodeRawValue(vc.getStart() - 1, BCF2Type.INT32); - // ref length + // ref length. GATK is closed, but BCF2 is open so the ref length is GATK end - GATK start + 1 + // for example, a SNP is in GATK at 1:10-10, which has ref length 10 - 10 + 1 = 1 encoder.encodeRawValue(vc.getEnd() - vc.getStart() + 1, BCF2Type.INT32); // qual From 7aff4446d4ecec878894fe5fac58fc54ad4f69f3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 1 Jul 2012 15:38:10 -0400 Subject: [PATCH 02/10] Added unit tests for header repairing capabilities in the GATK engine --- .../variantcontext/writer/VCFWriter.java | 2 +- .../utils/codecs/vcf/VCFIntegrationTest.java | 35 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index 2a4c8cac2..8ccb79744 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -564,6 +564,6 @@ class VCFWriter extends IndexingVariantContextWriter { + " at " + vc.getChr() + ":" + vc.getStart() + " but this key isn't defined in the VCFHeader. The GATK now requires all VCFs to have" + " complete VCF headers by default. This error can be disabled with the engine argument" - + " -U LENIENT_VCF_PROCESSING"); + + " -U LENIENT_VCF_PROCESSING or repair the VCF file header using repairVCFHeader"); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 4d23f0fa5..1eda504db 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.io.File; @@ -66,4 +67,38 @@ public class VCFIntegrationTest extends WalkerTest { WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("63a2e0484ae37b0680514f53e0bf0c94")); executeTest("Test reading samtools WEx BCF example", spec1); } + + // + // + // Tests to ensure that -U LENIENT_VCF_PROCESS and header repairs are working + // + // + + @Test + public void testFailingOnVCFWithoutHeaders() { + runVCFWithoutHeaders("", "", UserException.class, false); + } + + @Test + public void testPassingOnVCFWithoutHeadersWithLenientProcessing() { + runVCFWithoutHeaders("-U LENIENT_VCF_PROCESSING", "6de8cb7457154dd355aa55befb943f88", null, true); + } + + @Test + public void testPassingOnVCFWithoutHeadersRepairingHeaders() { + runVCFWithoutHeaders("-repairVCFHeader " + privateTestDir + "vcfexample2.justHeader.vcf", "ff61e9cad6653c7f93d82d391f7ecdcb", null, false); + } + + private void runVCFWithoutHeaders(final String moreArgs, final String expectedMD5, final Class expectedException, final boolean disableBCF) { + final String testVCF = privateTestDir + "vcfexample2.noHeader.vcf"; + final String baseCommand = "-R " + b37KGReference + + " --no_cmdline_in_header -o %s " + + "-T VariantsToVCF -V " + testVCF + " " + moreArgs; + WalkerTestSpec spec1 = expectedException != null + ? new WalkerTestSpec(baseCommand, 1, expectedException) + : new WalkerTestSpec(baseCommand, 1, Arrays.asList(expectedMD5)); + if ( disableBCF ) + spec1.disableShadowBCF(); + executeTest("Test reading VCF without header lines with additional args " + moreArgs, spec1); + } } From 01e04992f8a94f84776864e747e889ae6462ce31 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 2 Jul 2012 11:38:59 -0400 Subject: [PATCH 04/10] Fixed compatibilities in AbstractVCFCodec that resulted in key=; being parsed as written as key; in VCF output --- .../sting/utils/codecs/vcf/AbstractVCFCodec.java | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 1d5869349..5ad939e76 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -363,10 +363,10 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec int eqI = infoFieldArray[i].indexOf("="); if ( eqI != -1 ) { key = infoFieldArray[i].substring(0, eqI); - String str = infoFieldArray[i].substring(eqI+1); + String valueString = infoFieldArray[i].substring(eqI+1); // split on the INFO field separator - int infoValueSplitSize = ParsingUtils.split(str, infoValueArray, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR, false); + int infoValueSplitSize = ParsingUtils.split(valueString, infoValueArray, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR, false); if ( infoValueSplitSize == 1 ) { value = infoValueArray[0]; final VCFInfoHeaderLine headerLine = header.getInfoHeaderLine(key); @@ -396,6 +396,9 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec } } + // this line ensures that key/value pairs that look like key=; are parsed correctly as MISSING + if ( "".equals(value) ) value = VCFConstants.MISSING_VALUE_v4; + attributes.put(key, value); } } From bcd2e13d8bb1e1d4c9493030ef06c8775dadaa35 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 2 Jul 2012 11:39:34 -0400 Subject: [PATCH 05/10] Adding duplicate header line keys is a logger.debug not logger.warn message now --- .../org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index 50d46eab3..296be7873 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -214,7 +214,7 @@ public class VCFHeader { private final void addMetaDataMapBinding(final Map map, T line) { final String key = line.getID(); if ( map.containsKey(key) ) - logger.warn("Found duplicate VCF header lines for " + key + "; keeping the first only" ); + logger.debug("Found duplicate VCF header lines for " + key + "; keeping the first only" ); else map.put(key, line); } From 602729c09d74d6beecbecbf41a96f59a52d13ee1 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 2 Jul 2012 11:40:28 -0400 Subject: [PATCH 06/10] Moved parallel tests from SelectVariants to separate SelectVariantsParallelIntegrationTest -- Enabled previous tests -- all now working -- Added modern test against new VCF as well --- .../SelectVariantsIntegrationTest.java | 30 ---------- ...SelectVariantsParallelIntegrationTest.java | 59 +++++++++++++++++++ 2 files changed, 59 insertions(+), 30 deletions(-) create mode 100755 public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 30cdbee36..5e674d6c2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -167,36 +167,6 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testNoGTs--" + testFile, spec); } - @Test(enabled = false) - public void testParallelization2() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - WalkerTestSpec spec; - - spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 2"), - 1, - Arrays.asList("433eccaf1ac6e6be500ef0984a5d8d8b") - ); - spec.disableShadowBCF(); - executeTest("testParallelization (2 threads)--" + testfile, spec); - } - - @Test(enabled = false) - public void testParallelization4() { - String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; - String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; - WalkerTestSpec spec; - spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile + " -nt 4"), - 1, - Arrays.asList("433eccaf1ac6e6be500ef0984a5d8d8b") - ); - spec.disableShadowBCF(); - - executeTest("testParallelization (4 threads)--" + testfile, spec); - } - @Test public void testSelectFromMultiAllelic() { String testfile = privateTestDir + "multi-allelic.bi-allelicInGIH.vcf"; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java new file mode 100755 index 000000000..ff3982d24 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsParallelIntegrationTest.java @@ -0,0 +1,59 @@ +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class SelectVariantsParallelIntegrationTest extends WalkerTest { + + private class ParallelSelectTestProvider extends TestDataProvider { + final String reference; + final String args; + final String md5; + final int nt; + + private ParallelSelectTestProvider(final String reference, final String args, final String md5, final int nt) { + super(ParallelSelectTestProvider.class); + this.reference = reference; + this.args = args; + this.md5 = md5; + this.nt = nt; + } + + public final String getCmdLine() { + return "-T SelectVariants -R " + reference + " -o %s -L 1 --no_cmdline_in_header -nt " + nt + " " + args; + } + + public String toString() { + return String.format("ParallelSelectVariants nt=%d args=%s", nt, args); + } + } + + @DataProvider(name = "ParallelSelectTest") + public Object[][] makeParallelSelectTestProvider() { + for ( int nt : Arrays.asList(1, 2, 4) ) { + { // original MAF test + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + String args = " -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile; + new ParallelSelectTestProvider(b36KGReference, args, "4386fbb258dcef4437495a37f5a83c53", nt); + } + { // new tests on b37 using testdir VCF + final String testfile = privateTestDir + "NA12878.hg19.example1.vcf"; + final String args = "-select 'DP > 30' -V " + testfile; + new ParallelSelectTestProvider(b37KGReference, args, "c64b45a14d41b1e5cddbe036b47e7519", nt); + } + } + + return ParallelSelectTestProvider.getTests(ParallelSelectTestProvider.class); + } + + @Test(dataProvider = "ParallelSelectTest") + public void testParallelSelectTestProvider(final ParallelSelectTestProvider cfg) { + final WalkerTestSpec spec = new WalkerTestSpec( cfg.getCmdLine(), 1, Arrays.asList(cfg.md5) ); + executeTest(cfg.toString(), spec); + } +} From 1b0a7757738f13dfffb98cab27826fe2a1a40680 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 2 Jul 2012 15:55:42 -0400 Subject: [PATCH 07/10] Disabling bcf2 reading from samtools because it's 1 basis; updating select variants integrationtest --- .../walkers/variantutils/SelectVariantsIntegrationTest.java | 2 +- .../sting/utils/codecs/vcf/VCFIntegrationTest.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 5e674d6c2..59eaa177d 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -64,7 +64,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), 1, - Arrays.asList("433eccaf1ac6e6be500ef0984a5d8d8b") + Arrays.asList("4386fbb258dcef4437495a37f5a83c53") ); spec.disableShadowBCF(); executeTest("testComplexSelection--" + testfile, spec); diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 1eda504db..1e3c799fe 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -59,7 +59,7 @@ public class VCFIntegrationTest extends WalkerTest { executeTest("Test writing samtools WEx BCF example", spec1); } - @Test + @Test(enabled = false) // TODO disabled because current BCF2 is 1 based public void testReadingSamtoolsWExBCFExample() { String testVCF = privateTestDir + "ex2.bcf"; String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; From 9ee58d323aeebe4a331ffd24b61fad8f59036a6a Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Mon, 2 Jul 2012 15:37:23 -0400 Subject: [PATCH 08/10] Pass the original GATK unsafe parameter to the VcfGatherFunction --- .../sting/queue/extensions/gatk/VcfGatherFunction.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala index 7862dec41..739e6cc91 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala @@ -52,6 +52,9 @@ class VcfGatherFunction extends CombineVariants with GatherFunction { val sitesOnly = QFunction.findField(originalFunction.getClass, VCFWriterArgumentTypeDescriptor.SITES_ONLY_ARG_NAME) this.sites_only = originalGATK.getFieldValue(sitesOnly).asInstanceOf[Boolean] + // ensure that the gather function receives the same unsafe parameter as the scattered function + this.unsafe = this.originalGATK.unsafe + super.freezeFieldValues() } } From 88a02fa2cb927923519fd354b821ee4cd0fe7913 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 2 Jul 2012 12:28:28 -0400 Subject: [PATCH 09/10] Fixing but for reads with cigars like 9S54H When hard-clipping predict when the read is going to be fully hard clipped to the point where only soft/hard-clips are left in the read and preemptively eliminate the read before the SAMRecord mathematics on malformed cigars kills the GATK. --- .../org/broadinstitute/sting/utils/clipping/ClippingOp.java | 5 ++++- .../src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index 2e3978ddb..d12d2c2ec 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -285,7 +285,10 @@ public class ClippingOp { @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1"}) private GATKSAMRecord hardClip(GATKSAMRecord read, int start, int stop) { - if (start == 0 && stop == read.getReadLength() - 1) + final int firstBaseAfterSoftClips = read.getAlignmentStart() - read.getSoftStart(); + final int lastBaseBeforeSoftClips = read.getSoftEnd() - read.getSoftStart(); + + if (start == firstBaseAfterSoftClips && stop == lastBaseBeforeSoftClips) // note that if the read has no soft clips, these constants will be 0 and read length - 1 (beauty of math). return GATKSAMRecord.emptyRead(read); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 78ce81a7c..6b9ba79b4 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -520,7 +520,7 @@ public class ReadUtils { if (allowGoalNotReached) { return new Pair(CLIPPING_GOAL_NOT_REACHED, false); } else { - throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Alignment " + alignmentStart + " | " + cigar); } } From 3cea080aa8a0feab210a0c2e00f73cc30a090ef7 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 2 Jul 2012 12:29:31 -0400 Subject: [PATCH 10/10] Cache SoftStart() and SoftEnd() in the GATKSAMRecord these are costly operations when done repeatedly on the same read. --- .../sting/utils/clipping/ClippingOp.java | 1 + .../sting/utils/sam/GATKSAMRecord.java | 66 +++++++++++++------ 2 files changed, 46 insertions(+), 21 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index d12d2c2ec..a4383c3ae 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -312,6 +312,7 @@ public class ClippingOp { throw new ReviewedStingException("Where did the clone go?"); } + hardClippedRead.resetSoftStartAndEnd(); // reset the cached soft start and end because they may have changed now that the read was hard clipped. No need to calculate them now. They'll be lazily calculated on the next call to getSoftStart()/End() hardClippedRead.setBaseQualities(newQuals); hardClippedRead.setReadBases(newBases); hardClippedRead.setCigar(cigarShift.cigar); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 5fbe12eed..a925c7577 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -59,6 +59,8 @@ public class GATKSAMRecord extends BAMRecord { private String mReadString = null; private GATKSAMReadGroupRecord mReadGroup = null; private byte[] reducedReadCounts = null; + private int softStart = -1; + private int softEnd = -1; // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; @@ -385,15 +387,17 @@ public class GATKSAMRecord extends BAMRecord { * @return the unclipped start of the read taking soft clips (but not hard clips) into account */ public int getSoftStart() { - int start = this.getUnclippedStart(); - for (CigarElement cigarElement : this.getCigar().getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) - start += cigarElement.getLength(); - else - break; + if (softStart < 0) { + int start = this.getUnclippedStart(); + for (CigarElement cigarElement : this.getCigar().getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) + start += cigarElement.getLength(); + else + break; + } + softStart = start; } - - return start; + return softStart; } /** @@ -404,23 +408,43 @@ public class GATKSAMRecord extends BAMRecord { * @return the unclipped end of the read taking soft clips (but not hard clips) into account */ public int getSoftEnd() { - int stop = this.getUnclippedStart(); + if (softEnd < 0) { + int stop = this.getUnclippedStart(); - if (ReadUtils.readIsEntirelyInsertion(this)) - return stop; + if (ReadUtils.readIsEntirelyInsertion(this)) + return stop; - int shift = 0; - CigarOperator lastOperator = null; - for (CigarElement cigarElement : this.getCigar().getCigarElements()) { - stop += shift; - lastOperator = cigarElement.getOperator(); - if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP) - shift = cigarElement.getLength(); - else - shift = 0; + int shift = 0; + CigarOperator lastOperator = null; + for (CigarElement cigarElement : this.getCigar().getCigarElements()) { + stop += shift; + lastOperator = cigarElement.getOperator(); + if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP) + shift = cigarElement.getLength(); + else + shift = 0; + } + softEnd = (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } + return softEnd; + } - return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; + /** + * If the read is hard clipped, the soft start and end will change. You can set manually or just reset the cache + * so that the next call to getSoftStart/End will recalculate it lazily. + */ + public void resetSoftStartAndEnd() { + softStart = -1; + softEnd = -1; + } + + /** + * If the read is hard clipped, the soft start and end will change. You can set manually or just reset the cache + * so that the next call to getSoftStart/End will recalculate it lazily. + */ + public void resetSoftStartAndEnd(int softStart, int softEnd) { + this.softStart = softStart; + this.softEnd = softEnd; } /**