From e366ee18bcc90027f6309fab360aa6b55c63c041 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 29 Sep 2011 07:46:19 -0400 Subject: [PATCH 03/16] Adding ability to read in and make use of kmer quality tables during HMM evaluation --- .../walkers/recalibration/TableRecalibrationWalker.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 174e810c2..e04f5bc4b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -170,9 +170,9 @@ public class TableRecalibrationWalker extends ReadWalker requestedCovariates = new ArrayList(); // List of covariates to be used in this calculation - private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); - private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); - private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); + public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + public static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); + public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); public static final String EOF_MARKER = "EOF"; private long numReadsWithMalformedColorSpace = 0; From 05fba6f23ad7c9bb116a03d3243a0e56c1e08d7b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 30 Sep 2011 15:44:30 -0400 Subject: [PATCH 08/16] Clipping ends inside deletion and before insertion fixed. --- .../broadinstitute/sting/utils/sam/ReadUtils.java | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 3c389fd4c..49900f0da 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -879,9 +879,20 @@ public class ReadUtils { if (endsWithinCigar) fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; - // if we end outside the current cigar element, we need to check if the next element is a deletion. + // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. else { nextCigarElement = cigarElementIterator.next(); + + // if it's an insertion, we need to clip the whole insertion before looking at the next element + if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { + readBases += nextCigarElement.getLength(); + if (!cigarElementIterator.hasNext()) + throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + + nextCigarElement = cigarElementIterator.next(); + } + + // if it's a deletion, we will pass the information on to be handled downstream. fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION; } From c7898a9be78130b865cc8d6a8c3e3429ac71eb00 Mon Sep 17 00:00:00 2001 From: Andrey Sivachenko Date: Fri, 30 Sep 2011 16:40:21 -0400 Subject: [PATCH 09/16] inconsequential change in string constants printed into the vcf which noone uses anyway... --- .../gatk/walkers/indels/SomaticIndelDetectorWalker.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java index 8bba8eac2..5b10a79c6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java @@ -265,7 +265,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker { Set headerInfo = new HashSet(); // first, the basic info - headerInfo.add(new VCFHeaderLine("source", "IndelGenotyperV2")); + headerInfo.add(new VCFHeaderLine("source", "SomaticIndelDetector")); headerInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); // FORMAT and INFO fields @@ -283,10 +283,10 @@ public class SomaticIndelDetectorWalker extends ReadWalker { args.addAll(getToolkit().getFilters()); Map commandLineArgs = getToolkit().getApproximateCommandLineArguments(args); for ( Map.Entry commandLineArg : commandLineArgs.entrySet() ) - headerInfo.add(new VCFHeaderLine(String.format("IGv2_%s", commandLineArg.getKey()), commandLineArg.getValue())); + headerInfo.add(new VCFHeaderLine(String.format("SID_%s", commandLineArg.getKey()), commandLineArg.getValue())); // also, the list of input bams for ( String fileName : getToolkit().getArguments().samFiles ) - headerInfo.add(new VCFHeaderLine("IGv2_bam_file_used", fileName)); + headerInfo.add(new VCFHeaderLine("SID_bam_file_used", fileName)); return headerInfo; } From bf6a3a65320cf5d1fa9aed9f3db5154579921443 Mon Sep 17 00:00:00 2001 From: Roger Zurawicki Date: Sun, 2 Oct 2011 22:33:46 -0400 Subject: [PATCH 10/16] Added framework to do batch CigarClip Testing *NOTE: This commit has not been compiled! --- .../sting/utils/clipreads/ReadClipperUnitTest.java | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index 1415379db..38eee762a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -62,6 +62,20 @@ public class ReadClipperUnitTest extends BaseTest { readClipper = new ReadClipper(read); } + private void testHardClipCigarByReadCoordinate( SAMRecord read, String inputCigar, String expectedCigar, int expectedStart, int expectedStop) { + read.setCigar(TextCigarCodec.getSingleton().decode(inputCigar) ); + SAMRecord clipped = readClipper.hardClipByReadCoordinates(expectedStart,expectedStop); + Assert.assertEquals(clipped.getCigarString(), expectedCigar, "Clipped Cigar string is different than expected"); + } +/* + private void testReadBasesAndQuals(SAMRecord read, int expectedStart, int expectedStop) { + SAMRecord clipped = ReadUtils.hardClipBases(read, expectedStart, expectedStop - 1, null); + String expectedBases = BASES.substring(expectedStart, expectedStop); + String expectedQuals = QUALS.substring(expectedStart, expectedStop); + Assert.assertEquals(clipped.getReadBases(), expectedBases.getBytes(), "Clipped bases not those expected"); + Assert.assertEquals(clipped.getBaseQualityString(), expectedQuals, "Clipped quals not those expected"); + } +*/ @Test public void testHardClipBothEndsByReferenceCoordinates() { logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); From c3eff7451abfcb91366f11158d7eac60fd120199 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 3 Oct 2011 14:20:39 -0400 Subject: [PATCH 11/16] Found a small inefficiency while profiling: we were still using String.split instead of ParsingUtils.split to break up array values in the INFO field. There was a noticeable (albeit not big) difference in the change when reading sites only files. --- .../utils/codecs/vcf/AbstractVCFCodec.java | 28 +++++++++++-------- .../sting/utils/codecs/vcf/VCFConstants.java | 1 + 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 43b07476d..c86b91b79 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -36,6 +36,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, // for ParsingUtils.split protected String[] GTValueArray = new String[100]; protected String[] genotypeKeyArray = new String[100]; + protected String[] infoFieldArray = new String[1000]; protected String[] infoValueArray = new String[1000]; // for performance testing purposes @@ -351,23 +352,28 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec, if ( infoField.indexOf("\t") != -1 || infoField.indexOf(" ") != -1 ) generateException("The VCF specification does not allow for whitespace in the INFO field"); - int infoValueSplitSize = ParsingUtils.split(infoField, infoValueArray, VCFConstants.INFO_FIELD_SEPARATOR_CHAR); - for (int i = 0; i < infoValueSplitSize; i++) { + int infoFieldSplitSize = ParsingUtils.split(infoField, infoFieldArray, VCFConstants.INFO_FIELD_SEPARATOR_CHAR, false); + for (int i = 0; i < infoFieldSplitSize; i++) { String key; Object value; - int eqI = infoValueArray[i].indexOf("="); + int eqI = infoFieldArray[i].indexOf("="); if ( eqI != -1 ) { - key = infoValueArray[i].substring(0, eqI); - String str = infoValueArray[i].substring(eqI+1, infoValueArray[i].length()); + key = infoFieldArray[i].substring(0, eqI); + String str = infoFieldArray[i].substring(eqI+1); - // lets see if the string contains a , separator - if ( str.contains(",") ) - value = Arrays.asList(str.split(",")); - else - value = str; + // split on the INFO field separator + int infoValueSplitSize = ParsingUtils.split(str, infoValueArray, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR, false); + if ( infoValueSplitSize == 1 ) { + value = infoValueArray[0]; + } else { + ArrayList valueList = new ArrayList(infoValueSplitSize); + for ( int j = 0; j < infoValueSplitSize; j++ ) + valueList.add(infoValueArray[j]); + value = valueList; + } } else { - key = infoValueArray[i]; + key = infoFieldArray[i]; value = true; } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java index 91cf86c70..8e9d989cc 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java @@ -71,6 +71,7 @@ public final class VCFConstants { public static final char FIELD_SEPARATOR_CHAR = '\t'; public static final String FILTER_CODE_SEPARATOR = ";"; public static final String INFO_FIELD_ARRAY_SEPARATOR = ","; + public static final char INFO_FIELD_ARRAY_SEPARATOR_CHAR = ','; public static final String ID_FIELD_SEPARATOR = ";"; public static final String INFO_FIELD_SEPARATOR = ";"; public static final char INFO_FIELD_SEPARATOR_CHAR = ';'; From 88c2fad64f193a42c8e7a66715b27b036a439389 Mon Sep 17 00:00:00 2001 From: Matt Hanna Date: Tue, 4 Oct 2011 13:14:39 -0400 Subject: [PATCH 13/16] Change vcf jar to use a classfileset to pull all dependencies. Should save Jim Robinson some detective work in the long run. --- build.xml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/build.xml b/build.xml index 1f26e7b7a..34de6cee6 100644 --- a/build.xml +++ b/build.xml @@ -545,12 +545,11 @@ - - - - - - + + + + + From 941317167eb1d091603aa03b323de0ab36fa608b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Oct 2011 14:08:00 -0700 Subject: [PATCH 14/16] Updating MD5 for BAMs that I added a read group to --- .../sting/utils/interval/IntervalIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java index 2fab1f287..178c09fa4 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java @@ -76,7 +76,7 @@ public class IntervalIntegrationTest extends WalkerTest { // our base file File baseOutputFile = createTempFile("testUnmappedReadInclusion",".bam"); spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("99c266d777e2e167b8153c858c305fda",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("748a38ed5eb0a043dfc7b82f0d1e8063",createTempFileFromBase(baseOutputFile.getAbsolutePath())); spec.addAuxFile("fadcdf88597b9609c5f2a17f4c6eb455", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); executeTest("testUnmappedReadInclusion",spec); @@ -97,7 +97,7 @@ public class IntervalIntegrationTest extends WalkerTest { File baseOutputFile = createTempFile("testUnmappedReadExclusion",".bam"); spec.setOutputFileLocation(baseOutputFile); spec.addAuxFile("8236f0b2df5a692e54751b08bc3836fa",createTempFileFromBase(baseOutputFile.getAbsolutePath())); - spec.addAuxFile("651b42456d31ba24e913297b71b32143", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); + spec.addAuxFile("b341d808ecc33217f37c0c0cde2a3e2f", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); executeTest("testUnmappedReadExclusion",spec); } From d1d39943d0475af51e438bdbb945c79471142bdd Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 4 Oct 2011 21:00:15 -0700 Subject: [PATCH 15/16] Updating MD5 for BAMs that I added a read group to, part 2 --- .../sting/utils/interval/IntervalIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java index 178c09fa4..379d79c84 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java @@ -96,7 +96,7 @@ public class IntervalIntegrationTest extends WalkerTest { // our base file File baseOutputFile = createTempFile("testUnmappedReadExclusion",".bam"); spec.setOutputFileLocation(baseOutputFile); - spec.addAuxFile("8236f0b2df5a692e54751b08bc3836fa",createTempFileFromBase(baseOutputFile.getAbsolutePath())); + spec.addAuxFile("80887ba488e53dabd9596ff93070ae75",createTempFileFromBase(baseOutputFile.getAbsolutePath())); spec.addAuxFile("b341d808ecc33217f37c0c0cde2a3e2f", createTempFileFromBase(baseOutputFile.getAbsolutePath().substring(0,baseOutputFile.getAbsolutePath().indexOf(".bam"))+".bai")); executeTest("testUnmappedReadExclusion",spec); From b732f740d2e0d2bb7a0bf8c2457f630cb0e61782 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 5 Oct 2011 16:51:30 -0400 Subject: [PATCH 16/16] Revert "Change vcf jar to use a classfileset to pull all dependencies. Should save" This reverts commit 441022c4c600624928da46419a6a289200700f3e. --- build.xml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/build.xml b/build.xml index 34de6cee6..1f26e7b7a 100644 --- a/build.xml +++ b/build.xml @@ -545,11 +545,12 @@ - - - - - + + + + + +