From 616ff8ea017888945e4dd5dcdde6dc196e9b6bd2 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 6 Jan 2012 10:36:11 -0500 Subject: [PATCH 01/31] fixed typo in help text --- .../sting/gatk/walkers/variantutils/CombineVariants.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 096085330..af05c0dc4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -105,7 +105,7 @@ public class CombineVariants extends RodWalker { * and each named argument will be labeled as such in the output (i.e., set=name rather than * set=variants2). The order of arguments does not matter unless except for the naming, so * if you provide an rod priority list and no explicit names than variants, variants2, etc - * are techincally order dependent. It is strongly recommended to provide explicit names when + * are technically order dependent. It is strongly recommended to provide explicit names when * a rod priority list is provided. */ @Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true) From ae259f81cc46dbb4966b52c1affe73951ea8494d Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 17 Jan 2012 14:39:27 -0500 Subject: [PATCH 02/31] Bug fixing for merging of read fragments when one fragment contained an indel --- .../variantrecalibration/VariantRecalibratorEngine.java | 3 +-- .../java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java index 6d2ac643b..378765051 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibratorEngine.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.List; @@ -126,7 +125,7 @@ public class VariantRecalibratorEngine { iteration++; model.maximizationStep( data ); currentChangeInMixtureCoefficients = model.normalizePMixtureLog10(); - model.expectationStep(data); + model.expectationStep( data ); if( iteration % 5 == 0 ) { // cut down on the number of output lines so that users can read the warning messages logger.info("Finished iteration " + iteration + ". \tCurrent change in mixture coefficients = " + String.format("%.5f", currentChangeInMixtureCoefficients)); } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index cc0b1ae67..d1e3ce26b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -608,7 +608,7 @@ public class ReadUtils { * Example: Locus => {read1, read2, ..., readN} * * - Second: readToLocusMap -- a HashMap that describes for each read what loci it contributes to the coverage. - * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with true meaning it contributes to the coverage. + * Note: Locus is a boolean array, indexed from 0 (= startLocation) to N (= stopLocation), with value==true meaning it contributes to the coverage. * Example: Read => {true, true, false, ... false} * * @param readList the list of reads to generate the association mappings From 8b0ddf0aaf5462986e405de2b966f146ac00a880 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 17 Jan 2012 16:13:13 -0500 Subject: [PATCH 03/31] Adding notes to CountCovariates docs about using interval lists as database of known variation --- .../gatk/walkers/recalibration/CountCovariatesWalker.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java index 88a9668cc..bdf25419f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java @@ -155,7 +155,9 @@ public class CountCovariatesWalker extends LocusWalker> knownSites = Collections.emptyList(); From ab8f499bc32982d000d5b9e291659d56927d9bfc Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 18 Jan 2012 22:04:51 -0500 Subject: [PATCH 05/31] Annotate with FS even for filtered sites --- .../sting/gatk/walkers/annotator/FisherStrand.java | 2 +- .../walkers/genotyper/UnifiedGenotyperIntegrationTest.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index c4025a25c..7d728dd5e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -54,7 +54,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat private static final double MIN_PVALUE = 1E-320; public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( ! vc.isVariant() || vc.isFiltered() ) + if ( !vc.isVariant() ) return null; int[][] table; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 32fa8679e..5cdf12f1b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -44,7 +44,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("43e7a17d95b1a0cf72e669657794d802")); + Arrays.asList("1899bdb956c62bbcbf160b18cd3aea60")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -275,7 +275,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("36ce53ae4319718ad9c8ae391deebc8c")); + Arrays.asList("320f61c87253aba77d6dc782242cba8b")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } From cf9b1d350a373fde820b4d97b2df3f321646601a Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 19 Jan 2012 00:20:49 -0500 Subject: [PATCH 06/31] Some minor changes to in-process functions that nobody else uses. CGL now properly ignores no-calls for external VCFs. --- .../sting/queue/library/ipf/vcf/VCFExtractIntervals.scala | 1 + .../sting/queue/library/ipf/vcf/VCFExtractSamples.scala | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala index 3935c2138..e661f1bb5 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala @@ -44,6 +44,7 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e } } } + out.printf("%s%n",cur) out.close } diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala index 54e541142..3179c327f 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractSamples.scala @@ -6,7 +6,7 @@ import collection.JavaConversions._ import org.broadinstitute.sting.commandline._ import java.io.{PrintWriter, PrintStream, File} -class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends InProcessFunction { +class VCFExtractSamples(inVCF: File, outVCF: File, samples: List[String]) extends InProcessFunction { def this(in: File, out: File, samples: File) = this(in,out, (new XReadLines(samples)).readLines.toList) @Input(doc="VCF from which to extract samples") var inputVCF : File = inVCF From 9946853039d2370d043d284f61356e9eaf66be1f Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 19 Jan 2012 00:25:22 -0500 Subject: [PATCH 07/31] Remove duplicated line --- .../sting/queue/library/ipf/vcf/VCFExtractIntervals.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala index e661f1bb5..a7b324a2b 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala @@ -39,8 +39,6 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e if ( elems.hasNext ) { prev = cur cur = elems.next - } else { - out.printf("%s%n",cur) } } } From 1e037a0ecf0c40b6ed472dbb5786825e8e9882d2 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 19 Jan 2012 00:33:08 -0500 Subject: [PATCH 08/31] Ensure second-to-last line printed --- .../sting/queue/library/ipf/vcf/VCFExtractIntervals.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala index a7b324a2b..03d31a217 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala @@ -42,6 +42,7 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e } } } + out.printf("%s%n",prev) out.printf("%s%n",cur) out.close From 39e6df5aa9d1008a7318fa900137b0c5efa5546d Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 19 Jan 2012 00:51:28 -0500 Subject: [PATCH 09/31] Fix edge case for very small VCFs --- .../library/ipf/vcf/VCFExtractIntervals.scala | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala index 03d31a217..3c7cd0a2d 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/library/ipf/vcf/VCFExtractIntervals.scala @@ -26,24 +26,24 @@ class VCFExtractIntervals(inVCF: File, outList: File, useFilterSites: Boolean) e var cur : String = null if ( elems.hasNext ) { cur = elems.next + while ( elems.hasNext ) { + out.printf("%s%n",prev) + while ( cur.equals(prev) && elems.hasNext && !cur.equals("") ) { + cur = elems.next + } + + if ( ! cur.equals(prev) ) { + if ( elems.hasNext ) { + prev = cur + cur = elems.next + } + } + } + out.printf("%s%n",prev) + out.printf("%s%n",cur) } else { out.printf("%s%n",prev) } - while ( elems.hasNext ) { - out.printf("%s%n",prev) - while ( cur.equals(prev) && elems.hasNext && !cur.equals("") ) { - cur = elems.next - } - - if ( ! cur.equals(prev) ) { - if ( elems.hasNext ) { - prev = cur - cur = elems.next - } - } - } - out.printf("%s%n",prev) - out.printf("%s%n",cur) out.close } From 7f3ad25b0109e91a82cf41378c33830f169fb705 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Thu, 19 Jan 2012 10:54:48 -0500 Subject: [PATCH 12/31] Adding a mode to VariantFiltration to invalidate previously-applied filters to allow complete re-filtering of a VCF. T2D VQSR: re-calling now done with appropriate quality settings and using BAQ. --- .../gatk/walkers/filters/VariantFiltrationWalker.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index 8278dbab7..7eb6fad54 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -139,6 +139,12 @@ public class VariantFiltrationWalker extends RodWalker { @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false) protected Boolean FAIL_MISSING_VALUES = false; + /** + * Invalidate previous filters applied to the VariantContext, applying only the filters here + */ + @Argument(fullName="invalidatePreviousFilters",doc="Remove previous filters applied to the VCF",required=false) + boolean invalidatePrevious = false; + // JEXL expressions for the filters List filterExps; List genotypeFilterExps; @@ -215,6 +221,9 @@ public class VariantFiltrationWalker extends RodWalker { for ( VariantContext vc : VCs ) { + if ( invalidatePrevious ) { + vc = (new VariantContextBuilder(vc)).filters(new HashSet()).make(); + } // filter based on previous mask position if ( previousMaskPosition != null && // we saw a previous mask site previousMaskPosition.getContig().equals(vc.getChr()) && // it's on the same contig From ced6775de3af5b5df967ea96ada5c73300aa3bc1 Mon Sep 17 00:00:00 2001 From: Aaron McKenna Date: Wed, 18 Jan 2012 16:34:42 -0500 Subject: [PATCH 14/31] Changes to allow for external tests Changes to the build script that allow the external directory to have tests. This means groups like CGA don't have to reinvent the wheel on testing, and can instead use the GATKs unit and integration tests. Signed-off-by: David Roazen --- build.xml | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/build.xml b/build.xml index dbdafa3d9..ffb564cd2 100644 --- a/build.xml +++ b/build.xml @@ -281,6 +281,10 @@ + + + + @@ -331,7 +335,7 @@ - + @@ -761,6 +765,7 @@ + @@ -811,7 +816,23 @@ - + + + + + + + + + + + + + + + + + @@ -852,6 +873,7 @@ + @@ -934,6 +956,9 @@ + + + From 066da80a3d9a83ec5e59cd3e545c6c2bf102a625 Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Thu, 19 Jan 2012 18:19:58 -0500 Subject: [PATCH 16/31] Added KEEP_UNCONDTIONAL option which permits even sites with only filtered records to be included as unfiltered sites in the output --- .../sting/utils/variantcontext/VariantContextUtils.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index c9a4965c1..39045ea21 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -464,7 +464,11 @@ public class VariantContextUtils { /** * Requires all records present at site to be unfiltered. VCF files that don't contain the record don't influence this. */ - KEEP_IF_ALL_UNFILTERED + KEEP_IF_ALL_UNFILTERED, + /** + * If any record is present at this site (regardless of possibly being filtered), then all such records are kept and the filters are reset. + */ + KEEP_UNCONDITIONAL } /** @@ -635,7 +639,7 @@ public class VariantContextUtils { } // if at least one record was unfiltered and we want a union, clear all of the filters - if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() ) + if ( (filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size()) || filteredRecordMergeType == FilteredRecordMergeType.KEEP_UNCONDITIONAL ) filters.clear(); From ace93330688e2a55b1d6d81f2d4e4bba21698cb6 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 19 Jan 2012 22:05:08 -0500 Subject: [PATCH 23/31] Active region walkers can now see the reads in a buffer around thier active reigons. This buffer size is specified as a walker annotation. Intervals are internally extended by this buffer size so that the extra reads make their way through the traversal engine but the walker author only needs to see the original interval. Also, several corner case bug fixes in active region traversal. --- .../sting/gatk/GenomeAnalysisEngine.java | 12 ++- .../gatk/executive/LinearMicroScheduler.java | 2 +- .../traversals/TraverseActiveRegions.java | 89 ++++++++++++------- .../gatk/walkers/ActiveRegionExtension.java | 19 ++++ .../gatk/walkers/ActiveRegionWalker.java | 29 +++++- .../gatk/walkers/annotator/FisherStrand.java | 1 - .../broadinstitute/sting/utils/GenomeLoc.java | 2 +- .../sting/utils/GenomeLocSortedSet.java | 15 ++++ .../utils/activeregion/ActiveRegion.java | 29 +++--- 9 files changed, 149 insertions(+), 49 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index ede8e9340..6140d543a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -443,14 +443,22 @@ public class GenomeAnalysisEngine { if(!readsDataSource.hasIndex() && intervals != null && !argCollection.allowIntervalsWithUnindexedBAM) throw new UserException.CommandLineException("Cannot perform interval processing when reads are present but no index is available."); - if(walker instanceof LocusWalker || walker instanceof ActiveRegionWalker) { + if(walker instanceof LocusWalker) { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); if(intervals == null) return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); else return readsDataSource.createShardIteratorOverIntervals(intervals,new LocusShardBalancer()); - } + } + else if(walker instanceof ActiveRegionWalker) { + if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) + throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); + } else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { // Apply special validation to read pair walkers. if(walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 774b532f3..16487054b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -77,7 +77,7 @@ public class LinearMicroScheduler extends MicroScheduler { done = walker.isDone(); } - // Special function call to empty out the work queue. Ugly for now but will be cleaned up when we push this functionality more into the engine + // Special function call to empty out the work queue. Ugly for now but will be cleaned up when we eventually push this functionality more into the engine if( traversalEngine instanceof TraverseActiveRegions ) { final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 384affcb7..b18d1ceb9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -7,10 +7,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -46,13 +45,15 @@ public class TraverseActiveRegions extends TraversalEngine isActiveList = new ArrayList(); //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); @@ -90,9 +91,11 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine activeRegions = integrateActiveList( isActiveList ); logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." ); workQueue.addAll( activeRegions ); - } - while( workQueue.peek().getLocation().getStop() < minStart ) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker ); + // Since we've sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + if( !workQueue.isEmpty() ) { + while( workQueue.peek().getExtendedLoc().getStop() < minStart || !workQueue.peek().getExtendedLoc().getContig().equals(dataProvider.getLocus().getContig()) ) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion( activeRegion, myReads, workQueue, sum, walker ); + } + } } return sum; @@ -158,16 +173,18 @@ public class TraverseActiveRegions extends TraversalEngine> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLocation()); + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); final M x = walker.map( activeRegion, null ); // BUGBUG: tracker needs to be filled in and passed to the walker return walker.reduce( x, sum ); } @@ -178,8 +195,8 @@ public class TraverseActiveRegions extends TraversalEngine walker, LocusShardDataProvider dataProvider ) { - DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + private LocusView getLocusView( final Walker walker, final LocusShardDataProvider dataProvider ) { + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); if( dataSource == DataSource.READS ) return new CoveredLocusView(dataProvider); else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) @@ -193,21 +210,29 @@ public class TraverseActiveRegions extends TraversalEngine integrateActiveList( final ArrayList activeList ) { final ArrayList returnList = new ArrayList(); - ActiveRegion prevLocus = activeList.remove(0); - ActiveRegion startLocus = prevLocus; - for( final ActiveRegion thisLocus : activeList ) { - if( prevLocus.isActive != thisLocus.isActive ) { - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), - prevLocus.isActive, engine.getGenomeLocParser() ) ); - startLocus = thisLocus; + if( activeList.size() == 0 ) { + return returnList; + } else if( activeList.size() == 1 ) { + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(activeList.get(0).getLocation().getContig(), activeList.get(0).getLocation().getStart(), activeList.get(0).getLocation().getStart()), + activeList.get(0).isActive, engine.getGenomeLocParser(), activeList.get(0).getExtension() ) ); + return returnList; + } else { + ActiveRegion prevLocus = activeList.get(0); + ActiveRegion startLocus = prevLocus; + for( final ActiveRegion thisLocus : activeList ) { + if( prevLocus.isActive != thisLocus.isActive || !prevLocus.getLocation().contiguousP( thisLocus.getLocation() ) ) { + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), + prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) ); + startLocus = thisLocus; + } + prevLocus = thisLocus; } - prevLocus = thisLocus; + // output the last region if necessary + if( startLocus != prevLocus ) { + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), + prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) ); + } + return returnList; } - // output the last region if necessary - if( startLocus != prevLocus ) { - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), - prevLocus.isActive, engine.getGenomeLocParser() ) ); - } - return returnList; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java new file mode 100644 index 000000000..bb007893c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionExtension.java @@ -0,0 +1,19 @@ +package org.broadinstitute.sting.gatk.walkers; + +import java.lang.annotation.Documented; +import java.lang.annotation.Inherited; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; + +/** + * Describes the size of the buffer region that is added to each active region when pulling in covered reads. + * User: rpoplin + * Date: 1/18/12 + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) + +public @interface ActiveRegionExtension { + public int extension() default 0; +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index d2891c959..8405f762d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -1,13 +1,26 @@ package org.broadinstitute.sting.gatk.walkers; +import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter; +import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter; +import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; +import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalUtils; + +import java.util.ArrayList; +import java.util.List; /** - * Created by IntelliJ IDEA. + * Base class for all the Active Region Walkers. * User: rpoplin * Date: 12/7/11 */ @@ -15,7 +28,10 @@ import org.broadinstitute.sting.utils.activeregion.ActiveRegion; @By(DataSource.READS) @Requires({DataSource.READS, DataSource.REFERENCE_BASES}) @PartitionBy(PartitionType.READ) +@ActiveRegionExtension(extension=50) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) public abstract class ActiveRegionWalker extends Walker { + // Do we actually want to operate on the context? public boolean filter(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context) { return true; // We are keeping all the reads @@ -26,4 +42,15 @@ public abstract class ActiveRegionWalker extends Walker allIntervals = new ArrayList(); + for( final GenomeLoc interval : intervals.toList() ) { + final int start = Math.max( 1, interval.getStart() - activeRegionExtension ); + final int stop = Math.min( reference.getSequenceDictionary().getSequence(interval.getContig()).getSequenceLength(), interval.getStop() + activeRegionExtension ); + allIntervals.add( genomeLocParser.createGenomeLoc(interval.getContig(), start, stop) ); + } + return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, IntervalMergingRule.ALL); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index c4025a25c..cb9bc103f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -73,7 +73,6 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat if ( pvalue == null ) return null; - // use Math.abs to prevent -0's Map map = new HashMap(); map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); return map; diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 6941b888b..ad10b61e7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -467,6 +467,6 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome } public long sizeOfOverlap( final GenomeLoc that ) { - return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) : 0L ); + return ( this.overlapsP(that) ? Math.min( getStop(), that.getStop() ) - Math.max( getStart(), that.getStart() ) + 1L : 0L ); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index 26be0e59e..d11adf9e3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -127,6 +127,21 @@ public class GenomeLocSortedSet extends AbstractSet { return mArray.isEmpty(); } + /** + * Determine if the given loc overlaps any loc in the sorted set + * + * @param loc the location to test + * @return + */ + public boolean overlaps(final GenomeLoc loc) { + for(final GenomeLoc e : mArray) { + if(e.overlapsP(loc)) { + return true; + } + } + return false; + } + /** * add a genomeLoc to the collection, simply inserting in order into the set * diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index e8908480c..abf74469f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -18,21 +18,25 @@ public class ActiveRegion implements HasGenomeLocation { private final ArrayList reads = new ArrayList(); private byte[] reference = null; - private final GenomeLoc loc; - private GenomeLoc referenceLoc = null; + private final GenomeLoc activeRegionLoc; + private final GenomeLoc extendedLoc; + private final int extension; + private GenomeLoc fullExtentReferenceLoc = null; private final GenomeLocParser genomeLocParser; public final boolean isActive; - public ActiveRegion( final GenomeLoc loc, final boolean isActive, final GenomeLocParser genomeLocParser ) { - this.loc = loc; + public ActiveRegion( final GenomeLoc activeRegionLoc, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { + this.activeRegionLoc = activeRegionLoc; this.isActive = isActive; this.genomeLocParser = genomeLocParser; - referenceLoc = loc; + this.extension = extension; + extendedLoc = genomeLocParser.createGenomeLoc(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); + fullExtentReferenceLoc = extendedLoc; } - // add each read to the bin and extend the reference genome loc if needed + // add each read to the bin and extend the reference genome activeRegionLoc if needed public void add( final GATKSAMRecord read, final boolean isPrimaryRegion ) { - referenceLoc = referenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); + fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); reads.add( new ActiveRead(read, isPrimaryRegion) ); } @@ -41,15 +45,18 @@ public class ActiveRegion implements HasGenomeLocation { public byte[] getReference( final IndexedFastaSequenceFile referenceReader ) { // set up the reference if we haven't done so yet if ( reference == null ) { - reference = referenceReader.getSubsequenceAt(referenceLoc.getContig(), referenceLoc.getStart(), referenceLoc.getStop()).getBases(); + reference = referenceReader.getSubsequenceAt(fullExtentReferenceLoc.getContig(), fullExtentReferenceLoc.getStart(), fullExtentReferenceLoc.getStop()).getBases(); } return reference; } - public GenomeLoc getLocation() { return loc; } - - public GenomeLoc getReferenceLocation() { return referenceLoc; } + @Override + public GenomeLoc getLocation() { return activeRegionLoc; } + public GenomeLoc getExtendedLoc() { return extendedLoc; } + public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; } + + public int getExtension() { return extension; } public int size() { return reads.size(); } } \ No newline at end of file From 9b4f6afa212f838504f002469e080ebe274ab59f Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Fri, 20 Jan 2012 23:07:59 -0500 Subject: [PATCH 28/31] Alterations to scripts for better performance. Grid search now expands the sens/spec tradeoff (90 was far too aggressive against hapmap chr20), and 20 max gaussians was too many, and caused errors. For consensus genotypes: remember to gunzip the beagle outputs before converting to VCF. Also, beagle can in fact create 'null' alleles in certain circumstances. I'm not sure what exactly those circumstances are, but those sites should be ignored. When it does, all alleles apear to be set to null, so this should not affect the actual phasing in the output VCF. --- .../sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index f827856be..4f115b46e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -241,6 +241,11 @@ public class BeagleOutputToVCFWalker extends RodWalker { String alleleA = beagleGenotypePairs.get(0); String alleleB = beagleGenotypePairs.get(1); + if ( alleleA.equals("null") || alleleB.equals("null") ) { + logger.warn("Beagle produced 'null' alleles at location "+ref.getLocus().toString()+". Ignoring."); + return 0; + } + // Beagle always produces genotype strings based on the strings we input in the likelihood file. String refString = vc_input.getReference().getDisplayString(); if (refString.length() == 0) // ref was null From 3b1aad4f1731125d67b51e6681a812f4c2a59a6d Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Fri, 20 Jan 2012 23:43:51 -0500 Subject: [PATCH 29/31] After a minor and abject freakout, alter the T2D script to seek out truth sensitivities between 80 and 100, rather than between 0.8 and 1. Also, don't consider a genotype "changed by beagle" if the initial genotype is a no-call. --- .../gatk/walkers/beagle/BeagleOutputToVCFWalker.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index 4f115b46e..ec67563dc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -320,8 +320,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { og = a1+"/"+a2; // See if Beagle switched genotypes - if (!((bglAlleleA.equals(originalAlleleA) && bglAlleleB.equals(originalAlleleB) || - (bglAlleleA.equals(originalAlleleB) && bglAlleleB.equals(originalAlleleA))))){ + if (! originalAlleleA.equals(Allele.NO_CALL) && beagleSwitchedGenotypes(bglAlleleA,originalAlleleA,bglAlleleB,originalAlleleB)){ originalAttributes.put("OG",og); numGenotypesChangedByBeagle++; } @@ -364,6 +363,11 @@ public class BeagleOutputToVCFWalker extends RodWalker { return 1; } + private boolean beagleSwitchedGenotypes(Allele bglAlleleA, Allele originalAlleleA, Allele bglAlleleB, Allele originalAlleleB) { + return !((bglAlleleA.equals(originalAlleleA) && bglAlleleB.equals(originalAlleleB) || + (bglAlleleA.equals(originalAlleleB) && bglAlleleB.equals(originalAlleleA)))); + } + public Integer reduceInit() { return 0; // Nothing to do here } From 4d6312d4ea1e22f53649c4c38fd0458753effe0f Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Sun, 22 Jan 2012 14:31:01 -0500 Subject: [PATCH 30/31] HaplotypeCaller is now an ActiveRegionWalker. --- .../traversals/TraverseActiveRegions.java | 24 +++---- .../gatk/walkers/ActiveRegionWalker.java | 4 ++ .../sting/utils/activeregion/ActiveRead.java | 19 ------ .../utils/activeregion/ActiveRegion.java | 24 +++---- .../sting/utils/fragments/FragmentUtils.java | 65 +++++++++++++++++++ 5 files changed, 95 insertions(+), 41 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRead.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index b18d1ceb9..ebfcc0c29 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -46,7 +46,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine integrateActiveList( final ArrayList activeList ) { final ArrayList returnList = new ArrayList(); if( activeList.size() == 0 ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index 8405f762d..d7e170d73 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -37,6 +37,10 @@ public abstract class ActiveRegionWalker extends Walker reads = new ArrayList(); - private byte[] reference = null; + private final ArrayList reads = new ArrayList(); private final GenomeLoc activeRegionLoc; private final GenomeLoc extendedLoc; private final int extension; @@ -35,28 +34,31 @@ public class ActiveRegion implements HasGenomeLocation { } // add each read to the bin and extend the reference genome activeRegionLoc if needed - public void add( final GATKSAMRecord read, final boolean isPrimaryRegion ) { + public void add( final GATKSAMRecord read ) { fullExtentReferenceLoc = fullExtentReferenceLoc.union( genomeLocParser.createGenomeLoc( read ) ); - reads.add( new ActiveRead(read, isPrimaryRegion) ); + reads.add( read ); } - public ArrayList getReads() { return reads; } + public ArrayList getReads() { return reads; } public byte[] getReference( final IndexedFastaSequenceFile referenceReader ) { - // set up the reference if we haven't done so yet - if ( reference == null ) { - reference = referenceReader.getSubsequenceAt(fullExtentReferenceLoc.getContig(), fullExtentReferenceLoc.getStart(), fullExtentReferenceLoc.getStop()).getBases(); - } + return getReference( referenceReader, 0 ); + } - return reference; + public byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + return referenceReader.getSubsequenceAt( fullExtentReferenceLoc.getContig(), + Math.max(1, fullExtentReferenceLoc.getStart() - padding), + Math.min(referenceReader.getSequenceDictionary().getSequence(fullExtentReferenceLoc.getContig()).getSequenceLength(), fullExtentReferenceLoc.getStop() + padding) ).getBases(); } @Override public GenomeLoc getLocation() { return activeRegionLoc; } - public GenomeLoc getExtendedLoc() { return extendedLoc; } public GenomeLoc getReferenceLoc() { return fullExtentReferenceLoc; } public int getExtension() { return extension; } public int size() { return reads.size(); } + public void clearReads() { reads.clear(); } + public void remove( final GATKSAMRecord read ) { reads.remove( read ); } + public void removeAll( final ArrayList readsToRemove ) { reads.removeAll( readsToRemove ); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index e5500ca21..68bf6dce8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -1,9 +1,15 @@ package org.broadinstitute.sting.utils.fragments; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.*; @@ -121,4 +127,63 @@ public class FragmentUtils { return create(reads, reads.size(), SamRecordGetter); } + public final static List mergeOverlappingPairedFragments( List overlappingPair ) { + final byte MIN_QUAL_BAD_OVERLAP = 16; + if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); } + + GATKSAMRecord firstRead = overlappingPair.get(0); + GATKSAMRecord secondRead = overlappingPair.get(1); + if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) { + firstRead = overlappingPair.get(1); + secondRead = overlappingPair.get(0); + } + if( !(secondRead.getUnclippedStart() <= firstRead.getUnclippedEnd() && secondRead.getUnclippedStart() >= firstRead.getUnclippedStart() && secondRead.getUnclippedEnd() >= firstRead.getUnclippedEnd()) ) { + return overlappingPair; // can't merge them, yet: AAAAAAAAAAA-BBBBBBBBBBB-AAAAAAAAAAAAAA, B is contained entirely inside A + } + if( firstRead.getCigarString().contains("I") || firstRead.getCigarString().contains("D") || secondRead.getCigarString().contains("I") || secondRead.getCigarString().contains("D") ) { + return overlappingPair; // fragments contain indels so don't merge them + } + + final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); + + final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() ); + final int numBases = firstReadStop + secondRead.getReadLength(); + final byte[] bases = new byte[numBases]; + final byte[] quals = new byte[numBases]; + final byte[] firstReadBases = firstRead.getReadBases(); + final byte[] firstReadQuals = firstRead.getBaseQualities(); + final byte[] secondReadBases = secondRead.getReadBases(); + final byte[] secondReadQuals = secondRead.getBaseQualities(); + for(int iii = 0; iii < firstReadStop; iii++) { + bases[iii] = firstReadBases[iii]; + quals[iii] = firstReadQuals[iii]; + } + for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) { + if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) { + return overlappingPair;// high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them + } + bases[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadBases[iii] : secondReadBases[iii-firstReadStop] ); + quals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadQuals[iii] : secondReadQuals[iii-firstReadStop] ); + } + for(int iii = firstRead.getReadLength(); iii < numBases; iii++) { + bases[iii] = secondReadBases[iii-firstReadStop]; + quals[iii] = secondReadQuals[iii-firstReadStop]; + } + + final GATKSAMRecord returnRead = new GATKSAMRecord(firstRead.getHeader()); + returnRead.setAlignmentStart(firstRead.getUnclippedStart()); + returnRead.setReadBases( bases ); + returnRead.setBaseQualities( quals ); + returnRead.setReadGroup( firstRead.getReadGroup() ); + returnRead.setReferenceName( firstRead.getReferenceName() ); + final CigarElement c = new CigarElement(bases.length, CigarOperator.M); + final ArrayList cList = new ArrayList(); + cList.add(c); + returnRead.setCigar( new Cigar( cList )); + returnRead.setMappingQuality( firstRead.getMappingQuality() ); + + final ArrayList returnList = new ArrayList(); + returnList.add(returnRead); + return returnList; + } } From 4a08e8ca6ecaffdd92623be3283faaa4aa881d01 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Mon, 23 Jan 2012 08:25:34 -0500 Subject: [PATCH 31/31] Minor tweaks to T2D-related qscripts. Replacing old md5s from the BeagleIntegrationTest. All differences boiled down either to the accounting of genotypes changed (./. --> 0/0 is no longer a "changed" genotype, and original genotypes that were ./. are represented as OG=. rather than OG=./. .) This is somewhat of an arbitrary decision, and is negotiable. I could see treating GT:PL ./.:. differently from GT:PL .:0,3,6 but am not sure the worth of doing so. --- .../sting/gatk/walkers/beagle/BeagleIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index 1a01ef8e8..9aae1f0ae 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -41,7 +41,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:BEAGLE " + beagleValidationDataLocation + "inttestbgl.r2 " + "--beagleProbs:BEAGLE " + beagleValidationDataLocation + "inttestbgl.gprobs " + "--beaglePhased:BEAGLE " + beagleValidationDataLocation + "inttestbgl.phased " + - "-o %s -NO_HEADER", 1, Arrays.asList("b445d280fd8fee1eeb4aacb3f5a54847")); + "-o %s -NO_HEADER", 1, Arrays.asList("6d0f213918e3b9ea33bc2f8a51a462f1")); executeTest("test BeagleOutputToVCF", spec); } @@ -72,7 +72,7 @@ public class BeagleIntegrationTest extends WalkerTest { "--beagleR2:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ "--beagleProbs:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ "--beaglePhased:beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ - "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("51a57ea565176edd96d907906914b0ee")); + "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("ddbf490f1d9f37cc79fe414c8d40886f")); executeTest("testBeagleChangesSitesToRef",spec); }