From 730208133ba0fd824287694c5ee24f4e4606683b Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Fri, 27 Apr 2012 14:41:17 -0400 Subject: [PATCH 5/8] Several fixes and improvements to Pool caller with ancillary test functions (not done yet): a) Utility class called Probability Vector that holds a log-probability vector and has the ability to clip ends that deviate largely from max value. b) Used this class to hold site error model, since likelihoods of error model away from peak are so far down that it's not worth computing with them and just wastes time. c) Expand unit tests and add an exhaustive test for ErrorModel class. d) Corrected major math bug in ErrorModel uncovered by exhaustive test: log(e^x) is NOT x if log's base = 10. e) Refactored utility functions that created artificial pileups for testing into separate class ArtificialPileupTestProvider. Right now functionality is limited (one artificial contig of 10 bp), can only specify pileups in one position with a given number of matches and mismatches to ref) but functionality will be expanded in future to cover more test cases. f) Use this utility class for IndelGenotypeLikelihoods unit test and for PoolGenotypeLikelihoods unit test (the latter testing functionality still not done). g) Linearized implementation of biallelic exact model (very simple approach, similar to diploid exact model, just abort if we're past the max value of AC distribution and below a threshold). Still need to add unit tests for this and to expand to multiallelic model. h) Update integration test md5's due to minor differences stemming from linearized exact model and better error model math --- .../broadinstitute/sting/utils/MathUtils.java | 2 +- .../ArtificialReadPileupTestProvider.java | 203 ++++++++++++++++++ .../IndelGenotypeLikelihoodsUnitTest.java | 154 ++----------- 3 files changed, 216 insertions(+), 143 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 29d47cf3c..e8b05b525 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -243,7 +243,7 @@ public class MathUtils { double maxValue = arrayMax(log10p, finish); if(maxValue == Double.NEGATIVE_INFINITY) - return sum; + return maxValue; for (int i = start; i < finish; i++) { sum += Math.pow(10.0, log10p[i] - maxValue); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java new file mode 100644 index 000000000..1c372aa82 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; + +import java.util.*; + + +public class ArtificialReadPileupTestProvider { + final int contigStart = 1; + final int contigStop = 10; + final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigStop - contigStart + 1); +// final GATKSAMReadGroupRecord artificialGATKRG = new GATKSAMReadGroupRecord("synthetic"); + final String artificialContig = "chr1"; + // final int artificialContigIndex = 0; + final String artificialReadName = "synth"; + final int artificialRefStart = 1; + final int artificialMappingQuality = 60; + Map sample2RG = new HashMap(); + List sampleRGs; + + final String refBases = "AGGATACTGT"; + List sampleNames = new ArrayList(); + private String sampleName(int i) { return sampleNames.get(i); } + private SAMReadGroupRecord sampleRG(String name) { return sample2RG.get(name); } + public final int offset = 5; + public final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + public final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,offset,offset); + public final GenomeLoc window = genomeLocParser.createGenomeLoc(artificialContig,artificialRefStart,10); + public final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser,loc,window,this.refBases.getBytes()); + + + public ArtificialReadPileupTestProvider(int numSamples, final String SAMPLE_PREFIX) { + sampleRGs = new ArrayList(); + + for ( int i = 0; i < numSamples; i++ ) { + sampleNames.add(String.format("%s%04d", SAMPLE_PREFIX, i)); + SAMReadGroupRecord rg = createRG(sampleName(i)); + sampleRGs.add(rg); + sample2RG.put(sampleName(i), rg); + } + + } + + public List getSampleNames() { + return sampleNames; + } + public byte getRefByte() { + return refBases.substring(offset,offset+1).getBytes()[0]; + } + + public Map getAlignmentContextFromAlleles(int eventLength, String altBases, int[] numReadsPerAllele) { + // RefMetaDataTracker tracker = new RefMetaDataTracker(null,referenceContext); + + + ArrayList vcAlleles = new ArrayList(); + Allele refAllele, altAllele; + if (eventLength == 0) {// SNP case + refAllele =Allele.create(refBases.substring(offset,offset+1),true); + altAllele = Allele.create(altBases.substring(0,1), false); + + } else if (eventLength>0){ + // insertion + refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); + altAllele = Allele.create(altBases.substring(0,eventLength), false); + } + else { + // deletion + refAllele =Allele.create(refBases.substring(offset,offset+Math.abs(eventLength)),true); + altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + } + int stop = loc.getStart(); + vcAlleles.add(refAllele); + vcAlleles.add(altAllele); + + final VariantContextBuilder builder = new VariantContextBuilder().source(""); + builder.loc(loc.getContig(), loc.getStart(), stop); + builder.alleles(vcAlleles); + builder.referenceBaseForIndel(referenceContext.getBase()); + builder.noGenotypes(); + + final VariantContext vc = builder.make(); + + Map contexts = new HashMap(); + + for (String sample: sampleNames) { + AlignmentContext context = new AlignmentContext(loc, generateRBPForVariant(loc,vc, altBases, numReadsPerAllele, sample)); + contexts.put(sample,context); + + } + + return contexts; + } + + private SAMReadGroupRecord createRG(String name) { + SAMReadGroupRecord rg = new SAMReadGroupRecord(name); + rg.setPlatform("ILLUMINA"); + rg.setSample(name); + return rg; + } + private ReadBackedPileup generateRBPForVariant( GenomeLoc loc, VariantContext vc, String altBases, + int[] numReadsPerAllele, String sample) { + List pileupElements = new ArrayList(); + int readStart = contigStart; + int offset = (contigStop-contigStart+1)/2; + int refAlleleLength = 0; + int readCounter = 0; + int alleleCounter = 0; + for (Allele allele: vc.getAlleles()) { + if (allele.isReference()) + refAlleleLength = allele.getBases().length; + + int alleleLength = allele.getBases().length; + + for ( int d = 0; d < numReadsPerAllele[alleleCounter]; d++ ) { + byte[] readBases = trueHaplotype(allele, offset, refAlleleLength); + byte[] readQuals = new byte[readBases.length]; + Arrays.fill(readQuals, (byte) 50); + + GATKSAMRecord read = new GATKSAMRecord(header); + read.setBaseQualities(readQuals); + read.setReadBases(readBases); + read.setReadName(artificialReadName+readCounter++); + + boolean isBeforeDeletion = false, isBeforeInsertion = false; + if (allele.isReference()) + read.setCigarString(readBases.length + "M"); + else { + isBeforeDeletion = alleleLengthrefAlleleLength; + if (isBeforeDeletion || isBeforeInsertion) + read.setCigarString(offset+"M"+ alleleLength + (isBeforeDeletion?"D":"I") + + (readBases.length-offset)+"M"); + else // SNP case + read.setCigarString(readBases.length+"M"); + } + + int eventLength = (isBeforeDeletion?refAlleleLength:(isBeforeInsertion?alleleLength:0)); + read.setReadPairedFlag(false); + read.setAlignmentStart(readStart); + read.setMappingQuality(artificialMappingQuality); + read.setReferenceName(loc.getContig()); + read.setReadNegativeStrandFlag(false); + read.setAttribute("RG", sampleRG(sample).getReadGroupId()); + + + pileupElements.add(new PileupElement(read,offset,false,isBeforeDeletion, false, isBeforeInsertion,false,false,altBases.substring(0,alleleLength),eventLength)); + } + alleleCounter++; + } + + return new ReadBackedPileupImpl(loc,pileupElements); + } + + private byte[] trueHaplotype(Allele allele, int offset, int refAlleleLength) { + // create haplotype based on a particular allele + String prefix = refBases.substring(offset); + String alleleBases = new String(allele.getBases()); + String postfix = refBases.substring(offset+refAlleleLength,refBases.length()); + + return (prefix+alleleBases+postfix).getBytes(); + + + + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java index 5c75a9b29..e4c3b8dae 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsUnitTest.java @@ -45,51 +45,23 @@ import org.testng.annotations.Test; */ public class IndelGenotypeLikelihoodsUnitTest extends BaseTest { - final int contigStart = 1; - final int contigStop = 10; - final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, contigStop-contigStart+1); - final GATKSAMReadGroupRecord artificialGATKRG = new GATKSAMReadGroupRecord("synthetic"); - final String artificialContig = "chr1"; - final int artificialContigIndex = 0; - final String artificialReadName = "synth"; - final int artificialRefStart = 1; - final int artificialMappingQuality = 60; - Map sample2RG = new HashMap(); - final String refBases = "AGGATACTGT"; - final String SAMPLE_PREFIX = "sample"; - - List sampleNames = new ArrayList(); final int nSamples = 1; - final int numReadsPerAllele = 10; - - List sampleRGs; - - private String sampleName(int i) { return sampleNames.get(i); } - private SAMReadGroupRecord sampleRG(String name) { return sample2RG.get(name); } + final int[] numReadsPerAllele = new int[]{10,10}; + final String SAMPLE_PREFIX = "sample"; final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); final Logger logger = Logger.getLogger(Walker.class); - final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); final IndelGenotypeLikelihoodsCalculationModel model = new IndelGenotypeLikelihoodsCalculationModel(UAC,logger); - final int offset = 5; - final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,offset,offset); - final GenomeLoc window = genomeLocParser.createGenomeLoc(artificialContig,artificialRefStart,10); - final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser,loc,window,this.refBases.getBytes()); - @BeforeSuite + ArtificialReadPileupTestProvider pileupProvider; + + @BeforeSuite public void before() { - sampleRGs = new ArrayList(); - - for ( int i = 0; i < nSamples; i++ ) { - sampleNames.add(String.format("%s%04d", SAMPLE_PREFIX, i)); - SAMReadGroupRecord rg = createRG(sampleName(i)); - sampleRGs.add(rg); - sample2RG.put(sampleName(i), rg); - } - + pileupProvider = new ArtificialReadPileupTestProvider(nSamples, SAMPLE_PREFIX); } + @Test public void testBasicConsensusCounts() { // 4 inserted bases, min cnt = 10 @@ -107,7 +79,7 @@ public class IndelGenotypeLikelihoodsUnitTest extends BaseTest { eventLength = 3; alleles = getConsensusAlleles(eventLength,false,10,0.1, altBases); Assert.assertEquals(alleles.size(),2); - Assert.assertEquals(alleles.get(0).getBaseString(), refBases.substring(offset,offset+eventLength)); + Assert.assertEquals(alleles.get(0).getBaseString(), refBases.substring(pileupProvider.offset,pileupProvider.offset+eventLength)); // same with min Reads = 11 alleles = getConsensusAlleles(eventLength,false,11,0.1, altBases); @@ -121,112 +93,10 @@ public class IndelGenotypeLikelihoodsUnitTest extends BaseTest { } private List getConsensusAlleles(int eventLength, boolean isInsertion, int minCnt, double minFraction, String altBases) { - final ConsensusAlleleCounter counter = new ConsensusAlleleCounter(genomeLocParser, true, minCnt, minFraction); - return counter.computeConsensusAlleles(referenceContext,getContextFromAlleles(eventLength, isInsertion, altBases), AlignmentContextUtils.ReadOrientation.COMPLETE); - - } - private Map getContextFromAlleles(int eventLength, boolean isInsertion, String altBases) { - // RefMetaDataTracker tracker = new RefMetaDataTracker(null,referenceContext); - - - ArrayList vcAlleles = new ArrayList(); - Allele refAllele, altAllele; - if (isInsertion) { - refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); - altAllele = Allele.create(altBases.substring(0,eventLength), false); - } - else { - refAllele =Allele.create(refBases.substring(offset,offset+eventLength),true); - altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); - } - - int stop = loc.getStart(); - vcAlleles.add(refAllele); - vcAlleles.add(altAllele); - - final VariantContextBuilder builder = new VariantContextBuilder().source(""); - builder.loc(loc.getContig(), loc.getStart(), stop); - builder.alleles(vcAlleles); - builder.referenceBaseForIndel(referenceContext.getBase()); - builder.noGenotypes(); - - final VariantContext vc = builder.make(); - - Map contexts = new HashMap(); - - for (String sample: sampleNames) { - AlignmentContext context = new AlignmentContext(loc, generateRBPForVariant(loc,vc, altBases, numReadsPerAllele, sample)); - contexts.put(sample,context); - - } - - return contexts; - } - - private SAMReadGroupRecord createRG(String name) { - SAMReadGroupRecord rg = new SAMReadGroupRecord(name); - rg.setPlatform("ILLUMINA"); - rg.setSample(name); - return rg; - } - private ReadBackedPileup generateRBPForVariant( GenomeLoc loc, VariantContext vc, String altBases, - int numReads, String sample) { - List pileupElements = new ArrayList(); - int readStart = contigStart; - int offset = (contigStop-contigStart+1)/2; - int refAlleleLength = 0; - int readCounter = 0; - for (Allele allele: vc.getAlleles()) { - if (allele.isReference()) - refAlleleLength = allele.getBases().length; - - int alleleLength = allele.getBases().length; - - for ( int d = 0; d < numReads; d++ ) { - byte[] readBases = trueHaplotype(allele, offset, refAlleleLength); - byte[] readQuals = new byte[readBases.length]; - Arrays.fill(readQuals,(byte)50); - - GATKSAMRecord read = new GATKSAMRecord(header); - read.setBaseQualities(readQuals); - read.setReadBases(readBases); - read.setReadName(artificialReadName+readCounter++); - - boolean isBeforeDeletion = false, isBeforeInsertion = false; - if (allele.isReference()) - read.setCigarString(readBases.length + "M"); - else { - isBeforeDeletion = alleleLengthrefAlleleLength; - read.setCigarString(offset+"M"+ alleleLength + (isBeforeDeletion?"D":"I") + - (readBases.length-offset)+"M"); - } - - int eventLength = (isBeforeDeletion?refAlleleLength:(isBeforeInsertion?alleleLength:0)); - read.setReadPairedFlag(false); - read.setAlignmentStart(readStart); - read.setMappingQuality(artificialMappingQuality); - read.setReferenceName(loc.getContig()); - read.setReadNegativeStrandFlag(false); - read.setAttribute("RG", sampleRG(sample).getReadGroupId()); - - - pileupElements.add(new PileupElement(read,offset,false,isBeforeDeletion, false, isBeforeInsertion,false,false,altBases.substring(0,alleleLength),eventLength)); - } - } - - return new ReadBackedPileupImpl(loc,pileupElements); - } - - byte[] trueHaplotype(Allele allele, int offset, int refAlleleLength) { - // create haplotype based on a particular allele - String prefix = refBases.substring(offset); - String alleleBases = new String(allele.getBases()); - String postfix = refBases.substring(offset+refAlleleLength,refBases.length()); - - return (prefix+alleleBases+postfix).getBytes(); - - + final ConsensusAlleleCounter counter = new ConsensusAlleleCounter(pileupProvider.genomeLocParser, true, minCnt, minFraction); + return counter.computeConsensusAlleles(pileupProvider.referenceContext, + pileupProvider.getAlignmentContextFromAlleles(isInsertion?eventLength:-eventLength,altBases,numReadsPerAllele), + AlignmentContextUtils.ReadOrientation.COMPLETE); } } From 08dbd756f3c42430e12b0472500dd2494941578e Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 25 Apr 2012 17:09:36 -0400 Subject: [PATCH 6/8] Quick QC walkers to look at the error profile of indels in the read --- .../walkers/qc/CountReadEventsWalker.java | 94 +++++++++++++++++++ .../walkers/qc/CountTerminusEventWalker.java | 70 ++++++++++++++ .../sting/utils/sam/ReadUtils.java | 39 ++++++++ 3 files changed, 203 insertions(+) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEventsWalker.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEventWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEventsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEventsWalker.java new file mode 100755 index 000000000..c5ab0426d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEventsWalker.java @@ -0,0 +1,94 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +/** + * Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * Number of reads ending in each category. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ReadEndIndels \
+ *   -o output.grp \
+ *   -I input.bam \
+ *   [-L input.intervals]
+ * 
+ */ + + +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class CountReadEventsWalker extends ReadWalker> , Map>> { + @Output (doc = "GATKReport table output") + PrintStream out; + + public Map> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + return ReadUtils.getCigarOperatorForAllBases(read); + } + + public Map> reduceInit() { + return new HashMap>(); + } + + public Map> reduce(Map> value, Map> sum) { + for (Map.Entry> entry : value.entrySet()) { + CigarOperator op = entry.getKey(); + ArrayList positions = entry.getValue(); + + for (int p : positions) { + Map operatorCount = sum.get(p); + if (operatorCount == null) { + operatorCount = new HashMap(); + sum.put(p, operatorCount); + } + + Long count = operatorCount.get(op); + if (count == null) + count = 0L; + count++; + operatorCount.put(op, count); + } + } + return sum; + } + + @Override + public void onTraversalDone(Map> result) { + GATKReport report = GATKReport.newSimpleReport("Events", "Position", "Event", "Observations"); + for (Map.Entry> entry : result.entrySet()) { + int position = entry.getKey(); + Map operatorCount = entry.getValue(); + + for (Map.Entry subEntry: operatorCount.entrySet()) { + String operator = subEntry.getKey().name(); + Long observations = subEntry.getValue(); + report.addRow(position, operator, observations); + } + } + report.print(out); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEventWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEventWalker.java new file mode 100755 index 000000000..9208cbae8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEventWalker.java @@ -0,0 +1,70 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.List; + +/** + * Walks over the input data set, counting the number of reads ending in insertions/deletions or soft-clips + * + *

Input

+ *

+ * One or more BAM files. + *

+ * + *

Output

+ *

+ * Number of reads ending in each category. + *

+ * + *

Examples

+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T ReadEndIndels \
+ *   -o output.txt \
+ *   -I input.bam \
+ *   [-L input.intervals]
+ * 
+ */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class CountTerminusEventWalker extends ReadWalker, Pair> { + public Pair map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + List cigarElements = read.getCigar().getCigarElements(); + + CigarElement lastElement = null; + for (CigarElement element : cigarElements) { + if (element.getOperator() != CigarOperator.HARD_CLIP) + lastElement = element; + } + + if (lastElement == null) + throw new UserException.MalformedBAM(read, "read does not have any bases, it's all hard clips"); + + long endsInIndel = lastElement.getOperator() == CigarOperator.INSERTION || lastElement.getOperator() == CigarOperator.DELETION? 1 : 0; + long endsInSC = lastElement.getOperator() == CigarOperator.SOFT_CLIP ? 1 : 0; + + return new Pair(endsInIndel, endsInSC); + } + + public Pair reduceInit() { return new Pair(0L, 0L); } + + public Pair reduce(Pair value, Pair sum) { + sum.set(sum.getFirst() + value.getFirst(), sum.getSecond() + value.getSecond()); + return sum; + } + + @Override + public void onTraversalDone(Pair result) { + System.out.println(String.format("\tReads ending in indels : %d\n\tReads ending in soft-clips: %d\n", result.getFirst(), result.getSecond())); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index c2f7117f8..4e2fd1446 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -783,4 +783,43 @@ public class ReadUtils { return location; } + /** + * Creates a map with each event in the read (cigar operator) and the read coordinate where it happened. + * + * Example: + * D -> 2, 34, 75 + * I -> 55 + * S -> 0, 101 + * H -> 101 + * + * @param read the read + * @return a map with the properties described above. See example + */ + public static Map> getCigarOperatorForAllBases (GATKSAMRecord read) { + Map> events = new HashMap>(); + + int position = 0; + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + CigarOperator op = cigarElement.getOperator(); + if (op.consumesReadBases()) { + ArrayList list = events.get(op); + if (list == null) { + list = new ArrayList(); + events.put(op, list); + } + for (int i = position; i < cigarElement.getLength(); i++) + list.add(position++); + } + else { + ArrayList list = events.get(op); + if (list == null) { + list = new ArrayList(); + events.put(op, list); + } + list.add(position); + } + } + return events; + } + } From 1db2d1ba82dc84ec8e5c435ddeb89a6ca7af795f Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 26 Apr 2012 13:32:38 -0400 Subject: [PATCH 7/8] Do not add the first and last 4 cycles to the recalibration tables. --- .../sting/gatk/walkers/bqsr/CycleCovariate.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java index 54a90a959..50e9b0447 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java @@ -78,8 +78,10 @@ public class CycleCovariate implements StandardCovariate { increment = readOrderFactor; } - for (int i = 0; i < read.getReadLength(); i++) { - cycles[i] = BitSetUtils.bitSetFrom(cycle); + final int CUSHION = 4; + final int MAX_CYCLE = read.getReadLength() - CUSHION - 1; + for (int i = 0; i < MAX_CYCLE; i++) { + cycles[i] = (iMAX_CYCLE) ? null : BitSetUtils.bitSetFrom(cycle); cycle += increment; } }