From 5b9a1af7febf01463a409e15c8d9844b26e3ce4f Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Mon, 30 Jul 2012 09:56:10 -0400 Subject: [PATCH 01/43] Intermediate fix for pool GL unit test: fix up artificial read pileup provider to give consistent data. b) Increase downsampling in pool integration tests with reference sample, and shorten MT tests so they don't last too long --- .../gatk/walkers/genotyper/ErrorModel.java | 11 ++-- .../genotyper/PoolSNPGenotypeLikelihoods.java | 8 ++- .../genotyper/PoolCallerIntegrationTest.java | 20 +++---- .../PoolGenotypeLikelihoodsUnitTest.java | 21 +++++-- .../ArtificialReadPileupTestProvider.java | 56 +++++++++++-------- 5 files changed, 70 insertions(+), 46 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java index 864414de9..8e4ca9595 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -68,10 +68,10 @@ public class ErrorModel { break; } } + haplotypeMap = new LinkedHashMap(); if (refSampleVC.isIndel()) { pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); - haplotypeMap = new LinkedHashMap(); indelLikelihoodMap = new HashMap>(); IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements } @@ -96,7 +96,8 @@ public class ErrorModel { final int readCounts[] = new int[refSamplePileup.getNumberOfElements()]; //perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()]; final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles()); - perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts); + if (!haplotypeMap.isEmpty()) + perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts); } int idx = 0; for (PileupElement refPileupElement : refSamplePileup) { @@ -108,7 +109,7 @@ public class ErrorModel { if (DEBUG) System.out.println(m); isMatch |= m; } - if (refSampleVC.isIndel()) { + if (refSampleVC.isIndel() && !haplotypeMap.isEmpty()) { // ignore match/mismatch if reads, as determined by their likelihood, are not informative double[] perAlleleLikelihoods = perReadLikelihoods[idx++]; if (!isInformativeElement(perAlleleLikelihoods)) @@ -173,10 +174,10 @@ public class ErrorModel { // if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch if (allele.isReference()) { // for a ref allele, any base mismatch or new indel is a mismatch. - if(allele.getBases().length>0 ) + if(allele.getBases().length>0) // todo - can't check vs. allele because allele is not padded so it doesn't include the reference base at this location // could clean up/simplify this when unpadding is removed - return (pileupElement.getBase() == refBase); + return (pileupElement.getBase() == refBase && !pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart()); else // either null allele to compare, or ref/alt lengths are different (indel by definition). // if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java index 1e445270f..f763392ae 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java @@ -5,6 +5,7 @@ import net.sf.samtools.SAMUtils; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -48,7 +49,12 @@ public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implem myAlleles = new ArrayList(alleles); - refByte = alleles.get(0).getBases()[0]; // by construction, first allele in list is always ref! + Allele refAllele = alleles.get(0); + //sanity check: by construction, first allele should ALWAYS be the reference alleles + if (!refAllele.isReference()) + throw new ReviewedStingException("BUG: First allele in list passed to PoolSNPGenotypeLikelihoods should be reference!"); + + refByte = refAllele.getBases()[0]; // by construction, first allele in list is always ref! if (myAlleles.size() < BaseUtils.BASES.length) { // likelihood only defined for subset of possible alleles. Fill then with other alleles to have all possible ones, diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java index c6b1a8b7f..a2b478b0e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java @@ -18,27 +18,27 @@ public class PoolCallerIntegrationTest extends WalkerTest { final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam"; final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf"; final String REFSAMPLE_NAME = "NA12878"; - final String MTINTERVALS = "MT"; + final String MTINTERVALS = "MT:1-3000"; final String LSVINTERVALS = "20:40,000,000-41,000,000"; final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf"; final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf"; final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf"; private void PC_MT_Test(String bam, String args, String name, String md5) { - final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL", + final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL", REF, bam, MTINTERVALS, REFSAMPLE_MT_CALLS, REFSAMPLE_NAME) + " --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testPoolCaller:"+name+" args=" + args, spec); } private void PC_LSV_Test(String args, String name, String model, String md5) { - final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL", + final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL", REF, LSV_BAM, LSVINTERVALS, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testPoolCaller:"+name+" args=" + args, spec); } private void PC_LSV_Test_NoRef(String args, String name, String model, String md5) { - final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s -glm %s -ignoreLane -pnrm POOL", + final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s -glm %s -ignoreLane -pnrm POOL", REF, LSV_BAM, LSVINTERVALS, model) + " --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testPoolCaller:"+name+" args=" + args, spec); @@ -46,33 +46,33 @@ public class PoolCallerIntegrationTest extends WalkerTest { @Test public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","36b8db57f65be1cc3d2d9d7f9f3f26e4"); + PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","d8cba4ec4267d7d766081fcead845d08"); } @Test public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","POOLINDEL","d1339990291648495bfcf4404f051478"); + PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","POOLINDEL","8e9b7e89c439b430e95b146a7540c72e"); } @Test public void testINDEL_maxAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","POOLINDEL","b66e7150603310fd57ee7bf9fc590706"); + PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","POOLINDEL","96087fe9240e3656cc2a4e0ff0174d5b"); } @Test public void testINDEL_maxAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","POOLINDEL","ccdae3fc4d2c922f956a186aaad51c29"); + PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","POOLINDEL","6fdae7093831ecfc82a06dd707d62fe9"); } @Test public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","fa5ee7c957c473a80f3a7f3c35dc80b5"); + PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","6b27634214530d379db70391a9cfc2d7"); } @Test public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "6907c8617d49bb57b33f8704ce7f0323"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "e74d4c73ece45d7fb676b99364df4f1a"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java index c97c4ed28..9a785acda 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java @@ -392,8 +392,6 @@ public class PoolGenotypeLikelihoodsUnitTest { final byte refByte = readPileupTestProvider.getRefByte(); final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T'; - final int refIdx = BaseUtils.simpleBaseToBaseIndex(refByte); - final int altIdx = BaseUtils.simpleBaseToBaseIndex(altByte); final List allAlleles = new ArrayList(); // this contains only ref Allele up to now final Set laneIDs = new TreeSet(); @@ -411,17 +409,28 @@ public class PoolGenotypeLikelihoodsUnitTest { for (String laneID : laneIDs) noisyErrorModels.put(laneID, Q30ErrorModel); + // all first ref allele + allAlleles.add(Allele.create(refByte,true)); for (byte b: BaseUtils.BASES) { - if (refByte == b) - allAlleles.add(Allele.create(b,true)); - else + if (refByte != b) allAlleles.add(Allele.create(b, false)); } + final int refIdx = 0; + int altIdx = -1; + + for (int k=0; k < allAlleles.size(); k++) + if (altByte == allAlleles.get(k).getBases()[0]) { + altIdx = k; + break; + } + + + PrintStream out = null; if (SIMULATE_NOISY_PILEUP) { try { - out = new PrintStream(new File("/humgen/gsa-scr1/delangel/GATK/Sting_unstable_mac/GLUnitTest.table")); + out = new PrintStream(new File("GLUnitTest.table")); // out = new PrintStream(new File("/Users/delangel/GATK/Sting_unstable/GLUnitTest.table")); } catch (Exception e) {} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java index 77769a5fe..8011ea66b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java @@ -62,9 +62,9 @@ public class ArtificialReadPileupTestProvider { List sampleNames = new ArrayList(); private String sampleName(int i) { return sampleNames.get(i); } private SAMReadGroupRecord sampleRG(String name) { return sample2RG.get(name); } - public final int offset = 5; + public final int locStart = 5; // 1-based public final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - public final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,offset,offset); + public final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,locStart,locStart); //1-based public final GenomeLoc window = genomeLocParser.createGenomeLoc(artificialContig,artificialRefStart,10); public final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser,loc,window,this.refBases.getBytes()); @@ -103,22 +103,22 @@ public class ArtificialReadPileupTestProvider { boolean addBaseErrors, int phredScaledBaseErrorRate) { // RefMetaDataTracker tracker = new RefMetaDataTracker(null,referenceContext); - + String refStr = new String(new byte[]{referenceContext.getBase()}); ArrayList vcAlleles = new ArrayList(); Allele refAllele, altAllele; if (eventLength == 0) {// SNP case - refAllele =Allele.create(referenceContext.getBase(),true); + refAllele =Allele.create(refStr,true); altAllele = Allele.create(altBases.substring(0,1), false); } else if (eventLength>0){ // insertion - refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); - altAllele = Allele.create(altBases.substring(0,eventLength), false); + refAllele = Allele.create(refStr, true); + altAllele = Allele.create(refStr+altBases.substring(0,eventLength), false); } else { // deletion - refAllele =Allele.create(refBases.substring(offset,offset+Math.abs(eventLength)),true); - altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + refAllele =Allele.create(refBases.substring(locStart-1,locStart+Math.abs(eventLength)-1),true); + altAllele = Allele.create(refBases.substring(locStart-1,locStart), false); } int stop = loc.getStart(); vcAlleles.add(refAllele); @@ -153,18 +153,15 @@ public class ArtificialReadPileupTestProvider { int[] numReadsPerAllele, String sample, boolean addErrors, int phredScaledErrorRate) { List pileupElements = new ArrayList(); int readStart = contigStart; - int offset = (contigStop-contigStart+1)/2; - int refAlleleLength = 0; + + int refAlleleLength = vc.getReference().getBases().length; int readCounter = 0; int alleleCounter = 0; for (Allele allele: vc.getAlleles()) { - if (allele.isReference()) - refAlleleLength = allele.getBases().length; - int alleleLength = allele.getBases().length; for ( int d = 0; d < numReadsPerAllele[alleleCounter]; d++ ) { - byte[] readBases = trueHaplotype(allele, offset, refAlleleLength); + byte[] readBases = trueHaplotype(allele, locStart, vc.getReference()); if (addErrors) addBaseErrors(readBases, phredScaledErrorRate); @@ -176,20 +173,20 @@ public class ArtificialReadPileupTestProvider { read.setReadBases(readBases); read.setReadName(artificialReadName+readCounter++); - boolean isBeforeDeletion = false, isBeforeInsertion = false; + boolean isBeforeDeletion = alleleLengthrefAlleleLength; + + int eventLength = alleleLength - refAlleleLength; if (allele.isReference()) read.setCigarString(readBases.length + "M"); else { - isBeforeDeletion = alleleLengthrefAlleleLength; if (isBeforeDeletion || isBeforeInsertion) - read.setCigarString(offset+"M"+ alleleLength + (isBeforeDeletion?"D":"I") + - (readBases.length-offset)+"M"); + read.setCigarString(locStart+"M"+ eventLength + (isBeforeDeletion?"D":"I") + + (readBases.length-locStart)+"M"); else // SNP case read.setCigarString(readBases.length+"M"); } - int eventLength = (isBeforeDeletion?refAlleleLength:(isBeforeInsertion?alleleLength:0)); read.setReadPairedFlag(false); read.setAlignmentStart(readStart); read.setMappingQuality(artificialMappingQuality); @@ -198,7 +195,7 @@ public class ArtificialReadPileupTestProvider { read.setAttribute("RG", sampleRG(sample).getReadGroupId()); - pileupElements.add(new PileupElement(read,offset,false,isBeforeDeletion, false, isBeforeInsertion,false,false,altBases.substring(0,alleleLength),eventLength)); + pileupElements.add(new PileupElement(read,locStart-1,false,isBeforeDeletion, false, isBeforeInsertion,false,false,altBases.substring(0,alleleLength-1),eventLength)); } alleleCounter++; } @@ -206,11 +203,22 @@ public class ArtificialReadPileupTestProvider { return new ReadBackedPileupImpl(loc,pileupElements); } - private byte[] trueHaplotype(Allele allele, int offset, int refAlleleLength) { + /** + * create haplotype based on a particular allele + * @param allele Allele of interest. ASSUMED TO INCLUDE REF BASE AT startPosition! + * @param startPosition 1-based start position of allele + * @param refAllele REF allele + * @return + */ + private byte[] trueHaplotype(Allele allele, int startPosition, Allele refAllele) { + // create haplotype based on a particular allele - String prefix = refBases.substring(offset); + // startPosition is 1-based. + // so, if startPosition == 5, we need to include positions 1 to 4 , or indeces 0 to 3 of string + String prefix = refBases.substring(0,startPosition-1); String alleleBases = new String(allele.getBases()); - String postfix = refBases.substring(offset+refAlleleLength,refBases.length()); + // where to start postfix? We have (startPosition-1) prefix bases + refAllele.length bases before postfix + String postfix = refBases.substring(startPosition -1 + refAllele.getBases().length,refBases.length()); return (prefix+alleleBases+postfix).getBytes(); From 874dbf5b58669dfe2d0b22aa954598ca89a77b41 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 26 Jul 2012 09:46:56 -0400 Subject: [PATCH 02/43] Maximum wait for GATK run report upload reduced to 10 seconds --- .../org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index f190cbcfd..a42c73212 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -86,7 +86,7 @@ public class GATKRunReport { private static File REPORT_SENTINEL = new File(REPORT_DIR.getAbsolutePath() + "/ENABLE"); // number of milliseconds before the S3 put operation is timed-out: - private static final long S3PutTimeOut = 30 * 1000; + private static final long S3PutTimeOut = 10 * 1000; /** From 8db4e787b1f6ac57d15378893240e2ca203d369d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 26 Jul 2012 13:56:06 -0400 Subject: [PATCH 03/43] V1 of tool to visualize the quality score information in the context covariates -- Upgraded jgrapht to latest version (0.8.3) --- ivy.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ivy.xml b/ivy.xml index b197d0714..6dd5be7a8 100644 --- a/ivy.xml +++ b/ivy.xml @@ -52,7 +52,7 @@ - + From 06703162882253b63a5a3b23cbc681ae8e5fa97d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 26 Jul 2012 16:51:17 -0400 Subject: [PATCH 04/43] Be clearer that dcov 50 is good for 4x, should use 200 for >30x --- .../sting/gatk/walkers/genotyper/UnifiedGenotyper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index cd1815d82..6b279fd95 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -82,7 +82,7 @@ import java.util.*; * -o snps.raw.vcf \ * -stand_call_conf [50.0] \ * -stand_emit_conf 10.0 \ - * -dcov [50] \ + * -dcov [50 for 4x, 200 for >30x WGS or Whole exome] \ * [-L targets.interval_list] * * From 191294eedc9ddb7033f1e3fac304ef5b2bb6eee5 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 27 Jul 2012 09:39:16 -0400 Subject: [PATCH 05/43] Initial cleanup of RecalDatum for move and further refactoring -- Moved Datum, the now unnecessary superclass, into RecalDatum -- Fixed some obviously dangerous synchronization errors in RecalDatum, though these may not have caused problems because they may not have been called in parallel mode --- .../sting/gatk/walkers/bqsr/Datum.java | 109 ------------------ .../sting/gatk/walkers/bqsr/RecalDatum.java | 72 +++++++++--- 2 files changed, 54 insertions(+), 127 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java deleted file mode 100644 index d7e8e16b5..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Datum.java +++ /dev/null @@ -1,109 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -import org.broadinstitute.sting.utils.QualityUtils; - -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: Jan 6, 2010 - * - * An individual piece of recalibration data. Optimized for CountCovariates. Extras added to make TableRecalibration fast have been removed. - * Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. - */ - -public class Datum { - - long numObservations; // number of bases seen in total - long numMismatches; // number of bases seen that didn't match the reference - - private static final int SMOOTHING_CONSTANT = 1; // used when calculating empirical qualities to avoid division by zero - - //--------------------------------------------------------------------------------------------------------------- - // - // constructors - // - //--------------------------------------------------------------------------------------------------------------- - - public Datum() { - numObservations = 0L; - numMismatches = 0L; - } - - public Datum(long numObservations, long numMismatches) { - this.numObservations = numObservations; - this.numMismatches = numMismatches; - } - - //--------------------------------------------------------------------------------------------------------------- - // - // increment methods - // - //--------------------------------------------------------------------------------------------------------------- - - synchronized void increment(final long incObservations, final long incMismatches) { - numObservations += incObservations; - numMismatches += incMismatches; - } - - synchronized void increment(final boolean isError) { - numObservations++; - numMismatches += isError ? 1:0; - } - - //--------------------------------------------------------------------------------------------------------------- - // - // methods to derive empirical quality score - // - //--------------------------------------------------------------------------------------------------------------- - - double empiricalQualDouble() { - final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); - final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT); // smoothing is one error and one non-error observation, for example - final double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); - return Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); - } - - byte empiricalQualByte() { - final double doubleMismatches = (double) (numMismatches); - final double doubleObservations = (double) (numObservations); - return QualityUtils.probToQual(1.0 - doubleMismatches / doubleObservations); // This is capped at Q40 - } - - @Override - public String toString() { - return String.format("%d,%d,%d", numObservations, numMismatches, (int) empiricalQualByte()); - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof Datum)) - return false; - Datum other = (Datum) o; - return numMismatches == other.numMismatches && numObservations == other.numObservations; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java index 9b00b1876..ed4e769b1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java @@ -33,20 +33,40 @@ import org.broadinstitute.sting.utils.QualityUtils; import java.util.Random; /** + * An individual piece of recalibration data. Each bin counts up the number of observations and the number + * of reference mismatches seen for that combination of covariates. + * * Created by IntelliJ IDEA. * User: rpoplin * Date: Nov 3, 2009 - * - * An individual piece of recalibration data. Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates. */ - -public class RecalDatum extends Datum { - +public class RecalDatum { private static final double UNINITIALIZED = -1.0; - private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations - private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) + /** + * estimated reported quality score based on combined data's individual q-reporteds and number of observations + */ + private double estimatedQReported; + /** + * the empirical quality for datums that have been collapsed together (by read group and reported quality, for example) + */ + private double empiricalQuality; + + /** + * number of bases seen in total + */ + long numObservations; + + /** + * number of bases seen that didn't match the reference + */ + long numMismatches; + + /** + * used when calculating empirical qualities to avoid division by zero + */ + private static final int SMOOTHING_CONSTANT = 1; //--------------------------------------------------------------------------------------------------------------- // @@ -68,26 +88,24 @@ public class RecalDatum extends Datum { this.empiricalQuality = copy.empiricalQuality; } - public void combine(final RecalDatum other) { + public synchronized void combine(final RecalDatum other) { final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); increment(other.numObservations, other.numMismatches); estimatedQReported = -10 * Math.log10(sumErrors / this.numObservations); empiricalQuality = UNINITIALIZED; } - @Override - public void increment(final boolean isError) { - super.increment(isError); - empiricalQuality = UNINITIALIZED; - } - @Requires("empiricalQuality == UNINITIALIZED") @Ensures("empiricalQuality != UNINITIALIZED") - protected final void calcEmpiricalQuality() { - empiricalQuality = empiricalQualDouble(); // cache the value so we don't call log over and over again + private synchronized final void calcEmpiricalQuality() { + // cache the value so we don't call log over and over again + final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); + final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT); + final double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); + empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); } - public void setEstimatedQReported(final double estimatedQReported) { + public synchronized void setEstimatedQReported(final double estimatedQReported) { this.estimatedQReported = estimatedQReported; } @@ -95,7 +113,7 @@ public class RecalDatum extends Datum { return estimatedQReported; } - public void setEmpiricalQuality(final double empiricalQuality) { + public synchronized void setEmpiricalQuality(final double empiricalQuality) { this.empiricalQuality = empiricalQuality; } @@ -145,4 +163,22 @@ public class RecalDatum extends Datum { return super.equals(o) && MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0; } + + //--------------------------------------------------------------------------------------------------------------- + // + // increment methods + // + //--------------------------------------------------------------------------------------------------------------- + + synchronized void increment(final long incObservations, final long incMismatches) { + numObservations += incObservations; + numMismatches += incMismatches; + empiricalQuality = UNINITIALIZED; + } + + synchronized void increment(final boolean isError) { + numObservations++; + numMismatches += isError ? 1:0; + empiricalQuality = UNINITIALIZED; + } } From e00ed8bc5e2e2c9bf7376c4926fe1359de1a738c Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 27 Jul 2012 12:25:06 -0400 Subject: [PATCH 06/43] Cleanup BQSR classes -- Moved most of BQSR classes (which are used throughout the codebase) to utils.recalibration. It's better in my opinion to keep commonly used code in utils, and only specialized code in walkers. As code becomes embedded throughout GATK its should be refactored to live in utils -- Removed unncessary imports of BQSR in VQSR v3 -- Now ready to refactor QualQuantizer and unit test into a subclass of RecalDatum, refactor unit tests into RecalDatum unit tests, and generalize into hierarchical recal datum that can be used in QualQuantizer and the analysis of adaptive context covariate -- Update PluginManager to sort the plugins and interfaces. This allows us to have a deterministic order in which the plugin classes come back, which caused BQSR integration tests to temporarily change because I moved my classes around a bit. --- .../bqsr/AdvancedRecalibrationEngine.java | 4 + .../reducereads/SyntheticRead.java | 2 +- .../sting/gatk/walkers/bqsr/BQSRGatherer.java | 6 +- .../gatk/walkers/bqsr/BaseRecalibrator.java | 26 +- .../bqsr/RecalibrationArgumentCollection.java | 41 ++-- .../walkers/bqsr/RecalibrationEngine.java | 1 + .../bqsr/StandardRecalibrationEngine.java | 4 + .../utils/classloader/PluginManager.java | 22 ++ .../sting/utils/clipping/ClippingOp.java | 2 +- .../sting/utils/clipping/ReadClipper.java | 2 +- .../sting/utils/fragments/FragmentUtils.java | 2 +- .../recalibration/BaseRecalibration.java | 4 +- .../recalibration}/EventType.java | 2 +- .../recalibration}/QuantizationInfo.java | 20 +- .../recalibration}/ReadCovariates.java | 2 +- .../recalibration}/RecalDatum.java | 60 +++-- .../recalibration/RecalUtils.java} | 226 +++++++++--------- .../recalibration}/RecalibrationReport.java | 63 ++--- .../recalibration/RecalibrationTables.java | 4 +- .../covariates}/BinaryTagCovariate.java | 4 +- .../covariates}/ContextCovariate.java | 4 +- .../recalibration/covariates}/Covariate.java | 9 +- .../covariates}/CycleCovariate.java | 4 +- .../covariates/ExperimentalCovariate.java | 30 +++ .../covariates}/QualityScoreCovariate.java | 4 +- .../covariates}/ReadGroupCovariate.java | 4 +- .../covariates/RequiredCovariate.java | 30 +++ .../covariates/StandardCovariate.java | 30 +++ .../sting/utils/sam/GATKSAMRecord.java | 2 +- .../walkers/bqsr/BQSRGathererUnitTest.java | 13 +- .../ContextCovariateUnitTest.java | 5 +- .../CycleCovariateUnitTest.java | 4 +- .../ReadCovariatesUnitTest.java | 6 +- .../ReadGroupCovariateUnitTest.java | 4 +- .../RecalibrationReportUnitTest.java | 7 +- 35 files changed, 413 insertions(+), 240 deletions(-) rename public/java/src/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration}/EventType.java (96%) rename public/java/src/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration}/QuantizationInfo.java (78%) rename public/java/src/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration}/ReadCovariates.java (97%) rename public/java/src/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration}/RecalDatum.java (77%) rename public/java/src/org/broadinstitute/sting/{gatk/walkers/bqsr/RecalDataManager.java => utils/recalibration/RecalUtils.java} (96%) rename public/java/src/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration}/RecalibrationReport.java (82%) rename public/java/src/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration/covariates}/BinaryTagCovariate.java (89%) rename public/java/src/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration/covariates}/ContextCovariate.java (98%) rename public/java/src/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration/covariates}/Covariate.java (94%) rename public/java/src/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration/covariates}/CycleCovariate.java (97%) create mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java rename public/java/src/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration/covariates}/QualityScoreCovariate.java (92%) rename public/java/src/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration/covariates}/ReadGroupCovariate.java (94%) create mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java rename public/java/test/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration}/ContextCovariateUnitTest.java (89%) rename public/java/test/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration}/CycleCovariateUnitTest.java (90%) rename public/java/test/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration}/ReadCovariatesUnitTest.java (92%) rename public/java/test/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration}/ReadGroupCovariateUnitTest.java (88%) rename public/java/test/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration}/RecalibrationReportUnitTest.java (95%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index 9eca81852..d714ca185 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -25,10 +25,14 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * OTHER DEALINGS IN THE SOFTWARE. */ +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.recalibration.RecalDatum; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java index 9ee1a4634..3b9fc0390 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java @@ -5,7 +5,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; +import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java index 122958ac2..a6d82d5b3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -28,6 +28,8 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.commandline.Gatherer; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.RecalUtils; +import org.broadinstitute.sting.utils.recalibration.RecalibrationReport; import java.io.File; import java.io.FileNotFoundException; @@ -71,11 +73,11 @@ public class BQSRGatherer extends Gatherer { if (RAC.recalibrationReport != null && !RAC.NO_PLOTS) { final File recal_out = new File(output.getName() + ".original"); final RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport); - RecalDataManager.generateRecalibrationPlot(recal_out, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates(), RAC.KEEP_INTERMEDIATE_FILES); + RecalUtils.generateRecalibrationPlot(recal_out, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates(), RAC.KEEP_INTERMEDIATE_FILES); } else if (!RAC.NO_PLOTS) { final File recal_out = new File(output.getName() + ".recal"); - RecalDataManager.generateRecalibrationPlot(recal_out, generalReport.getRecalibrationTables(), generalReport.getCovariates(), RAC.KEEP_INTERMEDIATE_FILES); + RecalUtils.generateRecalibrationPlot(recal_out, generalReport.getRecalibrationTables(), generalReport.getCovariates(), RAC.KEEP_INTERMEDIATE_FILES); } generalReport.output(outputFile); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index b2400a49d..e95a0e6c1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; import org.broadinstitute.sting.utils.collections.Pair; @@ -41,6 +42,9 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.recalibration.QuantizationInfo; +import org.broadinstitute.sting.utils.recalibration.RecalUtils; +import org.broadinstitute.sting.utils.recalibration.RecalibrationReport; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -109,7 +113,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed @ArgumentCollection private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates - private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization + private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization private RecalibrationTables recalibrationTables; @@ -143,12 +147,12 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed throw new UserException.CommandLineException(NO_DBSNP_EXCEPTION); if (RAC.LIST_ONLY) { - RecalDataManager.listAvailableCovariates(logger); + RecalUtils.listAvailableCovariates(logger); System.exit(0); } RAC.recalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table - Pair, ArrayList> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates + Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates ArrayList requiredCovariates = covariates.getFirst(); ArrayList optionalCovariates = covariates.getSecond(); @@ -222,17 +226,17 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed if (readNotSeen(read)) { read.setTemporaryAttribute(SEEN_ATTRIBUTE, true); - RecalDataManager.parsePlatformForRead(read, RAC); - if (RecalDataManager.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { + RecalUtils.parsePlatformForRead(read, RAC); + if (RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { read.setTemporaryAttribute(SKIP_RECORD_ATTRIBUTE, true); continue; } - read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(read, requestedCovariates)); + read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates)); } if (!ReadUtils.isSOLiDRead(read) || // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it - RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING || - RecalDataManager.isColorSpaceConsistent(read, offset)) + RAC.SOLID_RECAL_MODE == RecalUtils.SOLID_RECAL_MODE.DO_NOTHING || + RecalUtils.isColorSpaceConsistent(read, offset)) recalibrationEngine.updateDataForPileupElement(p, ref.getBase()); // This base finally passed all the checks for a good base, so add it to the big data hashmap } countedSites++; @@ -285,10 +289,10 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed File recalFile = getToolkit().getArguments().BQSR_RECAL_FILE; if (recalFile != null) { RecalibrationReport report = new RecalibrationReport(recalFile); - RecalDataManager.generateRecalibrationPlot(RAC.RECAL_FILE, report.getRecalibrationTables(), recalibrationTables, requestedCovariates, RAC.KEEP_INTERMEDIATE_FILES); + RecalUtils.generateRecalibrationPlot(RAC.RECAL_FILE, report.getRecalibrationTables(), recalibrationTables, requestedCovariates, RAC.KEEP_INTERMEDIATE_FILES); } else - RecalDataManager.generateRecalibrationPlot(RAC.RECAL_FILE, recalibrationTables, requestedCovariates, RAC.KEEP_INTERMEDIATE_FILES); + RecalUtils.generateRecalibrationPlot(RAC.RECAL_FILE, recalibrationTables, requestedCovariates, RAC.KEEP_INTERMEDIATE_FILES); } @@ -309,7 +313,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_FILE, "could not be created"); } - RecalDataManager.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates, output); + RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates, output); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index 2a94426a7..f04e4a1b3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -29,6 +29,7 @@ import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.recalibration.RecalUtils; import java.io.File; import java.util.Collections; @@ -100,7 +101,7 @@ public class RecalibrationArgumentCollection { * reads which have had the reference inserted because of color space inconsistencies. */ @Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS") - public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO; + public RecalUtils.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.SET_Q_ZERO; /** * CountCovariates and TableRecalibration accept a --solid_nocall_strategy flag which governs how the recalibrator handles @@ -108,7 +109,7 @@ public class RecalibrationArgumentCollection { * their color space tag can not be recalibrated. */ @Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false) - public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; + public RecalUtils.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION; /** * The context covariate will use a context of this size to calculate it's covariate value for base mismatches @@ -177,41 +178,41 @@ public class RecalibrationArgumentCollection { public GATKReportTable generateReportTable() { GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2); argumentsTable.addColumn("Argument"); - argumentsTable.addColumn(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME); + argumentsTable.addColumn(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); argumentsTable.addRowID("covariate", true); - argumentsTable.set("covariate", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, (COVARIATES == null) ? "null" : Utils.join(",", COVARIATES)); + argumentsTable.set("covariate", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, (COVARIATES == null) ? "null" : Utils.join(",", COVARIATES)); argumentsTable.addRowID("no_standard_covs", true); - argumentsTable.set("no_standard_covs", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES); + argumentsTable.set("no_standard_covs", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DO_NOT_USE_STANDARD_COVARIATES); argumentsTable.addRowID("run_without_dbsnp", true); - argumentsTable.set("run_without_dbsnp", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP); + argumentsTable.set("run_without_dbsnp", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RUN_WITHOUT_DBSNP); argumentsTable.addRowID("solid_recal_mode", true); - argumentsTable.set("solid_recal_mode", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE); + argumentsTable.set("solid_recal_mode", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_RECAL_MODE); argumentsTable.addRowID("solid_nocall_strategy", true); - argumentsTable.set("solid_nocall_strategy", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY); + argumentsTable.set("solid_nocall_strategy", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, SOLID_NOCALL_STRATEGY); argumentsTable.addRowID("mismatches_context_size", true); - argumentsTable.set("mismatches_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE); + argumentsTable.set("mismatches_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_CONTEXT_SIZE); argumentsTable.addRowID("indels_context_size", true); - argumentsTable.set("indels_context_size", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE); + argumentsTable.set("indels_context_size", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INDELS_CONTEXT_SIZE); argumentsTable.addRowID("mismatches_default_quality", true); - argumentsTable.set("mismatches_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY); + argumentsTable.set("mismatches_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, MISMATCHES_DEFAULT_QUALITY); argumentsTable.addRowID("insertions_default_quality", true); - argumentsTable.set("insertions_default_quality", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY); + argumentsTable.set("insertions_default_quality", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, INSERTIONS_DEFAULT_QUALITY); argumentsTable.addRowID("low_quality_tail", true); - argumentsTable.set("low_quality_tail", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL); + argumentsTable.set("low_quality_tail", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, LOW_QUAL_TAIL); argumentsTable.addRowID("default_platform", true); - argumentsTable.set("default_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM); + argumentsTable.set("default_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, DEFAULT_PLATFORM); argumentsTable.addRowID("force_platform", true); - argumentsTable.set("force_platform", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM); + argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM); argumentsTable.addRowID("quantizing_levels", true); - argumentsTable.set("quantizing_levels", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); + argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); argumentsTable.addRowID("keep_intermediate_files", true); - argumentsTable.set("keep_intermediate_files", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, KEEP_INTERMEDIATE_FILES); + argumentsTable.set("keep_intermediate_files", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, KEEP_INTERMEDIATE_FILES); argumentsTable.addRowID("no_plots", true); - argumentsTable.set("no_plots", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, NO_PLOTS); + argumentsTable.set("no_plots", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, NO_PLOTS); argumentsTable.addRowID("recalibration_report", true); - argumentsTable.set("recalibration_report", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, recalibrationReport == null ? "null" : recalibrationReport.getAbsolutePath()); + argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, recalibrationReport == null ? "null" : recalibrationReport.getAbsolutePath()); argumentsTable.addRowID("binary_tag_name", true); - argumentsTable.set("binary_tag_name", RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME); + argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME); return argumentsTable; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index aa62a18bc..38e306939 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index a24506d07..08c7da754 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -25,10 +25,14 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * OTHER DEALINGS IN THE SOFTWARE. */ +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.classloader.PublicPackageSource; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.recalibration.RecalDatum; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java index f24bbb636..9a2cb68db 100644 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java @@ -168,6 +168,28 @@ public class PluginManager { String pluginName = getName(pluginClass); pluginsByName.put(pluginName, pluginClass); } + + // sort the plugins so the order of elements is deterministic + sortPlugins(plugins); + sortPlugins(interfaces); + } + + /** + * Sorts, in place, the list of plugins according to getName() on each element + * + * @param unsortedPlugins + */ + private final void sortPlugins(final List> unsortedPlugins) { + Collections.sort(unsortedPlugins, new ComparePluginsByName()); + } + + private final class ComparePluginsByName implements Comparator> { + @Override + public int compare(final Class aClass, final Class aClass1) { + String pluginName1 = getName(aClass); + String pluginName2 = getName(aClass1); + return pluginName1.compareTo(pluginName2); + } } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index a4383c3ae..554188bc1 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -4,7 +4,7 @@ import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; +import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index ba9267222..6392ce4ce 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -3,7 +3,7 @@ package org.broadinstitute.sting.utils.clipping; import com.google.java.contract.Requires; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; +import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index c6eec24f1..851272673 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -4,7 +4,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; +import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index b5f7ad046..c09eb0063 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -27,7 +27,7 @@ package org.broadinstitute.sting.utils.recalibration; import net.sf.samtools.SAMTag; import net.sf.samtools.SAMUtils; -import org.broadinstitute.sting.gatk.walkers.bqsr.*; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; @@ -103,7 +103,7 @@ public class BaseRecalibration { } } - RecalDataManager.computeCovariates(read, requestedCovariates, readCovariates); // compute all covariates for the read + RecalUtils.computeCovariates(read, requestedCovariates, readCovariates); // compute all covariates for the read for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings if (disableIndelQuals && errorModel != EventType.BASE_SUBSTITUTION) { read.setBaseQualities(null, errorModel); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java similarity index 96% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java rename to public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java index 2650f0f8d..1c84518eb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/EventType.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java similarity index 78% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java rename to public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java index fb3aef949..2b67d12a9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java @@ -1,11 +1,9 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.recalibration.QualQuantizer; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import java.util.Arrays; import java.util.List; @@ -41,7 +39,7 @@ public class QuantizationInfo { for (final RecalDatum value : qualTable.getAllValues()) { final RecalDatum datum = value; final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) - qualHistogram[empiricalQual] += datum.numObservations; // add the number of observations for every key + qualHistogram[empiricalQual] += datum.getNumObservations(); // add the number of observations for every key } empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities quantizeQualityScores(quantizationLevels); @@ -70,15 +68,15 @@ public class QuantizationInfo { } public GATKReportTable generateReportTable() { - GATKReportTable quantizedTable = new GATKReportTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3); - quantizedTable.addColumn(RecalDataManager.QUALITY_SCORE_COLUMN_NAME); - quantizedTable.addColumn(RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME); - quantizedTable.addColumn(RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME); + GATKReportTable quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3); + quantizedTable.addColumn(RecalUtils.QUALITY_SCORE_COLUMN_NAME); + quantizedTable.addColumn(RecalUtils.QUANTIZED_COUNT_COLUMN_NAME); + quantizedTable.addColumn(RecalUtils.QUANTIZED_VALUE_COLUMN_NAME); for (int qual = 0; qual <= QualityUtils.MAX_QUAL_SCORE; qual++) { - quantizedTable.set(qual, RecalDataManager.QUALITY_SCORE_COLUMN_NAME, qual); - quantizedTable.set(qual, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME, empiricalQualCounts.get(qual)); - quantizedTable.set(qual, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME, quantizedQuals.get(qual)); + quantizedTable.set(qual, RecalUtils.QUALITY_SCORE_COLUMN_NAME, qual); + quantizedTable.set(qual, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME, empiricalQualCounts.get(qual)); + quantizedTable.set(qual, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME, quantizedQuals.get(qual)); } return quantizedTable; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java rename to public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java index 5e907237d..c86bd4deb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration; /** * The object temporarily held by a read that describes all of it's covariates. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java similarity index 77% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java rename to public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index ed4e769b1..43ff54378 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -1,4 +1,4 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration; /* * Copyright (c) 2009 The Broad Institute @@ -56,12 +56,12 @@ public class RecalDatum { /** * number of bases seen in total */ - long numObservations; + private long numObservations; /** * number of bases seen that didn't match the reference */ - long numMismatches; + private long numMismatches; /** * used when calculating empirical qualities to avoid division by zero @@ -82,16 +82,16 @@ public class RecalDatum { } public RecalDatum(final RecalDatum copy) { - this.numObservations = copy.numObservations; - this.numMismatches = copy.numMismatches; + this.numObservations = copy.getNumObservations(); + this.numMismatches = copy.getNumMismatches(); this.estimatedQReported = copy.estimatedQReported; this.empiricalQuality = copy.empiricalQuality; } public synchronized void combine(final RecalDatum other) { final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); - increment(other.numObservations, other.numMismatches); - estimatedQReported = -10 * Math.log10(sumErrors / this.numObservations); + increment(other.getNumObservations(), other.getNumMismatches()); + estimatedQReported = -10 * Math.log10(sumErrors / getNumObservations()); empiricalQuality = UNINITIALIZED; } @@ -100,7 +100,8 @@ public class RecalDatum { private synchronized final void calcEmpiricalQuality() { // cache the value so we don't call log over and over again final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); - final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT); + // smoothing is one error and one non-error observation, for example + final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT); final double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); } @@ -125,7 +126,7 @@ public class RecalDatum { @Override public String toString() { - return String.format("%d,%d,%d", numObservations, numMismatches, (byte) Math.floor(getEmpiricalQuality())); + return String.format("%d,%d,%d", getNumObservations(), getNumMismatches(), (byte) Math.floor(getEmpiricalQuality())); } public String stringForCSV() { @@ -133,7 +134,7 @@ public class RecalDatum { } private double calcExpectedErrors() { - return (double) this.numObservations * qualToErrorProb(estimatedQReported); + return (double) getNumObservations() * qualToErrorProb(estimatedQReported); } private double qualToErrorProb(final double qual) { @@ -170,15 +171,42 @@ public class RecalDatum { // //--------------------------------------------------------------------------------------------------------------- - synchronized void increment(final long incObservations, final long incMismatches) { - numObservations += incObservations; - numMismatches += incMismatches; + public long getNumObservations() { + return numObservations; + } + + public synchronized void setNumObservations(final long numObservations) { + this.numObservations = numObservations; empiricalQuality = UNINITIALIZED; } - synchronized void increment(final boolean isError) { - numObservations++; - numMismatches += isError ? 1:0; + public long getNumMismatches() { + return numMismatches; + } + + public synchronized void setNumMismatches(final long numMismatches) { + this.numMismatches = numMismatches; empiricalQuality = UNINITIALIZED; } + + public synchronized void incrementNumObservations(final long by) { + numObservations += by; + empiricalQuality = UNINITIALIZED; + } + + public synchronized void incrementNumMismatches(final long by) { + numMismatches += by; + empiricalQuality = UNINITIALIZED; + } + + public synchronized void increment(final long incObservations, final long incMismatches) { + incrementNumObservations(incObservations); + incrementNumMismatches(incMismatches); + } + + public synchronized void increment(final boolean isError) { + incrementNumObservations(1); + if ( isError ) + incrementNumMismatches(1); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java similarity index 96% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java rename to public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index f40a62d53..fe6ef7018 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -23,11 +23,13 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.*; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.Utils; @@ -39,7 +41,6 @@ import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.io.Resource; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -59,7 +60,7 @@ import java.util.*; * This class holds the parsing methods that are shared between CountCovariates and TableRecalibration. */ -public class RecalDataManager { +public class RecalUtils { public final static String ARGUMENT_REPORT_TABLE_TITLE = "Arguments"; public final static String QUANTIZED_REPORT_TABLE_TITLE = "Quantized"; public final static String READGROUP_REPORT_TABLE_TITLE = "RecalTable0"; @@ -85,13 +86,108 @@ public class RecalDataManager { private static final String SCRIPT_FILE = "BQSR.R"; - private static final Pair covariateValue = new Pair(RecalDataManager.COVARIATE_VALUE_COLUMN_NAME, "%s"); - private static final Pair covariateName = new Pair(RecalDataManager.COVARIATE_NAME_COLUMN_NAME, "%s"); - private static final Pair eventType = new Pair(RecalDataManager.EVENT_TYPE_COLUMN_NAME, "%s"); - private static final Pair empiricalQuality = new Pair(RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); - private static final Pair estimatedQReported = new Pair(RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); - private static final Pair nObservations = new Pair(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); - private static final Pair nErrors = new Pair(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME, "%d"); + private static final Pair covariateValue = new Pair(RecalUtils.COVARIATE_VALUE_COLUMN_NAME, "%s"); + private static final Pair covariateName = new Pair(RecalUtils.COVARIATE_NAME_COLUMN_NAME, "%s"); + private static final Pair eventType = new Pair(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s"); + private static final Pair empiricalQuality = new Pair(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); + private static final Pair estimatedQReported = new Pair(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); + private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); + private static final Pair nErrors = new Pair(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%d"); + + /** + * Generates two lists : required covariates and optional covariates based on the user's requests. + * + * Performs the following tasks in order: + * 1. Adds all requierd covariates in order + * 2. Check if the user asked to use the standard covariates and adds them all if that's the case + * 3. Adds all covariates requested by the user that were not already added by the two previous steps + * + * @param argumentCollection the argument collection object for the recalibration walker + * @return a pair of ordered lists : required covariates (first) and optional covariates (second) + */ + public static Pair, ArrayList> initializeCovariates(RecalibrationArgumentCollection argumentCollection) { + final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); + final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); + final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); + + final ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates + ArrayList optionalCovariates = new ArrayList(); + if (!argumentCollection.DO_NOT_USE_STANDARD_COVARIATES) + optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user + + if (argumentCollection.COVARIATES != null) { // parse the -cov arguments that were provided, skipping over the ones already specified + for (String requestedCovariateString : argumentCollection.COVARIATES) { + boolean foundClass = false; + for (Class covClass : covariateClasses) { + if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class + foundClass = true; + if (!requiredClasses.contains(covClass) && + (argumentCollection.DO_NOT_USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { + try { + final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it + optionalCovariates.add(covariate); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + } + } + } + + if (!foundClass) { + throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates."); + } + } + } + return new Pair, ArrayList>(requiredCovariates, optionalCovariates); + } + + /** + * Adds the required covariates to a covariate list + * + * Note: this method really only checks if the classes object has the expected number of required covariates, then add them by hand. + * + * @param classes list of classes to add to the covariate list + * @return the covariate list + */ + private static ArrayList addRequiredCovariatesToList(List> classes) { + ArrayList dest = new ArrayList(classes.size()); + if (classes.size() != 2) + throw new ReviewedStingException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected"); + + dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next. + dest.add(new QualityScoreCovariate()); + return dest; + } + + /** + * Adds the standard covariates to a covariate list + * + * @param classes list of classes to add to the covariate list + * @return the covariate list + */ + private static ArrayList addStandardCovariatesToList(List> classes) { + ArrayList dest = new ArrayList(classes.size()); + for (Class covClass : classes) { + try { + final Covariate covariate = (Covariate) covClass.newInstance(); + dest.add(covariate); + } catch (Exception e) { + throw new DynamicClassResolutionException(covClass, e); + } + } + return dest; + } + + public static void listAvailableCovariates(Logger logger) { + // Get a list of all available covariates + final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); + + // Print and exit if that's what was requested + logger.info("Available covariates:"); + for (Class covClass : covariateClasses) + logger.info(covClass.getSimpleName()); + logger.info(""); + } public enum SOLID_RECAL_MODE { @@ -152,64 +248,6 @@ public class RecalDataManager { } } - /** - * Generates two lists : required covariates and optional covariates based on the user's requests. - * - * Performs the following tasks in order: - * 1. Adds all requierd covariates in order - * 2. Check if the user asked to use the standard covariates and adds them all if that's the case - * 3. Adds all covariates requested by the user that were not already added by the two previous steps - * - * @param argumentCollection the argument collection object for the recalibration walker - * @return a pair of ordered lists : required covariates (first) and optional covariates (second) - */ - public static Pair, ArrayList> initializeCovariates(RecalibrationArgumentCollection argumentCollection) { - final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); - final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); - final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); - - final ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates - ArrayList optionalCovariates = new ArrayList(); - if (!argumentCollection.DO_NOT_USE_STANDARD_COVARIATES) - optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user - - if (argumentCollection.COVARIATES != null) { // parse the -cov arguments that were provided, skipping over the ones already specified - for (String requestedCovariateString : argumentCollection.COVARIATES) { - boolean foundClass = false; - for (Class covClass : covariateClasses) { - if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class - foundClass = true; - if (!requiredClasses.contains(covClass) && - (argumentCollection.DO_NOT_USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { - try { - final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it - optionalCovariates.add(covariate); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - } - } - } - - if (!foundClass) { - throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates."); - } - } - } - return new Pair, ArrayList>(requiredCovariates, optionalCovariates); - } - - public static void listAvailableCovariates(Logger logger) { - // Get a list of all available covariates - final List> covariateClasses = new PluginManager(Covariate.class).getPlugins(); - - // Print and exit if that's what was requested - logger.info("Available covariates:"); - for (Class covClass : covariateClasses) - logger.info(covClass.getSimpleName()); - logger.info(""); - } - private static List generateReportTables(final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates) { List result = new LinkedList(); int reportTableIndex = 0; @@ -272,8 +310,8 @@ public class RecalDataManager { reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality()); if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.index) reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.numObservations); - reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), datum.numMismatches); + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getNumObservations()); + reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), datum.getNumMismatches()); rowIndex++; } @@ -320,7 +358,7 @@ public class RecalDataManager { files.getFirst().close(); final RScriptExecutor executor = new RScriptExecutor(); - executor.addScript(new Resource(SCRIPT_FILE, RecalDataManager.class)); + executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); executor.addArgs(csvFileName.getAbsolutePath()); executor.addArgs(plotFileName.getAbsolutePath()); executor.exec(); @@ -480,14 +518,14 @@ public class RecalDataManager { */ public static boolean isColorSpaceConsistent(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { if (ReadUtils.isSOLiDRead(read)) { // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base - if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read - final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG); + if (read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read + final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG); if (attr != null) { byte[] colorSpace; if (attr instanceof String) colorSpace = ((String) attr).getBytes(); else - throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); + throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName())); byte[] readBases = read.getReadBases(); // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read if (read.getReadNegativeStrandFlag()) @@ -501,7 +539,7 @@ public class RecalDataManager { inconsistency[i] = (byte) (thisBase == readBases[i] ? 0 : 1); prevBase = readBases[i]; } - read.setAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); + read.setAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); } else if (strategy == SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) // if the strategy calls for an exception, throw it throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); @@ -545,7 +583,7 @@ public class RecalDataManager { * @return Returns true if the base was inconsistent with the color space */ public static boolean isColorSpaceConsistent(final GATKSAMRecord read, final int offset) { - final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG); + final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG); if (attr != null) { final byte[] inconsistency = (byte[]) attr; // NOTE: The inconsistency array is in the direction of the read, not aligned to the reference! @@ -691,40 +729,4 @@ public class RecalDataManager { } - /** - * Adds the required covariates to a covariate list - * - * Note: this method really only checks if the classes object has the expected number of required covariates, then add them by hand. - * - * @param classes list of classes to add to the covariate list - * @return the covariate list - */ - private static ArrayList addRequiredCovariatesToList(List> classes) { - ArrayList dest = new ArrayList(classes.size()); - if (classes.size() != 2) - throw new ReviewedStingException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected"); - - dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next. - dest.add(new QualityScoreCovariate()); - return dest; - } - - /** - * Adds the standard covariates to a covariate list - * - * @param classes list of classes to add to the covariate list - * @return the covariate list - */ - private static ArrayList addStandardCovariatesToList(List> classes) { - ArrayList dest = new ArrayList(classes.size()); - for (Class covClass : classes) { - try { - final Covariate covariate = (Covariate) covClass.newInstance(); - dest.add(covariate); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - } - return dest; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java similarity index 82% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java rename to public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index e69cf4d69..e6ab9e38b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -1,11 +1,12 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.bqsr.*; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import java.io.File; import java.io.PrintStream; @@ -33,13 +34,13 @@ public class RecalibrationReport { public RecalibrationReport(final File RECAL_FILE) { final GATKReport report = new GATKReport(RECAL_FILE); - argumentTable = report.getTable(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE); + argumentTable = report.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE); RAC = initializeArgumentCollectionTable(argumentTable); - GATKReportTable quantizedTable = report.getTable(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE); + GATKReportTable quantizedTable = report.getTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE); quantizationInfo = initializeQuantizationTable(quantizedTable); - Pair, ArrayList> covariates = RecalDataManager.initializeCovariates(RAC); // initialize the required and optional covariates + Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates ArrayList requiredCovariates = covariates.getFirst(); ArrayList optionalCovariates = covariates.getSecond(); requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; @@ -57,13 +58,13 @@ public class RecalibrationReport { for (Covariate cov : requestedCovariates) cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection - recalibrationTables = new RecalibrationTables(requestedCovariates, countReadGroups(report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE))); + recalibrationTables = new RecalibrationTables(requestedCovariates, countReadGroups(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE))); - parseReadGroupTable(report.getTable(RecalDataManager.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE)); + parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE)); - parseQualityScoreTable(report.getTable(RecalDataManager.QUALITY_SCORE_REPORT_TABLE_TITLE), recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE)); + parseQualityScoreTable(report.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE), recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE)); - parseAllCovariatesTable(report.getTable(RecalDataManager.ALL_COVARIATES_REPORT_TABLE_TITLE), recalibrationTables); + parseAllCovariatesTable(report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE), recalibrationTables); } @@ -85,7 +86,7 @@ public class RecalibrationReport { private int countReadGroups(final GATKReportTable reportTable) { Set readGroups = new HashSet(); for ( int i = 0; i < reportTable.getNumRows(); i++ ) - readGroups.add(reportTable.get(i, RecalDataManager.READGROUP_COLUMN_NAME).toString()); + readGroups.add(reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME).toString()); return readGroups.size(); } @@ -139,17 +140,17 @@ public class RecalibrationReport { \ */ private void parseAllCovariatesTable(final GATKReportTable reportTable, final RecalibrationTables recalibrationTables) { for ( int i = 0; i < reportTable.getNumRows(); i++ ) { - final Object rg = reportTable.get(i, RecalDataManager.READGROUP_COLUMN_NAME); + final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); tempCOVarray[0] = requestedCovariates[0].keyFromValue(rg); - final Object qual = reportTable.get(i, RecalDataManager.QUALITY_SCORE_COLUMN_NAME); + final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME); tempCOVarray[1] = requestedCovariates[1].keyFromValue(qual); - final String covName = (String)reportTable.get(i, RecalDataManager.COVARIATE_NAME_COLUMN_NAME); + final String covName = (String)reportTable.get(i, RecalUtils.COVARIATE_NAME_COLUMN_NAME); final int covIndex = optionalCovariateIndexes.get(covName); - final Object covValue = reportTable.get(i, RecalDataManager.COVARIATE_VALUE_COLUMN_NAME); + final Object covValue = reportTable.get(i, RecalUtils.COVARIATE_VALUE_COLUMN_NAME); tempCOVarray[2] = requestedCovariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + covIndex].keyFromValue(covValue); - final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalDataManager.EVENT_TYPE_COLUMN_NAME)); + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); tempCOVarray[3] = event.index; recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + covIndex).put(getRecalDatum(reportTable, i, false), tempCOVarray); @@ -164,11 +165,11 @@ public class RecalibrationReport { */ private void parseQualityScoreTable(final GATKReportTable reportTable, final NestedIntegerArray qualTable) { for ( int i = 0; i < reportTable.getNumRows(); i++ ) { - final Object rg = reportTable.get(i, RecalDataManager.READGROUP_COLUMN_NAME); + final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); tempQUALarray[0] = requestedCovariates[0].keyFromValue(rg); - final Object qual = reportTable.get(i, RecalDataManager.QUALITY_SCORE_COLUMN_NAME); + final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME); tempQUALarray[1] = requestedCovariates[1].keyFromValue(qual); - final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalDataManager.EVENT_TYPE_COLUMN_NAME)); + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); tempQUALarray[2] = event.index; qualTable.put(getRecalDatum(reportTable, i, false), tempQUALarray); @@ -183,9 +184,9 @@ public class RecalibrationReport { */ private void parseReadGroupTable(final GATKReportTable reportTable, final NestedIntegerArray rgTable) { for ( int i = 0; i < reportTable.getNumRows(); i++ ) { - final Object rg = reportTable.get(i, RecalDataManager.READGROUP_COLUMN_NAME); + final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); tempRGarray[0] = requestedCovariates[0].keyFromValue(rg); - final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalDataManager.EVENT_TYPE_COLUMN_NAME)); + final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); tempRGarray[1] = event.index; rgTable.put(getRecalDatum(reportTable, i, true), tempRGarray); @@ -193,13 +194,13 @@ public class RecalibrationReport { } private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { - final long nObservations = (Long) reportTable.get(row, RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME); - final long nErrors = (Long) reportTable.get(row, RecalDataManager.NUMBER_ERRORS_COLUMN_NAME); - final double empiricalQuality = (Double) reportTable.get(row, RecalDataManager.EMPIRICAL_QUALITY_COLUMN_NAME); + final long nObservations = (Long) reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME); + final long nErrors = (Long) reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME); + final double empiricalQuality = (Double) reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME); final double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table - (Double) reportTable.get(row, RecalDataManager.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table - Byte.parseByte((String) reportTable.get(row, RecalDataManager.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table + (Double) reportTable.get(row, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table + Byte.parseByte((String) reportTable.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table final RecalDatum datum = new RecalDatum(nObservations, nErrors, (byte)1); datum.setEstimatedQReported(estimatedQReported); @@ -218,8 +219,8 @@ public class RecalibrationReport { final Long[] counts = new Long[QualityUtils.MAX_QUAL_SCORE + 1]; for ( int i = 0; i < table.getNumRows(); i++ ) { final byte originalQual = (byte)i; - final Object quantizedObject = table.get(i, RecalDataManager.QUANTIZED_VALUE_COLUMN_NAME); - final Object countObject = table.get(i, RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME); + final Object quantizedObject = table.get(i, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME); + final Object countObject = table.get(i, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME); final byte quantizedQual = Byte.parseByte(quantizedObject.toString()); final long quantizedCount = Long.parseLong(countObject.toString()); quals[originalQual] = quantizedQual; @@ -239,7 +240,7 @@ public class RecalibrationReport { for ( int i = 0; i < table.getNumRows(); i++ ) { final String argument = table.get(i, "Argument").toString(); - Object value = table.get(i, RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME); + Object value = table.get(i, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); if (value.equals("null")) value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport @@ -250,10 +251,10 @@ public class RecalibrationReport { RAC.DO_NOT_USE_STANDARD_COVARIATES = Boolean.parseBoolean((String) value); else if (argument.equals("solid_recal_mode")) - RAC.SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.recalModeFromString((String) value); + RAC.SOLID_RECAL_MODE = RecalUtils.SOLID_RECAL_MODE.recalModeFromString((String) value); else if (argument.equals("solid_nocall_strategy")) - RAC.SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value); + RAC.SOLID_NOCALL_STRATEGY = RecalUtils.SOLID_NOCALL_STRATEGY.nocallStrategyFromString((String) value); else if (argument.equals("mismatches_context_size")) RAC.MISMATCHES_CONTEXT_SIZE = Integer.parseInt((String) value); @@ -307,7 +308,7 @@ public class RecalibrationReport { } public void output(PrintStream output) { - RecalDataManager.outputRecalibrationReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, output); + RecalUtils.outputRecalibrationReport(argumentTable, quantizationInfo, recalibrationTables, requestedCovariates, output); } public RecalibrationArgumentCollection getRAC() { diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java index 0416b5eb9..f37e69c9a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java @@ -25,9 +25,7 @@ package org.broadinstitute.sting.utils.recalibration; -import org.broadinstitute.sting.gatk.walkers.bqsr.Covariate; -import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; -import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDatum; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BinaryTagCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/BinaryTagCovariate.java similarity index 89% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BinaryTagCovariate.java rename to public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/BinaryTagCovariate.java index a89586c2c..cebdebf9d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BinaryTagCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/BinaryTagCovariate.java @@ -1,5 +1,7 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration.covariates; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java similarity index 98% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java rename to public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java index 5fe8809fb..4c20284d9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java @@ -23,8 +23,10 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration.covariates; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; import org.broadinstitute.sting.utils.clipping.ReadClipper; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/Covariate.java similarity index 94% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java rename to public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/Covariate.java index 1ad5346fa..c613135bb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/Covariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/Covariate.java @@ -1,5 +1,7 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration.covariates; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* @@ -89,8 +91,3 @@ public interface Covariate { public int maximumKeyValue(); } -interface RequiredCovariate extends Covariate {} - -interface StandardCovariate extends Covariate {} - -interface ExperimentalCovariate extends Covariate {} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java similarity index 97% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java rename to public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java index f0ff8f2bd..4f15419c7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java @@ -1,5 +1,7 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration.covariates; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.exceptions.UserException; diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java new file mode 100644 index 000000000..72df2a410 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ExperimentalCovariate.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.utils.recalibration.covariates; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public interface ExperimentalCovariate extends Covariate {} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/QualityScoreCovariate.java similarity index 92% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java rename to public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/QualityScoreCovariate.java index dd7060ff8..3ef8ee931 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/QualityScoreCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/QualityScoreCovariate.java @@ -1,5 +1,7 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration.covariates; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java similarity index 94% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java rename to public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java index f04d27b7a..85568dac9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java @@ -1,5 +1,7 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration.covariates; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java new file mode 100644 index 000000000..50755dbcf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/RequiredCovariate.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.utils.recalibration.covariates; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public interface RequiredCovariate extends Covariate {} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java new file mode 100644 index 000000000..444954f25 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/StandardCovariate.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.utils.recalibration.covariates; + +/** + * [Short one sentence description of this walker] + *

+ *

+ * [Functionality of this walker] + *

+ *

+ *

Input

+ *

+ * [Input description] + *

+ *

+ *

Output

+ *

+ * [Output description] + *

+ *

+ *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T $WalkerName
+ *  
+ * + * @author Your Name + * @since Date created + */ +public interface StandardCovariate extends Covariate {} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 659615cf4..c9b3a2df8 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -25,7 +25,7 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; -import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; +import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java index 8e9f2533f..f1ffbe80f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.utils.recalibration.RecalUtils; import org.testng.Assert; import org.testng.annotations.Test; @@ -33,15 +34,15 @@ public class BQSRGathererUnitTest { for (GATKReportTable originalTable : originalReport.getTables()) { GATKReportTable calculatedTable = calculatedReport.getTable(originalTable.getTableName()); List columnsToTest = new LinkedList(); - columnsToTest.add(RecalDataManager.NUMBER_OBSERVATIONS_COLUMN_NAME); - columnsToTest.add(RecalDataManager.NUMBER_ERRORS_COLUMN_NAME); - if (originalTable.getTableName().equals(RecalDataManager.ARGUMENT_REPORT_TABLE_TITLE)) { // these tables must be IDENTICAL - columnsToTest.add(RecalDataManager.ARGUMENT_VALUE_COLUMN_NAME); + columnsToTest.add(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME); + columnsToTest.add(RecalUtils.NUMBER_ERRORS_COLUMN_NAME); + if (originalTable.getTableName().equals(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE)) { // these tables must be IDENTICAL + columnsToTest.add(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 1); } - else if (originalTable.getTableName().equals(RecalDataManager.QUANTIZED_REPORT_TABLE_TITLE)) { - columnsToTest.add(RecalDataManager.QUANTIZED_COUNT_COLUMN_NAME); + else if (originalTable.getTableName().equals(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE)) { + columnsToTest.add(RecalUtils.QUANTIZED_COUNT_COLUMN_NAME); testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java similarity index 89% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java index 553b7e237..2556448ad 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/ContextCovariateUnitTest.java @@ -1,5 +1,8 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.ContextCovariate; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.clipping.ClippingRepresentation; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java similarity index 90% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java index 3fa1e916d..c3d93b2cb 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/CycleCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java @@ -1,5 +1,7 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.CycleCovariate; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariatesUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java similarity index 92% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariatesUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java index 37994cf12..dac26cb53 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadCovariatesUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/ReadCovariatesUnitTest.java @@ -1,5 +1,7 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.*; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -41,7 +43,7 @@ public class ReadCovariatesUnitTest { requestedCovariates[2] = coCov; requestedCovariates[3] = cyCov; - ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates); + ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); // check that the length is correct Assert.assertEquals(rc.getMismatchesKeySet().length, length); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java similarity index 88% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java index a83508353..78a74d259 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/ReadGroupCovariateUnitTest.java @@ -1,5 +1,7 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.ReadGroupCovariate; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java similarity index 95% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java index e4a77c016..387cc94d6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java @@ -1,9 +1,10 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; +package org.broadinstitute.sting.utils.recalibration; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.*; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -72,7 +73,7 @@ public class RecalibrationReportUnitTest { final int expectedKeys = expectedNumberOfKeys(4, length, RAC.INDELS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); int nKeys = 0; // keep track of how many keys were produced - final ReadCovariates rc = RecalDataManager.computeCovariates(read, requestedCovariates); + final ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); final RecalibrationTables recalibrationTables = new RecalibrationTables(requestedCovariates); final NestedIntegerArray rgTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE); From 57b45bfb1e284abd335ad9965776b6228fd2e504 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 27 Jul 2012 14:34:13 -0400 Subject: [PATCH 07/43] Extensive unit tests, contacts, and documentation for RecalDatum --- .../sting/utils/QualityUtils.java | 5 + .../sting/utils/recalibration/RecalDatum.java | 102 ++++++++++++++---- .../org/broadinstitute/sting/BaseTest.java | 10 +- 3 files changed, 93 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 4acc0e2c3..1242e5b00 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -69,9 +69,14 @@ public class QualityUtils { * @return a probability (0.0 - 1.0) */ static private double qualToErrorProbRaw(int qual) { + return qualToErrorProb((double) qual); + } + + public static double qualToErrorProb(final double qual) { return Math.pow(10.0, ((double) qual)/-10.0); } + static public double qualToErrorProb(byte qual) { return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 43ff54378..6ee5df68b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.recalibration; */ import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; @@ -40,6 +41,17 @@ import java.util.Random; * User: rpoplin * Date: Nov 3, 2009 */ +@Invariant({ + "estimatedQReported >= 0.0", + "! Double.isNaN(estimatedQReported)", + "! Double.isInfinite(estimatedQReported)", + "empiricalQuality >= 0.0 || empiricalQuality == UNINITIALIZED", + "! Double.isNaN(empiricalQuality)", + "! Double.isInfinite(empiricalQuality)", + "numObservations >= 0", + "numMismatches >= 0", + "numMismatches <= numObservations" +}) public class RecalDatum { private static final double UNINITIALIZED = -1.0; @@ -74,13 +86,28 @@ public class RecalDatum { // //--------------------------------------------------------------------------------------------------------------- + /** + * Create a new RecalDatum with given observation and mismatch counts, and an reported quality + * + * @param _numObservations + * @param _numMismatches + * @param reportedQuality + */ public RecalDatum(final long _numObservations, final long _numMismatches, final byte reportedQuality) { + if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); + if ( numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0"); + if ( reportedQuality < 0 ) throw new IllegalArgumentException("reportedQuality < 0"); + numObservations = _numObservations; numMismatches = _numMismatches; estimatedQReported = reportedQuality; empiricalQuality = UNINITIALIZED; } + /** + * Copy copy into this recal datum, overwriting all of this objects data + * @param copy + */ public RecalDatum(final RecalDatum copy) { this.numObservations = copy.getNumObservations(); this.numMismatches = copy.getNumMismatches(); @@ -88,6 +115,12 @@ public class RecalDatum { this.empiricalQuality = copy.empiricalQuality; } + /** + * Add in all of the data from other into this object, updating the reported quality from the expected + * error rate implied by the two reported qualities + * + * @param other + */ public synchronized void combine(final RecalDatum other) { final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); increment(other.getNumObservations(), other.getNumMismatches()); @@ -95,26 +128,26 @@ public class RecalDatum { empiricalQuality = UNINITIALIZED; } - @Requires("empiricalQuality == UNINITIALIZED") - @Ensures("empiricalQuality != UNINITIALIZED") - private synchronized final void calcEmpiricalQuality() { - // cache the value so we don't call log over and over again - final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); - // smoothing is one error and one non-error observation, for example - final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT); - final double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); - empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); - } - public synchronized void setEstimatedQReported(final double estimatedQReported) { + if ( estimatedQReported < 0 ) throw new IllegalArgumentException("estimatedQReported < 0"); + if ( Double.isInfinite(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is infinite"); + if ( Double.isNaN(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is NaN"); + this.estimatedQReported = estimatedQReported; } public final double getEstimatedQReported() { return estimatedQReported; } + public final byte getEstimatedQReportedAsByte() { + return (byte)(int)(Math.round(getEstimatedQReported())); + } public synchronized void setEmpiricalQuality(final double empiricalQuality) { + if ( empiricalQuality < 0 ) throw new IllegalArgumentException("empiricalQuality < 0"); + if ( Double.isInfinite(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is infinite"); + if ( Double.isNaN(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is NaN"); + this.empiricalQuality = empiricalQuality; } @@ -133,14 +166,6 @@ public class RecalDatum { return String.format("%s,%d,%.2f", toString(), (byte) Math.floor(getEstimatedQReported()), getEmpiricalQuality() - getEstimatedQReported()); } - private double calcExpectedErrors() { - return (double) getNumObservations() * qualToErrorProb(estimatedQReported); - } - - private double qualToErrorProb(final double qual) { - return Math.pow(10.0, qual / -10.0); - } - public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { final Random random = new Random(); final int nObservations = random.nextInt(maxObservations); @@ -176,6 +201,7 @@ public class RecalDatum { } public synchronized void setNumObservations(final long numObservations) { + if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); this.numObservations = numObservations; empiricalQuality = UNINITIALIZED; } @@ -184,29 +210,67 @@ public class RecalDatum { return numMismatches; } + @Requires({"numMismatches >= 0"}) public synchronized void setNumMismatches(final long numMismatches) { + if ( numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0"); this.numMismatches = numMismatches; empiricalQuality = UNINITIALIZED; } + @Requires({"by >= 0"}) public synchronized void incrementNumObservations(final long by) { numObservations += by; empiricalQuality = UNINITIALIZED; } + @Requires({"by >= 0"}) public synchronized void incrementNumMismatches(final long by) { numMismatches += by; empiricalQuality = UNINITIALIZED; } + @Requires({"incObservations >= 0", "incMismatches >= 0"}) + @Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"}) public synchronized void increment(final long incObservations, final long incMismatches) { incrementNumObservations(incObservations); incrementNumMismatches(incMismatches); } + @Ensures({"numObservations == old(numObservations) + 1", "numMismatches >= old(numMismatches)"}) public synchronized void increment(final boolean isError) { incrementNumObservations(1); if ( isError ) incrementNumMismatches(1); } + + // ------------------------------------------------------------------------------------- + // + // Private implementation helper functions + // + // ------------------------------------------------------------------------------------- + + /** + * Calculate and cache the empirical quality score from mismatches and observations (expensive operation) + */ + @Requires("empiricalQuality == UNINITIALIZED") + @Ensures("empiricalQuality != UNINITIALIZED") + private synchronized final void calcEmpiricalQuality() { + // cache the value so we don't call log over and over again + final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); + // smoothing is one error and one non-error observation, for example + final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT); + final double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); + empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); + } + + /** + * calculate the expected number of errors given the estimated Q reported and the number of observations + * in this datum. + * + * @return a positive (potentially fractional) estimate of the number of errors + */ + @Ensures("result >= 0.0") + private double calcExpectedErrors() { + return (double) getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); + } } diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index af4891856..76e25a3c0 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -282,12 +282,12 @@ public abstract class BaseTest { private static final double DEFAULT_FLOAT_TOLERANCE = 1e-1; public static final void assertEqualsDoubleSmart(final Object actual, final Double expected) { - Assert.assertTrue(actual instanceof Double); + Assert.assertTrue(actual instanceof Double, "Not a double"); assertEqualsDoubleSmart((double)(Double)actual, (double)expected); } public static final void assertEqualsDoubleSmart(final Object actual, final Double expected, final double tolerance) { - Assert.assertTrue(actual instanceof Double); + Assert.assertTrue(actual instanceof Double, "Not a double"); assertEqualsDoubleSmart((double)(Double)actual, (double)expected, tolerance); } @@ -303,13 +303,13 @@ public abstract class BaseTest { public static final void assertEqualsDoubleSmart(final double actual, final double expected, final double tolerance) { if ( Double.isNaN(expected) ) // NaN == NaN => false unfortunately - Assert.assertTrue(Double.isNaN(actual)); + Assert.assertTrue(Double.isNaN(actual), "expected is nan, actual is not"); else if ( Double.isInfinite(expected) ) // NaN == NaN => false unfortunately - Assert.assertTrue(Double.isInfinite(actual)); + Assert.assertTrue(Double.isInfinite(actual), "expected is infinite, actual is not"); else { final double delta = Math.abs(actual - expected); final double ratio = Math.abs(actual / expected - 1.0); - Assert.assertTrue(delta < tolerance || ratio < tolerance); + Assert.assertTrue(delta < tolerance || ratio < tolerance, "expected = " + expected + " actual = " + actual + " not within tolerance " + tolerance); } } } From 315d25409f34f409022c7ccb0271be83cb3b6900 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 29 Jul 2012 13:13:13 -0400 Subject: [PATCH 08/43] Improvement to RecalDatum and VisualizeContextTree -- Reorganize functions in RecalDatum so that error rate can be computed indepentently. Added unit tests. Removed equals() method, which is a buggy without it's associated implementation for hashcode -- New class RecalDatumTree based on QualIntervals that inherits from RecalDatum but includes the concept of sub data -- VisualizeContextTree now uses RecalDatumTree and can trivially compute the penalty function for merging nodes, which it displays in the graph --- .../sting/utils/recalibration/RecalDatum.java | 85 +++++++++++++------ .../utils/recalibration/RecalDatumTree.java | 76 +++++++++++++++++ 2 files changed, 133 insertions(+), 28 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 6ee5df68b..249422c17 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -136,6 +136,14 @@ public class RecalDatum { this.estimatedQReported = estimatedQReported; } + public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { + final Random random = new Random(); + final int nObservations = random.nextInt(maxObservations); + final int nErrors = random.nextInt(maxErrors); + final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE); + return new RecalDatum(nObservations, nErrors, (byte)qual); + } + public final double getEstimatedQReported() { return estimatedQReported; } @@ -143,6 +151,29 @@ public class RecalDatum { return (byte)(int)(Math.round(getEstimatedQReported())); } + //--------------------------------------------------------------------------------------------------------------- + // + // Empirical quality score -- derived from the num mismatches and observations + // + //--------------------------------------------------------------------------------------------------------------- + + /** + * Returns the error rate (in real space) of this interval, or 0 if there are no obserations + * @return the empirical error rate ~= N errors / N obs + */ + @Ensures("result >= 0.0") + public double getEmpiricalErrorRate() { + if ( numObservations == 0 ) + return 0.0; + else { + // cache the value so we don't call log over and over again + final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); + // smoothing is one error and one non-error observation, for example + final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT); + return doubleMismatches / doubleObservations; + } + } + public synchronized void setEmpiricalQuality(final double empiricalQuality) { if ( empiricalQuality < 0 ) throw new IllegalArgumentException("empiricalQuality < 0"); if ( Double.isInfinite(empiricalQuality) ) throw new IllegalArgumentException("empiricalQuality is infinite"); @@ -157,6 +188,16 @@ public class RecalDatum { return empiricalQuality; } + public final byte getEmpiricalQualityAsByte() { + return (byte)(Math.round(getEmpiricalQuality())); + } + + //--------------------------------------------------------------------------------------------------------------- + // + // increment methods + // + //--------------------------------------------------------------------------------------------------------------- + @Override public String toString() { return String.format("%d,%d,%d", getNumObservations(), getNumMismatches(), (byte) Math.floor(getEmpiricalQuality())); @@ -166,29 +207,21 @@ public class RecalDatum { return String.format("%s,%d,%.2f", toString(), (byte) Math.floor(getEstimatedQReported()), getEmpiricalQuality() - getEstimatedQReported()); } - public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { - final Random random = new Random(); - final int nObservations = random.nextInt(maxObservations); - final int nErrors = random.nextInt(maxErrors); - final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE); - return new RecalDatum(nObservations, nErrors, (byte)qual); - } - - /** - * We don't compare the estimated quality reported because it may be different when read from - * report tables. - * - * @param o the other recal datum - * @return true if the two recal datums have the same number of observations, errors and empirical quality. - */ - @Override - public boolean equals(Object o) { - if (!(o instanceof RecalDatum)) - return false; - RecalDatum other = (RecalDatum) o; - return super.equals(o) && - MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0; - } +// /** +// * We don't compare the estimated quality reported because it may be different when read from +// * report tables. +// * +// * @param o the other recal datum +// * @return true if the two recal datums have the same number of observations, errors and empirical quality. +// */ +// @Override +// public boolean equals(Object o) { +// if (!(o instanceof RecalDatum)) +// return false; +// RecalDatum other = (RecalDatum) o; +// return super.equals(o) && +// MathUtils.compareDoubles(this.empiricalQuality, other.empiricalQuality, 0.001) == 0; +// } //--------------------------------------------------------------------------------------------------------------- // @@ -255,11 +288,7 @@ public class RecalDatum { @Requires("empiricalQuality == UNINITIALIZED") @Ensures("empiricalQuality != UNINITIALIZED") private synchronized final void calcEmpiricalQuality() { - // cache the value so we don't call log over and over again - final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); - // smoothing is one error and one non-error observation, for example - final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT); - final double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations); + final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java new file mode 100644 index 000000000..210ea53bf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java @@ -0,0 +1,76 @@ +package org.broadinstitute.sting.utils.recalibration; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +/** + * A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one + * + * @author Mark DePristo + * @since 07/27/12 + */ +public class RecalDatumTree extends RecalDatum { + final Set subnodes; + + protected RecalDatumTree(final long nObservations, final long nErrors, final byte reportedQual) { + this(nObservations, nErrors, reportedQual, new HashSet()); + } + + public RecalDatumTree(final long nObservations, final long nErrors, final byte reportedQual, final Set subnodes) { + super(nObservations, nErrors, reportedQual); + this.subnodes = new HashSet(subnodes); + } + + public double getPenalty() { + return calcPenalty(getEmpiricalErrorRate()); + } + + public void addSubnode(final RecalDatumTree sub) { + subnodes.add(sub); + } + + public boolean isLeaf() { + return subnodes.isEmpty(); + } + + /** + * Calculate the penalty of this interval, given the overall error rate for the interval + * + * If the globalErrorRate is e, this value is: + * + * sum_i |log10(e_i) - log10(e)| * nObservations_i + * + * each the index i applies to all leaves of the tree accessible from this interval + * (found recursively from subnodes as necessary) + * + * @param globalErrorRate overall error rate in real space against which we calculate the penalty + * @return the cost of approximating the bins in this interval with the globalErrorRate + */ + @Requires("globalErrorRate >= 0.0") + @Ensures("result >= 0.0") + private double calcPenalty(final double globalErrorRate) { + if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty + return 0.0; + + if ( isLeaf() ) { + // this is leave node + return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations(); + // TODO -- how we can generalize this calculation? +// if ( this.qEnd <= minInterestingQual ) +// // It's free to merge up quality scores below the smallest interesting one +// return 0; +// else { +// return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations(); +// } + } else { + double sum = 0; + for ( final RecalDatumTree hrd : subnodes) + sum += hrd.calcPenalty(globalErrorRate); + return sum; + } + } +} From 93640b382ebe79d269ada3c006530d8a3f49330f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 30 Jul 2012 08:31:38 -0400 Subject: [PATCH 09/43] Preliminary version of adaptive context covariate algorithm -- Works according to visual inspection of output tree --- .../utils/recalibration/RecalDatumNode.java | 213 ++++++++++++++++++ .../utils/recalibration/RecalDatumTree.java | 76 ------- 2 files changed, 213 insertions(+), 76 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java new file mode 100644 index 000000000..62ea67d7c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java @@ -0,0 +1,213 @@ +package org.broadinstitute.sting.utils.recalibration; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.HashSet; +import java.util.Set; + +/** + * A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one + * + * @author Mark DePristo + * @since 07/27/12 + */ +public class RecalDatumNode { + protected static Logger logger = Logger.getLogger(RecalDatumNode.class); + private final static double UNINITIALIZED = -1.0; + private final T recalDatum; + private double fixedPenalty = UNINITIALIZED; + private final Set> subnodes; + + public RecalDatumNode(final T recalDatum) { + this(recalDatum, new HashSet>()); + } + + @Override + public String toString() { + return recalDatum.toString(); + } + + public RecalDatumNode(final T recalDatum, final Set> subnodes) { + this(recalDatum, UNINITIALIZED, subnodes); + } + + protected RecalDatumNode(final T recalDatum, final double fixedPenalty) { + this(recalDatum, fixedPenalty, new HashSet>()); + } + + protected RecalDatumNode(final T recalDatum, final double fixedPenalty, final Set> subnodes) { + this.recalDatum = recalDatum; + this.fixedPenalty = fixedPenalty; + this.subnodes = new HashSet>(subnodes); + } + + public T getRecalDatum() { + return recalDatum; + } + + public Set> getSubnodes() { + return subnodes; + } + + public double getPenalty() { + if ( fixedPenalty != UNINITIALIZED ) + return fixedPenalty; + else + return calcPenalty(recalDatum.getEmpiricalErrorRate()); + } + + public double calcAndSetFixedPenalty(final boolean doEntireTree) { + fixedPenalty = calcPenalty(recalDatum.getEmpiricalErrorRate()); + if ( doEntireTree ) + for ( final RecalDatumNode sub : subnodes ) + sub.calcAndSetFixedPenalty(doEntireTree); + return fixedPenalty; + } + + public void addSubnode(final RecalDatumNode sub) { + subnodes.add(sub); + } + + public boolean isLeaf() { + return subnodes.isEmpty(); + } + + public int getNumBranches() { + return subnodes.size(); + } + + public double getMinNodePenalty() { + if ( isLeaf() ) + return Double.MAX_VALUE; + else { + double minPenalty = getPenalty(); + for ( final RecalDatumNode sub : subnodes ) + minPenalty = Math.min(minPenalty, sub.getMinNodePenalty()); + return minPenalty; + } + } + + public int maxDepth() { + int subMax = 0; + for ( final RecalDatumNode sub : subnodes ) + subMax = Math.max(subMax, sub.maxDepth()); + return subMax + 1; + } + + public int size() { + int size = 1; + for ( final RecalDatumNode sub : subnodes ) + size += sub.size(); + return size; + } + + /** + * Calculate the penalty of this interval, given the overall error rate for the interval + * + * If the globalErrorRate is e, this value is: + * + * sum_i |log10(e_i) - log10(e)| * nObservations_i + * + * each the index i applies to all leaves of the tree accessible from this interval + * (found recursively from subnodes as necessary) + * + * @param globalErrorRate overall error rate in real space against which we calculate the penalty + * @return the cost of approximating the bins in this interval with the globalErrorRate + */ + @Requires("globalErrorRate >= 0.0") + @Ensures("result >= 0.0") + private double calcPenalty(final double globalErrorRate) { + if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty + return 0.0; + + if ( isLeaf() ) { + // this is leave node + return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * recalDatum.getNumObservations(); + // TODO -- how we can generalize this calculation? +// if ( this.qEnd <= minInterestingQual ) +// // It's free to merge up quality scores below the smallest interesting one +// return 0; +// else { +// return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations(); +// } + } else { + double sum = 0; + for ( final RecalDatumNode hrd : subnodes) + sum += hrd.calcPenalty(globalErrorRate); + return sum; + } + } + + public RecalDatumNode pruneToDepth(final int maxDepth) { + if ( maxDepth < 1 ) + throw new IllegalArgumentException("maxDepth < 1"); + else { + final Set> subPruned = new HashSet>(getNumBranches()); + if ( maxDepth > 1 ) + for ( final RecalDatumNode sub : subnodes ) + subPruned.add(sub.pruneToDepth(maxDepth - 1)); + return new RecalDatumNode(getRecalDatum(), fixedPenalty, subPruned); + } + } + + public RecalDatumNode pruneByPenalty(final int maxElements) { + RecalDatumNode root = this; + + while ( root.size() > maxElements ) { + // remove the lowest penalty element, and continue + root = root.removeLowestPenaltyNode(); + } + + // our size is below the target, so we are good, return + return root; + } + + /** + * Find the lowest penalty node in the tree, and return a tree without it + * + * Note this excludes the current (root) node + * + * @return + */ + private RecalDatumNode removeLowestPenaltyNode() { + final RecalDatumNode oneRemoved = removeFirstNodeWithPenalty(getMinNodePenalty()).getFirst(); + if ( oneRemoved == null ) + throw new IllegalStateException("Removed our root node, wow, didn't expect that"); + return oneRemoved; + } + + private Pair, Boolean> removeFirstNodeWithPenalty(final double penaltyToRemove) { + if ( getPenalty() == penaltyToRemove ) { + logger.info("Removing " + this + " with penalty " + penaltyToRemove); + if ( isLeaf() ) + throw new IllegalStateException("Trying to remove a leaf node from the tree! " + this + " " + penaltyToRemove); + // node is the thing we are going to remove, but without any subnodes + final RecalDatumNode node = new RecalDatumNode(getRecalDatum(), fixedPenalty); + return new Pair, Boolean>(node, true); + } else { + // did we remove something in a sub branch? + boolean removedSomething = false; + + // our sub nodes with the penalty node removed + final Set> sub = new HashSet>(getNumBranches()); + + for ( final RecalDatumNode sub1 : subnodes ) { + if ( removedSomething ) { + // already removed something, just add sub1 back to sub + sub.add(sub1); + } else { + // haven't removed anything yet, so try + final Pair, Boolean> maybeRemoved = sub1.removeFirstNodeWithPenalty(penaltyToRemove); + removedSomething = maybeRemoved.getSecond(); + sub.add(maybeRemoved.getFirst()); + } + } + + final RecalDatumNode node = new RecalDatumNode(getRecalDatum(), fixedPenalty, sub); + return new Pair, Boolean>(node, removedSomething); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java deleted file mode 100644 index 210ea53bf..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumTree.java +++ /dev/null @@ -1,76 +0,0 @@ -package org.broadinstitute.sting.utils.recalibration; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; - -/** - * A tree of recal datum, where each contains a set of sub datum representing sub-states of the higher level one - * - * @author Mark DePristo - * @since 07/27/12 - */ -public class RecalDatumTree extends RecalDatum { - final Set subnodes; - - protected RecalDatumTree(final long nObservations, final long nErrors, final byte reportedQual) { - this(nObservations, nErrors, reportedQual, new HashSet()); - } - - public RecalDatumTree(final long nObservations, final long nErrors, final byte reportedQual, final Set subnodes) { - super(nObservations, nErrors, reportedQual); - this.subnodes = new HashSet(subnodes); - } - - public double getPenalty() { - return calcPenalty(getEmpiricalErrorRate()); - } - - public void addSubnode(final RecalDatumTree sub) { - subnodes.add(sub); - } - - public boolean isLeaf() { - return subnodes.isEmpty(); - } - - /** - * Calculate the penalty of this interval, given the overall error rate for the interval - * - * If the globalErrorRate is e, this value is: - * - * sum_i |log10(e_i) - log10(e)| * nObservations_i - * - * each the index i applies to all leaves of the tree accessible from this interval - * (found recursively from subnodes as necessary) - * - * @param globalErrorRate overall error rate in real space against which we calculate the penalty - * @return the cost of approximating the bins in this interval with the globalErrorRate - */ - @Requires("globalErrorRate >= 0.0") - @Ensures("result >= 0.0") - private double calcPenalty(final double globalErrorRate) { - if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty - return 0.0; - - if ( isLeaf() ) { - // this is leave node - return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations(); - // TODO -- how we can generalize this calculation? -// if ( this.qEnd <= minInterestingQual ) -// // It's free to merge up quality scores below the smallest interesting one -// return 0; -// else { -// return (Math.abs(Math.log10(getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * getNumObservations(); -// } - } else { - double sum = 0; - for ( final RecalDatumTree hrd : subnodes) - sum += hrd.calcPenalty(globalErrorRate); - return sum; - } - } -} From 0c4e729e13d0ce5e24dc28cc14664dd1519632a9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 30 Jul 2012 15:44:33 -0400 Subject: [PATCH 10/43] Working version of adaptive context calculations -- Uses chi2 test for independences to determine if subcontext is worth representing. Give excellent visual results -- Writes out analysis output file producing excellent results in R -- Trivial reformatting of MathUtils --- .../utils/recalibration/RecalDatumNode.java | 128 +++++++++++++++--- 1 file changed, 112 insertions(+), 16 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java index 62ea67d7c..3af91be16 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java @@ -2,7 +2,9 @@ package org.broadinstitute.sting.utils.recalibration; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.apache.commons.math.stat.inference.ChiSquareTestImpl; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.collections.Pair; import java.util.HashSet; @@ -15,8 +17,9 @@ import java.util.Set; * @since 07/27/12 */ public class RecalDatumNode { + private final static boolean USE_CHI2 = true; protected static Logger logger = Logger.getLogger(RecalDatumNode.class); - private final static double UNINITIALIZED = -1.0; + private final static double UNINITIALIZED = Double.NEGATIVE_INFINITY; private final T recalDatum; private double fixedPenalty = UNINITIALIZED; private final Set> subnodes; @@ -56,11 +59,11 @@ public class RecalDatumNode { if ( fixedPenalty != UNINITIALIZED ) return fixedPenalty; else - return calcPenalty(recalDatum.getEmpiricalErrorRate()); + return calcPenalty(); } public double calcAndSetFixedPenalty(final boolean doEntireTree) { - fixedPenalty = calcPenalty(recalDatum.getEmpiricalErrorRate()); + fixedPenalty = calcPenalty(); if ( doEntireTree ) for ( final RecalDatumNode sub : subnodes ) sub.calcAndSetFixedPenalty(doEntireTree); @@ -79,14 +82,23 @@ public class RecalDatumNode { return subnodes.size(); } - public double getMinNodePenalty() { + /** + * Total penalty is the sum of leaf node penalties + * + * This algorithm assumes that penalties have been fixed before pruning, as leaf nodes by + * definition have 0 penalty unless they represent a pruned tree with underlying -- but now + * pruned -- subtrees + * + * @return + */ + public double totalPenalty() { if ( isLeaf() ) - return Double.MAX_VALUE; + return getPenalty(); else { - double minPenalty = getPenalty(); + double sum = 0.0; for ( final RecalDatumNode sub : subnodes ) - minPenalty = Math.min(minPenalty, sub.getMinNodePenalty()); - return minPenalty; + sum += sub.totalPenalty(); + return sum; } } @@ -97,6 +109,17 @@ public class RecalDatumNode { return subMax + 1; } + public int minDepth() { + if ( isLeaf() ) + return 1; + else { + int subMin = Integer.MAX_VALUE; + for ( final RecalDatumNode sub : subnodes ) + subMin = Math.min(subMin, sub.minDepth()); + return subMin + 1; + } + } + public int size() { int size = 1; for ( final RecalDatumNode sub : subnodes ) @@ -104,6 +127,58 @@ public class RecalDatumNode { return size; } + public int numLeaves() { + if ( isLeaf() ) + return 1; + else { + int size = 0; + for ( final RecalDatumNode sub : subnodes ) + size += sub.numLeaves(); + return size; + } + } + + private double calcPenalty() { + if ( USE_CHI2 ) + return calcPenaltyChi2(); + else + return calcPenaltyLog10(getRecalDatum().getEmpiricalErrorRate()); + } + + private double calcPenaltyChi2() { + if ( isLeaf() ) + return 0.0; + else { + final long[][] counts = new long[subnodes.size()][2]; + + int i = 0; + for ( RecalDatumNode subnode : subnodes ) { + counts[i][0] = subnode.getRecalDatum().getNumMismatches(); + counts[i][1] = subnode.getRecalDatum().getNumObservations(); + i++; + } + + final double chi2 = new ChiSquareTestImpl().chiSquare(counts); + +// StringBuilder x = new StringBuilder(); +// StringBuilder y = new StringBuilder(); +// for ( int k = 0; k < counts.length; k++) { +// if ( k != 0 ) { +// x.append(", "); +// y.append(", "); +// } +// x.append(counts[k][0]); +// y.append(counts[k][1]); +// } +// logger.info("x = c(" + x.toString() + ")"); +// logger.info("y = c(" + y.toString() + ")"); +// logger.info("chi2 = " + chi2); + + return chi2; + //return Math.log10(chi2); + } + } + /** * Calculate the penalty of this interval, given the overall error rate for the interval * @@ -119,7 +194,7 @@ public class RecalDatumNode { */ @Requires("globalErrorRate >= 0.0") @Ensures("result >= 0.0") - private double calcPenalty(final double globalErrorRate) { + private double calcPenaltyLog10(final double globalErrorRate) { if ( globalErrorRate == 0.0 ) // there were no observations, so there's no penalty return 0.0; @@ -136,7 +211,7 @@ public class RecalDatumNode { } else { double sum = 0; for ( final RecalDatumNode hrd : subnodes) - sum += hrd.calcPenalty(globalErrorRate); + sum += hrd.calcPenaltyLog10(globalErrorRate); return sum; } } @@ -173,17 +248,38 @@ public class RecalDatumNode { * @return */ private RecalDatumNode removeLowestPenaltyNode() { - final RecalDatumNode oneRemoved = removeFirstNodeWithPenalty(getMinNodePenalty()).getFirst(); + final Pair, Double> nodeToRemove = getMinPenaltyNode(); + logger.info("Removing " + nodeToRemove.getFirst() + " with penalty " + nodeToRemove.getSecond()); + + final Pair, Boolean> result = removeNode(nodeToRemove.getFirst()); + + if ( ! result.getSecond() ) + throw new IllegalStateException("Never removed any node!"); + + final RecalDatumNode oneRemoved = result.getFirst(); if ( oneRemoved == null ) throw new IllegalStateException("Removed our root node, wow, didn't expect that"); return oneRemoved; } - private Pair, Boolean> removeFirstNodeWithPenalty(final double penaltyToRemove) { - if ( getPenalty() == penaltyToRemove ) { - logger.info("Removing " + this + " with penalty " + penaltyToRemove); + private Pair, Double> getMinPenaltyNode() { + final double myValue = isLeaf() ? Double.MAX_VALUE : getPenalty(); + Pair, Double> maxNode = new Pair, Double>(this, myValue); + + for ( final RecalDatumNode sub : subnodes ) { + final Pair, Double> subFind = sub.getMinPenaltyNode(); + if ( subFind.getSecond() < maxNode.getSecond() ) { + maxNode = subFind; + } + } + + return maxNode; + } + + private Pair, Boolean> removeNode(final RecalDatumNode nodeToRemove) { + if ( this == nodeToRemove ) { if ( isLeaf() ) - throw new IllegalStateException("Trying to remove a leaf node from the tree! " + this + " " + penaltyToRemove); + throw new IllegalStateException("Trying to remove a leaf node from the tree! " + this + " " + nodeToRemove); // node is the thing we are going to remove, but without any subnodes final RecalDatumNode node = new RecalDatumNode(getRecalDatum(), fixedPenalty); return new Pair, Boolean>(node, true); @@ -200,7 +296,7 @@ public class RecalDatumNode { sub.add(sub1); } else { // haven't removed anything yet, so try - final Pair, Boolean> maybeRemoved = sub1.removeFirstNodeWithPenalty(penaltyToRemove); + final Pair, Boolean> maybeRemoved = sub1.removeNode(nodeToRemove); removedSomething = maybeRemoved.getSecond(); sub.add(maybeRemoved.getFirst()); } From dad9bb11920b0877e6bd31cbede5f030a62d1d62 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 30 Jul 2012 15:46:00 -0400 Subject: [PATCH 11/43] Changes order of writing BaseRecalibrator results so that if R blows up you still get a meaningful tree --- .../sting/gatk/walkers/bqsr/BaseRecalibrator.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index e95a0e6c1..ddd75e232 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -275,13 +275,16 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed public void onTraversalDone(Long result) { logger.info("Calculating quantized quality scores..."); quantizeQualityScores(); + + logger.info("Writing recalibration report..."); + generateReport(); + logger.info("...done!"); + if (!RAC.NO_PLOTS) { logger.info("Generating recalibration plots..."); generatePlots(); } - logger.info("Writing recalibration report..."); - generateReport(); - logger.info("...done!"); + logger.info("Processed: " + result + " sites"); } From 762a3d9b50425deefcf20fc7798a5278522b9e23 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 30 Jul 2012 15:47:36 -0400 Subject: [PATCH 12/43] Move BQSR.R to utils/recalibration in R --- .../sting/{gatk/walkers/bqsr => utils/recalibration}/BQSR.R | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename public/R/scripts/org/broadinstitute/sting/{gatk/walkers/bqsr => utils/recalibration}/BQSR.R (100%) diff --git a/public/R/scripts/org/broadinstitute/sting/gatk/walkers/bqsr/BQSR.R b/public/R/scripts/org/broadinstitute/sting/utils/recalibration/BQSR.R similarity index 100% rename from public/R/scripts/org/broadinstitute/sting/gatk/walkers/bqsr/BQSR.R rename to public/R/scripts/org/broadinstitute/sting/utils/recalibration/BQSR.R From f7133ffc319ec56a6a19890d6526f3307972d84b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 31 Jul 2012 08:10:51 -0400 Subject: [PATCH 13/43] Cleanup syntax errors from BQSR reorganization --- .../src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 895de3578..4b56ce309 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -30,12 +30,12 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.walkers.bqsr.EventType; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.recalibration.EventType; import java.util.ArrayList; import java.util.Arrays; From 10111450aa20010fb16301006a471ba399f9ae8a Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 31 Jul 2012 15:37:22 -0400 Subject: [PATCH 16/43] Fixed AlignmentUtils bug for handling Ns in the CIGAR string. Added a UG integration test that calls a BAM with such reads (provided by a user on GetSatisfaction). --- .../sting/utils/sam/AlignmentUtils.java | 9 +++------ .../UnifiedGenotyperIntegrationTest.java | 18 ++++++++++++++++-- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 4b56ce309..2c388a1e0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -476,7 +476,6 @@ public class AlignmentUtils { } break; case D: - case N: if (!isDeletion) { alignmentPos += elementLength; } else { @@ -498,6 +497,7 @@ public class AlignmentUtils { break; case H: case P: + case N: break; default: throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); @@ -516,16 +516,13 @@ public class AlignmentUtils { final int elementLength = ce.getLength(); switch (ce.getOperator()) { - case I: - case S: - break; case D: case N: - alignmentLength += elementLength; - break; case M: alignmentLength += elementLength; break; + case I: + case S: case H: case P: break; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index f35eb4404..f26486a71 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -373,13 +373,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- // - // testing SnpEff + // testing MinIndelFraction // // -------------------------------------------------------------------------------------------------------------- final static String assessMinIndelFraction = baseCommandIndelsb37 + " -I " + validationDataLocation + "978604.bam -L 1:978,586-978,626 -o %s --sites_only -rf Sample -goodSM 7377 -goodSM 22-0022 -goodSM 134 -goodSM 344029-53 -goodSM 14030"; - + @Test public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -403,4 +403,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { Arrays.asList("3f07efb768e08650a7ce333edd4f9a52")); executeTest("test minIndelFraction 1.0", spec); } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing Ns in CIGAR + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testNsInCigar() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "testWithNs.bam", 1, + Arrays.asList("22c9fd65ce3298bd7fbf400c9c209f29")); + executeTest("test calling on reads with Ns in CIGAR", spec); + } } From ab53d73459e58032567ab597a871598aab0280fd Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 31 Jul 2012 15:50:32 -0400 Subject: [PATCH 17/43] Quick fix to user error catching --- .../src/org/broadinstitute/sting/gatk/CommandLineGATK.java | 4 ++-- .../walkers/genotyper/UnifiedGenotyperIntegrationTest.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index 306ebdd0e..312d31727 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -130,12 +130,12 @@ public class CommandLineGATK extends CommandLineExecutable { // can't close tribble index when writing if ( message.indexOf("Unable to close index for") != -1 ) - exitSystemWithUserError(new UserException(t.getCause().getMessage())); + exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); // disk is full if ( message.indexOf("No space left on device") != -1 ) exitSystemWithUserError(new UserException(t.getMessage())); - if ( t.getCause().getMessage().indexOf("No space left on device") != -1 ) + if ( t.getCause() != null && t.getCause().getMessage().indexOf("No space left on device") != -1 ) exitSystemWithUserError(new UserException(t.getCause().getMessage())); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index f26486a71..d976e3e22 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -413,7 +413,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "testWithNs.bam", 1, + "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + validationDataLocation + "testWithNs.bam -o %s -L 8:141799600-141814700", 1, Arrays.asList("22c9fd65ce3298bd7fbf400c9c209f29")); executeTest("test calling on reads with Ns in CIGAR", spec); } From 6cb10cef9638fe4d57cfe6168cb11fa3be8c2230 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 31 Jul 2012 16:00:36 -0400 Subject: [PATCH 18/43] Fixed older GS reported bug. Actually, the problem really lies in Picard (can't set max records in RAM without it throwing an exception, reported on their JIRA) so I just masked out the problem by removing this never-used argument from this rarely-used tool. --- .../gatk/io/storage/SAMFileWriterStorage.java | 1 + .../sting/gatk/walkers/indels/LeftAlignIndels.java | 14 +------------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java index cb8786be1..300e801e6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java @@ -62,6 +62,7 @@ public class SAMFileWriterStorage implements SAMFileWriter, Storage { @Output(required=false, doc="Output bam") protected StingSAMFileWriter writer = null; - /** - * If set too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool - * may run out of memory. We recommend that you additionally tell Java to use a temp directory with plenty of available - * space (by setting java.io.tempdir on the command-line). - */ - @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the output writer", required=false) - protected int MAX_RECORDS_IN_RAM = 500000; - - public void initialize() { - // set up the output writer - if ( writer != null ) - writer.setMaxRecordsInRam(MAX_RECORDS_IN_RAM); - } + public void initialize() {} private void emit(final SAMRecord read) { if ( writer != null ) From 4a23f3cd110d59f1f2cd1e82b72c15c7fdda0f2a Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Tue, 31 Jul 2012 16:34:20 -0400 Subject: [PATCH 19/43] Simple cleanup of pool caller code - since usage is much more general than just calling pools, AF calculation models and GL calculation models are renamed from Pool -> GeneralPloidy. Also, don't have users specify special arguments for -glm and -pnrm. Instead, when running UG with sample ploidy != 2, the correct general ploidy modules are automatically detected and loaded. -glm now reverts to old [SNP|INDEL|BOTH] usage --- ...GeneralPloidyExactAFCalculationModel.java} | 22 +++++++-------- ... => GeneralPloidyGenotypeLikelihoods.java} | 14 +++++----- ...yGenotypeLikelihoodsCalculationModel.java} | 14 +++++----- ...eneralPloidyIndelGenotypeLikelihoods.java} | 18 ++++++------- ...lGenotypeLikelihoodsCalculationModel.java} | 10 +++---- ... GeneralPloidySNPGenotypeLikelihoods.java} | 8 +++--- ...PGenotypeLikelihoodsCalculationModel.java} | 8 +++--- ...eralPloidyAFCalculationModelUnitTest.java} | 6 ++--- ...ralPloidyGenotypeLikelihoodsUnitTest.java} | 27 +++++++++---------- ...enotyperGeneralPloidyIntegrationTest.java} | 16 +++++------ .../AlleleFrequencyCalculationModel.java | 3 +-- .../GenotypeLikelihoodsCalculationModel.java | 7 +++-- .../walkers/genotyper/UnifiedGenotyper.java | 3 ++- .../genotyper/UnifiedGenotyperEngine.java | 27 ++++++++++++------- 14 files changed, 93 insertions(+), 90 deletions(-) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{PoolAFCalculationModel.java => GeneralPloidyExactAFCalculationModel.java} (95%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{PoolGenotypeLikelihoods.java => GeneralPloidyGenotypeLikelihoods.java} (97%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{PoolGenotypeLikelihoodsCalculationModel.java => GeneralPloidyGenotypeLikelihoodsCalculationModel.java} (94%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{PoolIndelGenotypeLikelihoods.java => GeneralPloidyIndelGenotypeLikelihoods.java} (92%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{PoolIndelGenotypeLikelihoodsCalculationModel.java => GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java} (90%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{PoolSNPGenotypeLikelihoods.java => GeneralPloidySNPGenotypeLikelihoods.java} (97%) rename protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/{PoolSNPGenotypeLikelihoodsCalculationModel.java => GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java} (91%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/{PoolAFCalculationModelUnitTest.java => GeneralPloidyAFCalculationModelUnitTest.java} (96%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/{PoolGenotypeLikelihoodsUnitTest.java => GeneralPloidyGenotypeLikelihoodsUnitTest.java} (93%) rename protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/{PoolCallerIntegrationTest.java => UnifiedGenotyperGeneralPloidyIntegrationTest.java} (88%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java similarity index 95% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModel.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java index b8be24cad..ba19638e0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java @@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.io.PrintStream; import java.util.*; -public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel { +public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel { static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them final protected UnifiedArgumentCollection UAC; @@ -42,7 +42,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel { private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private final static boolean VERBOSE = false; - protected PoolAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { + protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { super(UAC, N, logger, verboseWriter); ploidy = UAC.samplePloidy; this.UAC = UAC; @@ -140,7 +140,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel { for ( final double[] likelihoods : GLs ) { final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); - final int[] acCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(1+numOriginalAltAlleles,ploidy,PLindexOfBestGL); + final int[] acCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(1 + numOriginalAltAlleles, ploidy, PLindexOfBestGL); // by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele for (int k=1; k < acCount.length;k++) { if (acCount[k] > 0) @@ -238,7 +238,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel { return newPool; } - // todo - refactor, function almost identical except for log10LofK computation in PoolGenotypeLikelihoods + // todo - refactor, function almost identical except for log10LofK computation in GeneralPloidyGenotypeLikelihoods /** * * @param set ExactACset holding conformation to be computed @@ -301,7 +301,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel { continue; - PoolGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset); + GeneralPloidyGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset); } @@ -341,14 +341,14 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel { // Say L1(K) = Pr(D|AC1=K) * choose(m1,K) // and L2(K) = Pr(D|AC2=K) * choose(m2,K) - PoolGenotypeLikelihoods.SumIterator firstIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); + GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); final double[] x = originalPool.getLikelihoodsAsVector(true); while(firstIterator.hasNext()) { x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); firstIterator.next(); } - PoolGenotypeLikelihoods.SumIterator secondIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); + GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); final double[] y = yy.clone(); while(secondIterator.hasNext()) { y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); @@ -357,7 +357,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel { // initialize output to -log10(choose(m1+m2,[k1 k2...]) final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); - final PoolGenotypeLikelihoods.SumIterator outputIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); + final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K @@ -419,7 +419,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel { double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount); // for current conformation, get all possible ways to break vector K into two components G1 and G2 - final PoolGenotypeLikelihoods.SumIterator innerIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); + final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY; while (innerIterator.hasNext()) { // check if breaking current conformation into g1 and g2 is feasible. @@ -617,7 +617,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel { if ( numOriginalAltAlleles == numNewAltAlleles) { newLikelihoods = originalLikelihoods; } else { - newLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(),allelesToUse); + newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse); // might need to re-normalize newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true); @@ -668,7 +668,7 @@ public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel { // find the genotype with maximum likelihoods final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods); - final int[] mlAlleleCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex); + final int[] mlAlleleCount = GeneralPloidyGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex); final ArrayList alleleFreqs = new ArrayList(); final ArrayList alleleCounts = new ArrayList(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java similarity index 97% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoods.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java index 438acbacd..6b0831323 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java @@ -37,7 +37,7 @@ import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; import java.util.*; -public abstract class PoolGenotypeLikelihoods { +public abstract class GeneralPloidyGenotypeLikelihoods { protected final int numChromosomes; private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 @@ -67,8 +67,8 @@ public abstract class PoolGenotypeLikelihoods { private static final boolean FAST_GL_COMPUTATION = true; // constructor with given logPL elements - public PoolGenotypeLikelihoods(final List alleles, final double[] logLikelihoods, final int ploidy, - final HashMap perLaneErrorModels, final boolean ignoreLaneInformation) { + public GeneralPloidyGenotypeLikelihoods(final List alleles, final double[] logLikelihoods, final int ploidy, + final HashMap perLaneErrorModels, final boolean ignoreLaneInformation) { this.alleles = alleles; this.nAlleles = alleles.size(); numChromosomes = ploidy; @@ -101,7 +101,7 @@ public abstract class PoolGenotypeLikelihoods { Arrays.fill(log10Likelihoods, MIN_LIKELIHOOD); } else { if (logLikelihoods.length != likelihoodDim) - throw new ReviewedStingException("BUG: inconsistent parameters when creating PoolGenotypeLikelihoods object"); + throw new ReviewedStingException("BUG: inconsistent parameters when creating GeneralPloidyGenotypeLikelihoods object"); log10Likelihoods = logLikelihoods; //.clone(); // is clone needed? } @@ -174,7 +174,7 @@ public abstract class PoolGenotypeLikelihoods { final int numAlleles = currentState.length; final int ploidy = restrictSumTo; - linearIndex = PoolGenotypeLikelihoods.getLinearIndex(stateVector, numAlleles, ploidy); + linearIndex = GeneralPloidyGenotypeLikelihoods.getLinearIndex(stateVector, numAlleles, ploidy); } else throw new ReviewedStingException("BUG: Not supported"); @@ -308,7 +308,7 @@ public abstract class PoolGenotypeLikelihoods { public static double[] subsetToAlleles(final double[] oldLikelihoods, final int numChromosomes, final List originalAlleles, final List allelesToSubset) { - int newPLSize = PoolGenotypeLikelihoods.getNumLikelihoodElements(allelesToSubset.size(), numChromosomes); + int newPLSize = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(allelesToSubset.size(), numChromosomes); double[] newPLs = new double[newPLSize]; @@ -357,7 +357,7 @@ public abstract class PoolGenotypeLikelihoods { newCount[idx] = pVec[permutationKey[idx]]; // get corresponding index from new count - int outputIdx = PoolGenotypeLikelihoods.getLinearIndex(newCount, allelesToSubset.size(), numChromosomes); + int outputIdx = GeneralPloidyGenotypeLikelihoods.getLinearIndex(newCount, allelesToSubset.size(), numChromosomes); newPLs[outputIdx] = pl; if (VERBOSE) { System.out.println("Old Key:"+Arrays.toString(pVec)); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java similarity index 94% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsCalculationModel.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java index 685091678..f6ce818be 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java @@ -39,7 +39,7 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; -public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { +public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { //protected Set laneIDs; public enum Model { @@ -52,7 +52,7 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi final protected UnifiedArgumentCollection UAC; - protected PoolGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { + protected GeneralPloidyGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC,logger); this.UAC = UAC; @@ -137,11 +137,11 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi protected static class PoolGenotypeData { public final String name; - public final PoolGenotypeLikelihoods GL; + public final GeneralPloidyGenotypeLikelihoods GL; public final int depth; public final List alleles; - public PoolGenotypeData(final String name, final PoolGenotypeLikelihoods GL, final int depth, final List alleles) { + public PoolGenotypeData(final String name, final GeneralPloidyGenotypeLikelihoods GL, final int depth, final List alleles) { this.name = name; this.GL = GL; this.depth = depth; @@ -236,7 +236,7 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); // create the GenotypeLikelihoods object - final PoolGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO); + final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO); // actually compute likelihoods final int nGoodBases = GL.add(pileup, UAC); if ( nGoodBases > 0 ) @@ -268,7 +268,7 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi for ( PoolGenotypeData sampleData : GLs ) { // extract from multidimensional array - final double[] myLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(sampleData.GL.getLikelihoods(),sampleData.GL.numChromosomes, + final double[] myLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(sampleData.GL.getLikelihoods(), sampleData.GL.numChromosomes, allAlleles, alleles); // normalize in log space so that max element is zero. @@ -327,7 +327,7 @@ public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLi Abstract methods - must be implemented in derived classes */ - protected abstract PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles, + protected abstract GeneralPloidyGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles, final double[] logLikelihoods, final int ploidy, final HashMap perLaneErrorModels, diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java similarity index 92% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoods.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java index 33b7b8b90..4f42f820e 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java @@ -18,7 +18,7 @@ import java.util.*; * Time: 10:06 AM * To change this template use File | Settings | File Templates. */ -public class PoolIndelGenotypeLikelihoods extends PoolGenotypeLikelihoods { +public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotypeLikelihoods { final PairHMMIndelErrorModel pairModel; final LinkedHashMap haplotypeMap; final ReferenceContext refContext; @@ -27,14 +27,14 @@ public class PoolIndelGenotypeLikelihoods extends PoolGenotypeLikelihoods { final byte refBase; - public PoolIndelGenotypeLikelihoods(final List alleles, - final double[] logLikelihoods, - final int ploidy, - final HashMap perLaneErrorModels, - final boolean ignoreLaneInformation, - final PairHMMIndelErrorModel pairModel, - final LinkedHashMap haplotypeMap, - final ReferenceContext referenceContext) { + public GeneralPloidyIndelGenotypeLikelihoods(final List alleles, + final double[] logLikelihoods, + final int ploidy, + final HashMap perLaneErrorModels, + final boolean ignoreLaneInformation, + final PairHMMIndelErrorModel pairModel, + final LinkedHashMap haplotypeMap, + final ReferenceContext referenceContext) { super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation); this.pairModel = pairModel; this.haplotypeMap = haplotypeMap; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java similarity index 90% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java index 1fef76116..5eeadddae 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java @@ -32,13 +32,11 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; -public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel { +public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends GeneralPloidyGenotypeLikelihoodsCalculationModel { private static final int MAX_NUM_ALLELES_TO_GENOTYPE = 4; private PairHMMIndelErrorModel pairModel; @@ -59,7 +57,7 @@ public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLi } */ - protected PoolIndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC, final Logger logger) { + protected GeneralPloidyIndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC, final Logger logger) { super(UAC, logger); @@ -69,14 +67,14 @@ public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLi } - protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles, + protected GeneralPloidyGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles, final double[] logLikelihoods, final int ploidy, final HashMap perLaneErrorModels, final boolean useBQAedPileup, final ReferenceContext ref, final boolean ignoreLaneInformation){ - return new PoolIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref); + return new GeneralPloidyIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref); } protected List getInitialAllelesToUse(final RefMetaDataTracker tracker, diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java similarity index 97% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java index f763392ae..944372907 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java @@ -23,7 +23,7 @@ import static java.lang.Math.pow; * and posteriors given a pile of bases and quality scores * */ -public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implements Cloneable*/ { +public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLikelihoods/* implements Cloneable*/ { final List myAlleles; final int[] alleleIndices; @@ -42,8 +42,8 @@ public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implem * @param useBQAedPileup Use BAQed pileup * @param ignoreLaneInformation If true, lane info is ignored */ - public PoolSNPGenotypeLikelihoods(final List alleles, final double[] logLikelihoods, final int ploidy, - final HashMap perLaneErrorModels, final boolean useBQAedPileup,final boolean ignoreLaneInformation) { + public GeneralPloidySNPGenotypeLikelihoods(final List alleles, final double[] logLikelihoods, final int ploidy, + final HashMap perLaneErrorModels, final boolean useBQAedPileup, final boolean ignoreLaneInformation) { super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation); this.useBAQedPileup = useBQAedPileup; @@ -52,7 +52,7 @@ public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implem Allele refAllele = alleles.get(0); //sanity check: by construction, first allele should ALWAYS be the reference alleles if (!refAllele.isReference()) - throw new ReviewedStingException("BUG: First allele in list passed to PoolSNPGenotypeLikelihoods should be reference!"); + throw new ReviewedStingException("BUG: First allele in list passed to GeneralPloidySNPGenotypeLikelihoods should be reference!"); refByte = refAllele.getBases()[0]; // by construction, first allele in list is always ref! diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java similarity index 91% rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoodsCalculationModel.java rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java index 61f505445..30d614455 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java @@ -35,22 +35,22 @@ import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; -public class PoolSNPGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel { +public class GeneralPloidySNPGenotypeLikelihoodsCalculationModel extends GeneralPloidyGenotypeLikelihoodsCalculationModel { - protected PoolSNPGenotypeLikelihoodsCalculationModel( UnifiedArgumentCollection UAC, Logger logger) { + protected GeneralPloidySNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); } - protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles, + protected GeneralPloidyGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List alleles, final double[] logLikelihoods, final int ploidy, final HashMap perLaneErrorModels, final boolean useBQAedPileup, final ReferenceContext ref, final boolean ignoreLaneInformation) { - return new PoolSNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO); + return new GeneralPloidySNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO); } protected List getInitialAllelesToUse(final RefMetaDataTracker tracker, diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java similarity index 96% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModelUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java index 5a6f7df0f..983f562d2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolAFCalculationModelUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java @@ -19,7 +19,7 @@ import java.util.Arrays; * Time: 7:44 AM * To change this template use File | Settings | File Templates. */ -public class PoolAFCalculationModelUnitTest extends BaseTest { +public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest { static double[] AA1, AB1, BB1; static double[] AA2, AB2, AC2, BB2, BC2, CC2; @@ -138,10 +138,10 @@ public class PoolAFCalculationModelUnitTest extends BaseTest { public void testGLs(GetGLsTest cfg) { final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(cfg.numAltAlleles); - final int len = PoolGenotypeLikelihoods.getNumLikelihoodElements(1+cfg.numAltAlleles,cfg.ploidy*cfg.GLs.size()); + final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size()); double[] priors = new double[len]; // flat priors - PoolAFCalculationModel.combineSinglePools(cfg.GLs, 1+cfg.numAltAlleles, cfg.ploidy, priors, result); + GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result); int nameIndex = 1; for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java similarity index 93% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java index 5d78dd248..f95ba66b2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsUnitTest.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; @@ -42,7 +41,7 @@ import java.io.PrintStream; import java.util.*; -public class PoolGenotypeLikelihoodsUnitTest { +public class GeneralPloidyGenotypeLikelihoodsUnitTest { final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); final Logger logger = Logger.getLogger(Walker.class); @@ -61,7 +60,7 @@ public class PoolGenotypeLikelihoodsUnitTest { public void testStoringLikelihoodElements() { - // basic test storing a given PL vector in a PoolGenotypeLikelihoods object and then retrieving it back + // basic test storing a given PL vector in a GeneralPloidyGenotypeLikelihoods object and then retrieving it back int ploidy = 20; int numAlleles = 4; @@ -79,7 +78,7 @@ public class PoolGenotypeLikelihoodsUnitTest { for (int k=0; k < gls.length; k++) gls[k]= (double)k; - PoolGenotypeLikelihoods gl = new PoolSNPGenotypeLikelihoods(alleles, gls,ploidy, null, false,true); + GeneralPloidyGenotypeLikelihoods gl = new GeneralPloidySNPGenotypeLikelihoods(alleles, gls,ploidy, null, false,true); double[] glnew = gl.getLikelihoods(); Assert.assertEquals(gls, glnew); @@ -91,7 +90,7 @@ public class PoolGenotypeLikelihoodsUnitTest { for (int ploidy = 2; ploidy < 10; ploidy++) { for (int nAlleles = 2; nAlleles < 10; nAlleles++) - Assert.assertEquals(PoolGenotypeLikelihoods.getNumLikelihoodElements(nAlleles,ploidy), + Assert.assertEquals(GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(nAlleles, ploidy), GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy)); } @@ -103,7 +102,7 @@ public class PoolGenotypeLikelihoodsUnitTest { // create iterator, compare linear index given by iterator with closed form function int numAlleles = 4; int ploidy = 2; - PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles, ploidy); + GeneralPloidyGenotypeLikelihoods.SumIterator iterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles, ploidy); while(iterator.hasNext()) { System.out.format("\n%d:",iterator.getLinearIndex()); @@ -112,7 +111,7 @@ public class PoolGenotypeLikelihoodsUnitTest { System.out.format("%d ",aa); - int computedIdx = PoolGenotypeLikelihoods.getLinearIndex(a, numAlleles, ploidy); + int computedIdx = GeneralPloidyGenotypeLikelihoods.getLinearIndex(a, numAlleles, ploidy); System.out.format("Computed idx = %d\n",computedIdx); iterator.next(); } @@ -141,7 +140,7 @@ public class PoolGenotypeLikelihoodsUnitTest { allelesToSubset.add(Allele.create("A",false)); allelesToSubset.add(Allele.create("C",false)); - double[] newGLs = PoolGenotypeLikelihoods.subsetToAlleles(oldLikelihoods, ploidy, + double[] newGLs = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(oldLikelihoods, ploidy, originalAlleles, allelesToSubset); @@ -171,7 +170,7 @@ public class PoolGenotypeLikelihoodsUnitTest { @Test public void testIndexIterator() { int[] seed = new int[]{1,2,3,4}; - PoolGenotypeLikelihoods.SumIterator iterator = runIterator(seed,-1); + GeneralPloidyGenotypeLikelihoods.SumIterator iterator = runIterator(seed,-1); // Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed)); Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1); @@ -229,12 +228,12 @@ public class PoolGenotypeLikelihoodsUnitTest { } - private PoolGenotypeLikelihoods.SumIterator runIterator(int[] seed, int restrictSumTo) { - PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(seed, restrictSumTo); + private GeneralPloidyGenotypeLikelihoods.SumIterator runIterator(int[] seed, int restrictSumTo) { + GeneralPloidyGenotypeLikelihoods.SumIterator iterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(seed, restrictSumTo); while(iterator.hasNext()) { int[] a = iterator.getCurrentVector(); - int idx = PoolGenotypeLikelihoods.getLinearIndex(a, a.length, restrictSumTo); + int idx = GeneralPloidyGenotypeLikelihoods.getLinearIndex(a, a.length, restrictSumTo); if (VERBOSE) { System.out.format("%d:",iterator.getLinearIndex()); for (int i=0; i < seed.length; i++) @@ -454,7 +453,7 @@ public class PoolGenotypeLikelihoodsUnitTest { // get now likelihoods for this - final PoolSNPGenotypeLikelihoods GL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noiselessErrorModels, false, true); + final GeneralPloidySNPGenotypeLikelihoods GL = new GeneralPloidySNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noiselessErrorModels, false, true); final int nGoodBases = GL.add(alignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE); if (VERBOSE) { System.out.format("Depth:%d, AC:%d, altDepth:%d, samplesPerPool:%d\nGLs:", depth,ac,altDepth, nSamplesPerPool); @@ -483,7 +482,7 @@ public class PoolGenotypeLikelihoodsUnitTest { // get now likelihoods for this - final PoolSNPGenotypeLikelihoods noisyGL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noisyErrorModels, false,true); + final GeneralPloidySNPGenotypeLikelihoods noisyGL = new GeneralPloidySNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noisyErrorModels, false,true); noisyGL.add(noisyAlignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE); mlPair = noisyGL.getMostLikelyACCount(); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java similarity index 88% rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index acf95c906..9b3103274 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -12,7 +12,7 @@ import org.testng.annotations.Test; * Time: 11:28 AM * To change this template use File | Settings | File Templates. */ -public class PoolCallerIntegrationTest extends WalkerTest { +public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { final static String REF = b37KGReference; final String CEUTRIO_BAM = "/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.list"; final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam"; @@ -24,21 +24,21 @@ public class PoolCallerIntegrationTest extends WalkerTest { final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf"; final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf"; private void PC_MT_Test(String bam, String args, String name, String md5) { - final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL", + final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -ignoreLane ", REF, bam, MTINTERVALS, REFSAMPLE_MT_CALLS, REFSAMPLE_NAME) + " --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testPoolCaller:"+name+" args=" + args, spec); } private void PC_LSV_Test(String args, String name, String model, String md5) { - final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL", + final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane ", REF, LSV_BAM, LSVINTERVALS, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testPoolCaller:"+name+" args=" + args, spec); } private void PC_LSV_Test_NoRef(String args, String name, String model, String md5) { - final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s -glm %s -ignoreLane -pnrm POOL", + final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s -glm %s -ignoreLane", REF, LSV_BAM, LSVINTERVALS, model) + " --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testPoolCaller:"+name+" args=" + args, spec); @@ -46,22 +46,22 @@ public class PoolCallerIntegrationTest extends WalkerTest { @Test public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","d76e3b910259da819f1e1b2adc68ba8d"); + PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","d76e3b910259da819f1e1b2adc68ba8d"); } @Test public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","POOLINDEL","ffadcdaee613dab975197bed0fc78da3"); + PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","ffadcdaee613dab975197bed0fc78da3"); } @Test public void testINDEL_maxAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","POOLINDEL","96087fe9240e3656cc2a4e0ff0174d5b"); + PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","96087fe9240e3656cc2a4e0ff0174d5b"); } @Test public void testINDEL_maxAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","POOLINDEL","6fdae7093831ecfc82a06dd707d62fe9"); + PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","6fdae7093831ecfc82a06dd707d62fe9"); } @Test diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 432bbd6d7..08a333486 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -46,8 +46,7 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { public enum Model { /** The default model with the best performance in all cases */ - EXACT, - POOL + EXACT } protected int N; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 4253ff3ad..6fdc926d5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -59,10 +59,9 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { public enum Model { SNP, INDEL, - BOTH, - POOLSNP, - POOLINDEL, - POOLBOTH + GeneralPloidySNP, + GeneralPloidyINDEL, + BOTH } public enum GENOTYPING_MODE { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 6b279fd95..c1c1339f5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -241,7 +241,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif } else { // in full mode: check for consistency in ploidy/pool calling arguments // check for correct calculation models - if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) { +/* if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) { // polyploidy requires POOL GL and AF calculation models to be specified right now if (UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLSNP && UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLINDEL && UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLBOTH) { @@ -252,6 +252,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2"); } + */ // get all of the unique sample names if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL) { samples.clear(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index f73ab2471..f4bd196ae 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -50,6 +50,7 @@ import java.util.*; public class UnifiedGenotyperEngine { public static final String LOW_QUAL_FILTER_NAME = "LowQual"; + private static final String GPSTRING = "GeneralPloidy"; public static final String NUMBER_OF_DISCOVERED_ALLELES_KEY = "NDA"; @@ -273,7 +274,7 @@ public class UnifiedGenotyperEngine { glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - return glcm.get().get(model.name()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser); + return glcm.get().get(model.name().toUpperCase()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser); } private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { @@ -640,6 +641,9 @@ public class UnifiedGenotyperEngine { if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) modelPrefix = UAC.GLmodel.name().toUpperCase().replaceAll("BOTH",""); + if (!UAC.GLmodel.name().contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) + modelPrefix = GPSTRING + modelPrefix; + // if we're genotyping given alleles and we have a requested SNP at this position, do SNP if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { final VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, refContext, rawContext.getLocation(), false, logger, UAC.alleles); @@ -648,17 +652,13 @@ public class UnifiedGenotyperEngine { if ( vcInput.isSNP() ) { // ignore SNPs if the user chose INDEL mode only - if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) + if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") || UAC.GLmodel.name().toUpperCase().contains("SNP") ) models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"SNP")); - else if ( UAC.GLmodel.name().toUpperCase().contains("SNP") ) - models.add(UAC.GLmodel); - } + } else if ( vcInput.isIndel() || vcInput.isMixed() ) { // ignore INDELs if the user chose SNP mode only - if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") ) + if ( UAC.GLmodel.name().toUpperCase().contains("BOTH") || UAC.GLmodel.name().toUpperCase().contains("INDEL") ) models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); - else if (UAC.GLmodel.name().toUpperCase().contains("INDEL")) - models.add(UAC.GLmodel); } // No support for other types yet } @@ -668,7 +668,7 @@ public class UnifiedGenotyperEngine { models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+"INDEL")); } else { - models.add(UAC.GLmodel); + models.add(GenotypeLikelihoodsCalculationModel.Model.valueOf(modelPrefix+UAC.GLmodel.name().toUpperCase())); } } @@ -730,12 +730,19 @@ public class UnifiedGenotyperEngine { } private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) { + List> afClasses = new PluginManager(AlleleFrequencyCalculationModel.class).getPlugins(); + // user-specified name + String afModelName = UAC.AFmodel.name(); + + if (!afModelName.contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) + afModelName = GPSTRING + afModelName; + for (int i = 0; i < afClasses.size(); i++) { Class afClass = afClasses.get(i); String key = afClass.getSimpleName().replace("AFCalculationModel","").toUpperCase(); - if (UAC.AFmodel.name().equalsIgnoreCase(key)) { + if (afModelName.equalsIgnoreCase(key)) { try { Object args[] = new Object[]{UAC,N,logger,verboseWriter}; Constructor c = afClass.getDeclaredConstructor(UnifiedArgumentCollection.class, int.class, Logger.class, PrintStream.class); From 7cf4b63d763ebbecac23676da19d6328abe45359 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 1 Aug 2012 09:23:04 -0400 Subject: [PATCH 20/43] Disabling indel quals in BaseRecalibrator as it should be, not PrintReads. --- .../sting/queue/qscripts/DataProcessingPipeline.scala | 2 +- .../sting/queue/qscripts/PacbioProcessingPipeline.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 56e2ddebb..56f6460fb 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -347,6 +347,7 @@ class DataProcessingPipeline extends QScript { this.knownSites ++= qscript.dbSNP this.covariate ++= Seq("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "ContextCovariate") this.input_file :+= inBam + this.disable_indel_quals = true this.out = outRecalFile if (!defaultPlatform.isEmpty) this.default_platform = defaultPlatform if (!qscript.intervalString.isEmpty) this.intervalsString ++= Seq(qscript.intervalString) @@ -359,7 +360,6 @@ class DataProcessingPipeline extends QScript { case class recal (inBam: File, inRecalFile: File, outBam: File) extends PrintReads with CommandLineGATKArgs { this.input_file :+= inBam this.BQSR = inRecalFile - this.disable_indel_quals = true this.baq = CalculationMode.CALCULATE_AS_NECESSARY this.out = outBam if (!qscript.intervalString.isEmpty) this.intervalsString ++= Seq(qscript.intervalString) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala index b7246ca8f..a4a6636fe 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -166,6 +166,7 @@ class PacbioProcessingPipeline extends QScript { this.knownSites :+= dbSNP this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "ContextCovariate") this.input_file :+= inBam + this.disable_indel_quals = true this.out = outRecalFile this.analysisName = queueLogDir + outRecalFile + ".covariates" this.jobName = queueLogDir + outRecalFile + ".covariates" @@ -178,7 +179,6 @@ class PacbioProcessingPipeline extends QScript { this.input_file :+= inBam this.BQSR = inRecalFile this.out = outBam - this.disable_indel_quals = true this.isIntermediate = false this.analysisName = queueLogDir + outBam + ".recalibration" this.jobName = queueLogDir + outBam + ".recalibration" From 56f8afab971d4c67c4d07c8a270e9d2b697cb731 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 1 Aug 2012 09:50:00 -0400 Subject: [PATCH 21/43] Requested by Geraldine: adding a utility to register deprecated walkers (and the major version of the first release since they were removed) so that the User Error printed out for e.g. CountCovariates now states: Walker CountCovariates is no longer available in the GATK; it has been deprecated since version 2.0. --- .../sting/gatk/GenomeAnalysisEngine.java | 35 +++++++++++++++++++ .../sting/utils/exceptions/UserException.java | 6 ++-- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index eab9fde79..5d6fb75ed 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -274,6 +274,38 @@ public class GenomeAnalysisEngine { //return result; } + // TODO -- Let's move this to a utility class in unstable - but which one? + // ************************************************************************************** + // * Handle Deprecated Walkers * + // ************************************************************************************** + + // Mapping from walker name to major version number where the walker first disappeared + private static Map deprecatedGATKWalkers = new HashMap(); + static { + deprecatedGATKWalkers.put("CountCovariates", "2.0"); + deprecatedGATKWalkers.put("TableRecalibration", "2.0"); + } + + /** + * Utility method to check whether a given walker has been deprecated in a previous GATK release + * + * @param walkerName the walker class name (not the full package) to check + */ + public static boolean isDeprecatedWalker(final String walkerName) { + return deprecatedGATKWalkers.containsKey(walkerName); + } + + /** + * Utility method to check whether a given walker has been deprecated in a previous GATK release + * + * @param walkerName the walker class name (not the full package) to check + */ + public static String getDeprecatedMajorVersionNumber(final String walkerName) { + return deprecatedGATKWalkers.get(walkerName); + } + + // ************************************************************************************** + /** * Retrieves an instance of the walker based on the walker name. * @@ -287,6 +319,9 @@ public class GenomeAnalysisEngine { if ( isGATKLite() && GATKLiteUtils.isAvailableOnlyInFullGATK(walkerName) ) { e = new UserException.NotSupportedInGATKLite("the " + walkerName + " walker is available only in the full version of the GATK"); } + else if ( isDeprecatedWalker(walkerName) ) { + e = new UserException.DeprecatedWalker(walkerName, getDeprecatedMajorVersionNumber(walkerName)); + } throw e; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 9ce4e82a1..52da31362 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -316,9 +316,9 @@ public class UserException extends ReviewedStingException { - public static class MissingWalker extends UserException { - public MissingWalker(String walkerName, String message) { - super(String.format("Walker %s is not available: %s", walkerName, message)); + public static class DeprecatedWalker extends UserException { + public DeprecatedWalker(String walkerName, String version) { + super(String.format("Walker %s is no longer available in the GATK; it has been deprecated since version %s", walkerName, version)); } } From 05bf6e3726ad661289cda8e80e1889dd82422407 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 1 Aug 2012 10:27:00 -0400 Subject: [PATCH 22/43] Updating md5s in pipeline tests so that they finally pass --- .../sting/queue/pipeline/DataProcessingPipelineTest.scala | 4 ++-- .../sting/queue/pipeline/PacbioProcessingPipelineTest.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala index 724f35cf6..778e6c28a 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala @@ -41,7 +41,7 @@ class DataProcessingPipelineTest { " -D " + BaseTest.publicTestDir + "exampleDBSNP.vcf", " -test ", " -p " + projectName).mkString - spec.fileMD5s += testOut -> "0f0b32e94640a8d548b4c1134ad4c075" + spec.fileMD5s += testOut -> "021db3a52ee40d9193895213aaa05dd5" PipelineTest.executeTest(spec) } @@ -60,7 +60,7 @@ class DataProcessingPipelineTest { " -bwa /home/unix/carneiro/bin/bwa", " -bwape ", " -p " + projectName).mkString - spec.fileMD5s += testOut -> "6b4f13d22b45d7d617ee959fdc278ed2" + spec.fileMD5s += testOut -> "94e32ee974aa15d05ee7d3e99f2f8c8f" PipelineTest.executeTest(spec) } diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala index 6e4c5ed3e..62ee7ce90 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala @@ -40,7 +40,7 @@ class PacbioProcessingPipelineTest { " -blasr ", " -test ", " -D " + BaseTest.publicTestDir + "exampleDBSNP.vcf").mkString - spec.fileMD5s += testOut -> "2f2026882a2850bb14a858524158d5a8" + spec.fileMD5s += testOut -> "8b5f9e8e1ba5bd686e3084eaeea23889" PipelineTest.executeTest(spec) } } From a4a41458efc3b3745356b0ddc89fceee4cef2137 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 1 Aug 2012 10:33:41 -0400 Subject: [PATCH 23/43] Update docs of FastaAlternateReferenceMaker as promised in older GS thread --- .../sting/gatk/walkers/fasta/FastaAlternateReference.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java index 92549b821..28cfdd5cd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java @@ -47,7 +47,8 @@ import java.util.List; *

* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. - * Note that if there are multiple variants at a site, it takes the first one seen. + * Note that if there are multiple variants at a site, it chooses one of them randomly. + * Also note that this tool works only for SNPs and for simple indels (but not for things like complex substitutions). * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order). * *

Input

From 459832ee161f88631b56681a0dfdcc9fbe224cf9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 1 Aug 2012 10:45:04 -0400 Subject: [PATCH 24/43] Fixed bug in FastaAlternateReferenceMaker when input VCF has overlapping deletions as reported a while back on GS --- .../sting/gatk/walkers/fasta/FastaAlternateReference.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java index 28cfdd5cd..8fbd37e30 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java @@ -47,8 +47,10 @@ import java.util.List; *

* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. - * Note that if there are multiple variants at a site, it chooses one of them randomly. - * Also note that this tool works only for SNPs and for simple indels (but not for things like complex substitutions). + * Several important notes: + * 1) if there are multiple variants that start at a site, it chooses one of them randomly. + * 2) when there are overlapping indels (but with different start positions) only the first will be chosen. + * 3) this tool works only for SNPs and for simple indels (but not for things like complex substitutions). * Reference bases for each interval will be output as a separate fasta sequence (named numerically in order). * *

Input

@@ -103,7 +105,7 @@ public class FastaAlternateReference extends FastaReference { String refBase = String.valueOf((char)ref.getBase()); // Check to see if we have a called snp - for ( VariantContext vc : tracker.getValues(variants) ) { + for ( VariantContext vc : tracker.getValues(variants, ref.getLocus()) ) { if ( vc.isFiltered() ) continue; From 01265f78e66b1f30069b5afe999e07e8d8cc3753 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Wed, 1 Aug 2012 10:50:00 -0400 Subject: [PATCH 25/43] Add sanity check and possible bug fix for forum user: if haplotypes cannot be created from given alleles when genotyping indels (e.g. too close to contig boundary, etc.) in pool mode, empty allele list, signifying site can't be genotyped --- .../PoolIndelGenotypeLikelihoodsCalculationModel.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java index c2bac4455..c87ed04a0 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolIndelGenotypeLikelihoodsCalculationModel.java @@ -105,6 +105,10 @@ public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLi haplotypeMap.clear(); } IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(alleles, ref, ref.getLocus(), haplotypeMap); + + // sanity check: if haplotype map couldn't be created, clear allele list + if (haplotypeMap.isEmpty()) + alleles.clear(); return alleles; } From 22bf0528280630b1ae16e04a465d85a0b6e2922d Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 1 Aug 2012 13:17:16 -0400 Subject: [PATCH 26/43] Fixing BQSR GATK docs --- .../sting/gatk/walkers/bqsr/BaseRecalibrator.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index b2400a49d..06f42cbad 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -58,7 +58,7 @@ import java.util.ArrayList; * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group, - * reported quality score, cycle, and dinucleotide). Since there is a large amount of data one can then calculate an empirical + * reported quality score, cycle, and context). Since there is a large amount of data one can then calculate an empirical * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations. * The output file is a table (of the several covariate values, num observations, num mismatches, empirical quality score). *

@@ -90,7 +90,7 @@ import java.util.ArrayList; *

Examples

*
  * java -Xmx4g -jar GenomeAnalysisTK.jar \
- *   -T BaseQualityScoreRecalibrator \
+ *   -T BaseRecalibrator \
  *   -I my_reads.bam \
  *   -R resources/Homo_sapiens_assembly18.fasta \
  *   -knownSites bundle/hg18/dbsnp_132.hg18.vcf \

From 4093909a56ad3366308da40656cf2317b594e7ab Mon Sep 17 00:00:00 2001
From: Ryan Poplin 
Date: Wed, 1 Aug 2012 14:30:24 -0400
Subject: [PATCH 27/43] Updating VQSR docs. Removing references to old best
 practices pages.

---
 .../sting/gatk/walkers/bqsr/BQSR.R            |  1 +
 .../ApplyRecalibration.java                   |  6 ++---
 .../VariantRecalibrator.java                  | 22 ++++---------------
 3 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/public/R/scripts/org/broadinstitute/sting/gatk/walkers/bqsr/BQSR.R b/public/R/scripts/org/broadinstitute/sting/gatk/walkers/bqsr/BQSR.R
index fd301cd97..6c4dace1d 100644
--- a/public/R/scripts/org/broadinstitute/sting/gatk/walkers/bqsr/BQSR.R
+++ b/public/R/scripts/org/broadinstitute/sting/gatk/walkers/bqsr/BQSR.R
@@ -1,4 +1,5 @@
 library("ggplot2")
+library("tools") #For compactPDF in R 2.13+
 
 args <- commandArgs(TRUE)
 data <- read.csv(args[1])
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
index 744a094c6..073f56707 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java
@@ -58,14 +58,11 @@ import java.util.*;
  * to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a
  * slightly lower quality level.
  *
- * 

- * See the GATK wiki for a tutorial and example recalibration accuracy plots. - * *

Input

*

* The input raw variants to be recalibrated. *

- * The recalibration table file in CSV format that was generated by the VariantRecalibrator walker. + * The recalibration table file in VCF format that was generated by the VariantRecalibrator walker. *

* The tranches file that was generated by the VariantRecalibrator walker. * @@ -82,6 +79,7 @@ import java.util.*; * --ts_filter_level 99.0 \ * -tranchesFile path/to/output.tranches \ * -recalFile path/to/output.recal \ + * -mode SNP \ * -o path/to/output.recalibrated.filtered.vcf *

* diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 2a424e15b..a2909d4ca 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -38,7 +38,6 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; @@ -48,7 +47,6 @@ import org.broadinstitute.sting.utils.io.Resource; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; -import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory; import java.io.File; import java.io.FileNotFoundException; @@ -73,15 +71,9 @@ import java.util.*; * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. * *

- * NOTE: Please see our best practices wiki page for our recommendations on which annotations to use for specific project designs. - * - *

* NOTE: In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). * See http://www.r-project.org for more info on how to download and install R. * - *

- * See the GATK wiki for a tutorial and example recalibration accuracy plots. - * *

Input

*

* The input raw variants to be recalibrated. @@ -90,7 +82,7 @@ import java.util.*; * *

Output

*

- * A recalibration table file in CSV format that is used by the ApplyRecalibration walker. + * A recalibration table file in VCF format that is used by the ApplyRecalibration walker. *

* A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data. * @@ -102,8 +94,9 @@ import java.util.*; * -input NA12878.HiSeq.WGS.bwa.cleaned.raw.subset.b37.vcf \ * -resource:hapmap,known=false,training=true,truth=true,prior=15.0 hapmap_3.3.b37.sites.vcf \ * -resource:omni,known=false,training=true,truth=false,prior=12.0 1000G_omni2.5.b37.sites.vcf \ - * -resource:dbsnp,known=true,training=false,truth=false,prior=8.0 dbsnp_132.b37.vcf \ - * -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an MQ \ + * -resource:dbsnp,known=true,training=false,truth=false,prior=6.0 dbsnp_135.b37.vcf \ + * -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an InbreedingCoeff \ + * -mode SNP \ * -recalFile path/to/output.recal \ * -tranchesFile path/to/output.tranches \ * -rscriptFile path/to/output.plots.R @@ -187,9 +180,6 @@ public class VariantRecalibrator extends RodWalker Date: Wed, 1 Aug 2012 14:52:49 -0400 Subject: [PATCH 28/43] Another fix to VQSR docs --- .../gatk/walkers/variantrecalibration/VariantRecalibrator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index a2909d4ca..5e2125fb2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -63,7 +63,7 @@ import java.util.*; * The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set. * One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call. * The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship - * between SNP call annotations (QD, SB, HaplotypeScore, HRun, for example) and the the probability that a SNP is a true genetic + * between SNP call annotations (QD, MQ, HaplotypeScore, and ReadPosRankSum, for example) and the the probability that a SNP is a true genetic * variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided * as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive * error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the From 2b25df3d538ca4e1197786a500642c35a1b367b1 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Mon, 30 Jul 2012 10:19:57 -0400 Subject: [PATCH 29/43] Add removeProgramRecords argument * Add unit test for the removeProgramRecords --- .../sting/gatk/GenomeAnalysisEngine.java | 3 +- .../arguments/GATKArgumentCollection.java | 3 + .../gatk/datasources/reads/SAMDataSource.java | 21 +++++- .../reads/SAMDataSourceUnitTest.java | 73 +++++++++++++++++++ 4 files changed, 95 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 5d6fb75ed..228f8351a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -813,7 +813,8 @@ public class GenomeAnalysisEngine { getWalkerBAQQualityMode(), refReader, getBaseRecalibration(), - argCollection.defaultBaseQualities); + argCollection.defaultBaseQualities, + argCollection.removeProgramRecords); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 3fd3857c5..91a008cbc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -249,6 +249,9 @@ public class GATKArgumentCollection { @Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false) public SAMFileReader.ValidationStringency strictnessLevel = SAMFileReader.ValidationStringency.SILENT; + @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Should we remove program records from the SAM header", required = false) + public boolean removeProgramRecords = false; + @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 0fa4234b3..7f0a0c4c0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -89,6 +89,11 @@ public class SAMDataSource { */ private final SAMFileReader.ValidationStringency validationStringency; + /** + * Do we want to remove the program records from this data source? + */ + private final boolean removeProgramRecords; + /** * Store BAM indices for each reader present. */ @@ -200,7 +205,8 @@ public class SAMDataSource { BAQ.QualityMode.DONT_MODIFY, null, // no BAQ null, // no BQSR - (byte) -1); + (byte) -1, + false); } /** @@ -233,7 +239,8 @@ public class SAMDataSource { BAQ.QualityMode qmode, IndexedFastaSequenceFile refReader, BaseRecalibration bqsrApplier, - byte defaultBaseQualities) { + byte defaultBaseQualities, + boolean removeProgramRecords) { this.readMetrics = new ReadMetrics(); this.genomeLocParser = genomeLocParser; @@ -249,6 +256,7 @@ public class SAMDataSource { dispatcher = null; validationStringency = strictness; + this.removeProgramRecords = removeProgramRecords; if(readBufferSize != null) ReadShard.setReadBufferSize(readBufferSize); else { @@ -748,7 +756,7 @@ public class SAMDataSource { private synchronized void createNewResource() { if(allResources.size() > maxEntries) throw new ReviewedStingException("Cannot create a new resource pool. All resources are in use."); - SAMReaders readers = new SAMReaders(readerIDs, validationStringency); + SAMReaders readers = new SAMReaders(readerIDs, validationStringency, removeProgramRecords); allResources.add(readers); availableResources.add(readers); } @@ -777,9 +785,11 @@ public class SAMDataSource { /** * Derive a new set of readers from the Reads metadata. * @param readerIDs reads to load. + * TODO: validationStringency is not used here * @param validationStringency validation stringency. + * @param removeProgramRecords indicate whether to clear program records from the readers */ - public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) { + public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency, boolean removeProgramRecords) { final int totalNumberOfFiles = readerIDs.size(); int readerNumber = 1; final SimpleTimer timer = new SimpleTimer().start(); @@ -790,6 +800,9 @@ public class SAMDataSource { long lastTick = timer.currentTime(); for(final SAMReaderID readerID: readerIDs) { final ReaderInitializer init = new ReaderInitializer(readerID).call(); + if (removeProgramRecords) { + init.reader.getFileHeader().setProgramRecords(new ArrayList()); + } if (threadAllocation.getNumIOThreads() > 0) { inputStreams.put(init.readerID, init.blockInputStream); // get from initializer } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index 1c5dab254..f2c546317 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -24,9 +24,12 @@ package org.broadinstitute.sting.gatk.datasources.reads; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; import static org.testng.Assert.fail; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMProgramRecord; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; @@ -36,6 +39,7 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.AfterMethod; @@ -143,4 +147,73 @@ public class SAMDataSourceUnitTest extends BaseTest { fail("testLinearBreakIterateAll: We Should get a UserException.CouldNotReadInputFile exception"); } } + + /** Test that we clear program records when requested */ + @Test + public void testRemoveProgramRecords() { + logger.warn("Executing testRemoveProgramRecords"); + + // setup the data + readers.add(new SAMReaderID(new File(b37GoodBAM),new Tags())); + + // use defaults + SAMDataSource data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false); + + List defaultProgramRecords = data.getHeader().getProgramRecords(); + assertTrue(defaultProgramRecords.size() != 0, "testRemoveProgramRecords: No program records found when using default constructor"); + + boolean removeProgramRecords = false; + data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false, + BAQ.CalculationMode.OFF, + BAQ.QualityMode.DONT_MODIFY, + null, // no BAQ + null, // no BQSR + (byte) -1, + removeProgramRecords); + + List dontRemoveProgramRecords = data.getHeader().getProgramRecords(); + assertEquals(dontRemoveProgramRecords, defaultProgramRecords, "testRemoveProgramRecords: default program records differ from removeProgramRecords = false"); + + removeProgramRecords = true; + data = new SAMDataSource(readers, + new ThreadAllocation(), + null, + genomeLocParser, + false, + SAMFileReader.ValidationStringency.SILENT, + null, + null, + new ValidationExclusion(), + new ArrayList(), + false, + BAQ.CalculationMode.OFF, + BAQ.QualityMode.DONT_MODIFY, + null, // no BAQ + null, // no BQSR + (byte) -1, + removeProgramRecords); + + List doRemoveProgramRecords = data.getHeader().getProgramRecords(); + assertTrue(doRemoveProgramRecords.isEmpty(), "testRemoveProgramRecords: program records not cleared when removeProgramRecords = true"); + } } From ccac77d888f3a0c2cd5e3d8d2c0e7c13c21596af Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 1 Aug 2012 11:57:16 -0400 Subject: [PATCH 30/43] Bugfix for incorrect allele counting in IndelSummary -- Previous version would count all alt alleles as present in a sample, even if only 1 were present, because of the way VariantEval subsetted VCs -- Updated code for subsetting VCs by sample to be clearer about how it handles rederiving alleles -- Update a few pieces of code to get previous correct behavior -- Updated a few MD5s as now ref calls at sites in dbSNP are counted as having a comp sites, and therefore show up in known sites when Novelty strat is on (which I think is correct) -- Walkers that used old subsetting function with true are now using clearer version that does rederive alleles by default --- .../walkers/phasing/ReadBackedPhasing.java | 2 +- .../GLBasedSampleSelector.java | 2 +- .../GTBasedSampleSelector.java | 2 +- .../gatk/walkers/varianteval/VariantEval.java | 5 ++++- .../evaluators/TiTvVariantEvaluator.java | 9 ++++++--- .../varianteval/stratifications/Novelty.java | 2 +- .../varianteval/util/VariantEvalUtils.java | 12 +++++------- .../utils/variantcontext/VariantContext.java | 18 +++++++++++++++--- .../VariantEvalIntegrationTest.java | 6 +++--- .../VariantContextBenchmark.java | 2 +- 10 files changed, 38 insertions(+), 22 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java index f16deb701..f49e8f8c0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -288,7 +288,7 @@ public class ReadBackedPhasing extends RodWalker samplesToPhase) { // for ( String sample : samplesToPhase ) // logger.debug(String.format(" Sample %s has genotype %s, het = %s", sample, vc.getGenotype(sample), vc.getGenotype(sample).isHet() )); - VariantContext subvc = vc.subContextFromSamples(samplesToPhase, true); + VariantContext subvc = vc.subContextFromSamples(samplesToPhase); // logger.debug("original VC = " + vc); // logger.debug("sub VC = " + subvc); return VariantContextUtils.pruneVariantContext(subvc, KEYS_TO_KEEP_IN_REDUCED_VCF); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java index e54dc6388..3e48520a7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GLBasedSampleSelector.java @@ -43,7 +43,7 @@ public class GLBasedSampleSelector extends SampleSelector { return true; // want to include a site in the given samples if it is *likely* to be variant (via the EXACT model) // first subset to the samples - VariantContext subContext = vc.subContextFromSamples(samples, true); + VariantContext subContext = vc.subContextFromSamples(samples); // now check to see (using EXACT model) whether this should be variant // do we want to apply a prior? maybe user-spec? diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java index 0f55524a6..de832b108 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/validationsiteselector/GTBasedSampleSelector.java @@ -45,7 +45,7 @@ public class GTBasedSampleSelector extends SampleSelector{ if ( samples == null || samples.isEmpty() ) return true; - VariantContext subContext = vc.subContextFromSamples(samples, false); + VariantContext subContext = vc.subContextFromSamples(samples); if ( subContext.isPolymorphicInSamples() ) { return true; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java index 0b395bc62..58cd14737 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java @@ -500,7 +500,10 @@ public class VariantEval extends RodWalker implements TreeRedu @Requires({"eval != null", "comp != null"}) private EvalCompMatchType doEvalAndCompMatch(final VariantContext eval, final VariantContext comp, boolean requireStrictAlleleMatch) { - // find all of the matching comps + if ( comp.getType() == VariantContext.Type.NO_VARIATION || eval.getType() == VariantContext.Type.NO_VARIATION ) + // if either of these are NO_VARIATION they are LENIENT matches + return EvalCompMatchType.LENIENT; + if ( comp.getType() != eval.getType() ) return EvalCompMatchType.NO_MATCH; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java index 6c4fcd26d..fe2437976 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java @@ -57,9 +57,12 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv } } - public void update2(VariantContext vc1, VariantContext vc2, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if (vc1 != null) updateTiTv(vc1, false); - if (vc2 != null) updateTiTv(vc2, true); + @Override + public void update2(VariantContext eval, VariantContext comp, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if (eval != null) + updateTiTv(eval, false); + if (comp != null) + updateTiTv(comp, true); } @Override diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java index 693bdf198..2ad08d806 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Novelty.java @@ -28,7 +28,7 @@ public class Novelty extends VariantStratifier implements StandardStratification final Collection knownComps = tracker.getValues(knowns, ref.getLocus()); for ( final VariantContext c : knownComps ) { // loop over sites, looking for something that matches the type eval - if ( eval.getType() == c.getType() ) { + if ( eval.getType() == c.getType() || eval.getType() == VariantContext.Type.NO_VARIATION ) { return KNOWN_STATES; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 3dcc1f85f..e84b0b10e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -197,7 +197,9 @@ public class VariantEvalUtils { * @return a new VariantContext with just the requested samples */ public VariantContext getSubsetOfVariantContext(VariantContext vc, Set sampleNames) { - return ensureAnnotations(vc, vc.subContextFromSamples(sampleNames, false)); + // if we want to preserve AC0 sites as polymorphic we need to not rederive alleles + final boolean deriveAlleles = variantEvalWalker.ignoreAC0Sites(); + return ensureAnnotations(vc, vc.subContextFromSamples(sampleNames, deriveAlleles)); } public VariantContext ensureAnnotations(final VariantContext vc, final VariantContext vcsub) { @@ -262,12 +264,8 @@ public class VariantEvalUtils { // First, filter the VariantContext to represent only the samples for evaluation VariantContext vcsub = vc; - if (subsetBySample && vc.hasGenotypes()) { - if ( variantEvalWalker.isSubsettingToSpecificSamples() ) - vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation()); - else - vcsub = ensureAnnotations(vc, vc); - } + if (subsetBySample && vc.hasGenotypes()) + vcsub = getSubsetOfVariantContext(vc, variantEvalWalker.getSampleNamesForEvaluation()); if ((byFilter || !vcsub.isFiltered())) { addMapping(mapping, VariantEval.getAllSampleName(), vcsub); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 979400350..2211cfe5e 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -334,12 +334,14 @@ public class VariantContext implements Feature { // to enable tribble integratio * in this VC is returned as the set of alleles in the subContext, even if * some of those alleles aren't in the samples * + * WARNING: BE CAREFUL WITH rederiveAllelesFromGenotypes UNLESS YOU KNOW WHAT YOU ARE DOING? + * * @param sampleNames the sample names - * @param rederiveAllelesFromGenotypes if true, returns the alleles to just those in use by the samples + * @param rederiveAllelesFromGenotypes if true, returns the alleles to just those in use by the samples, true should be default * @return new VariantContext subsetting to just the given samples */ public VariantContext subContextFromSamples(Set sampleNames, final boolean rederiveAllelesFromGenotypes ) { - if ( sampleNames.containsAll(getSampleNames()) ) { + if ( sampleNames.containsAll(getSampleNames()) && ! rederiveAllelesFromGenotypes ) { return this; // fast path when you don't have any work to do } else { VariantContextBuilder builder = new VariantContextBuilder(this); @@ -355,8 +357,18 @@ public class VariantContext implements Feature { // to enable tribble integratio } } + /** + * @see #subContextFromSamples(java.util.Set, boolean) with rederiveAllelesFromGenotypes = true + * + * @param sampleNames + * @return + */ + public VariantContext subContextFromSamples(final Set sampleNames) { + return subContextFromSamples(sampleNames, true); + } + public VariantContext subContextFromSample(String sampleName) { - return subContextFromSamples(Collections.singleton(sampleName), true); + return subContextFromSamples(Collections.singleton(sampleName)); } /** diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index d9a91c4c2..94e52c2b9 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -34,7 +34,7 @@ import java.util.Arrays; import java.util.List; public class VariantEvalIntegrationTest extends WalkerTest { - private static String variantEvalTestDataRoot = validationDataLocation + "VariantEval/"; + private static String variantEvalTestDataRoot = privateTestDir + "VariantEval/"; private static String fundamentalTestVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.snps_and_indels.vcf"; private static String fundamentalTestSNPsVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.final.vcf"; private static String fundamentalTestSNPsWithMLEVCF = variantEvalTestDataRoot + "FundamentalsTest.annotated.db.subset.final.withMLE.vcf"; @@ -122,7 +122,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("e62a3bd9914d48e2bb2fb4f5dfc5ebc0") + Arrays.asList("40abbc9be663aed8ee1158f832463ca8") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNovelty", spec); } @@ -144,7 +144,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-o %s" ), 1, - Arrays.asList("087a2d9943c53e7f49663667c3305c7e") + Arrays.asList("106a0e8753e839c0a2c030eb4b165fa9") ); executeTest("testFundamentalsCountVariantsSNPsandIndelsWithNoveltyAndFilter", spec); } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java index 7c522eadf..0e5522e3a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextBenchmark.java @@ -152,7 +152,7 @@ public class VariantContextBenchmark extends SimpleBenchmark { public void run(final VariantContext vc) { if ( samples == null ) samples = new HashSet(new ArrayList(vc.getSampleNames()).subList(0, nSamplesToTake)); - VariantContext sub = vc.subContextFromSamples(samples, true); + VariantContext sub = vc.subContextFromSamples(samples); sub.getNSamples(); } }; From c3c3d18611f06e2831d3ed39263b44ed106f0df1 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 1 Aug 2012 17:09:22 -0400 Subject: [PATCH 32/43] Update BCF2 to put PASS as offset 0 not at the end -- Unfortunately this commit breaks backward compatibility with all existing BCF2 files... --- .../sting/utils/codecs/bcf2/BCF2Decoder.java | 2 +- .../sting/utils/codecs/bcf2/BCF2Utils.java | 10 +++------- .../utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java | 2 +- .../sting/utils/codecs/vcf/VCFIntegrationTest.java | 2 +- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java index a13be21c5..2619a4dae 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Decoder.java @@ -202,7 +202,7 @@ public final class BCF2Decoder { return null; else { final String s = new String(bytes, 0, goodLength); - return BCF2Utils.isCollapsedString(s) ? BCF2Utils.exploreStringList(s) : s; + return BCF2Utils.isCollapsedString(s) ? BCF2Utils.explodeStringList(s) : s; } } catch ( IOException e ) { throw new ReviewedStingException("readByte failure", e); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index 43e933948..1beacd070 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -79,23 +79,19 @@ public final class BCF2Utils { final Set seen = new HashSet(); final ArrayList dict = new ArrayList(); - boolean sawPASS = false; + dict.add(VCFConstants.PASSES_FILTERS_v4); // special case the special PASS field + // set up the strings dictionary for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) { if ( line instanceof VCFIDHeaderLine && ! (line instanceof VCFContigHeaderLine) ) { final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line; if ( ! seen.contains(idLine.getID())) { - sawPASS = sawPASS || idLine.getID().equals(VCFConstants.PASSES_FILTERS_v4); dict.add(idLine.getID()); seen.add(idLine.getID()); } } } - - if ( ! sawPASS ) - dict.add(VCFConstants.PASSES_FILTERS_v4); // special case the special PASS field - return dict; } @@ -177,7 +173,7 @@ public final class BCF2Utils { */ @Requires({"collapsed != null", "isCollapsedString(collapsed)"}) @Ensures("result != null") - public static final List exploreStringList(final String collapsed) { + public static final List explodeStringList(final String collapsed) { assert isCollapsedString(collapsed); final String[] exploded = collapsed.substring(1).split(","); return Arrays.asList(exploded); diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java index a0feef186..7569ce90d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/bcf2/BCF2EncoderDecoderUnitTest.java @@ -351,7 +351,7 @@ public class BCF2EncoderDecoderUnitTest extends BaseTest { public void testEncodingListOfString(List strings, String expected) throws IOException { final String collapsed = BCF2Utils.collapseStringList(strings); Assert.assertEquals(collapsed, expected); - Assert.assertEquals(BCF2Utils.exploreStringList(collapsed), strings); + Assert.assertEquals(BCF2Utils.explodeStringList(collapsed), strings); } // ----------------------------------------------------------------- diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 3948ba971..86c8c968d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -77,7 +77,7 @@ public class VCFIntegrationTest extends WalkerTest { String testVCF = privateTestDir + "ex2.vcf"; String baseCommand = "-R " + b36KGReference + " --no_cmdline_in_header -o %s "; String test1 = baseCommand + "-T SelectVariants -V " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("a04a0fc22fedb516c663e56e51fc1e27")); + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("e8f721ce81e4fdadba13c5291027057f")); executeTest("Test writing samtools WEx BCF example", spec1); } From 0381fd7c83a076471f571f012f792fff7adfcd20 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 2 Aug 2012 11:25:10 -0400 Subject: [PATCH 34/43] Hmm, I thought I used the right md5s last time. Let's try again. --- .../sting/queue/pipeline/DataProcessingPipelineTest.scala | 4 ++-- .../sting/queue/pipeline/PacbioProcessingPipelineTest.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala index 778e6c28a..3fb9e0efa 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala @@ -41,7 +41,7 @@ class DataProcessingPipelineTest { " -D " + BaseTest.publicTestDir + "exampleDBSNP.vcf", " -test ", " -p " + projectName).mkString - spec.fileMD5s += testOut -> "021db3a52ee40d9193895213aaa05dd5" + spec.fileMD5s += testOut -> "60d39ae909fdd049920b54e0965b6d3c" PipelineTest.executeTest(spec) } @@ -60,7 +60,7 @@ class DataProcessingPipelineTest { " -bwa /home/unix/carneiro/bin/bwa", " -bwape ", " -p " + projectName).mkString - spec.fileMD5s += testOut -> "94e32ee974aa15d05ee7d3e99f2f8c8f" + spec.fileMD5s += testOut -> "61ca3237afdfabf78ee27a5bb80dae59" PipelineTest.executeTest(spec) } diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala index 62ee7ce90..74e947377 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala @@ -40,7 +40,7 @@ class PacbioProcessingPipelineTest { " -blasr ", " -test ", " -D " + BaseTest.publicTestDir + "exampleDBSNP.vcf").mkString - spec.fileMD5s += testOut -> "8b5f9e8e1ba5bd686e3084eaeea23889" + spec.fileMD5s += testOut -> "61b06e8b78a93e6644657e6d38851084" PipelineTest.executeTest(spec) } } From e3f89fb0549c8f2c751338ffeeda76678c4d94c4 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 2 Aug 2012 11:33:21 -0400 Subject: [PATCH 35/43] Missing/malformed GATK report files are user errors --- .../src/org/broadinstitute/sting/gatk/report/GATKReport.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java index bec1ea543..47bc48f81 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java @@ -89,9 +89,9 @@ public class GATKReport { reader = new BufferedReader(new FileReader(file)); reportHeader = reader.readLine(); } catch (FileNotFoundException e) { - throw new ReviewedStingException("Could not open file : " + file); + throw new UserException.CouldNotReadInputFile(file, "it does not exist"); } catch (IOException e) { - throw new ReviewedStingException("Could not read file : " + file); + throw new UserException.CouldNotReadInputFile(file, e); } From cb8bc18aeb7780af067f90351566a538fe2f7df8 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 2 Aug 2012 11:41:06 -0400 Subject: [PATCH 36/43] Fix for error in HaplotypeCaller. HC has a UG argument collection for the UG engine but some of those arguments aren't appropriate to set. --- .../sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java | 1 + 1 file changed, 1 insertion(+) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index f5f707690..6bb894323 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -239,6 +239,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem Set samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); samplesList.addAll( samples ); // initialize the UnifiedGenotyper Engine which is used to call into the exact model + UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; // the GLmodel isn't used by the HaplotypeCaller but it is dangerous to let the user change this argument UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling From fb5dabce18bbcd06ce60392ff1380faec50d8c08 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 2 Aug 2012 16:44:39 -0400 Subject: [PATCH 37/43] Update BCF2 to include a minor version number so we can rev (and report errors) with BCF2 -- We are no likely to fail with an error when reading old BCF files, rather than just giving bad results -- Added new class BCFVersion that consolidates all of the version management of BCF --- .../sting/utils/codecs/bcf2/BCF2Codec.java | 22 ++++- .../sting/utils/codecs/bcf2/BCF2Utils.java | 44 +++++----- .../sting/utils/codecs/bcf2/BCFVersion.java | 80 +++++++++++++++++++ .../variantcontext/writer/BCF2Writer.java | 6 +- .../utils/codecs/vcf/VCFIntegrationTest.java | 2 +- 5 files changed, 122 insertions(+), 32 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCFVersion.java diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java index 0f9cc34e7..0776b3aa8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Codec.java @@ -53,6 +53,11 @@ public final class BCF2Codec implements FeatureCodec { final protected static Logger logger = Logger.getLogger(BCF2Codec.class); private final static boolean FORBID_SYMBOLICS = false; + private final static int ALLOWED_MAJOR_VERSION = 2; + private final static int MIN_MINOR_VERSION = 1; + + private BCFVersion bcfVersion = null; + private VCFHeader header = null; /** @@ -131,8 +136,16 @@ public final class BCF2Codec implements FeatureCodec { public FeatureCodecHeader readHeader( final PositionalBufferedStream inputStream ) { try { // note that this reads the magic as well, and so does double duty - if ( ! BCF2Utils.startsWithBCF2Magic(inputStream) ) - error("Input stream does not begin with BCF2 magic"); + bcfVersion = BCFVersion.readBCFVersion(inputStream); + if ( bcfVersion == null ) + error("Input stream does not contain a BCF encoded file; BCF magic header info not found"); + + if ( bcfVersion.getMajorVersion() != ALLOWED_MAJOR_VERSION ) + error("BCF2Codec can only process BCF2 files, this file has major version " + bcfVersion.getMajorVersion()); + if ( bcfVersion.getMinorVersion() < MIN_MINOR_VERSION ) + error("BCF2Codec can only process BCF2 files with minor version >= " + MIN_MINOR_VERSION + " but this file has minor version " + bcfVersion.getMinorVersion()); + + logger.info("BCF version " + bcfVersion); final int headerSizeInBytes = BCF2Utils.readInt(BCF2Type.INT32.getSizeInBytes(), inputStream); @@ -187,7 +200,8 @@ public final class BCF2Codec implements FeatureCodec { FileInputStream fis = null; try { fis = new FileInputStream(path); - return BCF2Utils.startsWithBCF2Magic(fis); + final BCFVersion version = BCFVersion.readBCFVersion(fis); + return version != null && version.getMajorVersion() == ALLOWED_MAJOR_VERSION; } catch ( FileNotFoundException e ) { return false; } catch ( IOException e ) { @@ -196,7 +210,7 @@ public final class BCF2Codec implements FeatureCodec { try { if ( fis != null ) fis.close(); } catch ( IOException e ) { - ; // do nothing + // do nothing } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index 1beacd070..d3f3fc102 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -41,8 +41,6 @@ import java.util.*; * @since 5/12 */ public final class BCF2Utils { - public static final byte[] MAGIC_HEADER_LINE = "BCF\2".getBytes(); - public static final int MAX_ALLELES_IN_GENOTYPES = 127; public static final int OVERFLOW_ELEMENT_MARKER = 15; @@ -75,7 +73,7 @@ public final class BCF2Utils { */ @Requires("header != null") @Ensures({"result != null", "new HashSet(result).size() == result.size()"}) - public final static ArrayList makeDictionary(final VCFHeader header) { + public static ArrayList makeDictionary(final VCFHeader header) { final Set seen = new HashSet(); final ArrayList dict = new ArrayList(); @@ -96,43 +94,37 @@ public final class BCF2Utils { } @Requires({"nElements >= 0", "type != null"}) - public final static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) { + public static byte encodeTypeDescriptor(final int nElements, final BCF2Type type ) { int encodeSize = Math.min(nElements, OVERFLOW_ELEMENT_MARKER); byte typeByte = (byte)((0x0F & encodeSize) << 4 | (type.getID() & 0x0F)); return typeByte; } @Ensures("result >= 0") - public final static int decodeSize(final byte typeDescriptor) { + public static int decodeSize(final byte typeDescriptor) { return (0xF0 & typeDescriptor) >> 4; } @Ensures("result >= 0") - public final static int decodeTypeID(final byte typeDescriptor) { + public static int decodeTypeID(final byte typeDescriptor) { return typeDescriptor & 0x0F; } @Ensures("result != null") - public final static BCF2Type decodeType(final byte typeDescriptor) { + public static BCF2Type decodeType(final byte typeDescriptor) { return ID_TO_ENUM[decodeTypeID(typeDescriptor)]; } - public final static boolean sizeIsOverflow(final byte typeDescriptor) { + public static boolean sizeIsOverflow(final byte typeDescriptor) { return decodeSize(typeDescriptor) == OVERFLOW_ELEMENT_MARKER; } @Requires("nElements >= 0") - public final static boolean willOverflow(final long nElements) { + public static boolean willOverflow(final long nElements) { return nElements > MAX_INLINE_ELEMENTS; } - public final static boolean startsWithBCF2Magic(final InputStream stream) throws IOException { - final byte[] magicBytes = new byte[BCF2Utils.MAGIC_HEADER_LINE.length]; - stream.read(magicBytes); - return Arrays.equals(magicBytes, BCF2Utils.MAGIC_HEADER_LINE); - } - - public final static byte readByte(final InputStream stream) { + public static byte readByte(final InputStream stream) { // TODO -- shouldn't be capturing error here try { return (byte)(stream.read() & 0xFF); @@ -151,7 +143,7 @@ public final class BCF2Utils { */ @Requires({"strings != null", "strings.size() > 1"}) @Ensures("result != null") - public static final String collapseStringList(final List strings) { + public static String collapseStringList(final List strings) { final StringBuilder b = new StringBuilder(); for ( final String s : strings ) { if ( s != null ) { @@ -173,14 +165,14 @@ public final class BCF2Utils { */ @Requires({"collapsed != null", "isCollapsedString(collapsed)"}) @Ensures("result != null") - public static final List explodeStringList(final String collapsed) { + public static List explodeStringList(final String collapsed) { assert isCollapsedString(collapsed); final String[] exploded = collapsed.substring(1).split(","); return Arrays.asList(exploded); } @Requires("s != null") - public static final boolean isCollapsedString(final String s) { + public static boolean isCollapsedString(final String s) { return s.charAt(0) == ','; } @@ -222,7 +214,7 @@ public final class BCF2Utils { } @Ensures("result.isIntegerType()") - public final static BCF2Type determineIntegerType(final int value) { + public static BCF2Type determineIntegerType(final int value) { for ( final BCF2Type potentialType : INTEGER_TYPES_BY_SIZE) { if ( potentialType.withinRange(value) ) return potentialType; @@ -232,7 +224,7 @@ public final class BCF2Utils { } @Ensures("result.isIntegerType()") - public final static BCF2Type determineIntegerType(final int[] values) { + public static BCF2Type determineIntegerType(final int[] values) { // literally a copy of the code below, but there's no general way to unify lists and arrays in java BCF2Type maxType = BCF2Type.INT8; for ( final int value : values ) { @@ -258,7 +250,7 @@ public final class BCF2Utils { */ @Requires({"t1.isIntegerType()","t2.isIntegerType()"}) @Ensures("result.isIntegerType()") - public final static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) { + public static BCF2Type maxIntegerType(final BCF2Type t1, final BCF2Type t2) { switch ( t1 ) { case INT8: return t2; case INT16: return t2 == BCF2Type.INT32 ? t2 : t1; @@ -268,7 +260,7 @@ public final class BCF2Utils { } @Ensures("result.isIntegerType()") - public final static BCF2Type determineIntegerType(final List values) { + public static BCF2Type determineIntegerType(final List values) { BCF2Type maxType = BCF2Type.INT8; for ( final int value : values ) { final BCF2Type type1 = determineIntegerType(value); @@ -293,7 +285,7 @@ public final class BCF2Utils { * @param o * @return */ - public final static List toList(final Object o) { + public static List toList(final Object o) { if ( o == null ) return Collections.emptyList(); else if ( o instanceof List ) return (List)o; else return Collections.singletonList(o); @@ -301,7 +293,7 @@ public final class BCF2Utils { @Requires({"stream != null", "bytesForEachInt > 0"}) - public final static int readInt(int bytesForEachInt, final InputStream stream) { + public static int readInt(int bytesForEachInt, final InputStream stream) { switch ( bytesForEachInt ) { case 1: { return (byte)(readByte(stream)); @@ -319,7 +311,7 @@ public final class BCF2Utils { } } - public final static void encodeRawBytes(final int value, final BCF2Type type, final OutputStream encodeStream) throws IOException { + public static void encodeRawBytes(final int value, final BCF2Type type, final OutputStream encodeStream) throws IOException { switch ( type.getSizeInBytes() ) { case 1: encodeStream.write(0xFF & value); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCFVersion.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCFVersion.java new file mode 100644 index 000000000..742da7c0c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCFVersion.java @@ -0,0 +1,80 @@ +package org.broadinstitute.sting.utils.codecs.bcf2; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Arrays; + +/** + * Simple holder for BCF version information + * + * User: depristo + * Date: 8/2/12 + * Time: 2:16 PM + */ +public class BCFVersion { + /** + * BCF2 begins with the MAGIC info BCF_M_m where M is the major version (currently 2) + * and m is the minor version, currently 1 + */ + public static final byte[] MAGIC_HEADER_START = "BCF".getBytes(); + + final int majorVersion; + final int minorVersion; + + public BCFVersion(int majorVersion, int minorVersion) { + this.majorVersion = majorVersion; + this.minorVersion = minorVersion; + } + + /** + * @return the major version number of this BCF file + */ + public int getMajorVersion() { + return majorVersion; + } + + /** + * @return the minor version number of this BCF file + */ + public int getMinorVersion() { + return minorVersion; + } + + /** + * Return a new BCFVersion object describing the major and minor version of the BCF file in stream + * + * Note that stream must be at the very start of the file. + * + * @param stream + * @return a BCFVersion object, or null if stream doesn't contain a BCF file + * @throws IOException + */ + public static BCFVersion readBCFVersion(final InputStream stream) throws IOException { + final byte[] magicBytes = new byte[MAGIC_HEADER_START.length]; + stream.read(magicBytes); + if ( Arrays.equals(magicBytes, MAGIC_HEADER_START) ) { + // we're a BCF file + final int majorByte = stream.read(); + final int minorByte = stream.read(); + return new BCFVersion( majorByte, minorByte ); + } else + return null; + } + + /** + * Write out the BCF magic information indicating this is a BCF file with corresponding major and minor versions + * @param out + * @throws IOException + */ + public void write(final OutputStream out) throws IOException { + out.write(MAGIC_HEADER_START); + out.write(getMajorVersion() & 0xFF); + out.write(getMinorVersion() & 0xFF); + } + + @Override + public String toString() { + return String.format("BCF%d.%d", getMajorVersion(), getMinorVersion()); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index b5da206ad..32377d09e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -31,6 +31,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils; +import org.broadinstitute.sting.utils.codecs.bcf2.BCFVersion; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -83,6 +84,9 @@ import java.util.*; * @since 06/12 */ class BCF2Writer extends IndexingVariantContextWriter { + public static final int MAJOR_VERSION = 2; + public static final int MINOR_VERSION = 1; + /** * If true, we will write out the undecoded raw bytes for a genotypes block, if it * is found in the input VC. This can be very dangerous as the genotype encoding @@ -153,7 +157,7 @@ class BCF2Writer extends IndexingVariantContextWriter { writer.close(); final byte[] headerBytes = capture.toByteArray(); - outputStream.write(BCF2Utils.MAGIC_HEADER_LINE); + new BCFVersion(MAJOR_VERSION, MINOR_VERSION).write(outputStream); BCF2Utils.encodeRawBytes(headerBytes.length, BCF2Type.INT32, outputStream); outputStream.write(headerBytes); } catch (IOException e) { diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 86c8c968d..71fc1d464 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -57,7 +57,7 @@ public class VCFIntegrationTest extends WalkerTest { String baseCommand = "-R " + b37KGReference + " --no_cmdline_in_header -o %s "; String test1 = baseCommand + "-T SelectVariants -V " + testVCF; - WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("")); + WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList("bdab26dd7648a806dbab01f64db2bdab")); executeTest("Test reading and writing 1000G Phase I SVs", spec1); } From e04989f76d3b2771f83cb33900c31a5518503e36 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 3 Aug 2012 09:42:21 -0400 Subject: [PATCH 39/43] Bugfix for new PASS position in dictionary in BCF2 --- .../org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java index d3f3fc102..c79abe2ae 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/bcf2/BCF2Utils.java @@ -77,7 +77,9 @@ public final class BCF2Utils { final Set seen = new HashSet(); final ArrayList dict = new ArrayList(); - dict.add(VCFConstants.PASSES_FILTERS_v4); // special case the special PASS field + // special case the special PASS field which doesn't show up in the FILTER field definitions + seen.add(VCFConstants.PASSES_FILTERS_v4); + dict.add(VCFConstants.PASSES_FILTERS_v4); // set up the strings dictionary for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) { From 524d7ea306d18a687d9dd3ba2454746bab63a936 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Tue, 31 Jul 2012 15:09:36 -0400 Subject: [PATCH 40/43] Choose whether to keep program records based on Walker * Add keepProgramRecords argument * Make removeProgramRecords / keepProgramRecords override default --- .../sting/gatk/GenomeAnalysisEngine.java | 11 ++++++++++- .../sting/gatk/arguments/GATKArgumentCollection.java | 5 ++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 228f8351a..967fd3236 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -797,6 +797,15 @@ public class GenomeAnalysisEngine { if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF) throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested."); + if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) + throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); + + // LocusWalkers don't use program records, so remove them by default to save memory + boolean removeProgramRecords = (this.walker instanceof LocusWalker) || argCollection.removeProgramRecords; + + if (argCollection.keepProgramRecords) + removeProgramRecords = false; + return new SAMDataSource( samReaderIDs, threadAllocation, @@ -814,7 +823,7 @@ public class GenomeAnalysisEngine { refReader, getBaseRecalibration(), argCollection.defaultBaseQualities, - argCollection.removeProgramRecords); + removeProgramRecords); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 91a008cbc..972116952 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -249,9 +249,12 @@ public class GATKArgumentCollection { @Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false) public SAMFileReader.ValidationStringency strictnessLevel = SAMFileReader.ValidationStringency.SILENT; - @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Should we remove program records from the SAM header", required = false) + @Argument(fullName = "remove_program_records", shortName = "rpr", doc = "Should we override the Walker's default and remove program records from the SAM header", required = false) public boolean removeProgramRecords = false; + @Argument(fullName = "keep_program_records", shortName = "kpr", doc = "Should we override the Walker's default and keep program records from the SAM header", required = false) + public boolean keepProgramRecords = false; + @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; From addbfd6437a47c1f3b9f9c6ffd51184561ed08dc Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Thu, 2 Aug 2012 14:21:30 -0400 Subject: [PATCH 41/43] Add a RemoveProgramRecords annotation * Add the RemoveProgramRecords annotation to LocusWalker --- .../sting/gatk/GenomeAnalysisEngine.java | 3 +-- .../sting/gatk/walkers/LocusWalker.java | 1 + .../gatk/walkers/RemoveProgramRecords.java | 21 +++++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 967fd3236..56fcf0652 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -800,8 +800,7 @@ public class GenomeAnalysisEngine { if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); - // LocusWalkers don't use program records, so remove them by default to save memory - boolean removeProgramRecords = (this.walker instanceof LocusWalker) || argCollection.removeProgramRecords; + boolean removeProgramRecords = argCollection.removeProgramRecords || walker.getClass().isAnnotationPresent(RemoveProgramRecords.class); if (argCollection.keepProgramRecords) removeProgramRecords = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java index e94d01d5a..2a92d8831 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java @@ -19,6 +19,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES}) @PartitionBy(PartitionType.LOCUS) @ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) +@RemoveProgramRecords public abstract class LocusWalker extends Walker { // Do we actually want to operate on the context? public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java new file mode 100644 index 000000000..d9abc7925 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/RemoveProgramRecords.java @@ -0,0 +1,21 @@ +package org.broadinstitute.sting.gatk.walkers; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 8/2/12 + * Time: 1:58 PM + * To change this template use File | Settings | File Templates. + */ + +import java.lang.annotation.*; + +/** + * Indicates that program records should be removed from SAM headers by default for this walker + */ +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface RemoveProgramRecords { +} From 51bd03cc36cd709b0a5a070246729858bba5bc54 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Thu, 2 Aug 2012 14:46:00 -0400 Subject: [PATCH 42/43] Add RemoveProgramRecords annotation to ActiveRegionWalker --- .../broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index b2975cbbf..aba508b3e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -32,6 +32,7 @@ import java.util.List; @PartitionBy(PartitionType.READ) @ActiveRegionExtension(extension=50,maxRegion=1500) @ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, MappingQualityUnavailableFilter.class}) +@RemoveProgramRecords public abstract class ActiveRegionWalker extends Walker { @Output(fullName="activeRegionOut", shortName="ARO", doc="Output the active region to this interval list file", required = false) From f40d0a0a28935382296d26e3de577f81fe458198 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 3 Aug 2012 10:44:36 -0400 Subject: [PATCH 43/43] Updating VQSR to work with the MNP and symbolic variants that are coming out of the HaplotypeCaller. Integration tests change because of the MNPs in dbSNP. --- .../ApplyRecalibration.java | 2 +- .../VariantDataManager.java | 31 +++++++++++++++++-- .../VariantRecalibrator.java | 8 +---- ...ntRecalibrationWalkersIntegrationTest.java | 6 ++-- 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 073f56707..011f3471c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -196,7 +196,7 @@ public class ApplyRecalibration extends RodWalker implements T for( final VariantContext vc : VCs ) { - if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) { + if( VariantDataManager.checkVariationClass( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) { final VariantContext recalDatum = getMatchingRecalVC(vc, recals); if( recalDatum == null ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index d6df4ff1b..45fdad4f8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -273,11 +274,37 @@ public class VariantDataManager { } private boolean isValidVariant( final VariantContext evalVC, final VariantContext trainVC, final boolean TRUST_ALL_POLYMORPHIC) { - return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && - ((evalVC.isSNP() && trainVC.isSNP()) || ((evalVC.isIndel()||evalVC.isMixed()) && (trainVC.isIndel()||trainVC.isMixed()))) && + return trainVC != null && trainVC.isNotFiltered() && trainVC.isVariant() && checkVariationClass( evalVC, trainVC ) && (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples()); } + protected static boolean checkVariationClass( final VariantContext evalVC, final VariantContext trainVC ) { + switch( trainVC.getType() ) { + case SNP: + case MNP: + return checkVariationClass( evalVC, VariantRecalibratorArgumentCollection.Mode.SNP ); + case INDEL: + case MIXED: + case SYMBOLIC: + return checkVariationClass( evalVC, VariantRecalibratorArgumentCollection.Mode.INDEL ); + default: + return false; + } + } + + protected static boolean checkVariationClass( final VariantContext evalVC, final VariantRecalibratorArgumentCollection.Mode mode ) { + switch( mode ) { + case SNP: + return evalVC.isSNP() || evalVC.isMNP(); + case INDEL: + return evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic(); + case BOTH: + return true; + default: + throw new ReviewedStingException( "Encountered unknown recal mode: " + mode ); + } + } + public void writeOutRecalibrationTable( final VariantContextWriter recalWriter ) { // we need to sort in coordinate order in order to produce a valid VCF Collections.sort( data, new Comparator() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 5e2125fb2..ab2ff6176 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -245,7 +245,7 @@ public class VariantRecalibrator extends RodWalker