From 5b9a1af7febf01463a409e15c8d9844b26e3ce4f Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Mon, 30 Jul 2012 09:56:10 -0400 Subject: [PATCH] Intermediate fix for pool GL unit test: fix up artificial read pileup provider to give consistent data. b) Increase downsampling in pool integration tests with reference sample, and shorten MT tests so they don't last too long --- .../gatk/walkers/genotyper/ErrorModel.java | 11 ++-- .../genotyper/PoolSNPGenotypeLikelihoods.java | 8 ++- .../genotyper/PoolCallerIntegrationTest.java | 20 +++---- .../PoolGenotypeLikelihoodsUnitTest.java | 21 +++++-- .../ArtificialReadPileupTestProvider.java | 56 +++++++++++-------- 5 files changed, 70 insertions(+), 46 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java index 864414de9..8e4ca9595 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java @@ -68,10 +68,10 @@ public class ErrorModel { break; } } + haplotypeMap = new LinkedHashMap(); if (refSampleVC.isIndel()) { pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); - haplotypeMap = new LinkedHashMap(); indelLikelihoodMap = new HashMap>(); IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements } @@ -96,7 +96,8 @@ public class ErrorModel { final int readCounts[] = new int[refSamplePileup.getNumberOfElements()]; //perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()]; final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles()); - perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts); + if (!haplotypeMap.isEmpty()) + perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts); } int idx = 0; for (PileupElement refPileupElement : refSamplePileup) { @@ -108,7 +109,7 @@ public class ErrorModel { if (DEBUG) System.out.println(m); isMatch |= m; } - if (refSampleVC.isIndel()) { + if (refSampleVC.isIndel() && !haplotypeMap.isEmpty()) { // ignore match/mismatch if reads, as determined by their likelihood, are not informative double[] perAlleleLikelihoods = perReadLikelihoods[idx++]; if (!isInformativeElement(perAlleleLikelihoods)) @@ -173,10 +174,10 @@ public class ErrorModel { // if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch if (allele.isReference()) { // for a ref allele, any base mismatch or new indel is a mismatch. - if(allele.getBases().length>0 ) + if(allele.getBases().length>0) // todo - can't check vs. allele because allele is not padded so it doesn't include the reference base at this location // could clean up/simplify this when unpadding is removed - return (pileupElement.getBase() == refBase); + return (pileupElement.getBase() == refBase && !pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart()); else // either null allele to compare, or ref/alt lengths are different (indel by definition). // if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java index 1e445270f..f763392ae 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/PoolSNPGenotypeLikelihoods.java @@ -5,6 +5,7 @@ import net.sf.samtools.SAMUtils; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -48,7 +49,12 @@ public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implem myAlleles = new ArrayList(alleles); - refByte = alleles.get(0).getBases()[0]; // by construction, first allele in list is always ref! + Allele refAllele = alleles.get(0); + //sanity check: by construction, first allele should ALWAYS be the reference alleles + if (!refAllele.isReference()) + throw new ReviewedStingException("BUG: First allele in list passed to PoolSNPGenotypeLikelihoods should be reference!"); + + refByte = refAllele.getBases()[0]; // by construction, first allele in list is always ref! if (myAlleles.size() < BaseUtils.BASES.length) { // likelihood only defined for subset of possible alleles. Fill then with other alleles to have all possible ones, diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java index c6b1a8b7f..a2b478b0e 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolCallerIntegrationTest.java @@ -18,27 +18,27 @@ public class PoolCallerIntegrationTest extends WalkerTest { final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam"; final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf"; final String REFSAMPLE_NAME = "NA12878"; - final String MTINTERVALS = "MT"; + final String MTINTERVALS = "MT:1-3000"; final String LSVINTERVALS = "20:40,000,000-41,000,000"; final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf"; final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf"; final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf"; private void PC_MT_Test(String bam, String args, String name, String md5) { - final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL", + final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL", REF, bam, MTINTERVALS, REFSAMPLE_MT_CALLS, REFSAMPLE_NAME) + " --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testPoolCaller:"+name+" args=" + args, spec); } private void PC_LSV_Test(String args, String name, String model, String md5) { - final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL", + final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL", REF, LSV_BAM, LSVINTERVALS, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testPoolCaller:"+name+" args=" + args, spec); } private void PC_LSV_Test_NoRef(String args, String name, String model, String md5) { - final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s -glm %s -ignoreLane -pnrm POOL", + final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s -glm %s -ignoreLane -pnrm POOL", REF, LSV_BAM, LSVINTERVALS, model) + " --no_cmdline_in_header -o %s"; final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); executeTest("testPoolCaller:"+name+" args=" + args, spec); @@ -46,33 +46,33 @@ public class PoolCallerIntegrationTest extends WalkerTest { @Test public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","36b8db57f65be1cc3d2d9d7f9f3f26e4"); + PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","d8cba4ec4267d7d766081fcead845d08"); } @Test public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","POOLINDEL","d1339990291648495bfcf4404f051478"); + PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","POOLINDEL","8e9b7e89c439b430e95b146a7540c72e"); } @Test public void testINDEL_maxAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","POOLINDEL","b66e7150603310fd57ee7bf9fc590706"); + PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","POOLINDEL","96087fe9240e3656cc2a4e0ff0174d5b"); } @Test public void testINDEL_maxAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","POOLINDEL","ccdae3fc4d2c922f956a186aaad51c29"); + PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","POOLINDEL","6fdae7093831ecfc82a06dd707d62fe9"); } @Test public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","fa5ee7c957c473a80f3a7f3c35dc80b5"); + PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","6b27634214530d379db70391a9cfc2d7"); } @Test public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "6907c8617d49bb57b33f8704ce7f0323"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "e74d4c73ece45d7fb676b99364df4f1a"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java index c97c4ed28..9a785acda 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/PoolGenotypeLikelihoodsUnitTest.java @@ -392,8 +392,6 @@ public class PoolGenotypeLikelihoodsUnitTest { final byte refByte = readPileupTestProvider.getRefByte(); final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T'; - final int refIdx = BaseUtils.simpleBaseToBaseIndex(refByte); - final int altIdx = BaseUtils.simpleBaseToBaseIndex(altByte); final List allAlleles = new ArrayList(); // this contains only ref Allele up to now final Set laneIDs = new TreeSet(); @@ -411,17 +409,28 @@ public class PoolGenotypeLikelihoodsUnitTest { for (String laneID : laneIDs) noisyErrorModels.put(laneID, Q30ErrorModel); + // all first ref allele + allAlleles.add(Allele.create(refByte,true)); for (byte b: BaseUtils.BASES) { - if (refByte == b) - allAlleles.add(Allele.create(b,true)); - else + if (refByte != b) allAlleles.add(Allele.create(b, false)); } + final int refIdx = 0; + int altIdx = -1; + + for (int k=0; k < allAlleles.size(); k++) + if (altByte == allAlleles.get(k).getBases()[0]) { + altIdx = k; + break; + } + + + PrintStream out = null; if (SIMULATE_NOISY_PILEUP) { try { - out = new PrintStream(new File("/humgen/gsa-scr1/delangel/GATK/Sting_unstable_mac/GLUnitTest.table")); + out = new PrintStream(new File("GLUnitTest.table")); // out = new PrintStream(new File("/Users/delangel/GATK/Sting_unstable/GLUnitTest.table")); } catch (Exception e) {} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java index 77769a5fe..8011ea66b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java @@ -62,9 +62,9 @@ public class ArtificialReadPileupTestProvider { List sampleNames = new ArrayList(); private String sampleName(int i) { return sampleNames.get(i); } private SAMReadGroupRecord sampleRG(String name) { return sample2RG.get(name); } - public final int offset = 5; + public final int locStart = 5; // 1-based public final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - public final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,offset,offset); + public final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,locStart,locStart); //1-based public final GenomeLoc window = genomeLocParser.createGenomeLoc(artificialContig,artificialRefStart,10); public final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser,loc,window,this.refBases.getBytes()); @@ -103,22 +103,22 @@ public class ArtificialReadPileupTestProvider { boolean addBaseErrors, int phredScaledBaseErrorRate) { // RefMetaDataTracker tracker = new RefMetaDataTracker(null,referenceContext); - + String refStr = new String(new byte[]{referenceContext.getBase()}); ArrayList vcAlleles = new ArrayList(); Allele refAllele, altAllele; if (eventLength == 0) {// SNP case - refAllele =Allele.create(referenceContext.getBase(),true); + refAllele =Allele.create(refStr,true); altAllele = Allele.create(altBases.substring(0,1), false); } else if (eventLength>0){ // insertion - refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true); - altAllele = Allele.create(altBases.substring(0,eventLength), false); + refAllele = Allele.create(refStr, true); + altAllele = Allele.create(refStr+altBases.substring(0,eventLength), false); } else { // deletion - refAllele =Allele.create(refBases.substring(offset,offset+Math.abs(eventLength)),true); - altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false); + refAllele =Allele.create(refBases.substring(locStart-1,locStart+Math.abs(eventLength)-1),true); + altAllele = Allele.create(refBases.substring(locStart-1,locStart), false); } int stop = loc.getStart(); vcAlleles.add(refAllele); @@ -153,18 +153,15 @@ public class ArtificialReadPileupTestProvider { int[] numReadsPerAllele, String sample, boolean addErrors, int phredScaledErrorRate) { List pileupElements = new ArrayList(); int readStart = contigStart; - int offset = (contigStop-contigStart+1)/2; - int refAlleleLength = 0; + + int refAlleleLength = vc.getReference().getBases().length; int readCounter = 0; int alleleCounter = 0; for (Allele allele: vc.getAlleles()) { - if (allele.isReference()) - refAlleleLength = allele.getBases().length; - int alleleLength = allele.getBases().length; for ( int d = 0; d < numReadsPerAllele[alleleCounter]; d++ ) { - byte[] readBases = trueHaplotype(allele, offset, refAlleleLength); + byte[] readBases = trueHaplotype(allele, locStart, vc.getReference()); if (addErrors) addBaseErrors(readBases, phredScaledErrorRate); @@ -176,20 +173,20 @@ public class ArtificialReadPileupTestProvider { read.setReadBases(readBases); read.setReadName(artificialReadName+readCounter++); - boolean isBeforeDeletion = false, isBeforeInsertion = false; + boolean isBeforeDeletion = alleleLengthrefAlleleLength; + + int eventLength = alleleLength - refAlleleLength; if (allele.isReference()) read.setCigarString(readBases.length + "M"); else { - isBeforeDeletion = alleleLengthrefAlleleLength; if (isBeforeDeletion || isBeforeInsertion) - read.setCigarString(offset+"M"+ alleleLength + (isBeforeDeletion?"D":"I") + - (readBases.length-offset)+"M"); + read.setCigarString(locStart+"M"+ eventLength + (isBeforeDeletion?"D":"I") + + (readBases.length-locStart)+"M"); else // SNP case read.setCigarString(readBases.length+"M"); } - int eventLength = (isBeforeDeletion?refAlleleLength:(isBeforeInsertion?alleleLength:0)); read.setReadPairedFlag(false); read.setAlignmentStart(readStart); read.setMappingQuality(artificialMappingQuality); @@ -198,7 +195,7 @@ public class ArtificialReadPileupTestProvider { read.setAttribute("RG", sampleRG(sample).getReadGroupId()); - pileupElements.add(new PileupElement(read,offset,false,isBeforeDeletion, false, isBeforeInsertion,false,false,altBases.substring(0,alleleLength),eventLength)); + pileupElements.add(new PileupElement(read,locStart-1,false,isBeforeDeletion, false, isBeforeInsertion,false,false,altBases.substring(0,alleleLength-1),eventLength)); } alleleCounter++; } @@ -206,11 +203,22 @@ public class ArtificialReadPileupTestProvider { return new ReadBackedPileupImpl(loc,pileupElements); } - private byte[] trueHaplotype(Allele allele, int offset, int refAlleleLength) { + /** + * create haplotype based on a particular allele + * @param allele Allele of interest. ASSUMED TO INCLUDE REF BASE AT startPosition! + * @param startPosition 1-based start position of allele + * @param refAllele REF allele + * @return + */ + private byte[] trueHaplotype(Allele allele, int startPosition, Allele refAllele) { + // create haplotype based on a particular allele - String prefix = refBases.substring(offset); + // startPosition is 1-based. + // so, if startPosition == 5, we need to include positions 1 to 4 , or indeces 0 to 3 of string + String prefix = refBases.substring(0,startPosition-1); String alleleBases = new String(allele.getBases()); - String postfix = refBases.substring(offset+refAlleleLength,refBases.length()); + // where to start postfix? We have (startPosition-1) prefix bases + refAllele.length bases before postfix + String postfix = refBases.substring(startPosition -1 + refAllele.getBases().length,refBases.length()); return (prefix+alleleBases+postfix).getBytes();