Intermediate fix for pool GL unit test: fix up artificial read pileup provider to give consistent data. b) Increase downsampling in pool integration tests with reference sample, and shorten MT tests so they don't last too long

This commit is contained in:
Guillermo del Angel 2012-07-30 09:56:10 -04:00
parent 2ae890155c
commit 5b9a1af7fe
5 changed files with 70 additions and 46 deletions

View File

@ -68,10 +68,10 @@ public class ErrorModel {
break;
}
}
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
if (refSampleVC.isIndel()) {
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
indelLikelihoodMap = new HashMap<PileupElement, LinkedHashMap<Allele, Double>>();
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements
}
@ -96,7 +96,8 @@ public class ErrorModel {
final int readCounts[] = new int[refSamplePileup.getNumberOfElements()];
//perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()];
final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles());
perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts);
if (!haplotypeMap.isEmpty())
perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts);
}
int idx = 0;
for (PileupElement refPileupElement : refSamplePileup) {
@ -108,7 +109,7 @@ public class ErrorModel {
if (DEBUG) System.out.println(m);
isMatch |= m;
}
if (refSampleVC.isIndel()) {
if (refSampleVC.isIndel() && !haplotypeMap.isEmpty()) {
// ignore match/mismatch if reads, as determined by their likelihood, are not informative
double[] perAlleleLikelihoods = perReadLikelihoods[idx++];
if (!isInformativeElement(perAlleleLikelihoods))
@ -173,10 +174,10 @@ public class ErrorModel {
// if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch
if (allele.isReference()) {
// for a ref allele, any base mismatch or new indel is a mismatch.
if(allele.getBases().length>0 )
if(allele.getBases().length>0)
// todo - can't check vs. allele because allele is not padded so it doesn't include the reference base at this location
// could clean up/simplify this when unpadding is removed
return (pileupElement.getBase() == refBase);
return (pileupElement.getBase() == refBase && !pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
else
// either null allele to compare, or ref/alt lengths are different (indel by definition).
// if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch

View File

@ -5,6 +5,7 @@ import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -48,7 +49,12 @@ public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implem
myAlleles = new ArrayList<Allele>(alleles);
refByte = alleles.get(0).getBases()[0]; // by construction, first allele in list is always ref!
Allele refAllele = alleles.get(0);
//sanity check: by construction, first allele should ALWAYS be the reference alleles
if (!refAllele.isReference())
throw new ReviewedStingException("BUG: First allele in list passed to PoolSNPGenotypeLikelihoods should be reference!");
refByte = refAllele.getBases()[0]; // by construction, first allele in list is always ref!
if (myAlleles.size() < BaseUtils.BASES.length) {
// likelihood only defined for subset of possible alleles. Fill then with other alleles to have all possible ones,

View File

@ -18,27 +18,27 @@ public class PoolCallerIntegrationTest extends WalkerTest {
final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam";
final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf";
final String REFSAMPLE_NAME = "NA12878";
final String MTINTERVALS = "MT";
final String MTINTERVALS = "MT:1-3000";
final String LSVINTERVALS = "20:40,000,000-41,000,000";
final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf";
final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf";
final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf";
private void PC_MT_Test(String bam, String args, String name, String md5) {
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL",
final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL",
REF, bam, MTINTERVALS, REFSAMPLE_MT_CALLS, REFSAMPLE_NAME) + " --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testPoolCaller:"+name+" args=" + args, spec);
}
private void PC_LSV_Test(String args, String name, String model, String md5) {
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL",
final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL",
REF, LSV_BAM, LSVINTERVALS, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testPoolCaller:"+name+" args=" + args, spec);
}
private void PC_LSV_Test_NoRef(String args, String name, String model, String md5) {
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s -glm %s -ignoreLane -pnrm POOL",
final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s -glm %s -ignoreLane -pnrm POOL",
REF, LSV_BAM, LSVINTERVALS, model) + " --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testPoolCaller:"+name+" args=" + args, spec);
@ -46,33 +46,33 @@ public class PoolCallerIntegrationTest extends WalkerTest {
@Test
public void testBOTH_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","36b8db57f65be1cc3d2d9d7f9f3f26e4");
PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","d8cba4ec4267d7d766081fcead845d08");
}
@Test
public void testINDEL_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","POOLINDEL","d1339990291648495bfcf4404f051478");
PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","POOLINDEL","8e9b7e89c439b430e95b146a7540c72e");
}
@Test
public void testINDEL_maxAlleles2_ploidy3_Pools_noRef() {
PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","POOLINDEL","b66e7150603310fd57ee7bf9fc590706");
PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","POOLINDEL","96087fe9240e3656cc2a4e0ff0174d5b");
}
@Test
public void testINDEL_maxAlleles2_ploidy1_Pools_noRef() {
PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","POOLINDEL","ccdae3fc4d2c922f956a186aaad51c29");
PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","POOLINDEL","6fdae7093831ecfc82a06dd707d62fe9");
}
@Test
public void testMT_SNP_DISCOVERY_sp4() {
PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","fa5ee7c957c473a80f3a7f3c35dc80b5");
PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","6b27634214530d379db70391a9cfc2d7");
}
@Test
public void testMT_SNP_GGA_sp10() {
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "6907c8617d49bb57b33f8704ce7f0323");
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "e74d4c73ece45d7fb676b99364df4f1a");
}
}

View File

@ -392,8 +392,6 @@ public class PoolGenotypeLikelihoodsUnitTest {
final byte refByte = readPileupTestProvider.getRefByte();
final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T';
final int refIdx = BaseUtils.simpleBaseToBaseIndex(refByte);
final int altIdx = BaseUtils.simpleBaseToBaseIndex(altByte);
final List<Allele> allAlleles = new ArrayList<Allele>(); // this contains only ref Allele up to now
final Set<String> laneIDs = new TreeSet<String>();
@ -411,17 +409,28 @@ public class PoolGenotypeLikelihoodsUnitTest {
for (String laneID : laneIDs)
noisyErrorModels.put(laneID, Q30ErrorModel);
// all first ref allele
allAlleles.add(Allele.create(refByte,true));
for (byte b: BaseUtils.BASES) {
if (refByte == b)
allAlleles.add(Allele.create(b,true));
else
if (refByte != b)
allAlleles.add(Allele.create(b, false));
}
final int refIdx = 0;
int altIdx = -1;
for (int k=0; k < allAlleles.size(); k++)
if (altByte == allAlleles.get(k).getBases()[0]) {
altIdx = k;
break;
}
PrintStream out = null;
if (SIMULATE_NOISY_PILEUP) {
try {
out = new PrintStream(new File("/humgen/gsa-scr1/delangel/GATK/Sting_unstable_mac/GLUnitTest.table"));
out = new PrintStream(new File("GLUnitTest.table"));
// out = new PrintStream(new File("/Users/delangel/GATK/Sting_unstable/GLUnitTest.table"));
}
catch (Exception e) {}

View File

@ -62,9 +62,9 @@ public class ArtificialReadPileupTestProvider {
List<String> sampleNames = new ArrayList<String>();
private String sampleName(int i) { return sampleNames.get(i); }
private SAMReadGroupRecord sampleRG(String name) { return sample2RG.get(name); }
public final int offset = 5;
public final int locStart = 5; // 1-based
public final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
public final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,offset,offset);
public final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,locStart,locStart); //1-based
public final GenomeLoc window = genomeLocParser.createGenomeLoc(artificialContig,artificialRefStart,10);
public final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser,loc,window,this.refBases.getBytes());
@ -103,22 +103,22 @@ public class ArtificialReadPileupTestProvider {
boolean addBaseErrors, int phredScaledBaseErrorRate) {
// RefMetaDataTracker tracker = new RefMetaDataTracker(null,referenceContext);
String refStr = new String(new byte[]{referenceContext.getBase()});
ArrayList<Allele> vcAlleles = new ArrayList<Allele>();
Allele refAllele, altAllele;
if (eventLength == 0) {// SNP case
refAllele =Allele.create(referenceContext.getBase(),true);
refAllele =Allele.create(refStr,true);
altAllele = Allele.create(altBases.substring(0,1), false);
} else if (eventLength>0){
// insertion
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
altAllele = Allele.create(altBases.substring(0,eventLength), false);
refAllele = Allele.create(refStr, true);
altAllele = Allele.create(refStr+altBases.substring(0,eventLength), false);
}
else {
// deletion
refAllele =Allele.create(refBases.substring(offset,offset+Math.abs(eventLength)),true);
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
refAllele =Allele.create(refBases.substring(locStart-1,locStart+Math.abs(eventLength)-1),true);
altAllele = Allele.create(refBases.substring(locStart-1,locStart), false);
}
int stop = loc.getStart();
vcAlleles.add(refAllele);
@ -153,18 +153,15 @@ public class ArtificialReadPileupTestProvider {
int[] numReadsPerAllele, String sample, boolean addErrors, int phredScaledErrorRate) {
List<PileupElement> pileupElements = new ArrayList<PileupElement>();
int readStart = contigStart;
int offset = (contigStop-contigStart+1)/2;
int refAlleleLength = 0;
int refAlleleLength = vc.getReference().getBases().length;
int readCounter = 0;
int alleleCounter = 0;
for (Allele allele: vc.getAlleles()) {
if (allele.isReference())
refAlleleLength = allele.getBases().length;
int alleleLength = allele.getBases().length;
for ( int d = 0; d < numReadsPerAllele[alleleCounter]; d++ ) {
byte[] readBases = trueHaplotype(allele, offset, refAlleleLength);
byte[] readBases = trueHaplotype(allele, locStart, vc.getReference());
if (addErrors)
addBaseErrors(readBases, phredScaledErrorRate);
@ -176,20 +173,20 @@ public class ArtificialReadPileupTestProvider {
read.setReadBases(readBases);
read.setReadName(artificialReadName+readCounter++);
boolean isBeforeDeletion = false, isBeforeInsertion = false;
boolean isBeforeDeletion = alleleLength<refAlleleLength;
boolean isBeforeInsertion = alleleLength>refAlleleLength;
int eventLength = alleleLength - refAlleleLength;
if (allele.isReference())
read.setCigarString(readBases.length + "M");
else {
isBeforeDeletion = alleleLength<refAlleleLength;
isBeforeInsertion = alleleLength>refAlleleLength;
if (isBeforeDeletion || isBeforeInsertion)
read.setCigarString(offset+"M"+ alleleLength + (isBeforeDeletion?"D":"I") +
(readBases.length-offset)+"M");
read.setCigarString(locStart+"M"+ eventLength + (isBeforeDeletion?"D":"I") +
(readBases.length-locStart)+"M");
else // SNP case
read.setCigarString(readBases.length+"M");
}
int eventLength = (isBeforeDeletion?refAlleleLength:(isBeforeInsertion?alleleLength:0));
read.setReadPairedFlag(false);
read.setAlignmentStart(readStart);
read.setMappingQuality(artificialMappingQuality);
@ -198,7 +195,7 @@ public class ArtificialReadPileupTestProvider {
read.setAttribute("RG", sampleRG(sample).getReadGroupId());
pileupElements.add(new PileupElement(read,offset,false,isBeforeDeletion, false, isBeforeInsertion,false,false,altBases.substring(0,alleleLength),eventLength));
pileupElements.add(new PileupElement(read,locStart-1,false,isBeforeDeletion, false, isBeforeInsertion,false,false,altBases.substring(0,alleleLength-1),eventLength));
}
alleleCounter++;
}
@ -206,11 +203,22 @@ public class ArtificialReadPileupTestProvider {
return new ReadBackedPileupImpl(loc,pileupElements);
}
private byte[] trueHaplotype(Allele allele, int offset, int refAlleleLength) {
/**
* create haplotype based on a particular allele
* @param allele Allele of interest. ASSUMED TO INCLUDE REF BASE AT startPosition!
* @param startPosition 1-based start position of allele
* @param refAllele REF allele
* @return
*/
private byte[] trueHaplotype(Allele allele, int startPosition, Allele refAllele) {
// create haplotype based on a particular allele
String prefix = refBases.substring(offset);
// startPosition is 1-based.
// so, if startPosition == 5, we need to include positions 1 to 4 , or indeces 0 to 3 of string
String prefix = refBases.substring(0,startPosition-1);
String alleleBases = new String(allele.getBases());
String postfix = refBases.substring(offset+refAlleleLength,refBases.length());
// where to start postfix? We have (startPosition-1) prefix bases + refAllele.length bases before postfix
String postfix = refBases.substring(startPosition -1 + refAllele.getBases().length,refBases.length());
return (prefix+alleleBases+postfix).getBytes();