Intermediate fix for pool GL unit test: fix up artificial read pileup provider to give consistent data. b) Increase downsampling in pool integration tests with reference sample, and shorten MT tests so they don't last too long
This commit is contained in:
parent
2ae890155c
commit
5b9a1af7fe
|
|
@ -68,10 +68,10 @@ public class ErrorModel {
|
|||
break;
|
||||
}
|
||||
}
|
||||
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
|
||||
if (refSampleVC.isIndel()) {
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
|
||||
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
|
||||
indelLikelihoodMap = new HashMap<PileupElement, LinkedHashMap<Allele, Double>>();
|
||||
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements
|
||||
}
|
||||
|
|
@ -96,7 +96,8 @@ public class ErrorModel {
|
|||
final int readCounts[] = new int[refSamplePileup.getNumberOfElements()];
|
||||
//perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()];
|
||||
final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles());
|
||||
perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts);
|
||||
if (!haplotypeMap.isEmpty())
|
||||
perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts);
|
||||
}
|
||||
int idx = 0;
|
||||
for (PileupElement refPileupElement : refSamplePileup) {
|
||||
|
|
@ -108,7 +109,7 @@ public class ErrorModel {
|
|||
if (DEBUG) System.out.println(m);
|
||||
isMatch |= m;
|
||||
}
|
||||
if (refSampleVC.isIndel()) {
|
||||
if (refSampleVC.isIndel() && !haplotypeMap.isEmpty()) {
|
||||
// ignore match/mismatch if reads, as determined by their likelihood, are not informative
|
||||
double[] perAlleleLikelihoods = perReadLikelihoods[idx++];
|
||||
if (!isInformativeElement(perAlleleLikelihoods))
|
||||
|
|
@ -173,10 +174,10 @@ public class ErrorModel {
|
|||
// if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch
|
||||
if (allele.isReference()) {
|
||||
// for a ref allele, any base mismatch or new indel is a mismatch.
|
||||
if(allele.getBases().length>0 )
|
||||
if(allele.getBases().length>0)
|
||||
// todo - can't check vs. allele because allele is not padded so it doesn't include the reference base at this location
|
||||
// could clean up/simplify this when unpadding is removed
|
||||
return (pileupElement.getBase() == refBase);
|
||||
return (pileupElement.getBase() == refBase && !pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
|
||||
else
|
||||
// either null allele to compare, or ref/alt lengths are different (indel by definition).
|
||||
// if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import net.sf.samtools.SAMUtils;
|
|||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -48,7 +49,12 @@ public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implem
|
|||
|
||||
myAlleles = new ArrayList<Allele>(alleles);
|
||||
|
||||
refByte = alleles.get(0).getBases()[0]; // by construction, first allele in list is always ref!
|
||||
Allele refAllele = alleles.get(0);
|
||||
//sanity check: by construction, first allele should ALWAYS be the reference alleles
|
||||
if (!refAllele.isReference())
|
||||
throw new ReviewedStingException("BUG: First allele in list passed to PoolSNPGenotypeLikelihoods should be reference!");
|
||||
|
||||
refByte = refAllele.getBases()[0]; // by construction, first allele in list is always ref!
|
||||
|
||||
if (myAlleles.size() < BaseUtils.BASES.length) {
|
||||
// likelihood only defined for subset of possible alleles. Fill then with other alleles to have all possible ones,
|
||||
|
|
|
|||
|
|
@ -18,27 +18,27 @@ public class PoolCallerIntegrationTest extends WalkerTest {
|
|||
final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam";
|
||||
final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf";
|
||||
final String REFSAMPLE_NAME = "NA12878";
|
||||
final String MTINTERVALS = "MT";
|
||||
final String MTINTERVALS = "MT:1-3000";
|
||||
final String LSVINTERVALS = "20:40,000,000-41,000,000";
|
||||
final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf";
|
||||
final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf";
|
||||
final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf";
|
||||
private void PC_MT_Test(String bam, String args, String name, String md5) {
|
||||
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL",
|
||||
final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL",
|
||||
REF, bam, MTINTERVALS, REFSAMPLE_MT_CALLS, REFSAMPLE_NAME) + " --no_cmdline_in_header -o %s";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testPoolCaller:"+name+" args=" + args, spec);
|
||||
}
|
||||
|
||||
private void PC_LSV_Test(String args, String name, String model, String md5) {
|
||||
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL",
|
||||
final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL",
|
||||
REF, LSV_BAM, LSVINTERVALS, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testPoolCaller:"+name+" args=" + args, spec);
|
||||
}
|
||||
|
||||
private void PC_LSV_Test_NoRef(String args, String name, String model, String md5) {
|
||||
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s -glm %s -ignoreLane -pnrm POOL",
|
||||
final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s -glm %s -ignoreLane -pnrm POOL",
|
||||
REF, LSV_BAM, LSVINTERVALS, model) + " --no_cmdline_in_header -o %s";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testPoolCaller:"+name+" args=" + args, spec);
|
||||
|
|
@ -46,33 +46,33 @@ public class PoolCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testBOTH_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","36b8db57f65be1cc3d2d9d7f9f3f26e4");
|
||||
PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","d8cba4ec4267d7d766081fcead845d08");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testINDEL_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","POOLINDEL","d1339990291648495bfcf4404f051478");
|
||||
PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","POOLINDEL","8e9b7e89c439b430e95b146a7540c72e");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testINDEL_maxAlleles2_ploidy3_Pools_noRef() {
|
||||
PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","POOLINDEL","b66e7150603310fd57ee7bf9fc590706");
|
||||
PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","POOLINDEL","96087fe9240e3656cc2a4e0ff0174d5b");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testINDEL_maxAlleles2_ploidy1_Pools_noRef() {
|
||||
PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","POOLINDEL","ccdae3fc4d2c922f956a186aaad51c29");
|
||||
PC_LSV_Test_NoRef(" -maxAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","POOLINDEL","6fdae7093831ecfc82a06dd707d62fe9");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMT_SNP_DISCOVERY_sp4() {
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","fa5ee7c957c473a80f3a7f3c35dc80b5");
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","6b27634214530d379db70391a9cfc2d7");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMT_SNP_GGA_sp10() {
|
||||
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "6907c8617d49bb57b33f8704ce7f0323");
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "e74d4c73ece45d7fb676b99364df4f1a");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -392,8 +392,6 @@ public class PoolGenotypeLikelihoodsUnitTest {
|
|||
final byte refByte = readPileupTestProvider.getRefByte();
|
||||
final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T';
|
||||
|
||||
final int refIdx = BaseUtils.simpleBaseToBaseIndex(refByte);
|
||||
final int altIdx = BaseUtils.simpleBaseToBaseIndex(altByte);
|
||||
|
||||
final List<Allele> allAlleles = new ArrayList<Allele>(); // this contains only ref Allele up to now
|
||||
final Set<String> laneIDs = new TreeSet<String>();
|
||||
|
|
@ -411,17 +409,28 @@ public class PoolGenotypeLikelihoodsUnitTest {
|
|||
for (String laneID : laneIDs)
|
||||
noisyErrorModels.put(laneID, Q30ErrorModel);
|
||||
|
||||
// all first ref allele
|
||||
allAlleles.add(Allele.create(refByte,true));
|
||||
for (byte b: BaseUtils.BASES) {
|
||||
if (refByte == b)
|
||||
allAlleles.add(Allele.create(b,true));
|
||||
else
|
||||
if (refByte != b)
|
||||
allAlleles.add(Allele.create(b, false));
|
||||
}
|
||||
|
||||
final int refIdx = 0;
|
||||
int altIdx = -1;
|
||||
|
||||
for (int k=0; k < allAlleles.size(); k++)
|
||||
if (altByte == allAlleles.get(k).getBases()[0]) {
|
||||
altIdx = k;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
|
||||
PrintStream out = null;
|
||||
if (SIMULATE_NOISY_PILEUP) {
|
||||
try {
|
||||
out = new PrintStream(new File("/humgen/gsa-scr1/delangel/GATK/Sting_unstable_mac/GLUnitTest.table"));
|
||||
out = new PrintStream(new File("GLUnitTest.table"));
|
||||
// out = new PrintStream(new File("/Users/delangel/GATK/Sting_unstable/GLUnitTest.table"));
|
||||
}
|
||||
catch (Exception e) {}
|
||||
|
|
|
|||
|
|
@ -62,9 +62,9 @@ public class ArtificialReadPileupTestProvider {
|
|||
List<String> sampleNames = new ArrayList<String>();
|
||||
private String sampleName(int i) { return sampleNames.get(i); }
|
||||
private SAMReadGroupRecord sampleRG(String name) { return sample2RG.get(name); }
|
||||
public final int offset = 5;
|
||||
public final int locStart = 5; // 1-based
|
||||
public final GenomeLocParser genomeLocParser = new GenomeLocParser(header.getSequenceDictionary());
|
||||
public final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,offset,offset);
|
||||
public final GenomeLoc loc = genomeLocParser.createGenomeLoc(artificialContig,locStart,locStart); //1-based
|
||||
public final GenomeLoc window = genomeLocParser.createGenomeLoc(artificialContig,artificialRefStart,10);
|
||||
public final ReferenceContext referenceContext = new ReferenceContext(genomeLocParser,loc,window,this.refBases.getBytes());
|
||||
|
||||
|
|
@ -103,22 +103,22 @@ public class ArtificialReadPileupTestProvider {
|
|||
boolean addBaseErrors, int phredScaledBaseErrorRate) {
|
||||
// RefMetaDataTracker tracker = new RefMetaDataTracker(null,referenceContext);
|
||||
|
||||
|
||||
String refStr = new String(new byte[]{referenceContext.getBase()});
|
||||
ArrayList<Allele> vcAlleles = new ArrayList<Allele>();
|
||||
Allele refAllele, altAllele;
|
||||
if (eventLength == 0) {// SNP case
|
||||
refAllele =Allele.create(referenceContext.getBase(),true);
|
||||
refAllele =Allele.create(refStr,true);
|
||||
altAllele = Allele.create(altBases.substring(0,1), false);
|
||||
|
||||
} else if (eventLength>0){
|
||||
// insertion
|
||||
refAllele = Allele.create(Allele.NULL_ALLELE_STRING, true);
|
||||
altAllele = Allele.create(altBases.substring(0,eventLength), false);
|
||||
refAllele = Allele.create(refStr, true);
|
||||
altAllele = Allele.create(refStr+altBases.substring(0,eventLength), false);
|
||||
}
|
||||
else {
|
||||
// deletion
|
||||
refAllele =Allele.create(refBases.substring(offset,offset+Math.abs(eventLength)),true);
|
||||
altAllele = Allele.create(Allele.NULL_ALLELE_STRING, false);
|
||||
refAllele =Allele.create(refBases.substring(locStart-1,locStart+Math.abs(eventLength)-1),true);
|
||||
altAllele = Allele.create(refBases.substring(locStart-1,locStart), false);
|
||||
}
|
||||
int stop = loc.getStart();
|
||||
vcAlleles.add(refAllele);
|
||||
|
|
@ -153,18 +153,15 @@ public class ArtificialReadPileupTestProvider {
|
|||
int[] numReadsPerAllele, String sample, boolean addErrors, int phredScaledErrorRate) {
|
||||
List<PileupElement> pileupElements = new ArrayList<PileupElement>();
|
||||
int readStart = contigStart;
|
||||
int offset = (contigStop-contigStart+1)/2;
|
||||
int refAlleleLength = 0;
|
||||
|
||||
int refAlleleLength = vc.getReference().getBases().length;
|
||||
int readCounter = 0;
|
||||
int alleleCounter = 0;
|
||||
for (Allele allele: vc.getAlleles()) {
|
||||
if (allele.isReference())
|
||||
refAlleleLength = allele.getBases().length;
|
||||
|
||||
int alleleLength = allele.getBases().length;
|
||||
|
||||
for ( int d = 0; d < numReadsPerAllele[alleleCounter]; d++ ) {
|
||||
byte[] readBases = trueHaplotype(allele, offset, refAlleleLength);
|
||||
byte[] readBases = trueHaplotype(allele, locStart, vc.getReference());
|
||||
if (addErrors)
|
||||
addBaseErrors(readBases, phredScaledErrorRate);
|
||||
|
||||
|
|
@ -176,20 +173,20 @@ public class ArtificialReadPileupTestProvider {
|
|||
read.setReadBases(readBases);
|
||||
read.setReadName(artificialReadName+readCounter++);
|
||||
|
||||
boolean isBeforeDeletion = false, isBeforeInsertion = false;
|
||||
boolean isBeforeDeletion = alleleLength<refAlleleLength;
|
||||
boolean isBeforeInsertion = alleleLength>refAlleleLength;
|
||||
|
||||
int eventLength = alleleLength - refAlleleLength;
|
||||
if (allele.isReference())
|
||||
read.setCigarString(readBases.length + "M");
|
||||
else {
|
||||
isBeforeDeletion = alleleLength<refAlleleLength;
|
||||
isBeforeInsertion = alleleLength>refAlleleLength;
|
||||
if (isBeforeDeletion || isBeforeInsertion)
|
||||
read.setCigarString(offset+"M"+ alleleLength + (isBeforeDeletion?"D":"I") +
|
||||
(readBases.length-offset)+"M");
|
||||
read.setCigarString(locStart+"M"+ eventLength + (isBeforeDeletion?"D":"I") +
|
||||
(readBases.length-locStart)+"M");
|
||||
else // SNP case
|
||||
read.setCigarString(readBases.length+"M");
|
||||
}
|
||||
|
||||
int eventLength = (isBeforeDeletion?refAlleleLength:(isBeforeInsertion?alleleLength:0));
|
||||
read.setReadPairedFlag(false);
|
||||
read.setAlignmentStart(readStart);
|
||||
read.setMappingQuality(artificialMappingQuality);
|
||||
|
|
@ -198,7 +195,7 @@ public class ArtificialReadPileupTestProvider {
|
|||
read.setAttribute("RG", sampleRG(sample).getReadGroupId());
|
||||
|
||||
|
||||
pileupElements.add(new PileupElement(read,offset,false,isBeforeDeletion, false, isBeforeInsertion,false,false,altBases.substring(0,alleleLength),eventLength));
|
||||
pileupElements.add(new PileupElement(read,locStart-1,false,isBeforeDeletion, false, isBeforeInsertion,false,false,altBases.substring(0,alleleLength-1),eventLength));
|
||||
}
|
||||
alleleCounter++;
|
||||
}
|
||||
|
|
@ -206,11 +203,22 @@ public class ArtificialReadPileupTestProvider {
|
|||
return new ReadBackedPileupImpl(loc,pileupElements);
|
||||
}
|
||||
|
||||
private byte[] trueHaplotype(Allele allele, int offset, int refAlleleLength) {
|
||||
/**
|
||||
* create haplotype based on a particular allele
|
||||
* @param allele Allele of interest. ASSUMED TO INCLUDE REF BASE AT startPosition!
|
||||
* @param startPosition 1-based start position of allele
|
||||
* @param refAllele REF allele
|
||||
* @return
|
||||
*/
|
||||
private byte[] trueHaplotype(Allele allele, int startPosition, Allele refAllele) {
|
||||
|
||||
// create haplotype based on a particular allele
|
||||
String prefix = refBases.substring(offset);
|
||||
// startPosition is 1-based.
|
||||
// so, if startPosition == 5, we need to include positions 1 to 4 , or indeces 0 to 3 of string
|
||||
String prefix = refBases.substring(0,startPosition-1);
|
||||
String alleleBases = new String(allele.getBases());
|
||||
String postfix = refBases.substring(offset+refAlleleLength,refBases.length());
|
||||
// where to start postfix? We have (startPosition-1) prefix bases + refAllele.length bases before postfix
|
||||
String postfix = refBases.substring(startPosition -1 + refAllele.getBases().length,refBases.length());
|
||||
|
||||
return (prefix+alleleBases+postfix).getBytes();
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue