number of stashed changes are lurking in here. In order of importance:

- Fix for M_Trieb's error report on the forum, and addition of integration tests to cover the walker.
 - Addition of StructuralIndel as a class of variation within the VariantContext. These are for variants with a full alt allele that's >150bp in length.
 - Adaptation of the MVLikelihoodRatio to work for a set of trios (takes the max over the trios of the MVLR)
 - InsertSizeDistribution changed to use the new gatk report output (it was previously broken)
 - RetrogeneDiscovery changed to be compatible with the new gatk report
 - A maxIndelSize argument added to SelectVariants
 - ByTranscriptEvaluator rewritten for cleanliness
 - VariantRecalibrator modified to not exclude structural indels from recalibration if the mode is INDEL
 - Documentation added to DepthOfCoverageIntegrationTest (no, don't yell at chartl ;_; )

Also sorry for the long commit history behind this that is the result of fixing merge conflicts. Because this *also* fixes a conflict (from git stash apply), for some reason I can't rebase all of them away. I'm pretty sure some of the commit notes say "this note isn't important because I'm going to rebase it anyway".
This commit is contained in:
Christopher Hartl 2012-08-21 07:03:50 -04:00
parent 3dfe8df262
commit ba8622ff0d
7 changed files with 260 additions and 81 deletions

View File

@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.MendelianViolation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
@ -28,9 +29,13 @@ import java.util.*;
public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
private MendelianViolation mendelianViolation = null;
private String motherId;
private String fatherId;
private String childId;
private Set<Trio> trios;
private class Trio {
String motherId;
String fatherId;
String childId;
}
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
if ( mendelianViolation == null ) {
@ -38,17 +43,27 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP );
}
else {
throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line containing only 1 trio.");
throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line.");
}
}
Map<String,Object> toRet = new HashMap<String,Object>(1);
boolean hasAppropriateGenotypes = vc.hasGenotype(motherId) && vc.getGenotype(motherId).hasLikelihoods() &&
vc.hasGenotype(fatherId) && vc.getGenotype(fatherId).hasLikelihoods() &&
vc.hasGenotype(childId) && vc.getGenotype(childId).hasLikelihoods();
if ( hasAppropriateGenotypes )
toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc,motherId,fatherId,childId));
//double pNoMV = 1.0;
double maxMVLR = Double.MIN_VALUE;
for ( Trio trio : trios ) {
boolean hasAppropriateGenotypes = vc.hasGenotype(trio.motherId) && vc.getGenotype(trio.motherId).hasLikelihoods() &&
vc.hasGenotype(trio.fatherId) && vc.getGenotype(trio.fatherId).hasLikelihoods() &&
vc.hasGenotype(trio.childId) && vc.getGenotype(trio.childId).hasLikelihoods();
if ( hasAppropriateGenotypes ) {
Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.motherId,trio.fatherId,trio.childId);
maxMVLR = likR > maxMVLR ? likR : maxMVLR;
//pNoMV *= (1.0-Math.pow(10.0,likR)/(1+Math.pow(10.0,likR)));
}
}
//double pSomeMV = 1.0-pNoMV;
//toRet.put("MVLR",Math.log10(pSomeMV)-Math.log10(1.0-pSomeMV));
toRet.put("MVLR",maxMVLR);
return toRet;
}
@ -58,25 +73,24 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment
public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); }
private boolean checkAndSetSamples(SampleDB db){
trios = new HashSet<Trio>();
Set<String> families = db.getFamilyIDs();
if(families.size() != 1)
return false;
Set<Sample> family = db.getFamily(families.iterator().next());
if(family.size() != 3)
return false;
Iterator<Sample> sampleIter = family.iterator();
Sample sample;
for(sample = sampleIter.next();sampleIter.hasNext();sample=sampleIter.next()){
if(sample.getParents().size()==2){
motherId = sample.getMaternalID();
fatherId = sample.getPaternalID();
childId = sample.getID();
return true;
for ( String familyString : families ) {
Set<Sample> family = db.getFamily(familyString);
Iterator<Sample> sampleIterator = family.iterator();
Sample sample;
for ( sample = sampleIterator.next(); sampleIterator.hasNext(); sample=sampleIterator.next()) {
if ( sample.getParents().size() == 2 ) {
Trio trio = new Trio();
trio.childId = sample.getID();
trio.fatherId = sample.getFather().getID();
trio.motherId = sample.getMother().getID();
trios.add(trio);
}
}
}
return false;
return trios.size() > 0;
}
}

View File

@ -297,7 +297,7 @@ public class VariantDataManager {
case SNP:
return evalVC.isSNP() || evalVC.isMNP();
case INDEL:
return evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic();
return evalVC.isStructuralIndel() || evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic();
case BOTH:
return true;
default:

View File

@ -322,6 +322,9 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
@Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false)
private boolean justRead = false;
@Argument(doc="indel size select",required=false,fullName="maxIndelSize")
private int maxIndelSize = Integer.MAX_VALUE;
/* Private class used to store the intermediate variants in the integer random selection process */
private static class RandomVariantStructure {
@ -541,6 +544,9 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
if (!selectedTypes.contains(vc.getType()))
continue;
if ( badIndelSize(vc) )
continue;
VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS);
if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) {
@ -572,6 +578,20 @@ public class SelectVariants extends RodWalker<Integer, Integer> implements TreeR
return 1;
}
private boolean badIndelSize(final VariantContext vc) {
if ( vc.getReference().length() > maxIndelSize ) {
return true;
}
for ( Allele a : vc.getAlternateAlleles() ) {
if ( a.length() > maxIndelSize ) {
return true;
}
}
return false;
}
private boolean hasPLs(final VariantContext vc) {
for ( Genotype g : vc.getGenotypes() ) {
if ( g.hasLikelihoods() )

View File

@ -76,47 +76,11 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private List<String> famOrder = new ArrayList<String>();
public void initialize() {
vv.variantCollection = variantCollection;
vv.dbsnp = dbsnp;
vv.DO_NOT_VALIDATE_FILTERED = true;
vv.type = ValidateVariants.ValidationType.REF;
initializeValidator();
writeBedHeader();
Map<String,Map<String,String>> sampleMetaValues = parseMetaData();
// create temporary output streams and buffers
// write magic bits into the ped file
try {
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0});
// ultimately, the bed will be in individual-major mode
} catch (IOException e) {
throw new ReviewedStingException("error writing to output file.");
}
// write to the fam file, the first six columns of the standard ped file
// first, load data from the input meta data file
Map<String,Map<String,String>> metaValues = new HashMap<String,Map<String,String>>();
logger.debug("Reading in metadata...");
try {
if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) {
for ( String line : new XReadLines(metaDataFile) ) {
String[] famSplit = line.split("\\t");
String sid = famSplit[1];
outFam.printf("%s%n",line);
}
} else {
for ( String line : new XReadLines(metaDataFile) ) {
logger.debug(line);
String[] split = line.split("\\t");
String sampleID = split[0];
String keyVals = split[1];
HashMap<String,String> values = new HashMap<String, String>();
for ( String kvp : keyVals.split(";") ) {
String[] kvp_split = kvp.split("=");
values.put(kvp_split[0],kvp_split[1]);
}
metaValues.put(sampleID,values);
}
}
} catch (FileNotFoundException e) {
throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e);
}
// family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype
int dummyID = 0; // increments for dummy parental and family IDs used
// want to be especially careful to maintain order here
@ -126,21 +90,23 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
continue;
}
for ( String sample : header.getValue().getGenotypeSamples() ) {
Map<String,String> mVals = metaValues.get(sample);
if ( mVals == null ) {
throw new UserException("No metadata provided for sample "+sample);
if ( ! metaDataFile.getAbsolutePath().endsWith(".fam") ) {
Map<String,String> mVals = sampleMetaValues.get(sample);
if ( mVals == null ) {
throw new UserException("No metadata provided for sample "+sample);
}
if ( ! mVals.containsKey("phenotype") ) {
throw new UserException("No phenotype data provided for sample "+sample);
}
String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID);
String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID);
String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID);
String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
String pheno = mVals.get("phenotype");
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno);
}
if ( ! mVals.containsKey("phenotype") ) {
throw new UserException("No phenotype data provided for sample "+sample);
}
String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID);
String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID);
String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID);
String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
String pheno = mVals.get("phenotype");
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno);
try {
File temp = File.createTempFile(sample, ".tmp");
File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp");
printMap.put(sample,new PrintStream(temp));
tempFiles.put(sample,temp);
} catch (IOException e) {
@ -216,6 +182,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
// reset the buffer for this sample
genotypeBuffer.put(sample,new byte[BUFFER_SIZE]);
}
byteCount = 0;
}
genotypeCount = 0;
}
@ -337,4 +304,69 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
throw new UserException("Allele frequency appears to be neither String nor Double. Please check the header of your VCF.");
}
}
private void initializeValidator() {
vv.variantCollection = variantCollection;
vv.dbsnp = dbsnp;
vv.DO_NOT_VALIDATE_FILTERED = true;
vv.type = ValidateVariants.ValidationType.REF;
}
private void writeBedHeader() {
// write magic bits into the ped file
try {
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0});
// ultimately, the bed will be in individual-major mode
} catch (IOException e) {
throw new ReviewedStingException("error writing to output file.");
}
}
private Map<String,Map<String,String>> parseMetaData() {
// write to the fam file, the first six columns of the standard ped file
// first, load data from the input meta data file
Map<String,Map<String,String>> metaValues = new HashMap<String,Map<String,String>>();
logger.debug("Reading in metadata...");
try {
if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) {
for ( String line : new XReadLines(metaDataFile) ) {
String[] famSplit = line.split("\\s+");
if ( famSplit.length != 6 ) {
throw new UserException("Line of the fam file is malformatted. Expected 6 entries. Line is "+line);
}
String sid = famSplit[1];
String fid = famSplit[0];
String mom = famSplit[2];
String dad = famSplit[3];
String sex = famSplit[4];
String pheno = famSplit[5];
HashMap<String,String> values = new HashMap<String, String>();
values.put("mom",mom);
values.put("dad",dad);
values.put("fid",fid);
values.put("sex",sex);
values.put("phenotype",pheno);
metaValues.put(sid,values);
outFam.printf("%s%n",line);
}
} else {
for ( String line : new XReadLines(metaDataFile) ) {
logger.debug(line);
String[] split = line.split("\\s+");
String sampleID = split[0];
String keyVals = split[1];
HashMap<String,String> values = new HashMap<String, String>();
for ( String kvp : keyVals.split(";") ) {
String[] kvp_split = kvp.split("=");
values.put(kvp_split[0],kvp_split[1]);
}
metaValues.put(sampleID,values);
}
}
} catch (FileNotFoundException e) {
throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e);
}
return metaValues;
}
}

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.utils.variantcontext;
import org.apache.commons.math.stat.descriptive.rank.Max;
import org.apache.log4j.Logger;
import org.broad.tribble.Feature;
import org.broad.tribble.TribbleException;
@ -178,9 +179,8 @@ import java.util.*;
*/
public class VariantContext implements Feature { // to enable tribble integration
private final static boolean WARN_ABOUT_BAD_END = true;
private final static long MAX_ALLELE_SIZE_FOR_NON_SV = 150;
final protected static Logger logger = Logger.getLogger(VariantContext.class);
private boolean fullyDecoded = false;
protected CommonInfo commonInfo = null;
public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR;
@ -458,6 +458,7 @@ public class VariantContext implements Feature { // to enable tribble integratio
SNP,
MNP, // a multi-nucleotide polymorphism
INDEL,
STRUCTURAL_INDEL,
SYMBOLIC,
MIXED,
}
@ -530,6 +531,18 @@ public class VariantContext implements Feature { // to enable tribble integratio
return getType() == Type.SYMBOLIC;
}
public boolean isStructuralIndel() {
return getType() == Type.STRUCTURAL_INDEL;
}
/**
*
* @return true if the variant is symbolic or a large indel
*/
public boolean isSymbolicOrSV() {
return isSymbolic() || isStructuralIndel();
}
public boolean isMNP() {
return getType() == Type.MNP;
}
@ -1250,6 +1263,14 @@ public class VariantContext implements Feature { // to enable tribble integratio
// performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type
// is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point
// in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL.
// Because a number of structural variation callers write the whole alternate allele into the VCF where possible,
// this can result in insertion/deletion alleles of structural variant size, e.g. 151+. As of July 2012, we now
// classify these as structural events, rather than indel events, as we think differently about the mechanism,
// representation, and handling of these events. Check for this case here:
if ( ref.length() > MAX_ALLELE_SIZE_FOR_NON_SV || allele.length() > MAX_ALLELE_SIZE_FOR_NON_SV )
return Type.STRUCTURAL_INDEL;
return Type.INDEL;
// old incorrect logic:

View File

@ -9,7 +9,7 @@ import java.util.Arrays;
import java.util.List;
/**
* IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl
* Integration tests for the Depth of Coverage walker
*
* @Author chartl
* @Date Feb 25, 2010

View File

@ -0,0 +1,92 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.Test;
import org.testng.annotations.DataProvider;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 8/20/12
* Time: 9:57 PM
* To change this template use File | Settings | File Templates.
*/
public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
public static final String VTBP_DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Validation_Data/VariantsToBinaryPed/";
public static String baseTestString(String inputVCF, String inputMetaData, int gq) {
return "-T VariantsToBinaryPed -R " + b37KGReference +
" -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) +
" -bim %s -fam %s -bed %s";
}
@Test
public void testNA12878Alone() {
String testName = "testNA12878Alone";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.fam",10),
3,
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","8e8bc0b5e69f22c54c0960f13c25d26c","02f1c462ebc8576e399d0e94f729fd95")
);
executeTest(testName, spec);
}
@Test
public void testNA12878AloneMetaData() {
String testName = "testNA12878AloneMetaData";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10),
3,
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","02f1c462ebc8576e399d0e94f729fd95")
);
executeTest(testName, spec);
}
@Test
public void testCEUTrio() {
String testName = "testCEUTrio";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("CEUTrio.subset.vcf", "CEUTrio.fam",10),
3,
Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","900f22c6d49a6ba0774466e99592e51d","7887d2e0bf605dbcd0688c552cdb99d5")
);
executeTest(testName, spec);
}
@Test
public void testCEUTrioMetaData() {
String testName = "testCEUTrioMetaData";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("CEUTrio.subset.vcf", "CEUTrio.metadata.txt",10),
3,
Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","2113d2cc0a059e35b1565196b7c5d98f","7887d2e0bf605dbcd0688c552cdb99d5")
);
executeTest(testName, spec);
}
@Test
public void testMalformedFam() {
String testName = "testMalformedFam";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("CEUTrio.subset.vcf", "CEUTrio.malformed.fam",10),
3,
UserException.class
);
executeTest(testName, spec);
}
}