diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index b6f24433e..7b0db6855 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -28,9 +29,13 @@ import java.util.*; public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { private MendelianViolation mendelianViolation = null; - private String motherId; - private String fatherId; - private String childId; + private Set trios; + + private class Trio { + String motherId; + String fatherId; + String childId; + } public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( mendelianViolation == null ) { @@ -38,17 +43,27 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); } else { - throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line containing only 1 trio."); + throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line."); } } Map toRet = new HashMap(1); - boolean hasAppropriateGenotypes = vc.hasGenotype(motherId) && vc.getGenotype(motherId).hasLikelihoods() && - vc.hasGenotype(fatherId) && vc.getGenotype(fatherId).hasLikelihoods() && - vc.hasGenotype(childId) && vc.getGenotype(childId).hasLikelihoods(); - if ( hasAppropriateGenotypes ) - toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc,motherId,fatherId,childId)); + //double pNoMV = 1.0; + double maxMVLR = Double.MIN_VALUE; + for ( Trio trio : trios ) { + boolean hasAppropriateGenotypes = vc.hasGenotype(trio.motherId) && vc.getGenotype(trio.motherId).hasLikelihoods() && + vc.hasGenotype(trio.fatherId) && vc.getGenotype(trio.fatherId).hasLikelihoods() && + vc.hasGenotype(trio.childId) && vc.getGenotype(trio.childId).hasLikelihoods(); + if ( hasAppropriateGenotypes ) { + Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.motherId,trio.fatherId,trio.childId); + maxMVLR = likR > maxMVLR ? likR : maxMVLR; + //pNoMV *= (1.0-Math.pow(10.0,likR)/(1+Math.pow(10.0,likR))); + } + } + //double pSomeMV = 1.0-pNoMV; + //toRet.put("MVLR",Math.log10(pSomeMV)-Math.log10(1.0-pSomeMV)); + toRet.put("MVLR",maxMVLR); return toRet; } @@ -58,25 +73,24 @@ public class MVLikelihoodRatio extends InfoFieldAnnotation implements Experiment public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } private boolean checkAndSetSamples(SampleDB db){ + trios = new HashSet(); Set families = db.getFamilyIDs(); - if(families.size() != 1) - return false; - - Set family = db.getFamily(families.iterator().next()); - if(family.size() != 3) - return false; - - Iterator sampleIter = family.iterator(); - Sample sample; - for(sample = sampleIter.next();sampleIter.hasNext();sample=sampleIter.next()){ - if(sample.getParents().size()==2){ - motherId = sample.getMaternalID(); - fatherId = sample.getPaternalID(); - childId = sample.getID(); - return true; + for ( String familyString : families ) { + Set family = db.getFamily(familyString); + Iterator sampleIterator = family.iterator(); + Sample sample; + for ( sample = sampleIterator.next(); sampleIterator.hasNext(); sample=sampleIterator.next()) { + if ( sample.getParents().size() == 2 ) { + Trio trio = new Trio(); + trio.childId = sample.getID(); + trio.fatherId = sample.getFather().getID(); + trio.motherId = sample.getMother().getID(); + trios.add(trio); + } } } - return false; + + return trios.size() > 0; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index e88505f99..1f06cc249 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -297,7 +297,7 @@ public class VariantDataManager { case SNP: return evalVC.isSNP() || evalVC.isMNP(); case INDEL: - return evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic(); + return evalVC.isStructuralIndel() || evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic(); case BOTH: return true; default: diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index bfd9aa52f..4c0c0cabf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -322,6 +322,9 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false) private boolean justRead = false; + @Argument(doc="indel size select",required=false,fullName="maxIndelSize") + private int maxIndelSize = Integer.MAX_VALUE; + /* Private class used to store the intermediate variants in the integer random selection process */ private static class RandomVariantStructure { @@ -541,6 +544,9 @@ public class SelectVariants extends RodWalker implements TreeR if (!selectedTypes.contains(vc.getType())) continue; + if ( badIndelSize(vc) ) + continue; + VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS); if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) { @@ -572,6 +578,20 @@ public class SelectVariants extends RodWalker implements TreeR return 1; } + private boolean badIndelSize(final VariantContext vc) { + if ( vc.getReference().length() > maxIndelSize ) { + return true; + } + + for ( Allele a : vc.getAlternateAlleles() ) { + if ( a.length() > maxIndelSize ) { + return true; + } + } + + return false; + } + private boolean hasPLs(final VariantContext vc) { for ( Genotype g : vc.getGenotypes() ) { if ( g.hasLikelihoods() ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index 3fba8fa77..7111bac46 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -76,47 +76,11 @@ public class VariantsToBinaryPed extends RodWalker { private List famOrder = new ArrayList(); public void initialize() { - vv.variantCollection = variantCollection; - vv.dbsnp = dbsnp; - vv.DO_NOT_VALIDATE_FILTERED = true; - vv.type = ValidateVariants.ValidationType.REF; + initializeValidator(); + writeBedHeader(); + Map> sampleMetaValues = parseMetaData(); // create temporary output streams and buffers - // write magic bits into the ped file - try { - outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); - // ultimately, the bed will be in individual-major mode - } catch (IOException e) { - throw new ReviewedStingException("error writing to output file."); - } - // write to the fam file, the first six columns of the standard ped file - // first, load data from the input meta data file - Map> metaValues = new HashMap>(); - logger.debug("Reading in metadata..."); - try { - if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { - for ( String line : new XReadLines(metaDataFile) ) { - String[] famSplit = line.split("\\t"); - String sid = famSplit[1]; - outFam.printf("%s%n",line); - } - } else { - for ( String line : new XReadLines(metaDataFile) ) { - logger.debug(line); - String[] split = line.split("\\t"); - String sampleID = split[0]; - String keyVals = split[1]; - HashMap values = new HashMap(); - for ( String kvp : keyVals.split(";") ) { - String[] kvp_split = kvp.split("="); - values.put(kvp_split[0],kvp_split[1]); - } - metaValues.put(sampleID,values); - } - } - } catch (FileNotFoundException e) { - throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e); - } // family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype int dummyID = 0; // increments for dummy parental and family IDs used // want to be especially careful to maintain order here @@ -126,21 +90,23 @@ public class VariantsToBinaryPed extends RodWalker { continue; } for ( String sample : header.getValue().getGenotypeSamples() ) { - Map mVals = metaValues.get(sample); - if ( mVals == null ) { - throw new UserException("No metadata provided for sample "+sample); + if ( ! metaDataFile.getAbsolutePath().endsWith(".fam") ) { + Map mVals = sampleMetaValues.get(sample); + if ( mVals == null ) { + throw new UserException("No metadata provided for sample "+sample); + } + if ( ! mVals.containsKey("phenotype") ) { + throw new UserException("No phenotype data provided for sample "+sample); + } + String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID); + String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID); + String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); + String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; + String pheno = mVals.get("phenotype"); + outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); } - if ( ! mVals.containsKey("phenotype") ) { - throw new UserException("No phenotype data provided for sample "+sample); - } - String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID); - String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID); - String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); - String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; - String pheno = mVals.get("phenotype"); - outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); try { - File temp = File.createTempFile(sample, ".tmp"); + File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp"); printMap.put(sample,new PrintStream(temp)); tempFiles.put(sample,temp); } catch (IOException e) { @@ -216,6 +182,7 @@ public class VariantsToBinaryPed extends RodWalker { // reset the buffer for this sample genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); } + byteCount = 0; } genotypeCount = 0; } @@ -337,4 +304,69 @@ public class VariantsToBinaryPed extends RodWalker { throw new UserException("Allele frequency appears to be neither String nor Double. Please check the header of your VCF."); } } + + private void initializeValidator() { + vv.variantCollection = variantCollection; + vv.dbsnp = dbsnp; + vv.DO_NOT_VALIDATE_FILTERED = true; + vv.type = ValidateVariants.ValidationType.REF; + } + + private void writeBedHeader() { + // write magic bits into the ped file + try { + outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); + // ultimately, the bed will be in individual-major mode + } catch (IOException e) { + throw new ReviewedStingException("error writing to output file."); + } + } + + private Map> parseMetaData() { + // write to the fam file, the first six columns of the standard ped file + // first, load data from the input meta data file + Map> metaValues = new HashMap>(); + logger.debug("Reading in metadata..."); + try { + if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { + for ( String line : new XReadLines(metaDataFile) ) { + String[] famSplit = line.split("\\s+"); + if ( famSplit.length != 6 ) { + throw new UserException("Line of the fam file is malformatted. Expected 6 entries. Line is "+line); + } + String sid = famSplit[1]; + String fid = famSplit[0]; + String mom = famSplit[2]; + String dad = famSplit[3]; + String sex = famSplit[4]; + String pheno = famSplit[5]; + HashMap values = new HashMap(); + values.put("mom",mom); + values.put("dad",dad); + values.put("fid",fid); + values.put("sex",sex); + values.put("phenotype",pheno); + metaValues.put(sid,values); + outFam.printf("%s%n",line); + } + } else { + for ( String line : new XReadLines(metaDataFile) ) { + logger.debug(line); + String[] split = line.split("\\s+"); + String sampleID = split[0]; + String keyVals = split[1]; + HashMap values = new HashMap(); + for ( String kvp : keyVals.split(";") ) { + String[] kvp_split = kvp.split("="); + values.put(kvp_split[0],kvp_split[1]); + } + metaValues.put(sampleID,values); + } + } + } catch (FileNotFoundException e) { + throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e); + } + + return metaValues; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 1fe6b8652..8015889f5 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils.variantcontext; +import org.apache.commons.math.stat.descriptive.rank.Max; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broad.tribble.TribbleException; @@ -178,9 +179,8 @@ import java.util.*; */ public class VariantContext implements Feature { // to enable tribble integration private final static boolean WARN_ABOUT_BAD_END = true; + private final static long MAX_ALLELE_SIZE_FOR_NON_SV = 150; final protected static Logger logger = Logger.getLogger(VariantContext.class); - - private boolean fullyDecoded = false; protected CommonInfo commonInfo = null; public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; @@ -458,6 +458,7 @@ public class VariantContext implements Feature { // to enable tribble integratio SNP, MNP, // a multi-nucleotide polymorphism INDEL, + STRUCTURAL_INDEL, SYMBOLIC, MIXED, } @@ -530,6 +531,18 @@ public class VariantContext implements Feature { // to enable tribble integratio return getType() == Type.SYMBOLIC; } + public boolean isStructuralIndel() { + return getType() == Type.STRUCTURAL_INDEL; + } + + /** + * + * @return true if the variant is symbolic or a large indel + */ + public boolean isSymbolicOrSV() { + return isSymbolic() || isStructuralIndel(); + } + public boolean isMNP() { return getType() == Type.MNP; } @@ -1250,6 +1263,14 @@ public class VariantContext implements Feature { // to enable tribble integratio // performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. + + // Because a number of structural variation callers write the whole alternate allele into the VCF where possible, + // this can result in insertion/deletion alleles of structural variant size, e.g. 151+. As of July 2012, we now + // classify these as structural events, rather than indel events, as we think differently about the mechanism, + // representation, and handling of these events. Check for this case here: + if ( ref.length() > MAX_ALLELE_SIZE_FOR_NON_SV || allele.length() > MAX_ALLELE_SIZE_FOR_NON_SV ) + return Type.STRUCTURAL_INDEL; + return Type.INDEL; // old incorrect logic: diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java index 6f1370008..9bec1b75d 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -9,7 +9,7 @@ import java.util.Arrays; import java.util.List; /** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * Integration tests for the Depth of Coverage walker * * @Author chartl * @Date Feb 25, 2010 diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java new file mode 100644 index 000000000..07e82b869 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java @@ -0,0 +1,92 @@ +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 8/20/12 + * Time: 9:57 PM + * To change this template use File | Settings | File Templates. + */ +public class VariantsToBinaryPedIntegrationTest extends WalkerTest { + + public static final String VTBP_DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Validation_Data/VariantsToBinaryPed/"; + + public static String baseTestString(String inputVCF, String inputMetaData, int gq) { + return "-T VariantsToBinaryPed -R " + b37KGReference + + " -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) + + " -bim %s -fam %s -bed %s"; + + } + + @Test + public void testNA12878Alone() { + String testName = "testNA12878Alone"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.fam",10), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","8e8bc0b5e69f22c54c0960f13c25d26c","02f1c462ebc8576e399d0e94f729fd95") + ); + + executeTest(testName, spec); + } + + @Test + public void testNA12878AloneMetaData() { + String testName = "testNA12878AloneMetaData"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","02f1c462ebc8576e399d0e94f729fd95") + ); + + executeTest(testName, spec); + } + + @Test + public void testCEUTrio() { + String testName = "testCEUTrio"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.fam",10), + 3, + Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","900f22c6d49a6ba0774466e99592e51d","7887d2e0bf605dbcd0688c552cdb99d5") + ); + + executeTest(testName, spec); + } + + @Test + public void testCEUTrioMetaData() { + String testName = "testCEUTrioMetaData"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.metadata.txt",10), + 3, + Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","2113d2cc0a059e35b1565196b7c5d98f","7887d2e0bf605dbcd0688c552cdb99d5") + ); + + executeTest(testName, spec); + } + + @Test + public void testMalformedFam() { + String testName = "testMalformedFam"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("CEUTrio.subset.vcf", "CEUTrio.malformed.fam",10), + 3, + UserException.class + ); + + executeTest(testName, spec); + } +} + +