Commit changes in Variants To Binary Ped to the stable repository to be available prior to next release.

This commit is contained in:
Christopher Hartl 2012-09-27 00:13:32 -04:00
parent 7cf9911924
commit 55cdf4f9b7
4 changed files with 296 additions and 53 deletions

View File

@ -7,7 +7,9 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -15,6 +17,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
@ -34,6 +37,28 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
@ArgumentCollection @ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
/**
* The metaData file can take two formats, the first of which is the first 6 lines of the standard ped file. This
* is what Plink describes as a fam file. An example fam file is (note that there is no header):
*
* CEUTrio NA12878 NA12891 NA12892 2 -9
* CEUTrio NA12891 UNKN1 UNKN2 2 -9
* CEUTrio NA12892 UNKN3 UNKN4 1 -9
*
* where the entries are (FamilyID IndividualID DadID MomID Phenotype Sex)
*
* An alternate format is a two-column key-value file
*
* NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9
* NA12891 fid=CEUTrio;sex=2;phenotype=-9
* NA12892 fid=CEUTrio;sex=1;phenotype=-9
*
* wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs.
*
* Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the
* command line has "-md [name].fam", the fam file will simply be copied. However, if a metadata file of the
* alternate format is passed by "-md [name].txt", the walker will construct a formatted .fam file from the data.
*/
@Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " + @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " +
"(in which case it will be copied to the file you provide as fam output).") "(in which case it will be copied to the file you provide as fam output).")
File metaDataFile; File metaDataFile;
@ -76,47 +101,11 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private List<String> famOrder = new ArrayList<String>(); private List<String> famOrder = new ArrayList<String>();
public void initialize() { public void initialize() {
vv.variantCollection = variantCollection; initializeValidator();
vv.dbsnp = dbsnp; writeBedHeader();
vv.DO_NOT_VALIDATE_FILTERED = true; Map<String,Map<String,String>> sampleMetaValues = parseMetaData();
vv.type = ValidateVariants.ValidationType.REF;
// create temporary output streams and buffers // create temporary output streams and buffers
// write magic bits into the ped file
try {
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0});
// ultimately, the bed will be in individual-major mode
} catch (IOException e) {
throw new ReviewedStingException("error writing to output file.");
}
// write to the fam file, the first six columns of the standard ped file
// first, load data from the input meta data file
Map<String,Map<String,String>> metaValues = new HashMap<String,Map<String,String>>();
logger.debug("Reading in metadata...");
try {
if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) {
for ( String line : new XReadLines(metaDataFile) ) {
String[] famSplit = line.split("\\t");
String sid = famSplit[1];
outFam.printf("%s%n",line);
}
} else {
for ( String line : new XReadLines(metaDataFile) ) {
logger.debug(line);
String[] split = line.split("\\t");
String sampleID = split[0];
String keyVals = split[1];
HashMap<String,String> values = new HashMap<String, String>();
for ( String kvp : keyVals.split(";") ) {
String[] kvp_split = kvp.split("=");
values.put(kvp_split[0],kvp_split[1]);
}
metaValues.put(sampleID,values);
}
}
} catch (FileNotFoundException e) {
throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e);
}
// family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype // family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype
int dummyID = 0; // increments for dummy parental and family IDs used int dummyID = 0; // increments for dummy parental and family IDs used
// want to be especially careful to maintain order here // want to be especially careful to maintain order here
@ -126,21 +115,29 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
continue; continue;
} }
for ( String sample : header.getValue().getGenotypeSamples() ) { for ( String sample : header.getValue().getGenotypeSamples() ) {
Map<String,String> mVals = metaValues.get(sample); if ( ! metaDataFile.getAbsolutePath().endsWith(".fam") ) {
if ( mVals == null ) { Map<String,String> mVals = sampleMetaValues.get(sample);
throw new UserException("No metadata provided for sample "+sample); if ( mVals == null ) {
throw new UserException("No metadata provided for sample "+sample);
}
if ( ! mVals.containsKey("phenotype") ) {
throw new UserException("No phenotype data provided for sample "+sample);
}
String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID);
String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID);
String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID);
String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
String pheno = mVals.get("phenotype");
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno);
} else {
// even if a fam file is input, we can't diverge the bed file from the fam file, which
// could lead to a malformed plink trio. Fail fast if there's any extra sample in the VCF.
if ( ! sampleMetaValues.containsKey(sample) ) {
throw new UserException("No metadata provided for sample "+sample);
}
} }
if ( ! mVals.containsKey("phenotype") ) {
throw new UserException("No phenotype data provided for sample "+sample);
}
String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID);
String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID);
String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID);
String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
String pheno = mVals.get("phenotype");
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno);
try { try {
File temp = File.createTempFile(sample, ".tmp"); File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp");
printMap.put(sample,new PrintStream(temp)); printMap.put(sample,new PrintStream(temp));
tempFiles.put(sample,temp); tempFiles.put(sample,temp);
} catch (IOException e) { } catch (IOException e) {
@ -216,6 +213,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
// reset the buffer for this sample // reset the buffer for this sample
genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); genotypeBuffer.put(sample,new byte[BUFFER_SIZE]);
} }
byteCount = 0;
} }
genotypeCount = 0; genotypeCount = 0;
} }
@ -305,7 +303,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private byte getFlippedEncoding(Genotype g, int offset) { private byte getFlippedEncoding(Genotype g, int offset) {
byte b; byte b;
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) { if ( ! checkGQIsGood(g) ) {
b = NO_CALL; b = NO_CALL;
} else if ( g.isHomRef() ) { } else if ( g.isHomRef() ) {
b = HOM_VAR; b = HOM_VAR;
@ -320,6 +318,16 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
return (byte) (b << (2*offset)); return (byte) (b << (2*offset));
} }
private boolean checkGQIsGood(Genotype genotype) {
if ( genotype.hasGQ() ) {
return genotype.getGQ() >= minGenotypeQuality;
} else if ( genotype.hasLikelihoods() ) {
return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality;
}
return false;
}
private static String getID(VariantContext v) { private static String getID(VariantContext v) {
if ( v.hasID() ) { if ( v.hasID() ) {
return v.getID(); return v.getID();
@ -337,4 +345,69 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
throw new UserException("Allele frequency appears to be neither String nor Double. Please check the header of your VCF."); throw new UserException("Allele frequency appears to be neither String nor Double. Please check the header of your VCF.");
} }
} }
private void initializeValidator() {
vv.variantCollection = variantCollection;
vv.dbsnp = dbsnp;
vv.DO_NOT_VALIDATE_FILTERED = true;
vv.type = ValidateVariants.ValidationType.REF;
}
private void writeBedHeader() {
// write magic bits into the ped file
try {
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0});
// ultimately, the bed will be in individual-major mode
} catch (IOException e) {
throw new ReviewedStingException("error writing to output file.");
}
}
private Map<String,Map<String,String>> parseMetaData() {
// write to the fam file, the first six columns of the standard ped file
// first, load data from the input meta data file
Map<String,Map<String,String>> metaValues = new HashMap<String,Map<String,String>>();
logger.debug("Reading in metadata...");
try {
if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) {
for ( String line : new XReadLines(metaDataFile) ) {
String[] famSplit = line.split("\\s+");
if ( famSplit.length != 6 ) {
throw new UserException("Line of the fam file is malformatted. Expected 6 entries. Line is "+line);
}
String sid = famSplit[1];
String fid = famSplit[0];
String mom = famSplit[2];
String dad = famSplit[3];
String sex = famSplit[4];
String pheno = famSplit[5];
HashMap<String,String> values = new HashMap<String, String>();
values.put("mom",mom);
values.put("dad",dad);
values.put("fid",fid);
values.put("sex",sex);
values.put("phenotype",pheno);
metaValues.put(sid,values);
outFam.printf("%s%n",line);
}
} else {
for ( String line : new XReadLines(metaDataFile) ) {
logger.debug(line);
String[] split = line.split("\\s+");
String sampleID = split[0];
String keyVals = split[1];
HashMap<String,String> values = new HashMap<String, String>();
for ( String kvp : keyVals.split(";") ) {
String[] kvp_split = kvp.split("=");
values.put(kvp_split[0],kvp_split[1]);
}
metaValues.put(sampleID,values);
}
}
} catch (FileNotFoundException e) {
throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e);
}
return metaValues;
}
} }

View File

@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.Arrays; import java.util.Arrays;
import java.util.EnumMap; import java.util.EnumMap;
import java.util.List;
public class GenotypeLikelihoods { public class GenotypeLikelihoods {
private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5; private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5;
@ -167,10 +168,36 @@ public class GenotypeLikelihoods {
//Return the neg log10 Genotype Quality (GQ) for the given genotype //Return the neg log10 Genotype Quality (GQ) for the given genotype
//Returns Double.NEGATIVE_INFINITY in case of missing genotype //Returns Double.NEGATIVE_INFINITY in case of missing genotype
/**
* This is really dangerous and returns completely wrong results for genotypes from a multi-allelic context.
* Use getLog10GQ(Genotype,VariantContext) or getLog10GQ(Genotype,List<Allele>) in place of it.
*
* If you **know** you're biallelic, use getGQLog10FromLikelihoods directly.
* @param genotype - actually a genotype type (no call, hom ref, het, hom var)
* @return an unsafe quantity that could be negative. In the bi-allelic case, the GQ resulting from best minus next best (if the type is the best).
*/
@Deprecated
public double getLog10GQ(GenotypeType genotype){ public double getLog10GQ(GenotypeType genotype){
return getGQLog10FromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector()); return getGQLog10FromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector());
} }
@Requires({"genotypeAlleles != null","genotypeAlleles.size()==2","contextAlleles != null","contextAlleles.size() >= 1"})
private double getLog10GQ(List<Allele> genotypeAlleles,List<Allele> contextAlleles) {
int allele1Index = contextAlleles.indexOf(genotypeAlleles.get(0));
int allele2Index = contextAlleles.indexOf(genotypeAlleles.get(1));
int plIndex = calculatePLindex(allele1Index,allele2Index);
return getGQLog10FromLikelihoods(plIndex,getAsVector());
}
public double getLog10GQ(Genotype genotype, List<Allele> vcAlleles ) {
return getLog10GQ(genotype.getAlleles(),vcAlleles);
}
public double getLog10GQ(Genotype genotype, VariantContext context) {
return getLog10GQ(genotype,context.getAlleles());
}
public static double getGQLog10FromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){ public static double getGQLog10FromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){
if(likelihoods == null) if(likelihoods == null)
return Double.NEGATIVE_INFINITY; return Double.NEGATIVE_INFINITY;

View File

@ -0,0 +1,117 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.Test;
import org.testng.annotations.DataProvider;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 8/20/12
* Time: 9:57 PM
* To change this template use File | Settings | File Templates.
*/
public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
public static final String VTBP_DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Validation_Data/VariantsToBinaryPed/";
public static String baseTestString(String inputVCF, String inputMetaData, int gq) {
return "-T VariantsToBinaryPed -R " + b37KGReference +
" -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) +
" -bim %s -fam %s -bed %s";
}
@Test
public void testNA12878Alone() {
String testName = "testNA12878Alone";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.fam",10),
3,
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","8e8bc0b5e69f22c54c0960f13c25d26c","02f1c462ebc8576e399d0e94f729fd95")
);
executeTest(testName, spec);
}
@Test
public void testNA12878AloneMetaData() {
String testName = "testNA12878AloneMetaData";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10),
3,
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","02f1c462ebc8576e399d0e94f729fd95")
);
executeTest(testName, spec);
}
@Test
public void testCEUTrio() {
String testName = "testCEUTrio";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("CEUTrio.subset.vcf", "CEUTrio.fam",10),
3,
Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","900f22c6d49a6ba0774466e99592e51d","7887d2e0bf605dbcd0688c552cdb99d5")
);
executeTest(testName, spec);
}
@Test
public void testCEUTrioMetaData() {
String testName = "testCEUTrioMetaData";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("CEUTrio.subset.vcf", "CEUTrio.metadata.txt",10),
3,
Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","2113d2cc0a059e35b1565196b7c5d98f","7887d2e0bf605dbcd0688c552cdb99d5")
);
executeTest(testName, spec);
}
@Test
public void testMalformedFam() {
String testName = "testMalformedFam";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("CEUTrio.subset.vcf", "CEUTrio.malformed.fam",10),
3,
UserException.class
);
executeTest(testName, spec);
}
@Test
public void testFailFast() {
String testName = "testFailFast";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("HapMap.testFailFast.vcf", "HapMap_only_famids.fam",10),
3,
UserException.class
);
executeTest(testName, spec);
}
@Test
public void testFailFastMeta() {
String testName = "testFailFastMeta";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("HapMap.testFailFast.vcf", "HapMap_only_famids.metadata.txt",10),
3,
UserException.class
);
executeTest(testName, spec);
}
}

View File

@ -29,12 +29,15 @@ package org.broadinstitute.sting.utils.variantcontext;
// the imports for unit testing. // the imports for unit testing.
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.util.Arrays;
import java.util.EnumMap; import java.util.EnumMap;
import java.util.List;
/** /**
@ -44,6 +47,7 @@ public class GenotypeLikelihoodsUnitTest {
double [] v = new double[]{-10.5, -1.25, -5.11}; double [] v = new double[]{-10.5, -1.25, -5.11};
final static String vGLString = "-10.50,-1.25,-5.11"; final static String vGLString = "-10.50,-1.25,-5.11";
final static String vPLString = "93,0,39"; final static String vPLString = "93,0,39";
double[] triAllelic = new double[]{-4.2,-2.0,-3.0,-1.6,0.0,-4.0}; //AA,AB,AC,BB,BC,CC
@Test @Test
public void testFromVector2() { public void testFromVector2() {
@ -139,6 +143,28 @@ public class GenotypeLikelihoodsUnitTest {
} }
} }
// this test is completely broken, the method is wrong.
public void testGetQualFromLikelihoodsMultiAllelicBroken() {
GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic);
double actualGQ = gl.getLog10GQ(GenotypeType.HET);
double expectedGQ = 1.6;
Assert.assertEquals(actualGQ,expectedGQ);
}
public void testGetQualFromLikelihoodsMultiAllelic() {
GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic);
Allele ref = Allele.create(BaseUtils.A,true);
Allele alt1 = Allele.create(BaseUtils.C);
Allele alt2 = Allele.create(BaseUtils.T);
List<Allele> allAlleles = Arrays.asList(ref,alt1,alt2);
List<Allele> gtAlleles = Arrays.asList(alt1,alt2);
GenotypeBuilder gtBuilder = new GenotypeBuilder();
gtBuilder.alleles(gtAlleles);
double actualGQ = gl.getLog10GQ(gtBuilder.make(),allAlleles);
double expectedGQ = 1.6;
Assert.assertEquals(actualGQ,expectedGQ);
}
private void assertDoubleArraysAreEqual(double[] v1, double[] v2) { private void assertDoubleArraysAreEqual(double[] v1, double[] v2) {
Assert.assertEquals(v1.length, v2.length); Assert.assertEquals(v1.length, v2.length);
for ( int i = 0; i < v1.length; i++ ) { for ( int i = 0; i < v1.length; i++ ) {