Commit changes in Variants To Binary Ped to the stable repository to be available prior to next release.

This commit is contained in:
Christopher Hartl 2012-09-27 00:13:32 -04:00
parent 7cf9911924
commit 55cdf4f9b7
4 changed files with 296 additions and 53 deletions

View File

@ -7,7 +7,9 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -15,6 +17,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
@ -34,6 +37,28 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
/**
* The metaData file can take two formats, the first of which is the first 6 lines of the standard ped file. This
* is what Plink describes as a fam file. An example fam file is (note that there is no header):
*
* CEUTrio NA12878 NA12891 NA12892 2 -9
* CEUTrio NA12891 UNKN1 UNKN2 2 -9
* CEUTrio NA12892 UNKN3 UNKN4 1 -9
*
* where the entries are (FamilyID IndividualID DadID MomID Phenotype Sex)
*
* An alternate format is a two-column key-value file
*
* NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9
* NA12891 fid=CEUTrio;sex=2;phenotype=-9
* NA12892 fid=CEUTrio;sex=1;phenotype=-9
*
* wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs.
*
* Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the
* command line has "-md [name].fam", the fam file will simply be copied. However, if a metadata file of the
* alternate format is passed by "-md [name].txt", the walker will construct a formatted .fam file from the data.
*/
@Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " +
"(in which case it will be copied to the file you provide as fam output).")
File metaDataFile;
@ -76,47 +101,11 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private List<String> famOrder = new ArrayList<String>();
public void initialize() {
vv.variantCollection = variantCollection;
vv.dbsnp = dbsnp;
vv.DO_NOT_VALIDATE_FILTERED = true;
vv.type = ValidateVariants.ValidationType.REF;
initializeValidator();
writeBedHeader();
Map<String,Map<String,String>> sampleMetaValues = parseMetaData();
// create temporary output streams and buffers
// write magic bits into the ped file
try {
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0});
// ultimately, the bed will be in individual-major mode
} catch (IOException e) {
throw new ReviewedStingException("error writing to output file.");
}
// write to the fam file, the first six columns of the standard ped file
// first, load data from the input meta data file
Map<String,Map<String,String>> metaValues = new HashMap<String,Map<String,String>>();
logger.debug("Reading in metadata...");
try {
if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) {
for ( String line : new XReadLines(metaDataFile) ) {
String[] famSplit = line.split("\\t");
String sid = famSplit[1];
outFam.printf("%s%n",line);
}
} else {
for ( String line : new XReadLines(metaDataFile) ) {
logger.debug(line);
String[] split = line.split("\\t");
String sampleID = split[0];
String keyVals = split[1];
HashMap<String,String> values = new HashMap<String, String>();
for ( String kvp : keyVals.split(";") ) {
String[] kvp_split = kvp.split("=");
values.put(kvp_split[0],kvp_split[1]);
}
metaValues.put(sampleID,values);
}
}
} catch (FileNotFoundException e) {
throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e);
}
// family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype
int dummyID = 0; // increments for dummy parental and family IDs used
// want to be especially careful to maintain order here
@ -126,21 +115,29 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
continue;
}
for ( String sample : header.getValue().getGenotypeSamples() ) {
Map<String,String> mVals = metaValues.get(sample);
if ( mVals == null ) {
throw new UserException("No metadata provided for sample "+sample);
if ( ! metaDataFile.getAbsolutePath().endsWith(".fam") ) {
Map<String,String> mVals = sampleMetaValues.get(sample);
if ( mVals == null ) {
throw new UserException("No metadata provided for sample "+sample);
}
if ( ! mVals.containsKey("phenotype") ) {
throw new UserException("No phenotype data provided for sample "+sample);
}
String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID);
String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID);
String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID);
String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
String pheno = mVals.get("phenotype");
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno);
} else {
// even if a fam file is input, we can't diverge the bed file from the fam file, which
// could lead to a malformed plink trio. Fail fast if there's any extra sample in the VCF.
if ( ! sampleMetaValues.containsKey(sample) ) {
throw new UserException("No metadata provided for sample "+sample);
}
}
if ( ! mVals.containsKey("phenotype") ) {
throw new UserException("No phenotype data provided for sample "+sample);
}
String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID);
String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID);
String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID);
String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
String pheno = mVals.get("phenotype");
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno);
try {
File temp = File.createTempFile(sample, ".tmp");
File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp");
printMap.put(sample,new PrintStream(temp));
tempFiles.put(sample,temp);
} catch (IOException e) {
@ -216,6 +213,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
// reset the buffer for this sample
genotypeBuffer.put(sample,new byte[BUFFER_SIZE]);
}
byteCount = 0;
}
genotypeCount = 0;
}
@ -305,7 +303,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private byte getFlippedEncoding(Genotype g, int offset) {
byte b;
if ( g.hasGQ() && g.getGQ() < minGenotypeQuality ) {
if ( ! checkGQIsGood(g) ) {
b = NO_CALL;
} else if ( g.isHomRef() ) {
b = HOM_VAR;
@ -320,6 +318,16 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
return (byte) (b << (2*offset));
}
private boolean checkGQIsGood(Genotype genotype) {
if ( genotype.hasGQ() ) {
return genotype.getGQ() >= minGenotypeQuality;
} else if ( genotype.hasLikelihoods() ) {
return GenotypeLikelihoods.getGQLog10FromLikelihoods(genotype.getType().ordinal()-1,genotype.getLikelihoods().getAsVector()) >= minGenotypeQuality;
}
return false;
}
private static String getID(VariantContext v) {
if ( v.hasID() ) {
return v.getID();
@ -337,4 +345,69 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
throw new UserException("Allele frequency appears to be neither String nor Double. Please check the header of your VCF.");
}
}
private void initializeValidator() {
vv.variantCollection = variantCollection;
vv.dbsnp = dbsnp;
vv.DO_NOT_VALIDATE_FILTERED = true;
vv.type = ValidateVariants.ValidationType.REF;
}
private void writeBedHeader() {
// write magic bits into the ped file
try {
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0});
// ultimately, the bed will be in individual-major mode
} catch (IOException e) {
throw new ReviewedStingException("error writing to output file.");
}
}
private Map<String,Map<String,String>> parseMetaData() {
// write to the fam file, the first six columns of the standard ped file
// first, load data from the input meta data file
Map<String,Map<String,String>> metaValues = new HashMap<String,Map<String,String>>();
logger.debug("Reading in metadata...");
try {
if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) {
for ( String line : new XReadLines(metaDataFile) ) {
String[] famSplit = line.split("\\s+");
if ( famSplit.length != 6 ) {
throw new UserException("Line of the fam file is malformatted. Expected 6 entries. Line is "+line);
}
String sid = famSplit[1];
String fid = famSplit[0];
String mom = famSplit[2];
String dad = famSplit[3];
String sex = famSplit[4];
String pheno = famSplit[5];
HashMap<String,String> values = new HashMap<String, String>();
values.put("mom",mom);
values.put("dad",dad);
values.put("fid",fid);
values.put("sex",sex);
values.put("phenotype",pheno);
metaValues.put(sid,values);
outFam.printf("%s%n",line);
}
} else {
for ( String line : new XReadLines(metaDataFile) ) {
logger.debug(line);
String[] split = line.split("\\s+");
String sampleID = split[0];
String keyVals = split[1];
HashMap<String,String> values = new HashMap<String, String>();
for ( String kvp : keyVals.split(";") ) {
String[] kvp_split = kvp.split("=");
values.put(kvp_split[0],kvp_split[1]);
}
metaValues.put(sampleID,values);
}
}
} catch (FileNotFoundException e) {
throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e);
}
return metaValues;
}
}

View File

@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.Arrays;
import java.util.EnumMap;
import java.util.List;
public class GenotypeLikelihoods {
private final static int NUM_LIKELIHOODS_CACHE_N_ALLELES = 5;
@ -167,10 +168,36 @@ public class GenotypeLikelihoods {
//Return the neg log10 Genotype Quality (GQ) for the given genotype
//Returns Double.NEGATIVE_INFINITY in case of missing genotype
/**
* This is really dangerous and returns completely wrong results for genotypes from a multi-allelic context.
* Use getLog10GQ(Genotype,VariantContext) or getLog10GQ(Genotype,List<Allele>) in place of it.
*
* If you **know** you're biallelic, use getGQLog10FromLikelihoods directly.
* @param genotype - actually a genotype type (no call, hom ref, het, hom var)
* @return an unsafe quantity that could be negative. In the bi-allelic case, the GQ resulting from best minus next best (if the type is the best).
*/
@Deprecated
public double getLog10GQ(GenotypeType genotype){
return getGQLog10FromLikelihoods(genotype.ordinal() - 1 /* NO_CALL IS FIRST */, getAsVector());
}
@Requires({"genotypeAlleles != null","genotypeAlleles.size()==2","contextAlleles != null","contextAlleles.size() >= 1"})
private double getLog10GQ(List<Allele> genotypeAlleles,List<Allele> contextAlleles) {
int allele1Index = contextAlleles.indexOf(genotypeAlleles.get(0));
int allele2Index = contextAlleles.indexOf(genotypeAlleles.get(1));
int plIndex = calculatePLindex(allele1Index,allele2Index);
return getGQLog10FromLikelihoods(plIndex,getAsVector());
}
public double getLog10GQ(Genotype genotype, List<Allele> vcAlleles ) {
return getLog10GQ(genotype.getAlleles(),vcAlleles);
}
public double getLog10GQ(Genotype genotype, VariantContext context) {
return getLog10GQ(genotype,context.getAlleles());
}
public static double getGQLog10FromLikelihoods(int iOfChoosenGenotype, double[] likelihoods){
if(likelihoods == null)
return Double.NEGATIVE_INFINITY;

View File

@ -0,0 +1,117 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.WalkerTest;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.Test;
import org.testng.annotations.DataProvider;
import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 8/20/12
* Time: 9:57 PM
* To change this template use File | Settings | File Templates.
*/
public class VariantsToBinaryPedIntegrationTest extends WalkerTest {
public static final String VTBP_DATA_DIR = "/humgen/gsa-hpprojects/GATK/data/Validation_Data/VariantsToBinaryPed/";
public static String baseTestString(String inputVCF, String inputMetaData, int gq) {
return "-T VariantsToBinaryPed -R " + b37KGReference +
" -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) +
" -bim %s -fam %s -bed %s";
}
@Test
public void testNA12878Alone() {
String testName = "testNA12878Alone";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.fam",10),
3,
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","8e8bc0b5e69f22c54c0960f13c25d26c","02f1c462ebc8576e399d0e94f729fd95")
);
executeTest(testName, spec);
}
@Test
public void testNA12878AloneMetaData() {
String testName = "testNA12878AloneMetaData";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10),
3,
Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","02f1c462ebc8576e399d0e94f729fd95")
);
executeTest(testName, spec);
}
@Test
public void testCEUTrio() {
String testName = "testCEUTrio";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("CEUTrio.subset.vcf", "CEUTrio.fam",10),
3,
Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","900f22c6d49a6ba0774466e99592e51d","7887d2e0bf605dbcd0688c552cdb99d5")
);
executeTest(testName, spec);
}
@Test
public void testCEUTrioMetaData() {
String testName = "testCEUTrioMetaData";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("CEUTrio.subset.vcf", "CEUTrio.metadata.txt",10),
3,
Arrays.asList("59b93fbb4bb31309b3adc83ba96dd1a2","2113d2cc0a059e35b1565196b7c5d98f","7887d2e0bf605dbcd0688c552cdb99d5")
);
executeTest(testName, spec);
}
@Test
public void testMalformedFam() {
String testName = "testMalformedFam";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("CEUTrio.subset.vcf", "CEUTrio.malformed.fam",10),
3,
UserException.class
);
executeTest(testName, spec);
}
@Test
public void testFailFast() {
String testName = "testFailFast";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("HapMap.testFailFast.vcf", "HapMap_only_famids.fam",10),
3,
UserException.class
);
executeTest(testName, spec);
}
@Test
public void testFailFastMeta() {
String testName = "testFailFastMeta";
WalkerTestSpec spec = new WalkerTestSpec(
baseTestString("HapMap.testFailFast.vcf", "HapMap_only_famids.metadata.txt",10),
3,
UserException.class
);
executeTest(testName, spec);
}
}

View File

@ -29,12 +29,15 @@ package org.broadinstitute.sting.utils.variantcontext;
// the imports for unit testing.
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.Assert;
import org.testng.annotations.Test;
import java.util.Arrays;
import java.util.EnumMap;
import java.util.List;
/**
@ -44,6 +47,7 @@ public class GenotypeLikelihoodsUnitTest {
double [] v = new double[]{-10.5, -1.25, -5.11};
final static String vGLString = "-10.50,-1.25,-5.11";
final static String vPLString = "93,0,39";
double[] triAllelic = new double[]{-4.2,-2.0,-3.0,-1.6,0.0,-4.0}; //AA,AB,AC,BB,BC,CC
@Test
public void testFromVector2() {
@ -139,6 +143,28 @@ public class GenotypeLikelihoodsUnitTest {
}
}
// this test is completely broken, the method is wrong.
public void testGetQualFromLikelihoodsMultiAllelicBroken() {
GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic);
double actualGQ = gl.getLog10GQ(GenotypeType.HET);
double expectedGQ = 1.6;
Assert.assertEquals(actualGQ,expectedGQ);
}
public void testGetQualFromLikelihoodsMultiAllelic() {
GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic);
Allele ref = Allele.create(BaseUtils.A,true);
Allele alt1 = Allele.create(BaseUtils.C);
Allele alt2 = Allele.create(BaseUtils.T);
List<Allele> allAlleles = Arrays.asList(ref,alt1,alt2);
List<Allele> gtAlleles = Arrays.asList(alt1,alt2);
GenotypeBuilder gtBuilder = new GenotypeBuilder();
gtBuilder.alleles(gtAlleles);
double actualGQ = gl.getLog10GQ(gtBuilder.make(),allAlleles);
double expectedGQ = 1.6;
Assert.assertEquals(actualGQ,expectedGQ);
}
private void assertDoubleArraysAreEqual(double[] v1, double[] v2) {
Assert.assertEquals(v1.length, v2.length);
for ( int i = 0; i < v1.length; i++ ) {