Introducing: VariantsToPed, the world's most annoying walker! And also a busted QScript to run it that I need Khalid's help debugging ( frownie face ). Note that VariantsToPed and PlinkSeq generate the same binary file (up to strand flips...thanks PlinkSeq), so I know it's working properly. Hooray!

This commit is contained in:
Christopher Hartl 2012-02-01 10:39:03 -05:00
parent 25d943f706
commit 810996cfca
2 changed files with 360 additions and 0 deletions

View File

@ -0,0 +1,198 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.PrintStream;
import java.util.*;
/**
* Yet another VCF to Ped converter. The world actually does need one that will
* work efficiently on large VCFs (or at least give a progress bar). This
* produces a binary ped file in SNP-major mode.
*/
public class VariantsToPed extends RodWalker<Integer,Integer> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
@Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file (in which case it will be copied to the file you provide as fam output)")
File metaDataFile;
@Output(shortName="bed",fullName = "bed",required=true,doc="output ped file")
PrintStream outBed;
@Output(shortName="bim",fullName="bim",required=true,doc="output map file")
PrintStream outBim;
@Output(shortName="fam",fullName="fam",required=true,doc="output fam file")
PrintStream outFam;
private ValidateVariants vv = new ValidateVariants();
private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
private static final byte HOM_REF = 0x0;
private static final byte HOM_VAR = 0x3;
private static final byte HET = 0x2;
private static final byte NO_CALL = 0x1;
// note that HET and NO_CALL are flippd from the documentation: that's because
// plink actually reads these in backwards; and we want to use a shift operator
// to put these in the appropriate location
public void initialize() {
vv.variantCollection = variantCollection;
vv.dbsnp = dbsnp;
vv.DO_NOT_VALIDATE_FILTERED = true;
vv.type = ValidateVariants.ValidationType.REF;
// write magic bits into the ped file
try {
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x1 });
} catch (IOException e) {
throw new ReviewedStingException("error writing to output file.");
}
// write to the fam file, the first six columns of the standard ped file
// first, load data from the input meta data file
Map<String,Map<String,String>> metaValues = new HashMap<String,Map<String,String>>();
try {
if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) {
for ( String line : new XReadLines(metaDataFile) ) {
outFam.printf("%s%n",line);
}
} else {
for ( String line : new XReadLines(metaDataFile) ) {
String[] split = line.split("\\t");
String sampleID = split[0];
String keyVals = split[1];
HashMap<String,String> values = new HashMap<String, String>();
for ( String kvp : keyVals.split(";") ) {
String[] kvp_split = kvp.split("=");
values.put(kvp_split[0],kvp_split[1]);
}
metaValues.put(sampleID,values);
}
}
} catch (FileNotFoundException e) {
throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e);
}
// family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype
int dummyID = 0; // increments for dummy parental and family IDs used
// want to be especially careful to maintain order here
Map<String,VCFHeader> headers = VCFUtils.getVCFHeadersFromRods(getToolkit());
for ( Map.Entry<String,VCFHeader> header : headers.entrySet() ) {
if ( ! header.getKey().equals(variantCollection.variants.getName()) && ! metaDataFile.getAbsolutePath().endsWith(".fam") ) {
continue;
}
for ( String sample : header.getValue().getGenotypeSamples() ) {
Map<String,String> mVals = metaValues.get(sample);
if ( mVals == null ) {
throw new UserException("No metadata provided for sample "+sample);
}
if ( ! mVals.containsKey("phenotype") ) {
throw new UserException("No phenotype data provided for sample "+sample);
}
String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID);
String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID);
String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID);
String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
String pheno = mVals.get("phenotype");
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,pid,sample,mid,sex,pheno);
}
}
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null || ! tracker.hasValues(variantCollection.variants) ||
tracker.getFirstValue(variantCollection.variants).isFiltered() ||
! tracker.getFirstValue(variantCollection.variants).isSNP() ||
! tracker.getFirstValue(variantCollection.variants).isBiallelic()) {
return 0;
}
try {
vv.map(tracker,ref,context);
} catch (UserException e) {
throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+
"Please run ValidateVariants for more detailed information.");
}
VariantContext vc = tracker.getFirstValue(variantCollection.variants);
// write an entry into the map file
outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(),
vc.getReference().getBaseString(),vc.getAlternateAllele(0).getBaseString());
// write an entry into the bed file
int buf = 0;
int idx = 0;
byte out = 0x0;
byte[] toWrite = new byte[1+(vc.getNSamples()/4)];
for (Genotype g : vc.getGenotypes() ) {
out |= getEncoding(g,buf);
if ( buf == 3 ) {
toWrite[idx] = out;
buf = 0;
out = 0x0;
idx++;
} else {
buf++;
}
}
if ( out != 0x0 ) {
toWrite[idx]=out;
}
try {
outBed.write(toWrite);
} catch (IOException e) {
throw new ReviewedStingException("Error writing to output file");
}
return 1;
}
public Integer reduce(Integer m, Integer r) {
return r + m;
}
public Integer reduceInit() {
return 0;
}
private static byte getEncoding(Genotype g, int offset) {
byte b;
if ( g.isHomRef() ) {
b = HOM_REF;
} else if ( g.isHomVar() ) {
b = HOM_VAR;
} else if ( g.isHet() ) {
b = HET;
} else {
b = NO_CALL;
}
return (byte) (b << (2*offset));
}
private static String getID(VariantContext v) {
if ( v.hasID() ) {
return v.getID();
} else {
return String.format("SNP-%s-%d",v.getChr(),v.getStart());
}
}
}

View File

@ -0,0 +1,162 @@
package org.broadinstitute.sting.queue.qscripts.lib
import org.broadinstitute.sting.queue.QScript
import org.broadinstitute.sting.commandline.Input
import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals
import org.broadinstitute.sting.utils.text.XReadLines
import collection.JavaConversions._
import java.io._
import org.broadinstitute.sting.queue.extensions.gatk.VariantsToPed
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 1/31/12
* Time: 10:46 PM
* To change this template use File | Settings | File Templates.
*/
class VcfToPed extends QScript {
@Input(shortName = "V", fullName="Variants", required=true,doc="VCF to convert to ped")
var variants : File = _
@Output(shortName = "B", fullName="Bed",required=true,doc="Name of the ped output file (fam and bim will use the root of this file)")
var bed : File = _
@Input(shortName = "M", fullName="Meta",required=true,doc="The sample metadata file, can be a .fam or [NAME]\\tkey1=val1;key2=val2")
var meta : File = _
@Input(shortName = "Int", fullName="Intervals",required=false,doc="Intervals. If not specified script will produce them and exit.")
var intervals : File = _
@Argument(shortName="R",fullName="Ref",required=false,doc="Reference file")
var ref : File = new File("/humgen/1kg/references/human_g1k_v37.fasta")
@Argument(shortName="D",fullName="dbsnp",required=false,doc="dbsnp file")
var dbsnp : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf")
val tmpdir : File = System.getProperty("java.io.tmpdir")
def script = {
if ( intervals == null ) {
val ivals : File = swapExt(variants,".vcf",".intervals.list")
val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false)
add(extract)
} else {
var iXRL = new XReadLines(intervals)
var chunk = 1;
var subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk))
var subList = new PrintStream(subListFile)
var nL = 0;
var bedOuts : List[File] = Nil;
var bimOuts : List[File] = Nil
var lastFam : File = null;
while ( iXRL.hasNext ) {
subList.printf("%s%n",iXRL.next())
nL = nL + 1
if ( nL > 100000 ) {
val toPed : VariantsToPed = new VariantsToPed
toPed.memoryLimit = 2
toPed.reference_sequence = ref
toPed.intervals :+= new File(subListFile)
toPed.dbsnp = dbsnp
toPed.variant = variants
toPed.metaData = meta
lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk)
lazy val tBed = new File(tmpdir,base+".bed")
lazy val bim = new File(tmpdir,base+".bim")
lazy val fam = new File(tmpdir,base+".fam")
toPed.bed = tBed
toPed.bim = bim
toPed.fam = fam
add(toPed)
subList.close()
chunk = chunk + 1
subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk))
subList = new PrintStream(subListFile)
bedOuts :+= tBed
bimOuts :+= bim
lastFam = fam
nL = 0;
}
}
if ( nL > 0 ) {
val toPed : VariantsToPed = new VariantsToPed
toPed.reference_sequence = ref
toPed.intervals :+= new File(subListFile)
toPed.dbsnp = dbsnp
toPed.variant = variants
toPed.metaData = meta
lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk)
lazy val tBed = new File(tmpdir,base+".bed")
lazy val bim = new File(tmpdir,base+".bim")
lazy val fam = new File(tmpdir,base+".fam")
toPed.bed = tBed
toPed.bim = bim
toPed.fam = fam
lastFam = fam
add(toPed)
subList.close()
bedOuts :+= tBed
bimOuts :+= bim
}
var gatherUP = new MyPedGather
gatherUP.binPed = bedOuts
gatherUP.bim = bimOuts
gatherUP.outPed = bed
gatherUP.outBim = swapExt(bed,".bed",".bim")
add(gatherUP)
class copyFam extends InProcessFunction {
@Input(doc="fam") var inFam = lastFam
@Output(doc="fam") var outFam = swapExt(bed,".bed",".fam")
def run = {
var stream = new PrintStream(outFam)
asScalaIterator(new XReadLines(inFam)).foreach( u => {
stream.printf("%s%n",u)
})
stream.close()
}
}
add(new copyFam)
}
}
class MyPedGather extends InProcessFunction {
@Input(doc="Peds to be merged") var binPed: List[File] = Nil
@Input(doc="Bims to be merged") var bim : List[File] = Nil
@Output(doc="The final Ped to write to") var outPed : File = _
@Output(doc="The final bim to write to") var outBim : File = _
def run : Unit = {
var stream : PrintStream = new PrintStream(outPed)
stream.write((List[Byte](0x6c.toByte,0x1b.toByte,0x1.toByte)).toArray)
binPed.map(u => new FileInputStream(u) ).foreach( u => {
u.skip(3)
var b = -1
do {
b = u.read()
stream.write(b.toByte)
} while ( b != -1 )
})
stream.close()
stream = new PrintStream(outBim)
bim.map(u => new XReadLines(u)).foreach( u => {
asScalaIterator(u).foreach( x => {
stream.printf("%s%n",x)
})
})
stream.close()
}
}
}