Introducing: VariantsToPed, the world's most annoying walker! And also a busted QScript to run it that I need Khalid's help debugging ( frownie face ). Note that VariantsToPed and PlinkSeq generate the same binary file (up to strand flips...thanks PlinkSeq), so I know it's working properly. Hooray!
This commit is contained in:
parent
25d943f706
commit
810996cfca
|
|
@ -0,0 +1,198 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Yet another VCF to Ped converter. The world actually does need one that will
|
||||
* work efficiently on large VCFs (or at least give a progress bar). This
|
||||
* produces a binary ped file in SNP-major mode.
|
||||
*/
|
||||
public class VariantsToPed extends RodWalker<Integer,Integer> {
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
||||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
|
||||
@Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file (in which case it will be copied to the file you provide as fam output)")
|
||||
File metaDataFile;
|
||||
|
||||
@Output(shortName="bed",fullName = "bed",required=true,doc="output ped file")
|
||||
PrintStream outBed;
|
||||
|
||||
@Output(shortName="bim",fullName="bim",required=true,doc="output map file")
|
||||
PrintStream outBim;
|
||||
|
||||
@Output(shortName="fam",fullName="fam",required=true,doc="output fam file")
|
||||
PrintStream outFam;
|
||||
|
||||
private ValidateVariants vv = new ValidateVariants();
|
||||
|
||||
private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
|
||||
|
||||
private static final byte HOM_REF = 0x0;
|
||||
private static final byte HOM_VAR = 0x3;
|
||||
private static final byte HET = 0x2;
|
||||
private static final byte NO_CALL = 0x1;
|
||||
|
||||
// note that HET and NO_CALL are flippd from the documentation: that's because
|
||||
// plink actually reads these in backwards; and we want to use a shift operator
|
||||
// to put these in the appropriate location
|
||||
|
||||
public void initialize() {
|
||||
vv.variantCollection = variantCollection;
|
||||
vv.dbsnp = dbsnp;
|
||||
vv.DO_NOT_VALIDATE_FILTERED = true;
|
||||
vv.type = ValidateVariants.ValidationType.REF;
|
||||
// write magic bits into the ped file
|
||||
try {
|
||||
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x1 });
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("error writing to output file.");
|
||||
}
|
||||
// write to the fam file, the first six columns of the standard ped file
|
||||
// first, load data from the input meta data file
|
||||
Map<String,Map<String,String>> metaValues = new HashMap<String,Map<String,String>>();
|
||||
try {
|
||||
if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) {
|
||||
for ( String line : new XReadLines(metaDataFile) ) {
|
||||
outFam.printf("%s%n",line);
|
||||
}
|
||||
} else {
|
||||
for ( String line : new XReadLines(metaDataFile) ) {
|
||||
String[] split = line.split("\\t");
|
||||
String sampleID = split[0];
|
||||
String keyVals = split[1];
|
||||
HashMap<String,String> values = new HashMap<String, String>();
|
||||
for ( String kvp : keyVals.split(";") ) {
|
||||
String[] kvp_split = kvp.split("=");
|
||||
values.put(kvp_split[0],kvp_split[1]);
|
||||
}
|
||||
metaValues.put(sampleID,values);
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e);
|
||||
}
|
||||
// family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype
|
||||
int dummyID = 0; // increments for dummy parental and family IDs used
|
||||
// want to be especially careful to maintain order here
|
||||
Map<String,VCFHeader> headers = VCFUtils.getVCFHeadersFromRods(getToolkit());
|
||||
for ( Map.Entry<String,VCFHeader> header : headers.entrySet() ) {
|
||||
if ( ! header.getKey().equals(variantCollection.variants.getName()) && ! metaDataFile.getAbsolutePath().endsWith(".fam") ) {
|
||||
continue;
|
||||
}
|
||||
for ( String sample : header.getValue().getGenotypeSamples() ) {
|
||||
Map<String,String> mVals = metaValues.get(sample);
|
||||
if ( mVals == null ) {
|
||||
throw new UserException("No metadata provided for sample "+sample);
|
||||
}
|
||||
if ( ! mVals.containsKey("phenotype") ) {
|
||||
throw new UserException("No phenotype data provided for sample "+sample);
|
||||
}
|
||||
String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID);
|
||||
String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID);
|
||||
String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID);
|
||||
String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
|
||||
String pheno = mVals.get("phenotype");
|
||||
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,pid,sample,mid,sex,pheno);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null || ! tracker.hasValues(variantCollection.variants) ||
|
||||
tracker.getFirstValue(variantCollection.variants).isFiltered() ||
|
||||
! tracker.getFirstValue(variantCollection.variants).isSNP() ||
|
||||
! tracker.getFirstValue(variantCollection.variants).isBiallelic()) {
|
||||
return 0;
|
||||
}
|
||||
try {
|
||||
vv.map(tracker,ref,context);
|
||||
} catch (UserException e) {
|
||||
throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+
|
||||
"Please run ValidateVariants for more detailed information.");
|
||||
}
|
||||
|
||||
VariantContext vc = tracker.getFirstValue(variantCollection.variants);
|
||||
// write an entry into the map file
|
||||
outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(),
|
||||
vc.getReference().getBaseString(),vc.getAlternateAllele(0).getBaseString());
|
||||
// write an entry into the bed file
|
||||
int buf = 0;
|
||||
int idx = 0;
|
||||
byte out = 0x0;
|
||||
byte[] toWrite = new byte[1+(vc.getNSamples()/4)];
|
||||
for (Genotype g : vc.getGenotypes() ) {
|
||||
out |= getEncoding(g,buf);
|
||||
if ( buf == 3 ) {
|
||||
toWrite[idx] = out;
|
||||
buf = 0;
|
||||
out = 0x0;
|
||||
idx++;
|
||||
} else {
|
||||
buf++;
|
||||
}
|
||||
}
|
||||
if ( out != 0x0 ) {
|
||||
toWrite[idx]=out;
|
||||
}
|
||||
try {
|
||||
outBed.write(toWrite);
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error writing to output file");
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer m, Integer r) {
|
||||
return r + m;
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
private static byte getEncoding(Genotype g, int offset) {
|
||||
byte b;
|
||||
if ( g.isHomRef() ) {
|
||||
b = HOM_REF;
|
||||
} else if ( g.isHomVar() ) {
|
||||
b = HOM_VAR;
|
||||
} else if ( g.isHet() ) {
|
||||
b = HET;
|
||||
} else {
|
||||
b = NO_CALL;
|
||||
}
|
||||
|
||||
return (byte) (b << (2*offset));
|
||||
}
|
||||
|
||||
private static String getID(VariantContext v) {
|
||||
if ( v.hasID() ) {
|
||||
return v.getID();
|
||||
} else {
|
||||
return String.format("SNP-%s-%d",v.getChr(),v.getStart());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,162 @@
|
|||
package org.broadinstitute.sting.queue.qscripts.lib
|
||||
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.commandline.Input
|
||||
import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import collection.JavaConversions._
|
||||
import java.io._
|
||||
import org.broadinstitute.sting.queue.extensions.gatk.VariantsToPed
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: 1/31/12
|
||||
* Time: 10:46 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
|
||||
class VcfToPed extends QScript {
|
||||
|
||||
@Input(shortName = "V", fullName="Variants", required=true,doc="VCF to convert to ped")
|
||||
var variants : File = _
|
||||
|
||||
@Output(shortName = "B", fullName="Bed",required=true,doc="Name of the ped output file (fam and bim will use the root of this file)")
|
||||
var bed : File = _
|
||||
|
||||
@Input(shortName = "M", fullName="Meta",required=true,doc="The sample metadata file, can be a .fam or [NAME]\\tkey1=val1;key2=val2")
|
||||
var meta : File = _
|
||||
|
||||
@Input(shortName = "Int", fullName="Intervals",required=false,doc="Intervals. If not specified script will produce them and exit.")
|
||||
var intervals : File = _
|
||||
|
||||
@Argument(shortName="R",fullName="Ref",required=false,doc="Reference file")
|
||||
var ref : File = new File("/humgen/1kg/references/human_g1k_v37.fasta")
|
||||
|
||||
@Argument(shortName="D",fullName="dbsnp",required=false,doc="dbsnp file")
|
||||
var dbsnp : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf")
|
||||
|
||||
val tmpdir : File = System.getProperty("java.io.tmpdir")
|
||||
|
||||
def script = {
|
||||
if ( intervals == null ) {
|
||||
val ivals : File = swapExt(variants,".vcf",".intervals.list")
|
||||
val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false)
|
||||
add(extract)
|
||||
} else {
|
||||
var iXRL = new XReadLines(intervals)
|
||||
var chunk = 1;
|
||||
var subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk))
|
||||
var subList = new PrintStream(subListFile)
|
||||
var nL = 0;
|
||||
var bedOuts : List[File] = Nil;
|
||||
var bimOuts : List[File] = Nil
|
||||
var lastFam : File = null;
|
||||
while ( iXRL.hasNext ) {
|
||||
subList.printf("%s%n",iXRL.next())
|
||||
nL = nL + 1
|
||||
if ( nL > 100000 ) {
|
||||
val toPed : VariantsToPed = new VariantsToPed
|
||||
toPed.memoryLimit = 2
|
||||
toPed.reference_sequence = ref
|
||||
toPed.intervals :+= new File(subListFile)
|
||||
toPed.dbsnp = dbsnp
|
||||
toPed.variant = variants
|
||||
toPed.metaData = meta
|
||||
lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk)
|
||||
lazy val tBed = new File(tmpdir,base+".bed")
|
||||
lazy val bim = new File(tmpdir,base+".bim")
|
||||
lazy val fam = new File(tmpdir,base+".fam")
|
||||
toPed.bed = tBed
|
||||
toPed.bim = bim
|
||||
toPed.fam = fam
|
||||
add(toPed)
|
||||
subList.close()
|
||||
chunk = chunk + 1
|
||||
subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk))
|
||||
subList = new PrintStream(subListFile)
|
||||
bedOuts :+= tBed
|
||||
bimOuts :+= bim
|
||||
lastFam = fam
|
||||
nL = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if ( nL > 0 ) {
|
||||
val toPed : VariantsToPed = new VariantsToPed
|
||||
toPed.reference_sequence = ref
|
||||
toPed.intervals :+= new File(subListFile)
|
||||
toPed.dbsnp = dbsnp
|
||||
toPed.variant = variants
|
||||
toPed.metaData = meta
|
||||
lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk)
|
||||
lazy val tBed = new File(tmpdir,base+".bed")
|
||||
lazy val bim = new File(tmpdir,base+".bim")
|
||||
lazy val fam = new File(tmpdir,base+".fam")
|
||||
toPed.bed = tBed
|
||||
toPed.bim = bim
|
||||
toPed.fam = fam
|
||||
lastFam = fam
|
||||
add(toPed)
|
||||
subList.close()
|
||||
bedOuts :+= tBed
|
||||
bimOuts :+= bim
|
||||
}
|
||||
|
||||
var gatherUP = new MyPedGather
|
||||
gatherUP.binPed = bedOuts
|
||||
gatherUP.bim = bimOuts
|
||||
gatherUP.outPed = bed
|
||||
gatherUP.outBim = swapExt(bed,".bed",".bim")
|
||||
|
||||
add(gatherUP)
|
||||
|
||||
class copyFam extends InProcessFunction {
|
||||
@Input(doc="fam") var inFam = lastFam
|
||||
@Output(doc="fam") var outFam = swapExt(bed,".bed",".fam")
|
||||
|
||||
def run = {
|
||||
var stream = new PrintStream(outFam)
|
||||
asScalaIterator(new XReadLines(inFam)).foreach( u => {
|
||||
stream.printf("%s%n",u)
|
||||
})
|
||||
stream.close()
|
||||
}
|
||||
}
|
||||
|
||||
add(new copyFam)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class MyPedGather extends InProcessFunction {
|
||||
@Input(doc="Peds to be merged") var binPed: List[File] = Nil
|
||||
@Input(doc="Bims to be merged") var bim : List[File] = Nil
|
||||
@Output(doc="The final Ped to write to") var outPed : File = _
|
||||
@Output(doc="The final bim to write to") var outBim : File = _
|
||||
|
||||
def run : Unit = {
|
||||
var stream : PrintStream = new PrintStream(outPed)
|
||||
stream.write((List[Byte](0x6c.toByte,0x1b.toByte,0x1.toByte)).toArray)
|
||||
binPed.map(u => new FileInputStream(u) ).foreach( u => {
|
||||
u.skip(3)
|
||||
var b = -1
|
||||
do {
|
||||
b = u.read()
|
||||
stream.write(b.toByte)
|
||||
} while ( b != -1 )
|
||||
})
|
||||
stream.close()
|
||||
|
||||
stream = new PrintStream(outBim)
|
||||
bim.map(u => new XReadLines(u)).foreach( u => {
|
||||
asScalaIterator(u).foreach( x => {
|
||||
stream.printf("%s%n",x)
|
||||
})
|
||||
})
|
||||
|
||||
stream.close()
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue