Introducing: VariantsToPed, the world's most annoying walker! And also a busted QScript to run it that I need Khalid's help debugging ( frownie face ). Note that VariantsToPed and PlinkSeq generate the same binary file (up to strand flips...thanks PlinkSeq), so I know it's working properly. Hooray!
This commit is contained in:
parent
25d943f706
commit
810996cfca
|
|
@ -0,0 +1,198 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.variantutils;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.commandline.*;
|
||||||
|
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||||
|
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||||
|
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||||
|
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
|
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||||
|
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||||
|
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.PrintStream;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Yet another VCF to Ped converter. The world actually does need one that will
|
||||||
|
* work efficiently on large VCFs (or at least give a progress bar). This
|
||||||
|
* produces a binary ped file in SNP-major mode.
|
||||||
|
*/
|
||||||
|
public class VariantsToPed extends RodWalker<Integer,Integer> {
|
||||||
|
@ArgumentCollection
|
||||||
|
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||||
|
|
||||||
|
@ArgumentCollection
|
||||||
|
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||||
|
|
||||||
|
@Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file (in which case it will be copied to the file you provide as fam output)")
|
||||||
|
File metaDataFile;
|
||||||
|
|
||||||
|
@Output(shortName="bed",fullName = "bed",required=true,doc="output ped file")
|
||||||
|
PrintStream outBed;
|
||||||
|
|
||||||
|
@Output(shortName="bim",fullName="bim",required=true,doc="output map file")
|
||||||
|
PrintStream outBim;
|
||||||
|
|
||||||
|
@Output(shortName="fam",fullName="fam",required=true,doc="output fam file")
|
||||||
|
PrintStream outFam;
|
||||||
|
|
||||||
|
private ValidateVariants vv = new ValidateVariants();
|
||||||
|
|
||||||
|
private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
|
||||||
|
|
||||||
|
private static final byte HOM_REF = 0x0;
|
||||||
|
private static final byte HOM_VAR = 0x3;
|
||||||
|
private static final byte HET = 0x2;
|
||||||
|
private static final byte NO_CALL = 0x1;
|
||||||
|
|
||||||
|
// note that HET and NO_CALL are flippd from the documentation: that's because
|
||||||
|
// plink actually reads these in backwards; and we want to use a shift operator
|
||||||
|
// to put these in the appropriate location
|
||||||
|
|
||||||
|
public void initialize() {
|
||||||
|
vv.variantCollection = variantCollection;
|
||||||
|
vv.dbsnp = dbsnp;
|
||||||
|
vv.DO_NOT_VALIDATE_FILTERED = true;
|
||||||
|
vv.type = ValidateVariants.ValidationType.REF;
|
||||||
|
// write magic bits into the ped file
|
||||||
|
try {
|
||||||
|
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x1 });
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new ReviewedStingException("error writing to output file.");
|
||||||
|
}
|
||||||
|
// write to the fam file, the first six columns of the standard ped file
|
||||||
|
// first, load data from the input meta data file
|
||||||
|
Map<String,Map<String,String>> metaValues = new HashMap<String,Map<String,String>>();
|
||||||
|
try {
|
||||||
|
if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) {
|
||||||
|
for ( String line : new XReadLines(metaDataFile) ) {
|
||||||
|
outFam.printf("%s%n",line);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for ( String line : new XReadLines(metaDataFile) ) {
|
||||||
|
String[] split = line.split("\\t");
|
||||||
|
String sampleID = split[0];
|
||||||
|
String keyVals = split[1];
|
||||||
|
HashMap<String,String> values = new HashMap<String, String>();
|
||||||
|
for ( String kvp : keyVals.split(";") ) {
|
||||||
|
String[] kvp_split = kvp.split("=");
|
||||||
|
values.put(kvp_split[0],kvp_split[1]);
|
||||||
|
}
|
||||||
|
metaValues.put(sampleID,values);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (FileNotFoundException e) {
|
||||||
|
throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e);
|
||||||
|
}
|
||||||
|
// family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype
|
||||||
|
int dummyID = 0; // increments for dummy parental and family IDs used
|
||||||
|
// want to be especially careful to maintain order here
|
||||||
|
Map<String,VCFHeader> headers = VCFUtils.getVCFHeadersFromRods(getToolkit());
|
||||||
|
for ( Map.Entry<String,VCFHeader> header : headers.entrySet() ) {
|
||||||
|
if ( ! header.getKey().equals(variantCollection.variants.getName()) && ! metaDataFile.getAbsolutePath().endsWith(".fam") ) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for ( String sample : header.getValue().getGenotypeSamples() ) {
|
||||||
|
Map<String,String> mVals = metaValues.get(sample);
|
||||||
|
if ( mVals == null ) {
|
||||||
|
throw new UserException("No metadata provided for sample "+sample);
|
||||||
|
}
|
||||||
|
if ( ! mVals.containsKey("phenotype") ) {
|
||||||
|
throw new UserException("No phenotype data provided for sample "+sample);
|
||||||
|
}
|
||||||
|
String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID);
|
||||||
|
String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID);
|
||||||
|
String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID);
|
||||||
|
String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
|
||||||
|
String pheno = mVals.get("phenotype");
|
||||||
|
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,pid,sample,mid,sex,pheno);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||||
|
if ( tracker == null || ! tracker.hasValues(variantCollection.variants) ||
|
||||||
|
tracker.getFirstValue(variantCollection.variants).isFiltered() ||
|
||||||
|
! tracker.getFirstValue(variantCollection.variants).isSNP() ||
|
||||||
|
! tracker.getFirstValue(variantCollection.variants).isBiallelic()) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
vv.map(tracker,ref,context);
|
||||||
|
} catch (UserException e) {
|
||||||
|
throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+
|
||||||
|
"Please run ValidateVariants for more detailed information.");
|
||||||
|
}
|
||||||
|
|
||||||
|
VariantContext vc = tracker.getFirstValue(variantCollection.variants);
|
||||||
|
// write an entry into the map file
|
||||||
|
outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(),
|
||||||
|
vc.getReference().getBaseString(),vc.getAlternateAllele(0).getBaseString());
|
||||||
|
// write an entry into the bed file
|
||||||
|
int buf = 0;
|
||||||
|
int idx = 0;
|
||||||
|
byte out = 0x0;
|
||||||
|
byte[] toWrite = new byte[1+(vc.getNSamples()/4)];
|
||||||
|
for (Genotype g : vc.getGenotypes() ) {
|
||||||
|
out |= getEncoding(g,buf);
|
||||||
|
if ( buf == 3 ) {
|
||||||
|
toWrite[idx] = out;
|
||||||
|
buf = 0;
|
||||||
|
out = 0x0;
|
||||||
|
idx++;
|
||||||
|
} else {
|
||||||
|
buf++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if ( out != 0x0 ) {
|
||||||
|
toWrite[idx]=out;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
outBed.write(toWrite);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new ReviewedStingException("Error writing to output file");
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer reduce(Integer m, Integer r) {
|
||||||
|
return r + m;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Integer reduceInit() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static byte getEncoding(Genotype g, int offset) {
|
||||||
|
byte b;
|
||||||
|
if ( g.isHomRef() ) {
|
||||||
|
b = HOM_REF;
|
||||||
|
} else if ( g.isHomVar() ) {
|
||||||
|
b = HOM_VAR;
|
||||||
|
} else if ( g.isHet() ) {
|
||||||
|
b = HET;
|
||||||
|
} else {
|
||||||
|
b = NO_CALL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (byte) (b << (2*offset));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getID(VariantContext v) {
|
||||||
|
if ( v.hasID() ) {
|
||||||
|
return v.getID();
|
||||||
|
} else {
|
||||||
|
return String.format("SNP-%s-%d",v.getChr(),v.getStart());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,162 @@
|
||||||
|
package org.broadinstitute.sting.queue.qscripts.lib
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.queue.QScript
|
||||||
|
import org.broadinstitute.sting.commandline.Input
|
||||||
|
import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals
|
||||||
|
import org.broadinstitute.sting.utils.text.XReadLines
|
||||||
|
import collection.JavaConversions._
|
||||||
|
import java.io._
|
||||||
|
import org.broadinstitute.sting.queue.extensions.gatk.VariantsToPed
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Created by IntelliJ IDEA.
|
||||||
|
* User: chartl
|
||||||
|
* Date: 1/31/12
|
||||||
|
* Time: 10:46 PM
|
||||||
|
* To change this template use File | Settings | File Templates.
|
||||||
|
*/
|
||||||
|
|
||||||
|
class VcfToPed extends QScript {
|
||||||
|
|
||||||
|
@Input(shortName = "V", fullName="Variants", required=true,doc="VCF to convert to ped")
|
||||||
|
var variants : File = _
|
||||||
|
|
||||||
|
@Output(shortName = "B", fullName="Bed",required=true,doc="Name of the ped output file (fam and bim will use the root of this file)")
|
||||||
|
var bed : File = _
|
||||||
|
|
||||||
|
@Input(shortName = "M", fullName="Meta",required=true,doc="The sample metadata file, can be a .fam or [NAME]\\tkey1=val1;key2=val2")
|
||||||
|
var meta : File = _
|
||||||
|
|
||||||
|
@Input(shortName = "Int", fullName="Intervals",required=false,doc="Intervals. If not specified script will produce them and exit.")
|
||||||
|
var intervals : File = _
|
||||||
|
|
||||||
|
@Argument(shortName="R",fullName="Ref",required=false,doc="Reference file")
|
||||||
|
var ref : File = new File("/humgen/1kg/references/human_g1k_v37.fasta")
|
||||||
|
|
||||||
|
@Argument(shortName="D",fullName="dbsnp",required=false,doc="dbsnp file")
|
||||||
|
var dbsnp : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf")
|
||||||
|
|
||||||
|
val tmpdir : File = System.getProperty("java.io.tmpdir")
|
||||||
|
|
||||||
|
def script = {
|
||||||
|
if ( intervals == null ) {
|
||||||
|
val ivals : File = swapExt(variants,".vcf",".intervals.list")
|
||||||
|
val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false)
|
||||||
|
add(extract)
|
||||||
|
} else {
|
||||||
|
var iXRL = new XReadLines(intervals)
|
||||||
|
var chunk = 1;
|
||||||
|
var subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk))
|
||||||
|
var subList = new PrintStream(subListFile)
|
||||||
|
var nL = 0;
|
||||||
|
var bedOuts : List[File] = Nil;
|
||||||
|
var bimOuts : List[File] = Nil
|
||||||
|
var lastFam : File = null;
|
||||||
|
while ( iXRL.hasNext ) {
|
||||||
|
subList.printf("%s%n",iXRL.next())
|
||||||
|
nL = nL + 1
|
||||||
|
if ( nL > 100000 ) {
|
||||||
|
val toPed : VariantsToPed = new VariantsToPed
|
||||||
|
toPed.memoryLimit = 2
|
||||||
|
toPed.reference_sequence = ref
|
||||||
|
toPed.intervals :+= new File(subListFile)
|
||||||
|
toPed.dbsnp = dbsnp
|
||||||
|
toPed.variant = variants
|
||||||
|
toPed.metaData = meta
|
||||||
|
lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk)
|
||||||
|
lazy val tBed = new File(tmpdir,base+".bed")
|
||||||
|
lazy val bim = new File(tmpdir,base+".bim")
|
||||||
|
lazy val fam = new File(tmpdir,base+".fam")
|
||||||
|
toPed.bed = tBed
|
||||||
|
toPed.bim = bim
|
||||||
|
toPed.fam = fam
|
||||||
|
add(toPed)
|
||||||
|
subList.close()
|
||||||
|
chunk = chunk + 1
|
||||||
|
subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk))
|
||||||
|
subList = new PrintStream(subListFile)
|
||||||
|
bedOuts :+= tBed
|
||||||
|
bimOuts :+= bim
|
||||||
|
lastFam = fam
|
||||||
|
nL = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( nL > 0 ) {
|
||||||
|
val toPed : VariantsToPed = new VariantsToPed
|
||||||
|
toPed.reference_sequence = ref
|
||||||
|
toPed.intervals :+= new File(subListFile)
|
||||||
|
toPed.dbsnp = dbsnp
|
||||||
|
toPed.variant = variants
|
||||||
|
toPed.metaData = meta
|
||||||
|
lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk)
|
||||||
|
lazy val tBed = new File(tmpdir,base+".bed")
|
||||||
|
lazy val bim = new File(tmpdir,base+".bim")
|
||||||
|
lazy val fam = new File(tmpdir,base+".fam")
|
||||||
|
toPed.bed = tBed
|
||||||
|
toPed.bim = bim
|
||||||
|
toPed.fam = fam
|
||||||
|
lastFam = fam
|
||||||
|
add(toPed)
|
||||||
|
subList.close()
|
||||||
|
bedOuts :+= tBed
|
||||||
|
bimOuts :+= bim
|
||||||
|
}
|
||||||
|
|
||||||
|
var gatherUP = new MyPedGather
|
||||||
|
gatherUP.binPed = bedOuts
|
||||||
|
gatherUP.bim = bimOuts
|
||||||
|
gatherUP.outPed = bed
|
||||||
|
gatherUP.outBim = swapExt(bed,".bed",".bim")
|
||||||
|
|
||||||
|
add(gatherUP)
|
||||||
|
|
||||||
|
class copyFam extends InProcessFunction {
|
||||||
|
@Input(doc="fam") var inFam = lastFam
|
||||||
|
@Output(doc="fam") var outFam = swapExt(bed,".bed",".fam")
|
||||||
|
|
||||||
|
def run = {
|
||||||
|
var stream = new PrintStream(outFam)
|
||||||
|
asScalaIterator(new XReadLines(inFam)).foreach( u => {
|
||||||
|
stream.printf("%s%n",u)
|
||||||
|
})
|
||||||
|
stream.close()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
add(new copyFam)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
class MyPedGather extends InProcessFunction {
|
||||||
|
@Input(doc="Peds to be merged") var binPed: List[File] = Nil
|
||||||
|
@Input(doc="Bims to be merged") var bim : List[File] = Nil
|
||||||
|
@Output(doc="The final Ped to write to") var outPed : File = _
|
||||||
|
@Output(doc="The final bim to write to") var outBim : File = _
|
||||||
|
|
||||||
|
def run : Unit = {
|
||||||
|
var stream : PrintStream = new PrintStream(outPed)
|
||||||
|
stream.write((List[Byte](0x6c.toByte,0x1b.toByte,0x1.toByte)).toArray)
|
||||||
|
binPed.map(u => new FileInputStream(u) ).foreach( u => {
|
||||||
|
u.skip(3)
|
||||||
|
var b = -1
|
||||||
|
do {
|
||||||
|
b = u.read()
|
||||||
|
stream.write(b.toByte)
|
||||||
|
} while ( b != -1 )
|
||||||
|
})
|
||||||
|
stream.close()
|
||||||
|
|
||||||
|
stream = new PrintStream(outBim)
|
||||||
|
bim.map(u => new XReadLines(u)).foreach( u => {
|
||||||
|
asScalaIterator(u).foreach( x => {
|
||||||
|
stream.printf("%s%n",x)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
stream.close()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue