diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java new file mode 100644 index 000000000..32b2dd06c --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java @@ -0,0 +1,198 @@ +package org.broadinstitute.sting.gatk.walkers.variantutils; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.PrintStream; +import java.util.*; + +/** + * Yet another VCF to Ped converter. The world actually does need one that will + * work efficiently on large VCFs (or at least give a progress bar). This + * produces a binary ped file in SNP-major mode. + */ +public class VariantsToPed extends RodWalker { + @ArgumentCollection + protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); + + @ArgumentCollection + protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); + + @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file (in which case it will be copied to the file you provide as fam output)") + File metaDataFile; + + @Output(shortName="bed",fullName = "bed",required=true,doc="output ped file") + PrintStream outBed; + + @Output(shortName="bim",fullName="bim",required=true,doc="output map file") + PrintStream outBim; + + @Output(shortName="fam",fullName="fam",required=true,doc="output fam file") + PrintStream outFam; + + private ValidateVariants vv = new ValidateVariants(); + + private static double APPROX_CM_PER_BP = 1000000.0/750000.0; + + private static final byte HOM_REF = 0x0; + private static final byte HOM_VAR = 0x3; + private static final byte HET = 0x2; + private static final byte NO_CALL = 0x1; + + // note that HET and NO_CALL are flippd from the documentation: that's because + // plink actually reads these in backwards; and we want to use a shift operator + // to put these in the appropriate location + + public void initialize() { + vv.variantCollection = variantCollection; + vv.dbsnp = dbsnp; + vv.DO_NOT_VALIDATE_FILTERED = true; + vv.type = ValidateVariants.ValidationType.REF; + // write magic bits into the ped file + try { + outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x1 }); + } catch (IOException e) { + throw new ReviewedStingException("error writing to output file."); + } + // write to the fam file, the first six columns of the standard ped file + // first, load data from the input meta data file + Map> metaValues = new HashMap>(); + try { + if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { + for ( String line : new XReadLines(metaDataFile) ) { + outFam.printf("%s%n",line); + } + } else { + for ( String line : new XReadLines(metaDataFile) ) { + String[] split = line.split("\\t"); + String sampleID = split[0]; + String keyVals = split[1]; + HashMap values = new HashMap(); + for ( String kvp : keyVals.split(";") ) { + String[] kvp_split = kvp.split("="); + values.put(kvp_split[0],kvp_split[1]); + } + metaValues.put(sampleID,values); + } + } + } catch (FileNotFoundException e) { + throw new UserException("Meta data file not found: "+metaDataFile.getAbsolutePath(),e); + } + // family ID, individual ID, Paternal ID, Maternal ID, Sex, Phenotype + int dummyID = 0; // increments for dummy parental and family IDs used + // want to be especially careful to maintain order here + Map headers = VCFUtils.getVCFHeadersFromRods(getToolkit()); + for ( Map.Entry header : headers.entrySet() ) { + if ( ! header.getKey().equals(variantCollection.variants.getName()) && ! metaDataFile.getAbsolutePath().endsWith(".fam") ) { + continue; + } + for ( String sample : header.getValue().getGenotypeSamples() ) { + Map mVals = metaValues.get(sample); + if ( mVals == null ) { + throw new UserException("No metadata provided for sample "+sample); + } + if ( ! mVals.containsKey("phenotype") ) { + throw new UserException("No phenotype data provided for sample "+sample); + } + String fid = mVals.containsKey("fid") ? mVals.get("fid") : String.format("dummy_%d",++dummyID); + String pid = mVals.containsKey("dad") ? mVals.get("dad") : String.format("dummy_%d",++dummyID); + String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID); + String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; + String pheno = mVals.get("phenotype"); + outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,pid,sample,mid,sex,pheno); + } + } + } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null || ! tracker.hasValues(variantCollection.variants) || + tracker.getFirstValue(variantCollection.variants).isFiltered() || + ! tracker.getFirstValue(variantCollection.variants).isSNP() || + ! tracker.getFirstValue(variantCollection.variants).isBiallelic()) { + return 0; + } + try { + vv.map(tracker,ref,context); + } catch (UserException e) { + throw new UserException("Input VCF file is invalid; we cannot guarantee the resulting ped file. "+ + "Please run ValidateVariants for more detailed information."); + } + + VariantContext vc = tracker.getFirstValue(variantCollection.variants); + // write an entry into the map file + outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(), + vc.getReference().getBaseString(),vc.getAlternateAllele(0).getBaseString()); + // write an entry into the bed file + int buf = 0; + int idx = 0; + byte out = 0x0; + byte[] toWrite = new byte[1+(vc.getNSamples()/4)]; + for (Genotype g : vc.getGenotypes() ) { + out |= getEncoding(g,buf); + if ( buf == 3 ) { + toWrite[idx] = out; + buf = 0; + out = 0x0; + idx++; + } else { + buf++; + } + } + if ( out != 0x0 ) { + toWrite[idx]=out; + } + try { + outBed.write(toWrite); + } catch (IOException e) { + throw new ReviewedStingException("Error writing to output file"); + } + + return 1; + } + + public Integer reduce(Integer m, Integer r) { + return r + m; + } + + public Integer reduceInit() { + return 0; + } + + private static byte getEncoding(Genotype g, int offset) { + byte b; + if ( g.isHomRef() ) { + b = HOM_REF; + } else if ( g.isHomVar() ) { + b = HOM_VAR; + } else if ( g.isHet() ) { + b = HET; + } else { + b = NO_CALL; + } + + return (byte) (b << (2*offset)); + } + + private static String getID(VariantContext v) { + if ( v.hasID() ) { + return v.getID(); + } else { + return String.format("SNP-%s-%d",v.getChr(),v.getStart()); + } + } +} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala new file mode 100644 index 000000000..04f73d562 --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/lib/VcfToPed.scala @@ -0,0 +1,162 @@ +package org.broadinstitute.sting.queue.qscripts.lib + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.commandline.Input +import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals +import org.broadinstitute.sting.utils.text.XReadLines +import collection.JavaConversions._ +import java.io._ +import org.broadinstitute.sting.queue.extensions.gatk.VariantsToPed + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 1/31/12 + * Time: 10:46 PM + * To change this template use File | Settings | File Templates. + */ + +class VcfToPed extends QScript { + + @Input(shortName = "V", fullName="Variants", required=true,doc="VCF to convert to ped") + var variants : File = _ + + @Output(shortName = "B", fullName="Bed",required=true,doc="Name of the ped output file (fam and bim will use the root of this file)") + var bed : File = _ + + @Input(shortName = "M", fullName="Meta",required=true,doc="The sample metadata file, can be a .fam or [NAME]\\tkey1=val1;key2=val2") + var meta : File = _ + + @Input(shortName = "Int", fullName="Intervals",required=false,doc="Intervals. If not specified script will produce them and exit.") + var intervals : File = _ + + @Argument(shortName="R",fullName="Ref",required=false,doc="Reference file") + var ref : File = new File("/humgen/1kg/references/human_g1k_v37.fasta") + + @Argument(shortName="D",fullName="dbsnp",required=false,doc="dbsnp file") + var dbsnp : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf") + + val tmpdir : File = System.getProperty("java.io.tmpdir") + + def script = { + if ( intervals == null ) { + val ivals : File = swapExt(variants,".vcf",".intervals.list") + val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false) + add(extract) + } else { + var iXRL = new XReadLines(intervals) + var chunk = 1; + var subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) + var subList = new PrintStream(subListFile) + var nL = 0; + var bedOuts : List[File] = Nil; + var bimOuts : List[File] = Nil + var lastFam : File = null; + while ( iXRL.hasNext ) { + subList.printf("%s%n",iXRL.next()) + nL = nL + 1 + if ( nL > 100000 ) { + val toPed : VariantsToPed = new VariantsToPed + toPed.memoryLimit = 2 + toPed.reference_sequence = ref + toPed.intervals :+= new File(subListFile) + toPed.dbsnp = dbsnp + toPed.variant = variants + toPed.metaData = meta + lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk) + lazy val tBed = new File(tmpdir,base+".bed") + lazy val bim = new File(tmpdir,base+".bim") + lazy val fam = new File(tmpdir,base+".fam") + toPed.bed = tBed + toPed.bim = bim + toPed.fam = fam + add(toPed) + subList.close() + chunk = chunk + 1 + subListFile = swapExt(tmpdir,variants,".vcf",".chunk%d.list".format(chunk)) + subList = new PrintStream(subListFile) + bedOuts :+= tBed + bimOuts :+= bim + lastFam = fam + nL = 0; + } + } + + if ( nL > 0 ) { + val toPed : VariantsToPed = new VariantsToPed + toPed.reference_sequence = ref + toPed.intervals :+= new File(subListFile) + toPed.dbsnp = dbsnp + toPed.variant = variants + toPed.metaData = meta + lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk) + lazy val tBed = new File(tmpdir,base+".bed") + lazy val bim = new File(tmpdir,base+".bim") + lazy val fam = new File(tmpdir,base+".fam") + toPed.bed = tBed + toPed.bim = bim + toPed.fam = fam + lastFam = fam + add(toPed) + subList.close() + bedOuts :+= tBed + bimOuts :+= bim + } + + var gatherUP = new MyPedGather + gatherUP.binPed = bedOuts + gatherUP.bim = bimOuts + gatherUP.outPed = bed + gatherUP.outBim = swapExt(bed,".bed",".bim") + + add(gatherUP) + + class copyFam extends InProcessFunction { + @Input(doc="fam") var inFam = lastFam + @Output(doc="fam") var outFam = swapExt(bed,".bed",".fam") + + def run = { + var stream = new PrintStream(outFam) + asScalaIterator(new XReadLines(inFam)).foreach( u => { + stream.printf("%s%n",u) + }) + stream.close() + } + } + + add(new copyFam) + } + + } + + class MyPedGather extends InProcessFunction { + @Input(doc="Peds to be merged") var binPed: List[File] = Nil + @Input(doc="Bims to be merged") var bim : List[File] = Nil + @Output(doc="The final Ped to write to") var outPed : File = _ + @Output(doc="The final bim to write to") var outBim : File = _ + + def run : Unit = { + var stream : PrintStream = new PrintStream(outPed) + stream.write((List[Byte](0x6c.toByte,0x1b.toByte,0x1.toByte)).toArray) + binPed.map(u => new FileInputStream(u) ).foreach( u => { + u.skip(3) + var b = -1 + do { + b = u.read() + stream.write(b.toByte) + } while ( b != -1 ) + }) + stream.close() + + stream = new PrintStream(outBim) + bim.map(u => new XReadLines(u)).foreach( u => { + asScalaIterator(u).foreach( x => { + stream.printf("%s%n",x) + }) + }) + + stream.close() + } + } + +} \ No newline at end of file