From a6837d31d42a65bef7afd1415b4b9188c7326581 Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Tue, 3 Apr 2012 16:13:16 -0400 Subject: [PATCH] Success! A fast and low-memory converter from VCF into a binary ped file. This is mostly so I don't have to listen to Pierre/Jason complain about how slow and inefficient plinkseq is at converting; or at transposting. This automatically writes to individual-major mode. It will eat up space on /tmp if you don't run with -Djava.io.tmpdir, so be careful if you use it. --- ...ntsToPed.java => VariantsToBinaryPed.java} | 204 +++++++++++++++--- 1 file changed, 170 insertions(+), 34 deletions(-) rename public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/{VariantsToPed.java => VariantsToBinaryPed.java} (51%) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java similarity index 51% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index d8b01e91d..aaf3bb5cd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.R.RScriptExecutorException; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; @@ -15,26 +16,26 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.PrintStream; +import java.io.*; import java.util.*; /** * Yet another VCF to Ped converter. The world actually does need one that will * work efficiently on large VCFs (or at least give a progress bar). This - * produces a binary ped file in SNP-major mode. + * produces a binary ped file in individual major mode. */ -public class VariantsToPed extends RodWalker { +public class VariantsToBinaryPed extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @ArgumentCollection protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); - @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file (in which case it will be copied to the file you provide as fam output)") + @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " + + "(in which case it will be copied to the file you provide as fam output).") File metaDataFile; @Output(shortName="bed",fullName = "bed",required=true,doc="output ped file") @@ -49,6 +50,9 @@ public class VariantsToPed extends RodWalker { @Argument(shortName="mgq",fullName="minGenotypeQuality",required=true,doc="If genotype quality is lower than this value, output NO_CALL") int minGenotypeQuality = 0; + @Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele") + boolean majorAlleleFirst = false; + private ValidateVariants vv = new ValidateVariants(); private static double APPROX_CM_PER_BP = 1000000.0/750000.0; @@ -58,31 +62,48 @@ public class VariantsToPed extends RodWalker { private static final byte HET = 0x2; private static final byte NO_CALL = 0x1; - // note that HET and NO_CALL are flippd from the documentation: that's because + private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples + + // note that HET and NO_CALL are flipped from the documentation: that's because // plink actually reads these in backwards; and we want to use a shift operator // to put these in the appropriate location + private Map printMap = new HashMap(); + private Map tempFiles = new HashMap(); + private Map genotypeBuffer = new HashMap(); + private int genotypeCount = 0; + private int byteCount = 0; + private List famOrder = new ArrayList(); + public void initialize() { vv.variantCollection = variantCollection; vv.dbsnp = dbsnp; vv.DO_NOT_VALIDATE_FILTERED = true; vv.type = ValidateVariants.ValidationType.REF; + // create temporary output streams and buffers + // write magic bits into the ped file try { - outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x1 }); + outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); + // ultimately, the bed will be in individual-major mode } catch (IOException e) { throw new ReviewedStingException("error writing to output file."); } // write to the fam file, the first six columns of the standard ped file // first, load data from the input meta data file Map> metaValues = new HashMap>(); + Set samplesToUse = new HashSet(); + logger.debug("Reading in metadata..."); try { if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) { for ( String line : new XReadLines(metaDataFile) ) { + String[] famSplit = line.split("\\t"); + String sid = famSplit[1]; outFam.printf("%s%n",line); } } else { for ( String line : new XReadLines(metaDataFile) ) { + logger.debug(line); String[] split = line.split("\\t"); String sampleID = split[0]; String keyVals = split[1]; @@ -119,6 +140,15 @@ public class VariantsToPed extends RodWalker { String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3"; String pheno = mVals.get("phenotype"); outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno); + try { + File temp = File.createTempFile(sample, ".tmp"); + printMap.put(sample,new PrintStream(temp)); + tempFiles.put(sample,temp); + } catch (IOException e) { + throw new ReviewedStingException("Error creating temporary file",e); + } + genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); + famOrder.add(sample); } } } @@ -138,32 +168,57 @@ public class VariantsToPed extends RodWalker { } VariantContext vc = tracker.getFirstValue(variantCollection.variants); + String refOut; + String altOut; + boolean altMajor; + if ( majorAlleleFirst ) { + // want to use the major allele as ref + HashMap ats = new HashMap(vc.getAttributes()); + if ( ! vc.hasAttribute("AF") ) { + VariantContextUtils.calculateChromosomeCounts(vc,ats,true); + } + if ( getAF(ats.get("AF")) > 0.5 ) { + refOut = vc.getAlternateAllele(0).getBaseString(); + altOut = vc.getReference().getBaseString(); + altMajor = true; + } else { + refOut = vc.getReference().getBaseString(); + altOut = vc.getAlternateAllele(0).getBaseString(); + altMajor = false; + } + } else { + refOut = vc.getReference().getBaseString(); + altOut = vc.getAlternateAllele(0).getBaseString(); + altMajor = false; + } // write an entry into the map file outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(), - vc.getReference().getBaseString(),vc.getAlternateAllele(0).getBaseString()); - // write an entry into the bed file - int buf = 0; - int idx = 0; - byte out = 0x0; - byte[] toWrite = new byte[1+(vc.getNSamples()/4)]; - for (Genotype g : vc.getGenotypes() ) { - out |= getEncoding(g,buf); - if ( buf == 3 ) { - toWrite[idx] = out; - buf = 0; - out = 0x0; - idx++; - } else { - buf++; + refOut,altOut); + // store genotypes per sample into the buffer + for ( Genotype g : vc.getGenotypes() ) { + String sample = g.getSampleName(); + byte[] samBuf = genotypeBuffer.get(sample); + byte enc = getEncoding(g,genotypeCount,altMajor); + samBuf[byteCount] |= enc; + } + genotypeCount++; + if ( genotypeCount % 4 == 0 ) { + byteCount++; + if ( byteCount >= BUFFER_SIZE ) { + // dump the buffer to the print streams + for ( String sample : printMap.keySet() ) { + OutputStream samOut = printMap.get(sample); + // print the buffer for this sample + try { + samOut.write(genotypeBuffer.get(sample)); + } catch ( IOException e ) { + throw new ReviewedStingException("Error writing to temporary bed file.",e); + } + // reset the buffer for this sample + genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); + } } - } - if ( out != 0x0 ) { - toWrite[idx]=out; - } - try { - outBed.write(toWrite); - } catch (IOException e) { - throw new ReviewedStingException("Error writing to output file"); + genotypeCount = 0; } return 1; @@ -177,7 +232,61 @@ public class VariantsToPed extends RodWalker { return 0; } - private byte getEncoding(Genotype g, int offset) { + public void onTraversalDone(Integer numSites) { + logger.info(String.format("%d sites processed!",numSites)); + // push out the remaining genotypes and close stream + for ( String sample : printMap.keySet() ) { + try { + int lim = byteCount + (genotypeCount > 0 ? 1 : 0); + printMap.get(sample).write(genotypeBuffer.get(sample),0,lim); + } catch (IOException e) { + throw new ReviewedStingException("Error closing temporary file.",e); + } + + try { + printMap.get(sample).close(); + } catch (IOException e) { + throw new ReviewedStingException("Error closing temporary file.",e); + } + } + for ( String sample : famOrder ) { + logger.info("Merging genotypes for "+sample); + FileInputStream inStream; + try { + inStream = new FileInputStream(tempFiles.get(sample)); + } catch (IOException e) { + throw new ReviewedStingException("Error opening temp file for input.",e); + } + + + try { + int ttr = numSites/4 + (genotypeCount > 0 ? 1 : 0); + for ( ; ttr > BUFFER_SIZE ; ttr -= BUFFER_SIZE ) { + byte[] readGenotypes = new byte[BUFFER_SIZE]; + inStream.read(readGenotypes); + outBed.write(readGenotypes); + } + if ( ttr > 0 ) { + byte[] readGenotypes = new byte[ttr]; + inStream.read(readGenotypes); + outBed.write(readGenotypes); + } + } catch (IOException e) { + throw new ReviewedStingException("Error reading form temp file for input.",e); + } + } + + } + + private byte getEncoding(Genotype g, int offset, boolean altMajor) { + if ( ! altMajor ) { + return getStandardEncoding(g,offset); + } + + return getFlippedEncoding(g,offset); + } + + private byte getStandardEncoding(Genotype g, int offset) { byte b; if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) { b = NO_CALL; @@ -194,11 +303,38 @@ public class VariantsToPed extends RodWalker { return (byte) (b << (2*offset)); } + private byte getFlippedEncoding(Genotype g, int offset) { + byte b; + if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) { + b = NO_CALL; + } else if ( g.isHomRef() ) { + b = HOM_VAR; + } else if ( g.isHomVar() ) { + b = HOM_REF; + } else if ( g.isHet() ) { + b = HET; + } else { + b = NO_CALL; + } + + return (byte) (b << (2*offset)); + } + private static String getID(VariantContext v) { if ( v.hasID() ) { return v.getID(); } else { - return String.format("SNP-%s-%d",v.getChr(),v.getStart()); + return String.format("Var-%s-%d",v.getChr(),v.getStart()); + } + } + + private double getAF(Object o) { + if ( (o instanceof String) ) { + return Double.parseDouble((String) o); + } else if ( (o instanceof Double) ) { + return (Double) o; + } else { + throw new UserException("Allele frequency appears to be neither String nor Double. Please check the header of your VCF."); } } }