Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
5abb07da5d
|
|
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.utils.R.RScriptExecutorException;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
|
|
@ -15,26 +16,26 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
|
|||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Yet another VCF to Ped converter. The world actually does need one that will
|
||||
* work efficiently on large VCFs (or at least give a progress bar). This
|
||||
* produces a binary ped file in SNP-major mode.
|
||||
* produces a binary ped file in individual major mode.
|
||||
*/
|
||||
public class VariantsToPed extends RodWalker<Integer,Integer> {
|
||||
public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
||||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
|
||||
@Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file (in which case it will be copied to the file you provide as fam output)")
|
||||
@Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " +
|
||||
"(in which case it will be copied to the file you provide as fam output).")
|
||||
File metaDataFile;
|
||||
|
||||
@Output(shortName="bed",fullName = "bed",required=true,doc="output ped file")
|
||||
|
|
@ -49,6 +50,9 @@ public class VariantsToPed extends RodWalker<Integer,Integer> {
|
|||
@Argument(shortName="mgq",fullName="minGenotypeQuality",required=true,doc="If genotype quality is lower than this value, output NO_CALL")
|
||||
int minGenotypeQuality = 0;
|
||||
|
||||
@Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele")
|
||||
boolean majorAlleleFirst = false;
|
||||
|
||||
private ValidateVariants vv = new ValidateVariants();
|
||||
|
||||
private static double APPROX_CM_PER_BP = 1000000.0/750000.0;
|
||||
|
|
@ -58,31 +62,48 @@ public class VariantsToPed extends RodWalker<Integer,Integer> {
|
|||
private static final byte HET = 0x2;
|
||||
private static final byte NO_CALL = 0x1;
|
||||
|
||||
// note that HET and NO_CALL are flippd from the documentation: that's because
|
||||
private static final int BUFFER_SIZE = 1000; //4k genotypes per sample = Nmb for N*1000 samples
|
||||
|
||||
// note that HET and NO_CALL are flipped from the documentation: that's because
|
||||
// plink actually reads these in backwards; and we want to use a shift operator
|
||||
// to put these in the appropriate location
|
||||
|
||||
private Map<String,OutputStream> printMap = new HashMap<String,OutputStream>();
|
||||
private Map<String,File> tempFiles = new HashMap<String,File>();
|
||||
private Map<String,byte[]> genotypeBuffer = new HashMap<String,byte[]>();
|
||||
private int genotypeCount = 0;
|
||||
private int byteCount = 0;
|
||||
private List<String> famOrder = new ArrayList<String>();
|
||||
|
||||
public void initialize() {
|
||||
vv.variantCollection = variantCollection;
|
||||
vv.dbsnp = dbsnp;
|
||||
vv.DO_NOT_VALIDATE_FILTERED = true;
|
||||
vv.type = ValidateVariants.ValidationType.REF;
|
||||
// create temporary output streams and buffers
|
||||
|
||||
// write magic bits into the ped file
|
||||
try {
|
||||
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x1 });
|
||||
outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0});
|
||||
// ultimately, the bed will be in individual-major mode
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("error writing to output file.");
|
||||
}
|
||||
// write to the fam file, the first six columns of the standard ped file
|
||||
// first, load data from the input meta data file
|
||||
Map<String,Map<String,String>> metaValues = new HashMap<String,Map<String,String>>();
|
||||
Set<String> samplesToUse = new HashSet<String>();
|
||||
logger.debug("Reading in metadata...");
|
||||
try {
|
||||
if ( metaDataFile.getAbsolutePath().endsWith(".fam") ) {
|
||||
for ( String line : new XReadLines(metaDataFile) ) {
|
||||
String[] famSplit = line.split("\\t");
|
||||
String sid = famSplit[1];
|
||||
outFam.printf("%s%n",line);
|
||||
}
|
||||
} else {
|
||||
for ( String line : new XReadLines(metaDataFile) ) {
|
||||
logger.debug(line);
|
||||
String[] split = line.split("\\t");
|
||||
String sampleID = split[0];
|
||||
String keyVals = split[1];
|
||||
|
|
@ -119,6 +140,15 @@ public class VariantsToPed extends RodWalker<Integer,Integer> {
|
|||
String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
|
||||
String pheno = mVals.get("phenotype");
|
||||
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno);
|
||||
try {
|
||||
File temp = File.createTempFile(sample, ".tmp");
|
||||
printMap.put(sample,new PrintStream(temp));
|
||||
tempFiles.put(sample,temp);
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error creating temporary file",e);
|
||||
}
|
||||
genotypeBuffer.put(sample,new byte[BUFFER_SIZE]);
|
||||
famOrder.add(sample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -138,32 +168,57 @@ public class VariantsToPed extends RodWalker<Integer,Integer> {
|
|||
}
|
||||
|
||||
VariantContext vc = tracker.getFirstValue(variantCollection.variants);
|
||||
String refOut;
|
||||
String altOut;
|
||||
boolean altMajor;
|
||||
if ( majorAlleleFirst ) {
|
||||
// want to use the major allele as ref
|
||||
HashMap<String,Object> ats = new HashMap<String,Object>(vc.getAttributes());
|
||||
if ( ! vc.hasAttribute("AF") ) {
|
||||
VariantContextUtils.calculateChromosomeCounts(vc,ats,true);
|
||||
}
|
||||
if ( getAF(ats.get("AF")) > 0.5 ) {
|
||||
refOut = vc.getAlternateAllele(0).getBaseString();
|
||||
altOut = vc.getReference().getBaseString();
|
||||
altMajor = true;
|
||||
} else {
|
||||
refOut = vc.getReference().getBaseString();
|
||||
altOut = vc.getAlternateAllele(0).getBaseString();
|
||||
altMajor = false;
|
||||
}
|
||||
} else {
|
||||
refOut = vc.getReference().getBaseString();
|
||||
altOut = vc.getAlternateAllele(0).getBaseString();
|
||||
altMajor = false;
|
||||
}
|
||||
// write an entry into the map file
|
||||
outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(),
|
||||
vc.getReference().getBaseString(),vc.getAlternateAllele(0).getBaseString());
|
||||
// write an entry into the bed file
|
||||
int buf = 0;
|
||||
int idx = 0;
|
||||
byte out = 0x0;
|
||||
byte[] toWrite = new byte[1+(vc.getNSamples()/4)];
|
||||
for (Genotype g : vc.getGenotypes() ) {
|
||||
out |= getEncoding(g,buf);
|
||||
if ( buf == 3 ) {
|
||||
toWrite[idx] = out;
|
||||
buf = 0;
|
||||
out = 0x0;
|
||||
idx++;
|
||||
} else {
|
||||
buf++;
|
||||
refOut,altOut);
|
||||
// store genotypes per sample into the buffer
|
||||
for ( Genotype g : vc.getGenotypes() ) {
|
||||
String sample = g.getSampleName();
|
||||
byte[] samBuf = genotypeBuffer.get(sample);
|
||||
byte enc = getEncoding(g,genotypeCount,altMajor);
|
||||
samBuf[byteCount] |= enc;
|
||||
}
|
||||
genotypeCount++;
|
||||
if ( genotypeCount % 4 == 0 ) {
|
||||
byteCount++;
|
||||
if ( byteCount >= BUFFER_SIZE ) {
|
||||
// dump the buffer to the print streams
|
||||
for ( String sample : printMap.keySet() ) {
|
||||
OutputStream samOut = printMap.get(sample);
|
||||
// print the buffer for this sample
|
||||
try {
|
||||
samOut.write(genotypeBuffer.get(sample));
|
||||
} catch ( IOException e ) {
|
||||
throw new ReviewedStingException("Error writing to temporary bed file.",e);
|
||||
}
|
||||
// reset the buffer for this sample
|
||||
genotypeBuffer.put(sample,new byte[BUFFER_SIZE]);
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( out != 0x0 ) {
|
||||
toWrite[idx]=out;
|
||||
}
|
||||
try {
|
||||
outBed.write(toWrite);
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error writing to output file");
|
||||
genotypeCount = 0;
|
||||
}
|
||||
|
||||
return 1;
|
||||
|
|
@ -177,7 +232,61 @@ public class VariantsToPed extends RodWalker<Integer,Integer> {
|
|||
return 0;
|
||||
}
|
||||
|
||||
private byte getEncoding(Genotype g, int offset) {
|
||||
public void onTraversalDone(Integer numSites) {
|
||||
logger.info(String.format("%d sites processed!",numSites));
|
||||
// push out the remaining genotypes and close stream
|
||||
for ( String sample : printMap.keySet() ) {
|
||||
try {
|
||||
int lim = byteCount + (genotypeCount > 0 ? 1 : 0);
|
||||
printMap.get(sample).write(genotypeBuffer.get(sample),0,lim);
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error closing temporary file.",e);
|
||||
}
|
||||
|
||||
try {
|
||||
printMap.get(sample).close();
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error closing temporary file.",e);
|
||||
}
|
||||
}
|
||||
for ( String sample : famOrder ) {
|
||||
logger.info("Merging genotypes for "+sample);
|
||||
FileInputStream inStream;
|
||||
try {
|
||||
inStream = new FileInputStream(tempFiles.get(sample));
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error opening temp file for input.",e);
|
||||
}
|
||||
|
||||
|
||||
try {
|
||||
int ttr = numSites/4 + (genotypeCount > 0 ? 1 : 0);
|
||||
for ( ; ttr > BUFFER_SIZE ; ttr -= BUFFER_SIZE ) {
|
||||
byte[] readGenotypes = new byte[BUFFER_SIZE];
|
||||
inStream.read(readGenotypes);
|
||||
outBed.write(readGenotypes);
|
||||
}
|
||||
if ( ttr > 0 ) {
|
||||
byte[] readGenotypes = new byte[ttr];
|
||||
inStream.read(readGenotypes);
|
||||
outBed.write(readGenotypes);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Error reading form temp file for input.",e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private byte getEncoding(Genotype g, int offset, boolean altMajor) {
|
||||
if ( ! altMajor ) {
|
||||
return getStandardEncoding(g,offset);
|
||||
}
|
||||
|
||||
return getFlippedEncoding(g,offset);
|
||||
}
|
||||
|
||||
private byte getStandardEncoding(Genotype g, int offset) {
|
||||
byte b;
|
||||
if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) {
|
||||
b = NO_CALL;
|
||||
|
|
@ -194,11 +303,38 @@ public class VariantsToPed extends RodWalker<Integer,Integer> {
|
|||
return (byte) (b << (2*offset));
|
||||
}
|
||||
|
||||
private byte getFlippedEncoding(Genotype g, int offset) {
|
||||
byte b;
|
||||
if ( g.hasAttribute(VCFConstants.GENOTYPE_QUALITY_KEY) && ((Integer) g.getAttribute(VCFConstants.GENOTYPE_QUALITY_KEY)) < minGenotypeQuality ) {
|
||||
b = NO_CALL;
|
||||
} else if ( g.isHomRef() ) {
|
||||
b = HOM_VAR;
|
||||
} else if ( g.isHomVar() ) {
|
||||
b = HOM_REF;
|
||||
} else if ( g.isHet() ) {
|
||||
b = HET;
|
||||
} else {
|
||||
b = NO_CALL;
|
||||
}
|
||||
|
||||
return (byte) (b << (2*offset));
|
||||
}
|
||||
|
||||
private static String getID(VariantContext v) {
|
||||
if ( v.hasID() ) {
|
||||
return v.getID();
|
||||
} else {
|
||||
return String.format("SNP-%s-%d",v.getChr(),v.getStart());
|
||||
return String.format("Var-%s-%d",v.getChr(),v.getStart());
|
||||
}
|
||||
}
|
||||
|
||||
private double getAF(Object o) {
|
||||
if ( (o instanceof String) ) {
|
||||
return Double.parseDouble((String) o);
|
||||
} else if ( (o instanceof Double) ) {
|
||||
return (Double) o;
|
||||
} else {
|
||||
throw new UserException("Allele frequency appears to be neither String nor Double. Please check the header of your VCF.");
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue