Allow VCFs without PLs to be converted to a bed file with genotypes other than no-call (by setting the minimum GQ to <=0). Performance enhancements to GRM suite.

This commit is contained in:
Christopher Hartl 2012-10-03 21:36:35 -04:00
parent 1be8a88909
commit ca31ddf2a5
1 changed files with 23 additions and 18 deletions

View File

@ -27,9 +27,7 @@ import java.io.*;
import java.util.*; import java.util.*;
/** /**
* Yet another VCF to Ped converter. The world actually does need one that will * Converts a VCF file to a binary plink Ped file (.bed/.bim/.fam)
* work efficiently on large VCFs (or at least give a progress bar). This
* produces a binary ped file in individual major mode.
*/ */
@DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} ) @DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=0,stop=100)) @Reference(window=@Window(start=0,stop=100))
@ -43,24 +41,25 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
/** /**
* The metaData file can take two formats, the first of which is the first 6 lines of the standard ped file. This * The metaData file can take two formats, the first of which is the first 6 lines of the standard ped file. This
* is what Plink describes as a fam file. An example fam file is (note that there is no header): * is what Plink describes as a fam file. An example fam file is (note that there is no header):
* * <p><p>
* CEUTrio NA12878 NA12891 NA12892 2 -9 * CEUTrio NA12878 NA12891 NA12892 2 -9</p><p>
* CEUTrio NA12891 UNKN1 UNKN2 2 -9 * CEUTrio NA12891 UNKN1 UNKN2 2 -9</p><p>
* CEUTrio NA12892 UNKN3 UNKN4 1 -9 * CEUTrio NA12892 UNKN3 UNKN4 1 -9</p><p>
* * </p>
* where the entries are (FamilyID IndividualID DadID MomID Phenotype Sex) * where the entries are (FamilyID IndividualID DadID MomID Phenotype Sex)
* * <p>
* An alternate format is a two-column key-value file * An alternate format is a two-column key-value file
* * </p><p><p>
* NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9 * NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9</p><p>
* NA12891 fid=CEUTrio;sex=2;phenotype=-9 * NA12891 fid=CEUTrio;sex=2;phenotype=-9</p><p>
* NA12892 fid=CEUTrio;sex=1;phenotype=-9 * NA12892 fid=CEUTrio;sex=1;phenotype=-9</p><p>
* * </p><p>
* wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs. * wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs.
* * </p><p>
* Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the * Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the
* command line has "-md [name].fam", the fam file will simply be copied. However, if a metadata file of the * command line has "-md [name].fam", the fam file will simply be copied. However, if a metadata file of the
* alternate format is passed by "-md [name].txt", the walker will construct a formatted .fam file from the data. * alternate format is passed by "-md [name].txt", the walker will construct a formatted .fam file from the data.
* </p>
*/ */
@Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " + @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " +
"(in which case it will be copied to the file you provide as fam output).") "(in which case it will be copied to the file you provide as fam output).")
@ -107,6 +106,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
private int genotypeCount = 0; private int genotypeCount = 0;
private int byteCount = 0; private int byteCount = 0;
private List<String> famOrder = new ArrayList<String>(); private List<String> famOrder = new ArrayList<String>();
private long totalByteCount = 0l;
private long totalGenotypeCount = 0l;
public void initialize() { public void initialize() {
writeBedHeader(); writeBedHeader();
@ -217,6 +218,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
public void writeIndividualMajor(VariantContext vc, boolean altMajor) { public void writeIndividualMajor(VariantContext vc, boolean altMajor) {
// store genotypes per sample into the buffer // store genotypes per sample into the buffer
for ( Genotype g : vc.getGenotypes() ) { for ( Genotype g : vc.getGenotypes() ) {
++totalGenotypeCount;
String sample = g.getSampleName(); String sample = g.getSampleName();
byte[] samBuf = genotypeBuffer.get(sample); byte[] samBuf = genotypeBuffer.get(sample);
byte enc = getEncoding(g,genotypeCount,altMajor); byte enc = getEncoding(g,genotypeCount,altMajor);
@ -260,7 +262,8 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
genotypeCount = 0; genotypeCount = 0;
} }
} }
totalGenotypeCount += famOrder.size();
totalByteCount += bytes.length;
try { try {
outBed.write(bytes); outBed.write(bytes);
} catch (IOException e) { } catch (IOException e) {
@ -277,7 +280,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
} }
public void onTraversalDone(Integer numSites) { public void onTraversalDone(Integer numSites) {
logger.info(String.format("%d sites processed!",numSites)); logger.info(String.format("%d sites processed for a total of %d genotypes encoded in %d bytes",numSites,totalGenotypeCount,totalByteCount));
if ( mode == OutputMode.INDIVIDUAL_MAJOR ) { if ( mode == OutputMode.INDIVIDUAL_MAJOR ) {
mergeGenotypeTempFiles(numSites); mergeGenotypeTempFiles(numSites);
@ -317,11 +320,13 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
byte[] readGenotypes = new byte[BUFFER_SIZE]; byte[] readGenotypes = new byte[BUFFER_SIZE];
inStream.read(readGenotypes); inStream.read(readGenotypes);
outBed.write(readGenotypes); outBed.write(readGenotypes);
totalByteCount += BUFFER_SIZE;
} }
if ( ttr > 0 ) { if ( ttr > 0 ) {
byte[] readGenotypes = new byte[ttr]; byte[] readGenotypes = new byte[ttr];
inStream.read(readGenotypes); inStream.read(readGenotypes);
outBed.write(readGenotypes); outBed.write(readGenotypes);
totalByteCount += ttr;
} }
inStream.close(); inStream.close();
} catch (IOException e) { } catch (IOException e) {
@ -380,7 +385,7 @@ public class VariantsToBinaryPed extends RodWalker<Integer,Integer> {
return MathUtils.log10ProbabilityToPhredScale(log10gq) >= minGenotypeQuality; return MathUtils.log10ProbabilityToPhredScale(log10gq) >= minGenotypeQuality;
} }
return false; return minGenotypeQuality <= 0;
} }
private static String getID(VariantContext v) { private static String getID(VariantContext v) {