Important refactoring of the VQSR recal file format: we now use a VCF instead of a CSV file.

The most important reason for this change is that we no longer need to read the entire recal file into memory up front in ApplyRecalibration.  For 1000G calling this was prohibitive in terms of memory requirements.  Now we go through the rod system and pull in just the records we need at a given position.

As an added bonus, once BCF2 is live we can drastically cut down the sizes of these recal files (which can grow large for whole genome calling).
This commit is contained in:
Eric Banks 2012-04-17 22:38:18 -04:00
parent ea793d8e27
commit 6d03bce0d3
4 changed files with 54 additions and 46 deletions

View File

@ -37,14 +37,11 @@ import org.broadinstitute.sting.gatk.walkers.PartitionType;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.collections.NestedHashMap;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.XReadLines;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.File;
import java.io.FileNotFoundException;
import java.util.*;
/**
@ -98,9 +95,9 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
@Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
public List<RodBinding<VariantContext>> input;
@Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true)
private File RECAL_FILE;
protected RodBinding<VariantContext> recal;
@Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
private File TRANCHES_FILE;
protected File TRANCHES_FILE;
/////////////////////////////
// Outputs
@ -123,8 +120,6 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
/////////////////////////////
final private List<Tranche> tranches = new ArrayList<Tranche>();
final private Set<String> inputNames = new HashSet<String>();
final private NestedHashMap lodMap = new NestedHashMap();
final private NestedHashMap annotationMap = new NestedHashMap();
final private Set<String> ignoreInputFilterSet = new TreeSet<String>();
//---------------------------------------------------------------------------------------------------------------
@ -174,20 +169,6 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
final VCFHeader vcfHeader = new VCFHeader(hInfo, samples);
vcfWriter.writeHeader(vcfHeader);
try {
logger.info("Reading in recalibration table...");
for ( final String line : new XReadLines( RECAL_FILE ) ) {
final String[] vals = line.split(",");
lodMap.put( Double.parseDouble(vals[3]), vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys
annotationMap.put( vals[4], vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys
}
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotReadInputFile(RECAL_FILE, e);
} catch ( Exception e ) {
throw new UserException.MalformedFile(RECAL_FILE, "Could not parse LOD and annotation information in input recal file. File is somehow malformed.");
}
}
//---------------------------------------------------------------------------------------------------------------
@ -202,21 +183,27 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
return 1;
}
for( VariantContext vc : tracker.getValues(input, context.getLocation()) ) {
for( final VariantContext vc : tracker.getValues(input, context.getLocation()) ) {
if( vc != null ) {
if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
VariantContextBuilder builder = new VariantContextBuilder(vc);
String filterString = null;
final Double lod = (Double) lodMap.get( vc.getChr(), vc.getStart(), vc.getEnd() );
final String worstAnnotation = (String) annotationMap.get( vc.getChr(), vc.getStart(), vc.getEnd() );
if( lod == null ) {
if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) {
final VariantContext recalDatum = tracker.getFirstValue(recal, context.getLocation());
if( recalDatum == null ) {
throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc );
}
final double lod = recalDatum.getAttributeAsDouble(VariantRecalibrator.VQS_LOD_KEY, Double.NEGATIVE_INFINITY);
if( lod == Double.NEGATIVE_INFINITY ) {
throw new UserException("Encountered a malformed record in the input recal file. There is no lod for the record at: " + vc );
}
VariantContextBuilder builder = new VariantContextBuilder(vc);
String filterString = null;
// Annotate the new record with its VQSLOD and the worst performing annotation
builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod));
builder.attribute(VariantRecalibrator.CULPRIT_KEY, worstAnnotation);
builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod);
builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY));
for( int i = tranches.size() - 1; i >= 0; i-- ) {
final Tranche tranche = tranches.get(i);

View File

@ -30,14 +30,16 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.*;
/**
* Created by IntelliJ IDEA.
@ -285,11 +287,31 @@ public class VariantDataManager {
(TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples());
}
public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) {
public void writeOutRecalibrationTable( final VCFWriter recalWriter ) {
// we need to sort in coordinate order in order to produce a valid VCF
Collections.sort( data, new Comparator<VariantDatum>() {
public int compare(VariantDatum vd1, VariantDatum vd2) {
return vd1.loc.compareTo(vd2.loc);
}} );
// create dummy alleles to be used
final List<Allele> alleles = new ArrayList<Allele>(2);
alleles.add(Allele.create("N", true));
alleles.add(Allele.create("<VQSR>", false));
final VCFHeader vcfHeader = new VCFHeader( null, Collections.<String>emptySet() );
recalWriter.writeHeader(vcfHeader);
// to be used for the important INFO tags
final HashMap<String, Object> attributes = new HashMap<String, Object>(3);
for( final VariantDatum datum : data ) {
RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s",
datum.contig, datum.start, datum.stop, datum.lod,
(datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")));
attributes.put(VCFConstants.END_KEY, datum.loc.getStop());
attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod));
attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"));
VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStop(), alleles).attributes(attributes);
recalWriter.add(builder.make());
}
}
}

View File

@ -25,6 +25,8 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.broadinstitute.sting.utils.GenomeLoc;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
@ -46,9 +48,7 @@ public class VariantDatum implements Comparable<VariantDatum> {
public double originalQual;
public double prior;
public int consensusCount;
public String contig;
public int start;
public int stop;
public GenomeLoc loc;
public int worstAnnotation;
public MultivariateGaussian assignment; // used in K-means implementation

View File

@ -37,6 +37,7 @@ import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.R.RScriptExecutor;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.io.Resource;
@ -136,7 +137,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
// Outputs
/////////////////////////////
@Output(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true)
private PrintStream RECAL_FILE;
private VCFWriter recalWriter;
@Output(fullName="tranches_file", shortName="tranchesFile", doc="The output tranches file used by ApplyRecalibration", required=true)
private File TRANCHES_FILE;
@ -246,9 +247,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
// Populate the datum with lots of fields from the VariantContext, unfortunately the VC is too big so we just pull in only the things we absolutely need.
dataManager.decodeAnnotations( datum, vc, true ); //BUGBUG: when run with HierarchicalMicroScheduler this is non-deterministic because order of calls depends on load of machine
datum.contig = vc.getChr();
datum.start = vc.getStart();
datum.stop = vc.getEnd();
datum.loc = getToolkit().getGenomeLocParser().createGenomeLoc(vc);
datum.originalQual = vc.getPhredScaledQual();
datum.isSNP = vc.isSNP() && vc.isBiallelic();
datum.isTransition = datum.isSNP && VariantContextUtils.isTransition(vc);
@ -345,7 +344,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
}
logger.info( "Writing out recalibration table..." );
dataManager.writeOutRecalibrationTable( RECAL_FILE );
dataManager.writeOutRecalibrationTable( recalWriter );
if( RSCRIPT_FILE != null ) {
logger.info( "Writing out visualization Rscript file...");
createVisualizationScript( dataManager.getRandomDataForPlotting( 6000 ), goodModel, badModel, lodCutoff );