diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 6b36f4e1b..5b1d69f14 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -35,16 +35,14 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; import java.io.File; -import java.io.FileNotFoundException; import java.util.*; /** @@ -86,8 +84,8 @@ import java.util.*; * */ -@PartitionBy(PartitionType.NONE) -public class ApplyRecalibration extends RodWalker { +@PartitionBy(PartitionType.LOCUS) +public class ApplyRecalibration extends RodWalker implements TreeReducible { ///////////////////////////// // Inputs @@ -98,9 +96,9 @@ public class ApplyRecalibration extends RodWalker { @Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true) public List> input; @Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true) - private File RECAL_FILE; + protected RodBinding recal; @Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true) - private File TRANCHES_FILE; + protected File TRANCHES_FILE; ///////////////////////////// // Outputs @@ -112,7 +110,7 @@ public class ApplyRecalibration extends RodWalker { // Command Line Arguments ///////////////////////////// @Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false) - private double TS_FILTER_LEVEL = 99.0; + protected double TS_FILTER_LEVEL = 99.0; @Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false) private String[] IGNORE_INPUT_FILTERS = null; @Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously.", required = false) @@ -123,8 +121,6 @@ public class ApplyRecalibration extends RodWalker { ///////////////////////////// final private List tranches = new ArrayList(); final private Set inputNames = new HashSet(); - final private NestedHashMap lodMap = new NestedHashMap(); - final private NestedHashMap annotationMap = new NestedHashMap(); final private Set ignoreInputFilterSet = new TreeSet(); //--------------------------------------------------------------------------------------------------------------- @@ -174,20 +170,6 @@ public class ApplyRecalibration extends RodWalker { final VCFHeader vcfHeader = new VCFHeader(hInfo, samples); vcfWriter.writeHeader(vcfHeader); - - try { - logger.info("Reading in recalibration table..."); - for ( final String line : new XReadLines( RECAL_FILE ) ) { - final String[] vals = line.split(","); - lodMap.put( Double.parseDouble(vals[3]), vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys - annotationMap.put( vals[4], vals[0], Integer.parseInt(vals[1]), Integer.parseInt(vals[2]) ); // value comes before the keys - } - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(RECAL_FILE, e); - } catch ( Exception e ) { - throw new UserException.MalformedFile(RECAL_FILE, "Could not parse LOD and annotation information in input recal file. File is somehow malformed."); - } - } //--------------------------------------------------------------------------------------------------------------- @@ -202,52 +184,75 @@ public class ApplyRecalibration extends RodWalker { return 1; } - for( VariantContext vc : tracker.getValues(input, context.getLocation()) ) { - if( vc != null ) { - if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) { - VariantContextBuilder builder = new VariantContextBuilder(vc); - String filterString = null; + final List VCs = tracker.getValues(input, context.getLocation()); + final List recals = tracker.getValues(recal, context.getLocation()); - final Double lod = (Double) lodMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); - final String worstAnnotation = (String) annotationMap.get( vc.getChr(), vc.getStart(), vc.getEnd() ); - if( lod == null ) { - throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc ); - } + for( final VariantContext vc : VCs ) { - // Annotate the new record with its VQSLOD and the worst performing annotation - builder.attribute(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", lod)); - builder.attribute(VariantRecalibrator.CULPRIT_KEY, worstAnnotation); + if( VariantRecalibrator.checkRecalibrationMode( vc, MODE ) && (vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters())) ) { - for( int i = tranches.size() - 1; i >= 0; i-- ) { - final Tranche tranche = tranches.get(i); - if( lod >= tranche.minVQSLod ) { - if( i == tranches.size() - 1 ) { - filterString = VCFConstants.PASSES_FILTERS_v4; - } else { - filterString = tranche.name; - } - break; - } - } - - if( filterString == null ) { - filterString = tranches.get(0).name+"+"; - } - - if( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) { - builder.filters(filterString); - } - - vcfWriter.add( builder.make() ); - } else { // valid VC but not compatible with this mode, so just emit the variant untouched - vcfWriter.add( vc ); + final VariantContext recalDatum = getMatchingRecalVC(vc, recals); + if( recalDatum == null ) { + throw new UserException("Encountered input variant which isn't found in the input recal file. Please make sure VariantRecalibrator and ApplyRecalibration were run on the same set of input variants. First seen at: " + vc ); } + + final String lodString = recalDatum.getAttributeAsString(VariantRecalibrator.VQS_LOD_KEY, null); + if( lodString == null ) { + throw new UserException("Encountered a malformed record in the input recal file. There is no lod for the record at: " + vc ); + } + final double lod; + try { + lod = Double.valueOf(lodString); + } catch (NumberFormatException e) { + throw new UserException("Encountered a malformed record in the input recal file. The lod is unreadable for the record at: " + vc ); + } + + VariantContextBuilder builder = new VariantContextBuilder(vc); + String filterString = null; + + // Annotate the new record with its VQSLOD and the worst performing annotation + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lodString); // use the String representation so that we don't lose precision on output + builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); + + for( int i = tranches.size() - 1; i >= 0; i-- ) { + final Tranche tranche = tranches.get(i); + if( lod >= tranche.minVQSLod ) { + if( i == tranches.size() - 1 ) { + filterString = VCFConstants.PASSES_FILTERS_v4; + } else { + filterString = tranche.name; + } + break; + } + } + + if( filterString == null ) { + filterString = tranches.get(0).name+"+"; + } + + if( !filterString.equals(VCFConstants.PASSES_FILTERS_v4) ) { + builder.filters(filterString); + } + + vcfWriter.add( builder.make() ); + } else { // valid VC but not compatible with this mode, so just emit the variant untouched + vcfWriter.add( vc ); } } return 1; // This value isn't used for anything } + private static VariantContext getMatchingRecalVC(final VariantContext target, final List recalVCs) { + for( final VariantContext recalVC : recalVCs ) { + if ( target.getEnd() == recalVC.getEnd() ) { + return recalVC; + } + } + + return null; + } + //--------------------------------------------------------------------------------------------------------------- // // reduce @@ -262,6 +267,10 @@ public class ApplyRecalibration extends RodWalker { return 1; // This value isn't used for anything } + public Integer treeReduce( final Integer lhs, final Integer rhs ) { + return 1; // This value isn't used for anything + } + public void onTraversalDone( final Integer reduceSum ) { } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index a957bfd85..e2d1692d0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -30,14 +30,16 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; +import java.util.*; /** * Created by IntelliJ IDEA. @@ -285,11 +287,28 @@ public class VariantDataManager { (TRUST_ALL_POLYMORPHIC || !trainVC.hasGenotypes() || trainVC.isPolymorphicInSamples()); } - public void writeOutRecalibrationTable( final PrintStream RECAL_FILE ) { + public void writeOutRecalibrationTable( final VCFWriter recalWriter ) { + // we need to sort in coordinate order in order to produce a valid VCF + Collections.sort( data, new Comparator() { + public int compare(VariantDatum vd1, VariantDatum vd2) { + return vd1.loc.compareTo(vd2.loc); + }} ); + + // create dummy alleles to be used + final List alleles = new ArrayList(2); + alleles.add(Allele.create("N", true)); + alleles.add(Allele.create("", false)); + + // to be used for the important INFO tags + final HashMap attributes = new HashMap(3); + for( final VariantDatum datum : data ) { - RECAL_FILE.println(String.format("%s,%d,%d,%.4f,%s", - datum.contig, datum.start, datum.stop, datum.lod, - (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL"))); + attributes.put(VCFConstants.END_KEY, datum.loc.getStop()); + attributes.put(VariantRecalibrator.VQS_LOD_KEY, String.format("%.4f", datum.lod)); + attributes.put(VariantRecalibrator.CULPRIT_KEY, (datum.worstAnnotation != -1 ? annotationKeys.get(datum.worstAnnotation) : "NULL")); + + VariantContextBuilder builder = new VariantContextBuilder("VQSR", datum.loc.getContig(), datum.loc.getStart(), datum.loc.getStart(), alleles).attributes(attributes); + recalWriter.add(builder.make()); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java index eb9e98fcb..32350f0fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -25,6 +25,8 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; +import org.broadinstitute.sting.utils.GenomeLoc; + /** * Created by IntelliJ IDEA. * User: rpoplin @@ -46,9 +48,7 @@ public class VariantDatum implements Comparable { public double originalQual; public double prior; public int consensusCount; - public String contig; - public int start; - public int stop; + public GenomeLoc loc; public int worstAnnotation; public MultivariateGaussian assignment; // used in K-means implementation diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 3cdcf4982..f86908dbe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -37,6 +37,8 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.io.Resource; @@ -136,9 +138,11 @@ public class VariantRecalibrator extends RodWalkeremptySet() ); + recalWriter = new StandardVCFWriter(recalFile, getMasterSequenceDictionary(), false); + recalWriter.writeHeader(vcfHeader); } //--------------------------------------------------------------------------------------------------------------- @@ -246,9 +254,7 @@ public class VariantRecalibrator extends RodWalker