diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/SimpleIndelROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/SimpleIndelROD.java index c9b0cd62c..85579645d 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/SimpleIndelROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/SimpleIndelROD.java @@ -68,10 +68,17 @@ public class SimpleIndelROD extends TabularROD implements Genotype, AllelicVaria return true; } + public String getSamplesString() { + return (is1KGFormat() && this.get("5") != null ? this.get("5") : ""); + } + public String toString() { StringBuffer sb = new StringBuffer(); sb.append(getLocation().getContig() + "\t" + getLocation().getStart() + "\t"); sb.append(length() + "\t" + (isInsertion() ? "I" : "D") + "\t" + getFWDAlleles().get(0)); + String samples = getSamplesString(); + if ( samples.length() > 0 ) + sb.append("\t" + samples); return sb.toString(); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECAlleleBalance.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECAlleleBalance.java index b008b3715..d80ce4719 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECAlleleBalance.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECAlleleBalance.java @@ -5,6 +5,8 @@ import org.broadinstitute.sting.utils.ReadBackedPileup; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Pair; +import java.util.HashMap; + public class VECAlleleBalance extends RatioFilter { private double lowThreshold = 0.25, highThreshold = 0.75; @@ -14,14 +16,13 @@ public class VECAlleleBalance extends RatioFilter { super("Allele Balance Ratio", VECAlleleBalance.class, Tail.TwoTailed); } - public void initialize(String arguments) { - if (arguments != null && !arguments.isEmpty()) { - String[] argPieces = arguments.split(","); - lowThreshold = Double.valueOf(argPieces[0]); - highThreshold = Double.valueOf(argPieces[1]); - if ( argPieces.length > 2 ) - minGenotypeConfidenceToTest = Double.valueOf(argPieces[2]); - } + public void initialize(HashMap args) { + if ( args.get("low") != null ) + lowThreshold = Double.valueOf(args.get("low")); + if ( args.get("high") != null ) + highThreshold = Double.valueOf(args.get("high")); + if ( args.get("confidence") != null ) + minGenotypeConfidenceToTest = Double.valueOf(args.get("confidence")); setLowThreshold(lowThreshold); setHighThreshold(highThreshold); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECClusteredSnps.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECClusteredSnps.java index d1ad57ba0..eb37f8610 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECClusteredSnps.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECClusteredSnps.java @@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.VariantContext; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.StingException; +import java.util.HashMap; public class VECClusteredSnps implements VariantExclusionCriterion { @@ -12,12 +13,11 @@ public class VECClusteredSnps implements VariantExclusionCriterion { private boolean exclude; private long distance; - public void initialize(String arguments) { - if (arguments != null && !arguments.isEmpty()) { - String[] argPieces = arguments.split(","); - window = Integer.valueOf(argPieces[0]); - snpThreshold = Integer.valueOf(argPieces[1]); - } + public void initialize(HashMap args) { + if ( args.get("window") != null ) + window = Integer.valueOf(args.get("window")); + if ( args.get("snps") != null ) + snpThreshold = Integer.valueOf(args.get("snps")); if ( window < 2 || snpThreshold < 2 ) throw new StingException("Window and threshold values need to be >= 2"); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECDepthOfCoverage.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECDepthOfCoverage.java index 56e0f0693..06096e81e 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECDepthOfCoverage.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECDepthOfCoverage.java @@ -2,6 +2,8 @@ package org.broadinstitute.sting.gatk.walkers.filters; import org.broadinstitute.sting.gatk.contexts.VariantContext; +import java.util.HashMap; + /** * Created by IntelliJ IDEA. @@ -16,10 +18,9 @@ public class VECDepthOfCoverage implements VariantExclusionCriterion { private boolean exclude = false; private int depth; - public void initialize(String arguments) { - if (arguments != null && !arguments.isEmpty()) { - maximum = Integer.valueOf(arguments); - } + public void initialize(HashMap args) { + if ( args.get("max") != null ) + maximum = Integer.valueOf(args.get("max")); } public void compute(VariantContextWindow contextWindow) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECFisherStrand.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECFisherStrand.java index 7bdcd64d9..7c1fa7e53 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECFisherStrand.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECFisherStrand.java @@ -8,6 +8,7 @@ import net.sf.samtools.SAMRecord; import cern.jet.math.Arithmetic; import java.util.List; +import java.util.HashMap; public class VECFisherStrand implements VariantExclusionCriterion { @@ -15,10 +16,9 @@ public class VECFisherStrand implements VariantExclusionCriterion { private double pValue; private boolean exclude; - public void initialize(String arguments) { - if (arguments != null && !arguments.isEmpty()) { - pvalueLimit = Double.valueOf(arguments); - } + public void initialize(HashMap args) { + if ( args.get("pvalue") != null ) + pvalueLimit = Double.valueOf(args.get("pvalue")); } public void compute(VariantContextWindow contextWindow) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECHomopolymer.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECHomopolymer.java index 28b8528c5..9e79dc261 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECHomopolymer.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECHomopolymer.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.io.File; import java.io.IOException; +import java.util.HashMap; import net.sf.picard.reference.ReferenceSequence; @@ -28,12 +29,11 @@ public class VECHomopolymer implements VariantExclusionCriterion { private boolean exclude = false; - public void initialize(String arguments) { - if (arguments != null && !arguments.isEmpty()) { - String[] argPieces = arguments.split(","); - extent = Integer.valueOf(argPieces[0]); - frac = Float.valueOf(argPieces[1]); - } + public void initialize(HashMap args) { + if ( args.get("extent") != null ) + extent = Integer.valueOf(args.get("extent")); + if ( args.get("fraction") != null ) + frac = Integer.valueOf(args.get("fraction")); File refFile = new File ("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECIndelArtifact.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECIndelArtifact.java index e8166411a..2dfb59f46 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECIndelArtifact.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECIndelArtifact.java @@ -3,15 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.filters; import org.broadinstitute.sting.gatk.contexts.VariantContext; import org.broadinstitute.sting.gatk.refdata.*; +import java.util.HashMap; + public class VECIndelArtifact implements VariantExclusionCriterion { private boolean exclude; private String source = "N/A"; - public void initialize(String arguments) { - if (arguments != null && !arguments.isEmpty()) { - } - } + public void initialize(HashMap arguments) {} public void compute(VariantContextWindow contextWindow) { VariantContext context = contextWindow.getContext(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECLodThreshold.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECLodThreshold.java index b376c47a3..c1243218a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECLodThreshold.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECLodThreshold.java @@ -2,15 +2,16 @@ package org.broadinstitute.sting.gatk.walkers.filters; import org.broadinstitute.sting.gatk.contexts.VariantContext; +import java.util.HashMap; + public class VECLodThreshold implements VariantExclusionCriterion { private double lodThreshold = 5.0; private double lod; private boolean exclude; - public void initialize(String arguments) { - if (arguments != null && !arguments.isEmpty()) { - lodThreshold = Double.valueOf(arguments); - } + public void initialize(HashMap args) { + if ( args.get("lod") != null ) + lodThreshold = Double.valueOf(args.get("lod")); } public void compute(VariantContextWindow contextWindow) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECMappingQuality.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECMappingQuality.java index 5afe79b48..5f1190155 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECMappingQuality.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECMappingQuality.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.MathUtils; import net.sf.samtools.SAMRecord; import java.util.List; +import java.util.HashMap; public class VECMappingQuality implements VariantExclusionCriterion { @@ -12,10 +13,9 @@ public class VECMappingQuality implements VariantExclusionCriterion { private double rms; private boolean exclude; - public void initialize(String arguments) { - if (arguments != null && !arguments.isEmpty()) { - minQuality = Double.valueOf(arguments); - } + public void initialize(HashMap args) { + if ( args.get("min") != null ) + minQuality = Double.valueOf(args.get("min")); } public void compute(VariantContextWindow contextWindow) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECMappingQualityZero.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECMappingQualityZero.java index 8b6c15abc..d341e1ce4 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECMappingQualityZero.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECMappingQualityZero.java @@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.VariantContext; import net.sf.samtools.SAMRecord; import java.util.List; +import java.util.HashMap; public class VECMappingQualityZero implements VariantExclusionCriterion { @@ -11,10 +12,9 @@ public class VECMappingQualityZero implements VariantExclusionCriterion { private int mq0Count; private boolean exclude; - public void initialize(String arguments) { - if (arguments != null && !arguments.isEmpty()) { - maximum = Integer.valueOf(arguments); - } + public void initialize(HashMap args) { + if ( args.get("max") != null ) + maximum = Integer.valueOf(args.get("max")); } public void compute(VariantContextWindow contextWindow) { diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECNull.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECNull.java index 3f2397f24..592abd6ea 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECNull.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECNull.java @@ -1,11 +1,11 @@ package org.broadinstitute.sting.gatk.walkers.filters; -public class VECNull implements VariantExclusionCriterion { - public void initialize(String arguments) { - } +import java.util.HashMap; - public void compute(VariantContextWindow contextWindow) { - } +public class VECNull implements VariantExclusionCriterion { + public void initialize(HashMap arguments) {} + + public void compute(VariantContextWindow contextWindow) {} public double inclusionProbability() { // A hack for now until this filter is actually converted to an empirical filter diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECOnOffGenotypeRatio.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECOnOffGenotypeRatio.java index e25285aa2..25804f3fc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECOnOffGenotypeRatio.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VECOnOffGenotypeRatio.java @@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.walkers.filters; import org.broadinstitute.sting.gatk.refdata.RodGeliText; import org.broadinstitute.sting.utils.*; +import java.util.HashMap; + public class VECOnOffGenotypeRatio extends RatioFilter { private double threshold = 0.8; private double ratio; @@ -11,13 +13,11 @@ public class VECOnOffGenotypeRatio extends RatioFilter { super("On/Off Genotype Ratio", VECOnOffGenotypeRatio.class, Tail.LeftTailed); } - public void initialize(String arguments) { - if (arguments != null && !arguments.isEmpty()) { - String[] argPieces = arguments.split(","); - threshold = Double.valueOf(argPieces[0]); - if ( argPieces.length > 1 ) - minGenotypeConfidenceToTest = Double.valueOf(argPieces[1]); - } + public void initialize(HashMap args) { + if ( args.get("threshold") != null ) + threshold = Double.valueOf(args.get("threshold")); + if ( args.get("confidence") != null ) + minGenotypeConfidenceToTest = Double.valueOf(args.get("confidence")); setLowThreshold(threshold); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantExclusionCriterion.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantExclusionCriterion.java index 2d2f5b174..53bf1addc 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantExclusionCriterion.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantExclusionCriterion.java @@ -1,8 +1,9 @@ package org.broadinstitute.sting.gatk.walkers.filters; -public interface VariantExclusionCriterion { - public void initialize(String arguments); +import java.util.HashMap; +public interface VariantExclusionCriterion { + public void initialize(HashMap arguments); public void compute(VariantContextWindow contextWindow); //public boolean isExcludable(); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index 6e579ac92..686e5261d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -21,7 +21,7 @@ import java.util.*; public class VariantFiltrationWalker extends LocusWalker { @Argument(fullName="variants_out_head", shortName="VOH", doc="File to which modified variants should be written") public String VARIANTS_OUT_HEAD; @Argument(fullName="features", shortName="F", doc="Feature test (optionally with arguments) to apply to genotype posteriors. Syntax: 'testname[:arguments]'", required=false) public String[] FEATURES; - @Argument(fullName="exclusion_criterion", shortName="X", doc="Exclusion test (optionally with arguments) to apply to variant call. Syntax: 'testname[:arguments]'", required=false) public String[] EXCLUSIONS; + @Argument(fullName="exclusion_criterion", shortName="X", doc="Exclusion test (optionally with arguments) to apply to variant call. Syntax: 'testname[:key1=arg1,key2=arg2,...]'", required=false) public String[] EXCLUSIONS; @Argument(fullName="inclusion_threshold", shortName="IT", doc="The product of the probability to include variants based on these filters must be greater than the value specified here in order to be included", required=false) public Double INCLUSION_THRESHOLD = 0.9; @Argument(fullName="verbose", shortName="V", doc="Show how the variant likelihoods are changing with the application of each feature") public Boolean VERBOSE = false; @Argument(fullName="list", shortName="ls", doc="List the available features and exclusion criteria and exit") public Boolean LIST = false; @@ -98,15 +98,23 @@ public class VariantFiltrationWalker extends LocusWalker { for (String requestedExclusionString : EXCLUSIONS) { String[] requestedExclusionPieces = requestedExclusionString.split(":"); String requestedExclusionName = requestedExclusionPieces[0]; - String requestedExclusionArgs = (requestedExclusionPieces.length == 2) ? requestedExclusionPieces[1] : ""; for ( Class exclusionClass : exclusionClasses ) { String exclusionClassName = rationalizeClassName(exclusionClass); if (requestedExclusionName.equalsIgnoreCase(exclusionClassName)) { try { + HashMap requestedArgs = new HashMap(); + if ( requestedExclusionPieces.length == 2 ) { + String[] argStrings = requestedExclusionPieces[1].split(","); + for (int i = 0; i < argStrings.length; i++ ) { + String[] arg = argStrings[i].split("="); + if ( arg.length == 2 ) + requestedArgs.put(arg[0], arg[1]); + } + } VariantExclusionCriterion vec = (VariantExclusionCriterion) exclusionClass.newInstance(); - vec.initialize(requestedExclusionArgs); + vec.initialize(requestedArgs); requestedExclusions.add(vec); paramsWriter.print(vec.getStudyHeader() + "\t"); diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CallSetsConcordanceWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/CallSetsConcordanceWalker.java deleted file mode 100755 index 9ccc6e0c5..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/CallSetsConcordanceWalker.java +++ /dev/null @@ -1,182 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.RefWalker; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.cmdLine.Argument; - -import java.io.FileNotFoundException; -import java.io.PrintWriter; - -/** - * Split up two call sets into their various concordance sets - */ -@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="callset1",type=AllelicVariant.class),@RMD(name="callset2",type=AllelicVariant.class)}) -public class CallSetsConcordanceWalker extends RefWalker { - @Argument(fullName="outputPrefix", shortName="prefix", doc="File prefix for output; turns on ALL output options below", required=false) - private String PREFIX = null; - @Argument(fullName="bothConfidentSameVar", shortName="confSameVar", doc="File to which calls that are confidently called var (identically) in both sets should be written", required=false) - private String CONF_SAME_VAR = null; - @Argument(fullName="oneConfidentVar", shortName="oneConfVar", doc="File to which calls that are confidently called var in only one of the sets should be written", required=false) - private String CONF_ONE_VAR = null; - @Argument(fullName="bothConfidentSameAllele", shortName="confSameAllele", doc="File to which calls that are confidently called var, with different genotypes, but with the same alt allele should be written", required=false) - private String CONF_SAME_ALLELE = null; - @Argument(fullName="bothConfidentDifferentAllele", shortName="confDiffAllele", doc="File to which calls that are confidently called var, with different genotypes, and with different alt alleles should be written", required=false) - private String CONF_DIFF_ALLELE = null; - @Argument(fullName="confidentRefVsConfidentVar", shortName="confRefVar", doc="File to which calls that are confidently called var in one set and ref in the other should be written", required=false) - private String CONF_REF_VAR = null; - @Argument(fullName="confidentVarWhenCombined", shortName="confComboVar", doc="File to which calls that are not confidently called var in each set separately but are when combined should be written", required=false) - private String CONF_COMBO_VAR = null; - @Argument(fullName="noCoverageVar", shortName="coverageVar", doc="File to which calls that are confidently called var in one set and for which there is no coverage in the other set should be written", required=false) - private String NO_COVERAGE_VAR = null; - @Argument(fullName="LOD_threshold", shortName="LOD", doc="LOD score for determining confidence", required=false) - private Double LOD = 5.0; - - private PrintWriter sameVarWriter = null; - private PrintWriter oneVarWriter = null; - private PrintWriter sameAlleleWriter = null; - private PrintWriter diffAlleleWriter = null; - private PrintWriter refVsVarWriter = null; - private PrintWriter comboVarWriter = null; - private PrintWriter coverageVarWriter = null; - - /** - * Prepare the output file and the list of available features. - */ - public void initialize() { - try { - if ( CONF_SAME_VAR != null ) - sameVarWriter = new PrintWriter(CONF_SAME_VAR); - else if ( PREFIX != null ) - sameVarWriter = new PrintWriter(PREFIX + ".sameConfidentVariant.calls"); - if ( CONF_ONE_VAR != null ) - oneVarWriter = new PrintWriter(CONF_ONE_VAR); - else if ( PREFIX != null ) - oneVarWriter = new PrintWriter(PREFIX + ".bothVariantOnlyOneIsConfident.calls"); - if ( CONF_SAME_ALLELE != null ) - sameAlleleWriter = new PrintWriter(CONF_SAME_ALLELE); - else if ( PREFIX != null ) - sameAlleleWriter = new PrintWriter(PREFIX + ".sameVariantAlleleDifferentGenotype.calls"); - if ( CONF_DIFF_ALLELE != null ) - diffAlleleWriter = new PrintWriter(CONF_DIFF_ALLELE); - else if ( PREFIX != null ) - diffAlleleWriter = new PrintWriter(PREFIX + ".differentVariantAllele.calls"); - if ( CONF_REF_VAR != null ) - refVsVarWriter = new PrintWriter(CONF_REF_VAR); - else if ( PREFIX != null ) - refVsVarWriter = new PrintWriter(PREFIX + ".oneRefOneVariant.calls"); - if ( CONF_COMBO_VAR != null ) - comboVarWriter = new PrintWriter(CONF_COMBO_VAR); - else if ( PREFIX != null ) - comboVarWriter = new PrintWriter(PREFIX + ".confidentVariantWhenCombined.calls"); - if ( NO_COVERAGE_VAR != null ) - coverageVarWriter = new PrintWriter(NO_COVERAGE_VAR); - else if ( PREFIX != null ) - coverageVarWriter = new PrintWriter(PREFIX + ".oneConfidentVariantOneNoCoverage.calls"); - } catch (FileNotFoundException e) { - throw new StingException(String.format("Could not open file(s) for writing")); - } - } - - /** - * Initialize the number of loci processed to zero. - * - * @return 0 - */ - public Integer reduceInit() { return 0; } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - AllelicVariant call1 = (AllelicVariant)tracker.lookup("callset1", null); - AllelicVariant call2 = (AllelicVariant)tracker.lookup("callset2", null); - - if ( call1 == null || call2 == null ) { - if ( call1 != null && call1.isSNP() && call1.getVariationConfidence() >= LOD ) - printVariant(coverageVarWriter, call1); - else if ( call2 != null && call2.isSNP() && call2.getVariationConfidence() >= LOD ) - printVariant(coverageVarWriter, call2); - return 1; - } - - double bestVsRef1 = call1.getVariationConfidence(); - double bestVsRef2 = call2.getVariationConfidence(); - //double bestVsNext1 = call1.getConsensusConfidence(); - //double bestVsNext2 = call2.getConsensusConfidence(); - String genotype1 = call1.getGenotype().get(0); - String genotype2 = call2.getGenotype().get(0); - - // are they both variants? - if ( call1.isSNP() && call2.isSNP() ) { - // are they both confident calls? - if ( bestVsRef1 >= LOD && bestVsRef2 >= LOD ) { - if ( genotype1.equals(genotype2) ) - printVariant(sameVarWriter, call1); - else if ( sameVariantAllele(genotype1, genotype2, ref.getBase()) ) - printVariant(sameAlleleWriter, call1); - else - printVariant(diffAlleleWriter, call1); - } else if ( bestVsRef1 < LOD && bestVsRef2 < LOD && bestVsRef1 + bestVsRef2 >= LOD ) { - printVariant(comboVarWriter, call1); - } else if ( (bestVsRef1 < LOD && bestVsRef2 >= LOD) || (bestVsRef1 >= LOD && bestVsRef2 < LOD) ) { - printVariant(oneVarWriter, call1); - } - } else if ( (call1.isSNP() && call2.isReference() && bestVsRef1 >= LOD) || - (call2.isSNP() && call1.isReference() && bestVsRef2 >= LOD) ) { - printVariant(refVsVarWriter, call1); - } - - return 1; - } - - private boolean sameVariantAllele(String genotype1, String genotype2, char ref) { - if ( genotype1.length() < 2 || genotype2.length() < 2 ) - return genotype1.equals(genotype2); - char altAllele1 = genotype1.charAt(0) != ref ? genotype1.charAt(0) : genotype1.charAt(1); - char altAllele2 = genotype2.charAt(0) != ref ? genotype2.charAt(0) : genotype2.charAt(1); - return altAllele1 == altAllele2; - } - - static void printVariant(PrintWriter writer, AllelicVariant variant) { - if ( writer != null && variant != null ) - writer.println(variant.toString()); - } - - /** - * Increment the number of loci processed. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return the new number of loci processed. - */ - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - /** - * Tell the user the number of loci processed and close out the new variants file. - * - * @param result the number of loci seen. - */ - public void onTraversalDone(Integer result) { - out.printf("Processed %d loci.\n", result); - - if ( sameVarWriter != null ) - sameVarWriter.close(); - if ( oneVarWriter != null ) - oneVarWriter.close(); - if ( sameAlleleWriter != null ) - sameAlleleWriter.close(); - if ( diffAlleleWriter != null ) - diffAlleleWriter.close(); - if ( refVsVarWriter != null ) - refVsVarWriter.close(); - if ( comboVarWriter != null ) - comboVarWriter.close(); - if ( coverageVarWriter != null ) - coverageVarWriter.close(); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/VennCallSetsWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/VennCallSetsWalker.java deleted file mode 100755 index aabedbd94..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/VennCallSetsWalker.java +++ /dev/null @@ -1,120 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.RefWalker; -import org.broadinstitute.sting.gatk.walkers.RMD; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.utils.StingException; -import org.broadinstitute.sting.utils.cmdLine.Argument; - -import java.io.FileNotFoundException; -import java.io.PrintWriter; - -/** - * Split up two call sets into their various s"Venn diagram" sets - */ -@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="callset1",type=AllelicVariant.class),@RMD(name="callset2",type=AllelicVariant.class)}) -public class VennCallSetsWalker extends RefWalker { - @Argument(fullName="outputPrefix", shortName="prefix", doc="File prefix for output; turns on ALL output options below", required=false) - private String PREFIX = null; - @Argument(fullName="union_out", shortName="union", doc="File to which union of sets should be written", required=false) - String UNION_OUT = null; - @Argument(fullName="intersect_out", shortName="intersect", doc="File to which intersection of sets should be written", required=false) - String INTERSECTION_OUT = null; - @Argument(fullName="set1_unique_out", shortName="set1", doc="File to which calls unique to set1 should be written", required=false) - String SET1_OUT = null; - @Argument(fullName="set2_unique_out", shortName="set2", doc="File to which calls unique to set2 should be written", required=false) - String SET2_OUT = null; - - private PrintWriter union_writer = null; - private PrintWriter intersect_writer = null; - private PrintWriter set1_writer = null; - private PrintWriter set2_writer = null; - - /** - * Prepare the output file and the list of available features. - */ - public void initialize() { - try { - if ( UNION_OUT != null ) - union_writer = new PrintWriter(UNION_OUT); - else if ( PREFIX != null ) - union_writer = new PrintWriter(PREFIX + ".union.calls"); - if ( INTERSECTION_OUT != null ) - intersect_writer = new PrintWriter(INTERSECTION_OUT); - else if ( PREFIX != null ) - intersect_writer = new PrintWriter(PREFIX + ".intersect.calls"); - if ( SET1_OUT != null ) - set1_writer = new PrintWriter(SET1_OUT); - else if ( PREFIX != null ) - set1_writer = new PrintWriter(PREFIX + ".set1Only.calls"); - if ( SET2_OUT != null ) - set2_writer = new PrintWriter(SET2_OUT); - else if ( PREFIX != null ) - set2_writer = new PrintWriter(PREFIX + ".set2Only.calls"); - } catch (FileNotFoundException e) { - throw new StingException(String.format("Could not open file(s) for writing")); - } - } - - /** - * Initialize the number of loci processed to zero. - * - * @return 0 - */ - public Integer reduceInit() { return 0; } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - AllelicVariant call1 = (AllelicVariant)tracker.lookup("callset1", null); - AllelicVariant call2 = (AllelicVariant)tracker.lookup("callset2", null); - - // Ignore places where we don't have a variant or where the reference base is ambiguous. - if ( union_writer != null ) - printVariant(union_writer, call1 != null ? call1 : call2); - if ( intersect_writer != null ) - printVariant(intersect_writer, call1 != null && call2 != null ? call1 : null); - if ( set1_writer != null ) - printVariant(set1_writer, call1 != null && call2 == null ? call1 : null); - if ( set2_writer != null ) - printVariant(set2_writer, call1 == null && call2 != null ? call2 : null); - - return 1; - } - - static void printVariant(PrintWriter writer, AllelicVariant variant) { - if ( variant != null ) - writer.println(variant.toString()); - } - - /** - * Increment the number of loci processed. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return the new number of loci processed. - */ - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - /** - * Tell the user the number of loci processed and close out the new variants file. - * - * @param result the number of loci seen. - */ - public void onTraversalDone(Integer result) { - out.printf("Processed %d loci.\n", result); - - if ( union_writer != null ) - union_writer.close(); - if ( intersect_writer != null ) - intersect_writer.close(); - if ( set1_writer != null ) - set1_writer.close(); - if ( set2_writer != null ) - set2_writer.close(); - } -} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/CallsetConcordanceWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/CallsetConcordanceWalker.java new file mode 100755 index 000000000..201bb870a --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/CallsetConcordanceWalker.java @@ -0,0 +1,105 @@ +package org.broadinstitute.sting.playground.gatk.walkers.concordance; + +import org.broadinstitute.sting.gatk.contexts.*; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; + +import java.util.*; + + +/** + * CallsetConcordanceWalker finds the concordance between multiple callsets (different tests are available). + */ +@Requires(value={DataSource.REFERENCE}, + referenceMetaData={@RMD(name="callset1",type=AllelicVariant.class), + @RMD(name="callset2",type=AllelicVariant.class)}) +@Reference(window=@Window(start=-20,stop=20)) +public class CallsetConcordanceWalker extends RefWalker { + @Argument(fullName="concordance_output_path", shortName="O", doc="File path to which split sets should be written", required=false) + private String OUTPUT_PATH = null; + @Argument(fullName="concordanceType", shortName="CT", doc="Concordance subset types to apply to given callsets. Syntax: 'type[:key1=arg1,key2=arg2,...]'", required=false) + private String[] TYPES = null; + @Argument(fullName="list", shortName="ls", doc="List the available concordance types and exit", required=false) + private Boolean LIST_ONLY = false; + + private ArrayList requestedTypes; + + private HashMap initializeConcordanceTypes(String prefix) { + HashMap types = new HashMap(); + + // + // Add new concordance types here! + // + types.put("SNPGenotypeConcordance", new SNPGenotypeConcordance(prefix)); + types.put("SimpleVenn", new SimpleVenn(prefix)); + types.put("IndelSubsets", new IndelSubsets(prefix)); + + return types; + } + + /** + * Prepare the output file and the list of available features. + */ + public void initialize() { + HashMap concordanceTypes = initializeConcordanceTypes(OUTPUT_PATH); + + if (LIST_ONLY) { + Iterator types = concordanceTypes.keySet().iterator(); + out.println("\nAvailable concordance types:"); + while ( types.hasNext() ) + out.println("\t" + types.next()); + System.exit(0); + } + + if ( OUTPUT_PATH == null ) + throw new StingException("--concordance_output_path must be specified"); + + requestedTypes = new ArrayList(); + + // initialize requested concordance types + if (TYPES != null) { + for ( String requestedTypeString : TYPES ) { + String[] requestedPieces = requestedTypeString.split(":"); + String requestedType = requestedPieces[0]; + ConcordanceType type = concordanceTypes.get(requestedType); + if ( type == null ) + throw new StingException("The requested concordance type (" + requestedType + ") isn't a valid option"); + + HashMap requestedArgs = new HashMap(); + if ( requestedPieces.length == 2 ) { + String[] argStrings = requestedPieces[1].split(","); + for (int i = 0; i < argStrings.length; i++ ) { + String[] arg = argStrings[i].split("="); + if ( arg.length == 2 ) + requestedArgs.put(arg[0], arg[1]); + } + } + + type.initialize(requestedArgs); + requestedTypes.add(type); + } + } + } + + public Integer reduceInit() { return 0; } + + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + for ( ConcordanceType type : requestedTypes ) + type.computeConcordance(tracker, ref); + + return 1; + } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + for ( ConcordanceType type : requestedTypes ) + type.cleanup(); + + out.printf("Processed %d loci.\n", result); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/ConcordanceType.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/ConcordanceType.java new file mode 100755 index 000000000..827abb8e6 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/ConcordanceType.java @@ -0,0 +1,14 @@ +package org.broadinstitute.sting.playground.gatk.walkers.concordance; + +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; + +import java.util.HashMap; + +public interface ConcordanceType { + + public void initialize(HashMap args); + public void computeConcordance(RefMetaDataTracker tracker, ReferenceContext ref); + public void cleanup(); + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/IndelSubsets.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/IndelSubsets.java new file mode 100755 index 000000000..4fb619df3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/IndelSubsets.java @@ -0,0 +1,109 @@ +package org.broadinstitute.sting.playground.gatk.walkers.concordance; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.StingException; + +import java.io.PrintWriter; +import java.io.FileNotFoundException; +import java.util.HashMap; + +/** + * filter an indel callset based on given criteria + */ +public class IndelSubsets implements ConcordanceType { + + public final static int HOMOPOLYMER_MAX = 20; + + // Hint: to disable either of these features, set the value to zero (via command-line arguments); + // then empty files will be output for the "below threshold" set, effectively disabling it + private int homopolymerCutoff = 1; + private int sizeCutoff = 2; + + private PrintWriter[][][][] writers = new PrintWriter[2][2][2][2]; + private String prefix; + + public IndelSubsets(String prefix) { + this.prefix = prefix; + } + + public void initialize(HashMap args) { + if ( args.get("sizeCutoff") != null ) + sizeCutoff = Integer.valueOf(args.get("sizeCutoff")); + if ( args.get("homopolymerCutoff") != null ) + homopolymerCutoff = Integer.valueOf(args.get("homopolymerCutoff")); + + try { + for (int i = 0; i < 2; i++) { + String name1 = i == 0 ? "set1" : "noSet1"; + for (int j = 0; j < 2; j++) { + String name2 = j == 0 ? "set2" : "noSet2"; + for (int k = 0; k < 2; k++) { + String name3 = "size" + (k == 0 ? Integer.toString(sizeCutoff)+"AndUnder" : "GreaterThan"+Integer.toString(sizeCutoff)); + for (int m = 0; m < 2; m++) { + String name4 = "homopolymer" + (m == 0 ? Integer.toString(homopolymerCutoff)+"AndUnder" : "GreaterThan"+Integer.toString(homopolymerCutoff)); + writers[i][j][k][m] = new PrintWriter(prefix + "." + name1 + "." + name2 + "." + name3 + "." + name4 + ".calls"); + } + } + } + } + } catch (FileNotFoundException e) { + throw new StingException(String.format("Could not open file(s) for writing")); + } + } + + public void computeConcordance(RefMetaDataTracker tracker, ReferenceContext ref) { + AllelicVariant indel1 = (AllelicVariant)tracker.lookup("callset1", null); + AllelicVariant indel2 = (AllelicVariant)tracker.lookup("callset2", null); + + int set1 = ( indel1 != null && indel1.isIndel() ? 0 : 1 ); + int set2 = ( indel2 != null && indel2.isIndel() ? 0 : 1 ); + + // skip this locus if they're both not valid indels + if ( set1 == 1 && set2 == 1 ) + return; + + // only deal with a valid indel + AllelicVariant indel = ( indel1 != null ? indel1 : indel2 ); + + int size = ( indel.length() <= sizeCutoff ? 0 : 1 ); + int homopol = ( homopolymerRunSize(ref, indel) <= homopolymerCutoff ? 0 : 1 ); + + writers[set1][set2][size][homopol].println(indel.toString()); + } + + private int homopolymerRunSize(ReferenceContext ref, AllelicVariant indel) { + char[] bases = ref.getBases(); + GenomeLoc window = ref.getWindow(); + GenomeLoc locus = ref.getLocus(); + + int refBasePos = (int)(locus.getStart() - window.getStart()); + char indelBase = indel.isDeletion() ? bases[refBasePos+1] : indel.getAltBasesFWD().charAt(0); + int leftRun = 0; + for ( int i = refBasePos; i >= 0; i--) { + if ( bases[i] != indelBase ) + break; + leftRun++; + } + + indelBase = indel.isDeletion() ? bases[Math.min(refBasePos+indel.length(),bases.length-1)] : indel.getAltBasesFWD().charAt(indel.getAltBasesFWD().length()-1); + int rightRun = 0; + for ( int i = refBasePos + (indel.isDeletion() ? 1+indel.length() : 1); i < bases.length; i++) { + if ( bases[i] != indelBase ) + break; + rightRun++; + } + + //System.out.println(String.valueOf(bases) + ": " + leftRun + " / " + rightRun); + return Math.max(leftRun, rightRun); + } + + public void cleanup() { + for (int i = 0; i < 2; i++) + for (int j = 0; j < 2; j++) + for (int k = 0; k < 2; k++) + for (int m = 0; m < 2; m++) + writers[i][j][k][m].close(); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/SNPGenotypeConcordance.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/SNPGenotypeConcordance.java new file mode 100755 index 000000000..2c4831155 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/SNPGenotypeConcordance.java @@ -0,0 +1,131 @@ +package org.broadinstitute.sting.playground.gatk.walkers.concordance; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.utils.StingException; + +import java.util.HashMap; +import java.io.FileNotFoundException; +import java.io.PrintWriter; + +/** + * Split up two call sets into their various concordance sets + */ +public class SNPGenotypeConcordance implements ConcordanceType { + + private double LOD = 5.0; + + private PrintWriter sameVarWriter = null; + private PrintWriter oneVarWriter = null; + private PrintWriter sameAlleleWriter = null; + private PrintWriter diffAlleleWriter = null; + private PrintWriter refVsVar1Writer = null; + private PrintWriter refVsVar2Writer = null; + private PrintWriter comboVarWriter = null; + private PrintWriter coverageVar1Writer = null; + private PrintWriter coverageVar2Writer = null; + private String prefix; + + public SNPGenotypeConcordance(String prefix) { + this.prefix = prefix; + } + + public void initialize(HashMap args) { + try { + sameVarWriter = new PrintWriter(prefix + ".snp_concordance.sameConfidentVariant.calls"); + oneVarWriter = new PrintWriter(prefix + ".snp_concordance.bothVariantOnlyOneIsConfident.calls"); + sameAlleleWriter = new PrintWriter(prefix + ".snp_concordance.sameVariantAlleleDifferentGenotype.calls"); + diffAlleleWriter = new PrintWriter(prefix + ".snp_concordance.differentVariantAllele.calls"); + refVsVar1Writer = new PrintWriter(prefix + ".snp_concordance.set1VariantSet2Ref.calls"); + refVsVar2Writer = new PrintWriter(prefix + ".snp_concordance.set1RefSet2Variant.calls"); + comboVarWriter = new PrintWriter(prefix + ".snp_concordance.confidentVariantWhenCombined.calls"); + coverageVar1Writer = new PrintWriter(prefix + ".snp_concordance.set1ConfidentVariantSet2NoCoverage.calls"); + coverageVar2Writer = new PrintWriter(prefix + ".snp_concordance.set1NoCoverageSet2ConfidentVariant.calls"); + } catch (FileNotFoundException e) { + throw new StingException(String.format("Could not open file(s) for writing")); + } + + if ( args.get("lod") != null ) + LOD = Double.valueOf(args.get("lod")); + } + + public void computeConcordance(RefMetaDataTracker tracker, ReferenceContext ref) { + AllelicVariant call1 = (AllelicVariant)tracker.lookup("callset1", null); + AllelicVariant call2 = (AllelicVariant)tracker.lookup("callset2", null); + + // the only reason they would be null is a lack of coverage + if ( call1 == null || call2 == null ) { + if ( call1 != null && call1.isSNP() && call1.getVariationConfidence() >= LOD ) + printVariant(coverageVar1Writer, call1); + else if ( call2 != null && call2.isSNP() && call2.getVariationConfidence() >= LOD ) + printVariant(coverageVar2Writer, call2); + return; + } + + double bestVsRef1 = call1.getVariationConfidence(); + double bestVsRef2 = call2.getVariationConfidence(); + //double bestVsNext1 = call1.getConsensusConfidence(); + //double bestVsNext2 = call2.getConsensusConfidence(); + String genotype1 = call1.getGenotype().get(0); + String genotype2 = call2.getGenotype().get(0); + + // are they both variant SNPs? + if ( call1.isSNP() && call2.isSNP() ) { + + // are they both confident calls? + if ( bestVsRef1 >= LOD && bestVsRef2 >= LOD ) { + // same genotype + if ( genotype1.equals(genotype2) ) + printVariant(sameVarWriter, call1); + + // same allele, different genotype + else if ( sameVariantAllele(genotype1, genotype2, ref.getBase()) ) + printVariant(sameAlleleWriter, call1); + + // different variant allele + else + printVariant(diffAlleleWriter, call1); + } + + // confident only when combined + else if ( bestVsRef1 < LOD && bestVsRef2 < LOD && bestVsRef1 + bestVsRef2 >= LOD ) { + printVariant(comboVarWriter, call1); + } + + // only one is confident variant + else if ( (bestVsRef1 < LOD && bestVsRef2 >= LOD) || (bestVsRef1 >= LOD && bestVsRef2 < LOD) ) { + printVariant(oneVarWriter, call1); + } + } + + // one is variant and the other is ref + else if ( call1.isSNP() && call2.isReference() && bestVsRef1 >= LOD ) + printVariant(refVsVar1Writer, call1); + else if ( call2.isSNP() && call1.isReference() && bestVsRef2 >= LOD ) + printVariant(refVsVar2Writer, call1); + } + + private boolean sameVariantAllele(String genotype1, String genotype2, char ref) { + if ( genotype1.length() < 2 || genotype2.length() < 2 ) + return genotype1.equals(genotype2); + char altAllele1 = genotype1.charAt(0) != ref ? genotype1.charAt(0) : genotype1.charAt(1); + char altAllele2 = genotype2.charAt(0) != ref ? genotype2.charAt(0) : genotype2.charAt(1); + return altAllele1 == altAllele2; + } + + private static void printVariant(PrintWriter writer, AllelicVariant variant) { + writer.println(variant.toString()); + } + + public void cleanup() { + sameVarWriter.close(); + oneVarWriter.close(); + sameAlleleWriter.close(); + diffAlleleWriter.close(); + refVsVar1Writer.close(); + refVsVar2Writer.close(); + comboVarWriter.close(); + coverageVar1Writer.close(); + coverageVar2Writer.close(); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/SimpleVenn.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/SimpleVenn.java new file mode 100755 index 000000000..b02602bbf --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/concordance/SimpleVenn.java @@ -0,0 +1,77 @@ +package org.broadinstitute.sting.playground.gatk.walkers.concordance; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.utils.StingException; + +import java.io.FileNotFoundException; +import java.io.PrintWriter; +import java.util.HashMap; + +/** + * Split up two call sets into their various "Venn diagram" sets + */ +public class SimpleVenn implements ConcordanceType { + + private PrintWriter union_writer = null; + private PrintWriter intersect_writer = null; + private PrintWriter discord_writer = null; + private PrintWriter set1_writer = null; + private PrintWriter set2_writer = null; + private String prefix; + + public SimpleVenn(String prefix) { + this.prefix = prefix; + } + + public void initialize(HashMap args) { + try { + union_writer = new PrintWriter(prefix + ".venn.union.calls"); + intersect_writer = new PrintWriter(prefix + ".venn.intersection.calls"); + discord_writer = new PrintWriter(prefix + ".venn.discordant.calls"); + set1_writer = new PrintWriter(prefix + ".venn.set1Only.calls"); + set2_writer = new PrintWriter(prefix + ".venn.set2Only.calls"); + } catch (FileNotFoundException e) { + throw new StingException(String.format("Could not open file(s) for writing")); + } + } + + public void computeConcordance(RefMetaDataTracker tracker, ReferenceContext ref) { + AllelicVariant call1 = (AllelicVariant)tracker.lookup("callset1", null); + AllelicVariant call2 = (AllelicVariant)tracker.lookup("callset2", null); + + if ( call1 == null && call2 == null ) + return; + + // union + printVariant(union_writer, call1 != null ? call1 : call2); + + // set 1 only + if ( call2 == null ) + printVariant(set1_writer, call1); + + // set 2 only + else if ( call1 == null ) + printVariant(set2_writer, call2); + + // intersection (concordant) + else if ( call1.getAltBasesFWD().equals(call2.getAltBasesFWD()) ) + printVariant(intersect_writer, call1); + + // intersection (discordant) + else + printVariant(discord_writer, call1); + } + + private static void printVariant(PrintWriter writer, AllelicVariant variant) { + writer.println(variant.toString()); + } + + public void cleanup() { + union_writer.close(); + intersect_writer.close(); + discord_writer.close(); + set1_writer.close(); + set2_writer.close(); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IndelFilterWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IndelFilterWalker.java deleted file mode 100755 index 6f8ba3705..000000000 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/indels/IndelFilterWalker.java +++ /dev/null @@ -1,91 +0,0 @@ -package org.broadinstitute.sting.playground.gatk.walkers.indels; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.*; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.cmdLine.Argument; - -/** - * filter an indel callset based on given criteria - */ -@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="calls",type=AllelicVariant.class)}) -@Allows(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="calls",type=AllelicVariant.class),@RMD(name="otherindels",type=AllelicVariant.class)}) -@Reference(window=@Window(start=-20,stop=20)) -public class IndelFilterWalker extends RefWalker { - @Argument(fullName="homopolymerRunMax", shortName="homopolMax", doc="filter indels within homopolymer runs greater than the given length (max 20)", required=false) - Integer HOMOPOLYMER_MAX = 20; - @Argument(fullName="homopolymerRunMin", shortName="homopolMin", doc="filter indels within homopolymer runs less than the given length", required=false) - Integer HOMOPOLYMER_MIN = 0; - @Argument(fullName="sizeMax", shortName="sizeMax", doc="filter indels greater than a certain size", required=false) - Integer SIZE_MAX = 100; - @Argument(fullName="sizeMin", shortName="sizeMin", doc="filter indels less than a certain size", required=false) - Integer SIZE_MIN = 0; - @Argument(fullName="inOtherSet", shortName="inSet", doc="filter indels that DO NOT occur in the provided 2nd set", required=false) - Boolean IN_OTHER_SET = false; - @Argument(fullName="notInOtherSet", shortName="notInSet", doc="filter indels that DO occur in the provided 2nd set", required=false) - Boolean NOT_IN_OTHER_SET = false; - - public void initialize() { - if ( HOMOPOLYMER_MAX > 20 ) - HOMOPOLYMER_MAX = 20; - } - - public Integer reduceInit() { return 0; } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - AllelicVariant indel = (AllelicVariant)tracker.lookup("calls", null); - - if ( indel == null || !indel.isIndel() ) - return 0; - - if ( indel.length() < SIZE_MIN || indel.length() > SIZE_MAX ) - return 0; - - int homopol = homopolymerRunSize(ref, indel); - if ( homopol < HOMOPOLYMER_MIN || homopol > HOMOPOLYMER_MAX ) - return 0; - - AllelicVariant other = (AllelicVariant)tracker.lookup("otherIndels", null); - if ( (IN_OTHER_SET && other == null) || (NOT_IN_OTHER_SET && other != null) ) - return 0; - - out.println(indel); - return 1; - } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer result) { - out.printf("output %d indels.\n", result); - } - - private int homopolymerRunSize(ReferenceContext ref, AllelicVariant indel) { - char[] bases = ref.getBases(); - GenomeLoc window = ref.getWindow(); - GenomeLoc locus = ref.getLocus(); - - int refBasePos = (int)(locus.getStart() - window.getStart()); - char indelBase = indel.isDeletion() ? bases[refBasePos+1] : indel.getAltBasesFWD().charAt(0); - int leftRun = 0; - for ( int i = refBasePos; i >= 0; i--) { - if ( bases[i] != indelBase ) - break; - leftRun++; - } - - indelBase = indel.isDeletion() ? bases[Math.min(refBasePos+indel.length(),bases.length-1)] : indel.getAltBasesFWD().charAt(indel.getAltBasesFWD().length()-1); - int rightRun = 0; - for ( int i = refBasePos + (indel.isDeletion() ? 1+indel.length() : 1); i < bases.length; i++) { - if ( bases[i] != indelBase ) - break; - rightRun++; - } - - //System.out.println(String.valueOf(bases) + ": " + leftRun + " / " + rightRun); - return Math.max(leftRun, rightRun); - } -} \ No newline at end of file diff --git a/python/RunPilot2Pipeline.py b/python/RunPilot2Pipeline.py index 971965682..c16a1b083 100755 --- a/python/RunPilot2Pipeline.py +++ b/python/RunPilot2Pipeline.py @@ -47,33 +47,53 @@ if __name__ == "__main__": snp_output = outputDir("calls/unfiltered_snps") filter_output = outputDir("calls/filtered_snps") indel_output = outputDir("calls/indels") + final_bam_dir = "/humgen/gsa-hphome1/projects/1kg_pilot2/useTheseBamsForAnalyses" samples = ["NA12878","NA12891","NA12892","NA19238","NA19239","NA19240"] - DoCs = [76,88,66,56,68,86] + techs = ["SLX"] chrs = range(1, 22) + ["X"] + DoCs = [82,91,70,56,68,86] + +# Official genome-wide Depth of Coverage tables for pilot 2, freeze 5: +# NA12878 NA12891 NA12892 NA19238 NA19239 NA19240 +# 454: 36 18 +# SLX: 82 91 70 56 68 86 +# SOLID: 37 64 +# 454+SLD: 64 77 +# ALL: xx xx for sample, DoC in zip(samples, DoCs): # # Actually do some work here # - myChrs = chrs - if sample in ["NA12891", "NA19239"]: - myChrs = chrs + ["Y"] + MQs = [100,5,5] + def finalBam(tech): + return os.path.join(final_bam_dir, "%s.%s.bam" % ( sample, tech )) + def outputFile(root, tech, name): + return os.path.join(root, "%s.%s.%s" % ( sample, tech, name )) + def badSnps( tech ): + return outputFile(cleaner_output, tech, "realigner.badsnps") + def indelsForFiltering( tech ): + return outputFile(indel_output, tech, "low.calls") - for chr in myChrs: + myTechs = techs + if sample in ["NA12878", "NA19240"]: + myTechs = techs + ["SOLID","454"] - def filepath(tech): - return "/broad/1KG/DCC/ftp/pilot_data/%s/alignment/%s.chrom%s.%s.SRP000032.2009_07.bam" % ( sample, sample, chr, tech ) + for tech,mappingQuality in zip(myTechs,MQs): + + myChrs = chrs + if sample in ["NA12891", "NA19239"]: + myChrs = chrs + ["Y"] + def badSnps( tech, chr ): + return os.path.join(cleaner_output, "%s.chr%s.%s.realigner.badsnps" % ( sample, chr, tech )) + + for chr in myChrs: + + bam = "/broad/1KG/DCC/ftp/pilot_data/%s/alignment/%s.chrom%s.%s.SRP000032.2009_07.bam" % ( sample, sample, chr, tech ) - techs = ["SLX", "SOLID", "454"] - MQs = [100,5,5] - solexaBamFile = None - solidBamFile = None - for tech,mappingQuality in zip(techs,MQs): - bam = filepath(tech) def outputFile(root, name): return os.path.join(root, "%s.chr%s.%s.%s" % ( sample, chr, tech, name )) - def MismatchIntervals(bam, outputFile, intervals): return config.gatkCmd('MismatchIntervals') + " -o " + outputFile + " -L " + intervals + " -I " + bam def IndelIntervals(bam, outputFile, intervals): @@ -84,18 +104,10 @@ if __name__ == "__main__": return config.gatkCmd('IntervalCleaner') + " -O " + outputFile + " -L " + intervals + " -I " + bam def Injector(bam, outputFile, intervals, inputfile): return config.gatkCmd('CleanedReadInjector') + " --output_bam " + outputFile + " -L " + intervals + " -I " + bam + " --cleaned_reads " + inputfile - def IndelCaller(bam, outputFile, intervals, fraction): - return config.gatkCmd('IndelGenotyper') + " -O " + outputFile + " -L " + intervals + " -I " + bam + " -minFraction " + fraction - def SnpCaller(bam, outputFile, intervals): - return config.gatkCmd('SingleSampleGenotyper') + " -o " + outputFile + " -L " + intervals + " -I " + bam - def VarFiltration(bam, outputHead, intervals, snpcalls, badsnps, indelcalls, depth, mq): - return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -L " + intervals + " -I " + bam + " -B variant,Variants," + snpcalls + ",cleaned,CleanedOutSnp," + badsnps + ",indels,SimpleIndel," + indelcalls + " -X DepthOfCoverage:" + depth + " -X MappingQualityZero:" + mq - def VarFiltration454(bam, outputHead, intervals, snpcalls, depth, mq): - return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -L " + intervals + " -I " + bam + " -B variant,Variants," + snpcalls + " -X DepthOfCoverage:" + depth + " -X MappingQualityZero:" + mq - jobid = None - indelsFileLow = None + mergeid = None + bamToCallFrom = finalBam(tech) if ( tech != "454" ): mismatchIntervalsFile = outputFile(intervals_dir, "mismatches.intervals") @@ -111,7 +123,7 @@ if __name__ == "__main__": jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, mergedIntervalsFile, just_print_commands = OPTIONS.dry, waitID = jobid) cleanedFile = outputFile(cleaner_output, "bam") - badsnpsFile = outputFile(cleaner_output, "realigner.badsnps") + badsnpsFile = badSnps(tech) cmd = CleanIntervals(bam, cleanedFile, mergedIntervalsFile, badsnpsFile) jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, cleanedFile, just_print_commands = OPTIONS.dry, waitID = jobid) @@ -119,52 +131,73 @@ if __name__ == "__main__": cmd = Injector(bam, injectedFile, str(chr), cleanedFile) jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, injectedFile, just_print_commands = OPTIONS.dry, waitID = jobid) - if ( tech == "SLX" ): - solexaBamFile = injectedFile - if ( tech == "SOLID" ): - solidBamFile = injectedFile +# HOW DO I CALL MergeBAMBatch FROM HERE? +# def MergeBams(): +# return "MergeBAMBatch.py -d " + final_bam_dir + " -q " + OPTIONS.farmQueue + " -s '" + bamToCallFrom + "\t" + os.path.join(injector_output, "%s.chr*.%s.bam" % ( sample, tech )) + "'" +# cmd = MergeBams() +# jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, bamToCallFrom, just_print_commands = OPTIONS.dry, waitID = jobid) + mergeid = jobid - cmd = "samtools index " + injectedFile - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, injectedFile + ".bai", just_print_commands = OPTIONS.dry, waitID = jobid) + cmd = "cat " + for chr in myChrs: + cmd = cmd + " " + badSnps(tech, chr) + cmd = cmd + " > " + badSnps(tech) + jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, badSnps(tech), just_print_commands = OPTIONS.dry, waitID = mergeid) - indelsFileLow = outputFile(indel_output, "low.calls") - cmd = IndelCaller(injectedFile, indelsFileLow, str(chr), "0.1") - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, indelsFileLow, just_print_commands = OPTIONS.dry, waitID = jobid) - - indelsFileHigh = outputFile(indel_output, "high.calls") - cmd = IndelCaller(injectedFile, indelsFileHigh, str(chr), "0.3") - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, indelsFileHigh, just_print_commands = OPTIONS.dry, waitID = jobid) - - snpsFile = outputFile(snp_output, "calls") - cmd = SnpCaller(injectedFile, snpsFile, str(chr)) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, snpsFile, just_print_commands = OPTIONS.dry, waitID = jobid) - - varFiltFile = os.path.join(filter_output, "%s.chr%s.%s" % ( sample, chr, tech )) - cmd = VarFiltration(injectedFile, varFiltFile, str(chr), snpsFile, badsnpsFile, indelsFileLow, str(DoC), str(mappingQuality)) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, varFiltFile, just_print_commands = OPTIONS.dry, waitID = jobid) - else: - snpsFile = outputFile(snp_output, "calls") - cmd = SnpCaller(bam, snpsFile, str(chr)) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, snpsFile, just_print_commands = OPTIONS.dry, waitID = jobid) - - varFiltFile = os.path.join(filter_output, "%s.chr%s.%s" % ( sample, chr, tech )) - cmd = VarFiltration454(bam, varFiltFile, str(chr), snpsFile, str(DoC), str(mappingQuality)) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, varFiltFile, just_print_commands = OPTIONS.dry, waitID = jobid) + def IndelCaller(bam, outputFile, fraction): + return config.gatkCmd('IndelGenotyper') + " -O " + outputFile + " -I " + bam + " -minFraction " + fraction + def SnpCaller(bam, outputFile): + return config.gatkCmd('SingleSampleGenotyper') + " -o " + outputFile + " -I " + bam + def VarFiltration(bam, outputHead, snpcalls, badsnps, indelcalls, depth, mq): + return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -I " + bam + " -B variant,Variants," + snpcalls + ",cleaned,CleanedOutSnp," + badsnps + ",indels,SimpleIndel," + indelcalls + " -X DepthOfCoverage:" + depth + " -X MappingQualityZero:" + mq + def VarFiltration454(bam, outputHead, snpcalls, depth, mq): + return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -I " + bam + " -B variant,Variants," + snpcalls + " -X DepthOfCoverage:" + depth + " -X MappingQualityZero:" + mq + indelsFileHigh = outputFile(indel_output, tech, "high.calls") + cmd = IndelCaller(bamToCallFrom, indelsFileHigh, "0.3") + jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, indelsFileHigh, just_print_commands = OPTIONS.dry, waitID = mergeid) - def outputFile(root, techs, name): - return os.path.join(root, "%s.chr%s.%s.%s" % ( sample, chr, techs, name )) - def SnpCaller(bams, outputFile, intervals): - return config.gatkCmd('SingleSampleGenotyper') + " -o " + outputFile + " -L " + intervals + " ".join(map( lambda x: " -I " + x, bams )) - def VarFiltration(bams, outputHead, intervals, snpcalls, badsnps, indelcalls, depth, mq): - return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -L " + intervals + " -B variant,Variants," + snpcalls + ",cleaned,CleanedOutSnp," + badsnps + ",indels,SimpleIndel," + indelcalls + " -X DepthOfCoverage:" + depth + " -X MappingQualityZero:" + mq + " ".join(map( lambda x: " -I " + x, bams )) + indelsFileLow = indelsForFiltering(tech) + cmd = IndelCaller(bamToCallFrom, indelsFileLow, "0.1") + jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, indelsFileLow, just_print_commands = OPTIONS.dry, waitID = mergeid) - solid454SnpsFile = outputFile(snp_output, "454-SOLID", "calls") - cmd = SnpCaller([solidBamFile,filepath("454")], solid454SnpsFile, str(chr)) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, solid454SnpsFile, just_print_commands = OPTIONS.dry, waitID = jobid) + snpsFile = outputFile(snp_output, tech, "calls") + cmd = SnpCaller(bamToCallFrom, snpsFile) + jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, snpsFile, just_print_commands = OPTIONS.dry, waitID = jobid) # wait on the low indel calls + + varFiltFile = os.path.join(filter_output, "%s.%s" % ( sample, tech )) + if ( tech != "454" ): + cmd = VarFiltration(bamToCallFrom, varFiltFile, snpsFile, badsnpsFile, indelsFileLow, str(DoC), str(mappingQuality)) + else: + cmd = VarFiltration454(bamToCallFrom, varFiltFile, snpsFile, str(DoC), str(mappingQuality)) + jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, varFiltFile, just_print_commands = OPTIONS.dry, waitID = jobid) + + def SnpCaller(bams, outputFile): + return config.gatkCmd('SingleSampleGenotyper') + " -o " + outputFile + " ".join(map( lambda x: " -I " + x, bams )) + def VarFiltration(bams, outputHead, snpcalls, badsnps, indelcalls, depth, mq): + return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -B variant,Variants," + snpcalls + ",cleaned,CleanedOutSnp," + badsnps + ",indels,SimpleIndel," + indelcalls + " -X DepthOfCoverage:" + depth + " -X MappingQualityZero:" + mq + " ".join(map( lambda x: " -I " + x, bams )) + +# +# HOW DO I MAKE THESE JOBS DEPEND ON THE MERGE IDS OF THE INDIVIDUAL SAMPLES??? +# (Or until everything else is done?) +# + solid454SnpsFile = outputFile(snp_output, "454-SOLID", "calls") + cmd = SnpCaller([finalBam("SOLID"),finalBam("454")], solid454SnpsFile) + jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, solid454SnpsFile, just_print_commands = OPTIONS.dry, waitID = allMergeIds) + + solid454VarFiltFile = os.path.join(filter_output, "%s.454-SOLID" % ( sample )) + cmd = VarFiltration([finalBam("SOLID"),finalBam("454")], solid454VarFiltFile, solid454SnpsFile, badSnps("SOLID"), indelsForFiltering("SOLID"), str(DoC), str(mappingQuality)) + jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, allVarFiltFile, just_print_commands = OPTIONS.dry, waitID = jobid) + + allSnpsFile = outputFile(snp_output, "allTechs", "calls") + cmd = SnpCaller([finalBam("SLX"),finalBam("SOLID"),finalBam("454")], solid454SnpsFile) + jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, allSnpsFile, just_print_commands = OPTIONS.dry, waitID = allMergeIds) + allVarFiltFile = os.path.join(filter_output, "%s.allTechs" % ( sample )) + cmd = VarFiltration([finalBam("SLX"),finalBam("SOLID"),finalBam("454")], varFiltFile, allSnpsFile, badSnps("SLX"), indelsForFiltering("SLX"), str(DoC), str(mappingQuality)) + jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, allVarFiltFile, just_print_commands = OPTIONS.dry, waitID = jobid) +# +# HOW DO I COMBINE ALL OF THE MQs AND DOCs? +# - allSnpsFile = outputFile(snp_output, "allTechs", "calls") - cmd = SnpCaller([solexaBamFile,solidBamFile,filepath("454")], allSnpsFile, str(chr)) - jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, allSnpsFile, just_print_commands = OPTIONS.dry, waitID = jobid)