Lots of changes: (I'll send email out in a sec)

1) Moved various disparate concordance / set splitting functionalities to a new parent tool which works like VariantFiltration (i.e. people can write various modules that fit inside and can be run though it).
2) Fixed up argument parsing in VariantFiltration to use key=value format so we don't accidentally mox up values (like I had been doing).
3) Have indel rod print samples


git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1540 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
ebanks 2009-09-07 01:12:09 +00:00
parent 1c3d67f0f3
commit 5dbba6711c
23 changed files with 614 additions and 520 deletions

View File

@ -68,10 +68,17 @@ public class SimpleIndelROD extends TabularROD implements Genotype, AllelicVaria
return true;
}
public String getSamplesString() {
return (is1KGFormat() && this.get("5") != null ? this.get("5") : "");
}
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append(getLocation().getContig() + "\t" + getLocation().getStart() + "\t");
sb.append(length() + "\t" + (isInsertion() ? "I" : "D") + "\t" + getFWDAlleles().get(0));
String samples = getSamplesString();
if ( samples.length() > 0 )
sb.append("\t" + samples);
return sb.toString();
}

View File

@ -5,6 +5,8 @@ import org.broadinstitute.sting.utils.ReadBackedPileup;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.Pair;
import java.util.HashMap;
public class VECAlleleBalance extends RatioFilter {
private double lowThreshold = 0.25, highThreshold = 0.75;
@ -14,14 +16,13 @@ public class VECAlleleBalance extends RatioFilter {
super("Allele Balance Ratio", VECAlleleBalance.class, Tail.TwoTailed);
}
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
String[] argPieces = arguments.split(",");
lowThreshold = Double.valueOf(argPieces[0]);
highThreshold = Double.valueOf(argPieces[1]);
if ( argPieces.length > 2 )
minGenotypeConfidenceToTest = Double.valueOf(argPieces[2]);
}
public void initialize(HashMap<String,String> args) {
if ( args.get("low") != null )
lowThreshold = Double.valueOf(args.get("low"));
if ( args.get("high") != null )
highThreshold = Double.valueOf(args.get("high"));
if ( args.get("confidence") != null )
minGenotypeConfidenceToTest = Double.valueOf(args.get("confidence"));
setLowThreshold(lowThreshold);
setHighThreshold(highThreshold);
}

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.VariantContext;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import java.util.HashMap;
public class VECClusteredSnps implements VariantExclusionCriterion {
@ -12,12 +13,11 @@ public class VECClusteredSnps implements VariantExclusionCriterion {
private boolean exclude;
private long distance;
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
String[] argPieces = arguments.split(",");
window = Integer.valueOf(argPieces[0]);
snpThreshold = Integer.valueOf(argPieces[1]);
}
public void initialize(HashMap<String,String> args) {
if ( args.get("window") != null )
window = Integer.valueOf(args.get("window"));
if ( args.get("snps") != null )
snpThreshold = Integer.valueOf(args.get("snps"));
if ( window < 2 || snpThreshold < 2 )
throw new StingException("Window and threshold values need to be >= 2");

View File

@ -2,6 +2,8 @@ package org.broadinstitute.sting.gatk.walkers.filters;
import org.broadinstitute.sting.gatk.contexts.VariantContext;
import java.util.HashMap;
/**
* Created by IntelliJ IDEA.
@ -16,10 +18,9 @@ public class VECDepthOfCoverage implements VariantExclusionCriterion {
private boolean exclude = false;
private int depth;
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
maximum = Integer.valueOf(arguments);
}
public void initialize(HashMap<String,String> args) {
if ( args.get("max") != null )
maximum = Integer.valueOf(args.get("max"));
}
public void compute(VariantContextWindow contextWindow) {

View File

@ -8,6 +8,7 @@ import net.sf.samtools.SAMRecord;
import cern.jet.math.Arithmetic;
import java.util.List;
import java.util.HashMap;
public class VECFisherStrand implements VariantExclusionCriterion {
@ -15,10 +16,9 @@ public class VECFisherStrand implements VariantExclusionCriterion {
private double pValue;
private boolean exclude;
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
pvalueLimit = Double.valueOf(arguments);
}
public void initialize(HashMap<String,String> args) {
if ( args.get("pvalue") != null )
pvalueLimit = Double.valueOf(args.get("pvalue"));
}
public void compute(VariantContextWindow contextWindow) {

View File

@ -7,6 +7,7 @@ import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import net.sf.picard.reference.ReferenceSequence;
@ -28,12 +29,11 @@ public class VECHomopolymer implements VariantExclusionCriterion {
private boolean exclude = false;
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
String[] argPieces = arguments.split(",");
extent = Integer.valueOf(argPieces[0]);
frac = Float.valueOf(argPieces[1]);
}
public void initialize(HashMap<String,String> args) {
if ( args.get("extent") != null )
extent = Integer.valueOf(args.get("extent"));
if ( args.get("fraction") != null )
frac = Integer.valueOf(args.get("fraction"));
File refFile = new File ("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta");

View File

@ -3,15 +3,14 @@ package org.broadinstitute.sting.gatk.walkers.filters;
import org.broadinstitute.sting.gatk.contexts.VariantContext;
import org.broadinstitute.sting.gatk.refdata.*;
import java.util.HashMap;
public class VECIndelArtifact implements VariantExclusionCriterion {
private boolean exclude;
private String source = "N/A";
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
}
}
public void initialize(HashMap<String,String> arguments) {}
public void compute(VariantContextWindow contextWindow) {
VariantContext context = contextWindow.getContext();

View File

@ -2,15 +2,16 @@ package org.broadinstitute.sting.gatk.walkers.filters;
import org.broadinstitute.sting.gatk.contexts.VariantContext;
import java.util.HashMap;
public class VECLodThreshold implements VariantExclusionCriterion {
private double lodThreshold = 5.0;
private double lod;
private boolean exclude;
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
lodThreshold = Double.valueOf(arguments);
}
public void initialize(HashMap<String,String> args) {
if ( args.get("lod") != null )
lodThreshold = Double.valueOf(args.get("lod"));
}
public void compute(VariantContextWindow contextWindow) {

View File

@ -5,6 +5,7 @@ import org.broadinstitute.sting.utils.MathUtils;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.HashMap;
public class VECMappingQuality implements VariantExclusionCriterion {
@ -12,10 +13,9 @@ public class VECMappingQuality implements VariantExclusionCriterion {
private double rms;
private boolean exclude;
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
minQuality = Double.valueOf(arguments);
}
public void initialize(HashMap<String,String> args) {
if ( args.get("min") != null )
minQuality = Double.valueOf(args.get("min"));
}
public void compute(VariantContextWindow contextWindow) {

View File

@ -4,6 +4,7 @@ import org.broadinstitute.sting.gatk.contexts.VariantContext;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.HashMap;
public class VECMappingQualityZero implements VariantExclusionCriterion {
@ -11,10 +12,9 @@ public class VECMappingQualityZero implements VariantExclusionCriterion {
private int mq0Count;
private boolean exclude;
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
maximum = Integer.valueOf(arguments);
}
public void initialize(HashMap<String,String> args) {
if ( args.get("max") != null )
maximum = Integer.valueOf(args.get("max"));
}
public void compute(VariantContextWindow contextWindow) {

View File

@ -1,11 +1,11 @@
package org.broadinstitute.sting.gatk.walkers.filters;
public class VECNull implements VariantExclusionCriterion {
public void initialize(String arguments) {
}
import java.util.HashMap;
public void compute(VariantContextWindow contextWindow) {
}
public class VECNull implements VariantExclusionCriterion {
public void initialize(HashMap<String,String> arguments) {}
public void compute(VariantContextWindow contextWindow) {}
public double inclusionProbability() {
// A hack for now until this filter is actually converted to an empirical filter

View File

@ -3,6 +3,8 @@ package org.broadinstitute.sting.gatk.walkers.filters;
import org.broadinstitute.sting.gatk.refdata.RodGeliText;
import org.broadinstitute.sting.utils.*;
import java.util.HashMap;
public class VECOnOffGenotypeRatio extends RatioFilter {
private double threshold = 0.8;
private double ratio;
@ -11,13 +13,11 @@ public class VECOnOffGenotypeRatio extends RatioFilter {
super("On/Off Genotype Ratio", VECOnOffGenotypeRatio.class, Tail.LeftTailed);
}
public void initialize(String arguments) {
if (arguments != null && !arguments.isEmpty()) {
String[] argPieces = arguments.split(",");
threshold = Double.valueOf(argPieces[0]);
if ( argPieces.length > 1 )
minGenotypeConfidenceToTest = Double.valueOf(argPieces[1]);
}
public void initialize(HashMap<String,String> args) {
if ( args.get("threshold") != null )
threshold = Double.valueOf(args.get("threshold"));
if ( args.get("confidence") != null )
minGenotypeConfidenceToTest = Double.valueOf(args.get("confidence"));
setLowThreshold(threshold);
}

View File

@ -1,8 +1,9 @@
package org.broadinstitute.sting.gatk.walkers.filters;
public interface VariantExclusionCriterion {
public void initialize(String arguments);
import java.util.HashMap;
public interface VariantExclusionCriterion {
public void initialize(HashMap<String,String> arguments);
public void compute(VariantContextWindow contextWindow);
//public boolean isExcludable();

View File

@ -21,7 +21,7 @@ import java.util.*;
public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="variants_out_head", shortName="VOH", doc="File to which modified variants should be written") public String VARIANTS_OUT_HEAD;
@Argument(fullName="features", shortName="F", doc="Feature test (optionally with arguments) to apply to genotype posteriors. Syntax: 'testname[:arguments]'", required=false) public String[] FEATURES;
@Argument(fullName="exclusion_criterion", shortName="X", doc="Exclusion test (optionally with arguments) to apply to variant call. Syntax: 'testname[:arguments]'", required=false) public String[] EXCLUSIONS;
@Argument(fullName="exclusion_criterion", shortName="X", doc="Exclusion test (optionally with arguments) to apply to variant call. Syntax: 'testname[:key1=arg1,key2=arg2,...]'", required=false) public String[] EXCLUSIONS;
@Argument(fullName="inclusion_threshold", shortName="IT", doc="The product of the probability to include variants based on these filters must be greater than the value specified here in order to be included", required=false) public Double INCLUSION_THRESHOLD = 0.9;
@Argument(fullName="verbose", shortName="V", doc="Show how the variant likelihoods are changing with the application of each feature") public Boolean VERBOSE = false;
@Argument(fullName="list", shortName="ls", doc="List the available features and exclusion criteria and exit") public Boolean LIST = false;
@ -98,15 +98,23 @@ public class VariantFiltrationWalker extends LocusWalker<Integer, Integer> {
for (String requestedExclusionString : EXCLUSIONS) {
String[] requestedExclusionPieces = requestedExclusionString.split(":");
String requestedExclusionName = requestedExclusionPieces[0];
String requestedExclusionArgs = (requestedExclusionPieces.length == 2) ? requestedExclusionPieces[1] : "";
for ( Class exclusionClass : exclusionClasses ) {
String exclusionClassName = rationalizeClassName(exclusionClass);
if (requestedExclusionName.equalsIgnoreCase(exclusionClassName)) {
try {
HashMap<String,String> requestedArgs = new HashMap<String,String>();
if ( requestedExclusionPieces.length == 2 ) {
String[] argStrings = requestedExclusionPieces[1].split(",");
for (int i = 0; i < argStrings.length; i++ ) {
String[] arg = argStrings[i].split("=");
if ( arg.length == 2 )
requestedArgs.put(arg[0], arg[1]);
}
}
VariantExclusionCriterion vec = (VariantExclusionCriterion) exclusionClass.newInstance();
vec.initialize(requestedExclusionArgs);
vec.initialize(requestedArgs);
requestedExclusions.add(vec);
paramsWriter.print(vec.getStudyHeader() + "\t");

View File

@ -1,182 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
/**
* Split up two call sets into their various concordance sets
*/
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="callset1",type=AllelicVariant.class),@RMD(name="callset2",type=AllelicVariant.class)})
public class CallSetsConcordanceWalker extends RefWalker<Integer, Integer> {
@Argument(fullName="outputPrefix", shortName="prefix", doc="File prefix for output; turns on ALL output options below", required=false)
private String PREFIX = null;
@Argument(fullName="bothConfidentSameVar", shortName="confSameVar", doc="File to which calls that are confidently called var (identically) in both sets should be written", required=false)
private String CONF_SAME_VAR = null;
@Argument(fullName="oneConfidentVar", shortName="oneConfVar", doc="File to which calls that are confidently called var in only one of the sets should be written", required=false)
private String CONF_ONE_VAR = null;
@Argument(fullName="bothConfidentSameAllele", shortName="confSameAllele", doc="File to which calls that are confidently called var, with different genotypes, but with the same alt allele should be written", required=false)
private String CONF_SAME_ALLELE = null;
@Argument(fullName="bothConfidentDifferentAllele", shortName="confDiffAllele", doc="File to which calls that are confidently called var, with different genotypes, and with different alt alleles should be written", required=false)
private String CONF_DIFF_ALLELE = null;
@Argument(fullName="confidentRefVsConfidentVar", shortName="confRefVar", doc="File to which calls that are confidently called var in one set and ref in the other should be written", required=false)
private String CONF_REF_VAR = null;
@Argument(fullName="confidentVarWhenCombined", shortName="confComboVar", doc="File to which calls that are not confidently called var in each set separately but are when combined should be written", required=false)
private String CONF_COMBO_VAR = null;
@Argument(fullName="noCoverageVar", shortName="coverageVar", doc="File to which calls that are confidently called var in one set and for which there is no coverage in the other set should be written", required=false)
private String NO_COVERAGE_VAR = null;
@Argument(fullName="LOD_threshold", shortName="LOD", doc="LOD score for determining confidence", required=false)
private Double LOD = 5.0;
private PrintWriter sameVarWriter = null;
private PrintWriter oneVarWriter = null;
private PrintWriter sameAlleleWriter = null;
private PrintWriter diffAlleleWriter = null;
private PrintWriter refVsVarWriter = null;
private PrintWriter comboVarWriter = null;
private PrintWriter coverageVarWriter = null;
/**
* Prepare the output file and the list of available features.
*/
public void initialize() {
try {
if ( CONF_SAME_VAR != null )
sameVarWriter = new PrintWriter(CONF_SAME_VAR);
else if ( PREFIX != null )
sameVarWriter = new PrintWriter(PREFIX + ".sameConfidentVariant.calls");
if ( CONF_ONE_VAR != null )
oneVarWriter = new PrintWriter(CONF_ONE_VAR);
else if ( PREFIX != null )
oneVarWriter = new PrintWriter(PREFIX + ".bothVariantOnlyOneIsConfident.calls");
if ( CONF_SAME_ALLELE != null )
sameAlleleWriter = new PrintWriter(CONF_SAME_ALLELE);
else if ( PREFIX != null )
sameAlleleWriter = new PrintWriter(PREFIX + ".sameVariantAlleleDifferentGenotype.calls");
if ( CONF_DIFF_ALLELE != null )
diffAlleleWriter = new PrintWriter(CONF_DIFF_ALLELE);
else if ( PREFIX != null )
diffAlleleWriter = new PrintWriter(PREFIX + ".differentVariantAllele.calls");
if ( CONF_REF_VAR != null )
refVsVarWriter = new PrintWriter(CONF_REF_VAR);
else if ( PREFIX != null )
refVsVarWriter = new PrintWriter(PREFIX + ".oneRefOneVariant.calls");
if ( CONF_COMBO_VAR != null )
comboVarWriter = new PrintWriter(CONF_COMBO_VAR);
else if ( PREFIX != null )
comboVarWriter = new PrintWriter(PREFIX + ".confidentVariantWhenCombined.calls");
if ( NO_COVERAGE_VAR != null )
coverageVarWriter = new PrintWriter(NO_COVERAGE_VAR);
else if ( PREFIX != null )
coverageVarWriter = new PrintWriter(PREFIX + ".oneConfidentVariantOneNoCoverage.calls");
} catch (FileNotFoundException e) {
throw new StingException(String.format("Could not open file(s) for writing"));
}
}
/**
* Initialize the number of loci processed to zero.
*
* @return 0
*/
public Integer reduceInit() { return 0; }
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
AllelicVariant call1 = (AllelicVariant)tracker.lookup("callset1", null);
AllelicVariant call2 = (AllelicVariant)tracker.lookup("callset2", null);
if ( call1 == null || call2 == null ) {
if ( call1 != null && call1.isSNP() && call1.getVariationConfidence() >= LOD )
printVariant(coverageVarWriter, call1);
else if ( call2 != null && call2.isSNP() && call2.getVariationConfidence() >= LOD )
printVariant(coverageVarWriter, call2);
return 1;
}
double bestVsRef1 = call1.getVariationConfidence();
double bestVsRef2 = call2.getVariationConfidence();
//double bestVsNext1 = call1.getConsensusConfidence();
//double bestVsNext2 = call2.getConsensusConfidence();
String genotype1 = call1.getGenotype().get(0);
String genotype2 = call2.getGenotype().get(0);
// are they both variants?
if ( call1.isSNP() && call2.isSNP() ) {
// are they both confident calls?
if ( bestVsRef1 >= LOD && bestVsRef2 >= LOD ) {
if ( genotype1.equals(genotype2) )
printVariant(sameVarWriter, call1);
else if ( sameVariantAllele(genotype1, genotype2, ref.getBase()) )
printVariant(sameAlleleWriter, call1);
else
printVariant(diffAlleleWriter, call1);
} else if ( bestVsRef1 < LOD && bestVsRef2 < LOD && bestVsRef1 + bestVsRef2 >= LOD ) {
printVariant(comboVarWriter, call1);
} else if ( (bestVsRef1 < LOD && bestVsRef2 >= LOD) || (bestVsRef1 >= LOD && bestVsRef2 < LOD) ) {
printVariant(oneVarWriter, call1);
}
} else if ( (call1.isSNP() && call2.isReference() && bestVsRef1 >= LOD) ||
(call2.isSNP() && call1.isReference() && bestVsRef2 >= LOD) ) {
printVariant(refVsVarWriter, call1);
}
return 1;
}
private boolean sameVariantAllele(String genotype1, String genotype2, char ref) {
if ( genotype1.length() < 2 || genotype2.length() < 2 )
return genotype1.equals(genotype2);
char altAllele1 = genotype1.charAt(0) != ref ? genotype1.charAt(0) : genotype1.charAt(1);
char altAllele2 = genotype2.charAt(0) != ref ? genotype2.charAt(0) : genotype2.charAt(1);
return altAllele1 == altAllele2;
}
static void printVariant(PrintWriter writer, AllelicVariant variant) {
if ( writer != null && variant != null )
writer.println(variant.toString());
}
/**
* Increment the number of loci processed.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return the new number of loci processed.
*/
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
/**
* Tell the user the number of loci processed and close out the new variants file.
*
* @param result the number of loci seen.
*/
public void onTraversalDone(Integer result) {
out.printf("Processed %d loci.\n", result);
if ( sameVarWriter != null )
sameVarWriter.close();
if ( oneVarWriter != null )
oneVarWriter.close();
if ( sameAlleleWriter != null )
sameAlleleWriter.close();
if ( diffAlleleWriter != null )
diffAlleleWriter.close();
if ( refVsVarWriter != null )
refVsVarWriter.close();
if ( comboVarWriter != null )
comboVarWriter.close();
if ( coverageVarWriter != null )
coverageVarWriter.close();
}
}

View File

@ -1,120 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
/**
* Split up two call sets into their various s"Venn diagram" sets
*/
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="callset1",type=AllelicVariant.class),@RMD(name="callset2",type=AllelicVariant.class)})
public class VennCallSetsWalker extends RefWalker<Integer, Integer> {
@Argument(fullName="outputPrefix", shortName="prefix", doc="File prefix for output; turns on ALL output options below", required=false)
private String PREFIX = null;
@Argument(fullName="union_out", shortName="union", doc="File to which union of sets should be written", required=false)
String UNION_OUT = null;
@Argument(fullName="intersect_out", shortName="intersect", doc="File to which intersection of sets should be written", required=false)
String INTERSECTION_OUT = null;
@Argument(fullName="set1_unique_out", shortName="set1", doc="File to which calls unique to set1 should be written", required=false)
String SET1_OUT = null;
@Argument(fullName="set2_unique_out", shortName="set2", doc="File to which calls unique to set2 should be written", required=false)
String SET2_OUT = null;
private PrintWriter union_writer = null;
private PrintWriter intersect_writer = null;
private PrintWriter set1_writer = null;
private PrintWriter set2_writer = null;
/**
* Prepare the output file and the list of available features.
*/
public void initialize() {
try {
if ( UNION_OUT != null )
union_writer = new PrintWriter(UNION_OUT);
else if ( PREFIX != null )
union_writer = new PrintWriter(PREFIX + ".union.calls");
if ( INTERSECTION_OUT != null )
intersect_writer = new PrintWriter(INTERSECTION_OUT);
else if ( PREFIX != null )
intersect_writer = new PrintWriter(PREFIX + ".intersect.calls");
if ( SET1_OUT != null )
set1_writer = new PrintWriter(SET1_OUT);
else if ( PREFIX != null )
set1_writer = new PrintWriter(PREFIX + ".set1Only.calls");
if ( SET2_OUT != null )
set2_writer = new PrintWriter(SET2_OUT);
else if ( PREFIX != null )
set2_writer = new PrintWriter(PREFIX + ".set2Only.calls");
} catch (FileNotFoundException e) {
throw new StingException(String.format("Could not open file(s) for writing"));
}
}
/**
* Initialize the number of loci processed to zero.
*
* @return 0
*/
public Integer reduceInit() { return 0; }
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
AllelicVariant call1 = (AllelicVariant)tracker.lookup("callset1", null);
AllelicVariant call2 = (AllelicVariant)tracker.lookup("callset2", null);
// Ignore places where we don't have a variant or where the reference base is ambiguous.
if ( union_writer != null )
printVariant(union_writer, call1 != null ? call1 : call2);
if ( intersect_writer != null )
printVariant(intersect_writer, call1 != null && call2 != null ? call1 : null);
if ( set1_writer != null )
printVariant(set1_writer, call1 != null && call2 == null ? call1 : null);
if ( set2_writer != null )
printVariant(set2_writer, call1 == null && call2 != null ? call2 : null);
return 1;
}
static void printVariant(PrintWriter writer, AllelicVariant variant) {
if ( variant != null )
writer.println(variant.toString());
}
/**
* Increment the number of loci processed.
*
* @param value result of the map.
* @param sum accumulator for the reduce.
* @return the new number of loci processed.
*/
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
/**
* Tell the user the number of loci processed and close out the new variants file.
*
* @param result the number of loci seen.
*/
public void onTraversalDone(Integer result) {
out.printf("Processed %d loci.\n", result);
if ( union_writer != null )
union_writer.close();
if ( intersect_writer != null )
intersect_writer.close();
if ( set1_writer != null )
set1_writer.close();
if ( set2_writer != null )
set2_writer.close();
}
}

View File

@ -0,0 +1,105 @@
package org.broadinstitute.sting.playground.gatk.walkers.concordance;
import org.broadinstitute.sting.gatk.contexts.*;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import java.util.*;
/**
* CallsetConcordanceWalker finds the concordance between multiple callsets (different tests are available).
*/
@Requires(value={DataSource.REFERENCE},
referenceMetaData={@RMD(name="callset1",type=AllelicVariant.class),
@RMD(name="callset2",type=AllelicVariant.class)})
@Reference(window=@Window(start=-20,stop=20))
public class CallsetConcordanceWalker extends RefWalker<Integer, Integer> {
@Argument(fullName="concordance_output_path", shortName="O", doc="File path to which split sets should be written", required=false)
private String OUTPUT_PATH = null;
@Argument(fullName="concordanceType", shortName="CT", doc="Concordance subset types to apply to given callsets. Syntax: 'type[:key1=arg1,key2=arg2,...]'", required=false)
private String[] TYPES = null;
@Argument(fullName="list", shortName="ls", doc="List the available concordance types and exit", required=false)
private Boolean LIST_ONLY = false;
private ArrayList<ConcordanceType> requestedTypes;
private HashMap<String, ConcordanceType> initializeConcordanceTypes(String prefix) {
HashMap<String, ConcordanceType> types = new HashMap<String, ConcordanceType>();
//
// Add new concordance types here!
//
types.put("SNPGenotypeConcordance", new SNPGenotypeConcordance(prefix));
types.put("SimpleVenn", new SimpleVenn(prefix));
types.put("IndelSubsets", new IndelSubsets(prefix));
return types;
}
/**
* Prepare the output file and the list of available features.
*/
public void initialize() {
HashMap<String, ConcordanceType> concordanceTypes = initializeConcordanceTypes(OUTPUT_PATH);
if (LIST_ONLY) {
Iterator<String> types = concordanceTypes.keySet().iterator();
out.println("\nAvailable concordance types:");
while ( types.hasNext() )
out.println("\t" + types.next());
System.exit(0);
}
if ( OUTPUT_PATH == null )
throw new StingException("--concordance_output_path must be specified");
requestedTypes = new ArrayList<ConcordanceType>();
// initialize requested concordance types
if (TYPES != null) {
for ( String requestedTypeString : TYPES ) {
String[] requestedPieces = requestedTypeString.split(":");
String requestedType = requestedPieces[0];
ConcordanceType type = concordanceTypes.get(requestedType);
if ( type == null )
throw new StingException("The requested concordance type (" + requestedType + ") isn't a valid option");
HashMap<String,String> requestedArgs = new HashMap<String,String>();
if ( requestedPieces.length == 2 ) {
String[] argStrings = requestedPieces[1].split(",");
for (int i = 0; i < argStrings.length; i++ ) {
String[] arg = argStrings[i].split("=");
if ( arg.length == 2 )
requestedArgs.put(arg[0], arg[1]);
}
}
type.initialize(requestedArgs);
requestedTypes.add(type);
}
}
}
public Integer reduceInit() { return 0; }
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
for ( ConcordanceType type : requestedTypes )
type.computeConcordance(tracker, ref);
return 1;
}
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
public void onTraversalDone(Integer result) {
for ( ConcordanceType type : requestedTypes )
type.cleanup();
out.printf("Processed %d loci.\n", result);
}
}

View File

@ -0,0 +1,14 @@
package org.broadinstitute.sting.playground.gatk.walkers.concordance;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import java.util.HashMap;
public interface ConcordanceType {
public void initialize(HashMap<String,String> args);
public void computeConcordance(RefMetaDataTracker tracker, ReferenceContext ref);
public void cleanup();
}

View File

@ -0,0 +1,109 @@
package org.broadinstitute.sting.playground.gatk.walkers.concordance;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.StingException;
import java.io.PrintWriter;
import java.io.FileNotFoundException;
import java.util.HashMap;
/**
* filter an indel callset based on given criteria
*/
public class IndelSubsets implements ConcordanceType {
public final static int HOMOPOLYMER_MAX = 20;
// Hint: to disable either of these features, set the value to zero (via command-line arguments);
// then empty files will be output for the "below threshold" set, effectively disabling it
private int homopolymerCutoff = 1;
private int sizeCutoff = 2;
private PrintWriter[][][][] writers = new PrintWriter[2][2][2][2];
private String prefix;
public IndelSubsets(String prefix) {
this.prefix = prefix;
}
public void initialize(HashMap<String,String> args) {
if ( args.get("sizeCutoff") != null )
sizeCutoff = Integer.valueOf(args.get("sizeCutoff"));
if ( args.get("homopolymerCutoff") != null )
homopolymerCutoff = Integer.valueOf(args.get("homopolymerCutoff"));
try {
for (int i = 0; i < 2; i++) {
String name1 = i == 0 ? "set1" : "noSet1";
for (int j = 0; j < 2; j++) {
String name2 = j == 0 ? "set2" : "noSet2";
for (int k = 0; k < 2; k++) {
String name3 = "size" + (k == 0 ? Integer.toString(sizeCutoff)+"AndUnder" : "GreaterThan"+Integer.toString(sizeCutoff));
for (int m = 0; m < 2; m++) {
String name4 = "homopolymer" + (m == 0 ? Integer.toString(homopolymerCutoff)+"AndUnder" : "GreaterThan"+Integer.toString(homopolymerCutoff));
writers[i][j][k][m] = new PrintWriter(prefix + "." + name1 + "." + name2 + "." + name3 + "." + name4 + ".calls");
}
}
}
}
} catch (FileNotFoundException e) {
throw new StingException(String.format("Could not open file(s) for writing"));
}
}
public void computeConcordance(RefMetaDataTracker tracker, ReferenceContext ref) {
AllelicVariant indel1 = (AllelicVariant)tracker.lookup("callset1", null);
AllelicVariant indel2 = (AllelicVariant)tracker.lookup("callset2", null);
int set1 = ( indel1 != null && indel1.isIndel() ? 0 : 1 );
int set2 = ( indel2 != null && indel2.isIndel() ? 0 : 1 );
// skip this locus if they're both not valid indels
if ( set1 == 1 && set2 == 1 )
return;
// only deal with a valid indel
AllelicVariant indel = ( indel1 != null ? indel1 : indel2 );
int size = ( indel.length() <= sizeCutoff ? 0 : 1 );
int homopol = ( homopolymerRunSize(ref, indel) <= homopolymerCutoff ? 0 : 1 );
writers[set1][set2][size][homopol].println(indel.toString());
}
private int homopolymerRunSize(ReferenceContext ref, AllelicVariant indel) {
char[] bases = ref.getBases();
GenomeLoc window = ref.getWindow();
GenomeLoc locus = ref.getLocus();
int refBasePos = (int)(locus.getStart() - window.getStart());
char indelBase = indel.isDeletion() ? bases[refBasePos+1] : indel.getAltBasesFWD().charAt(0);
int leftRun = 0;
for ( int i = refBasePos; i >= 0; i--) {
if ( bases[i] != indelBase )
break;
leftRun++;
}
indelBase = indel.isDeletion() ? bases[Math.min(refBasePos+indel.length(),bases.length-1)] : indel.getAltBasesFWD().charAt(indel.getAltBasesFWD().length()-1);
int rightRun = 0;
for ( int i = refBasePos + (indel.isDeletion() ? 1+indel.length() : 1); i < bases.length; i++) {
if ( bases[i] != indelBase )
break;
rightRun++;
}
//System.out.println(String.valueOf(bases) + ": " + leftRun + " / " + rightRun);
return Math.max(leftRun, rightRun);
}
public void cleanup() {
for (int i = 0; i < 2; i++)
for (int j = 0; j < 2; j++)
for (int k = 0; k < 2; k++)
for (int m = 0; m < 2; m++)
writers[i][j][k][m].close();
}
}

View File

@ -0,0 +1,131 @@
package org.broadinstitute.sting.playground.gatk.walkers.concordance;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.utils.StingException;
import java.util.HashMap;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
/**
* Split up two call sets into their various concordance sets
*/
public class SNPGenotypeConcordance implements ConcordanceType {
private double LOD = 5.0;
private PrintWriter sameVarWriter = null;
private PrintWriter oneVarWriter = null;
private PrintWriter sameAlleleWriter = null;
private PrintWriter diffAlleleWriter = null;
private PrintWriter refVsVar1Writer = null;
private PrintWriter refVsVar2Writer = null;
private PrintWriter comboVarWriter = null;
private PrintWriter coverageVar1Writer = null;
private PrintWriter coverageVar2Writer = null;
private String prefix;
public SNPGenotypeConcordance(String prefix) {
this.prefix = prefix;
}
public void initialize(HashMap<String,String> args) {
try {
sameVarWriter = new PrintWriter(prefix + ".snp_concordance.sameConfidentVariant.calls");
oneVarWriter = new PrintWriter(prefix + ".snp_concordance.bothVariantOnlyOneIsConfident.calls");
sameAlleleWriter = new PrintWriter(prefix + ".snp_concordance.sameVariantAlleleDifferentGenotype.calls");
diffAlleleWriter = new PrintWriter(prefix + ".snp_concordance.differentVariantAllele.calls");
refVsVar1Writer = new PrintWriter(prefix + ".snp_concordance.set1VariantSet2Ref.calls");
refVsVar2Writer = new PrintWriter(prefix + ".snp_concordance.set1RefSet2Variant.calls");
comboVarWriter = new PrintWriter(prefix + ".snp_concordance.confidentVariantWhenCombined.calls");
coverageVar1Writer = new PrintWriter(prefix + ".snp_concordance.set1ConfidentVariantSet2NoCoverage.calls");
coverageVar2Writer = new PrintWriter(prefix + ".snp_concordance.set1NoCoverageSet2ConfidentVariant.calls");
} catch (FileNotFoundException e) {
throw new StingException(String.format("Could not open file(s) for writing"));
}
if ( args.get("lod") != null )
LOD = Double.valueOf(args.get("lod"));
}
public void computeConcordance(RefMetaDataTracker tracker, ReferenceContext ref) {
AllelicVariant call1 = (AllelicVariant)tracker.lookup("callset1", null);
AllelicVariant call2 = (AllelicVariant)tracker.lookup("callset2", null);
// the only reason they would be null is a lack of coverage
if ( call1 == null || call2 == null ) {
if ( call1 != null && call1.isSNP() && call1.getVariationConfidence() >= LOD )
printVariant(coverageVar1Writer, call1);
else if ( call2 != null && call2.isSNP() && call2.getVariationConfidence() >= LOD )
printVariant(coverageVar2Writer, call2);
return;
}
double bestVsRef1 = call1.getVariationConfidence();
double bestVsRef2 = call2.getVariationConfidence();
//double bestVsNext1 = call1.getConsensusConfidence();
//double bestVsNext2 = call2.getConsensusConfidence();
String genotype1 = call1.getGenotype().get(0);
String genotype2 = call2.getGenotype().get(0);
// are they both variant SNPs?
if ( call1.isSNP() && call2.isSNP() ) {
// are they both confident calls?
if ( bestVsRef1 >= LOD && bestVsRef2 >= LOD ) {
// same genotype
if ( genotype1.equals(genotype2) )
printVariant(sameVarWriter, call1);
// same allele, different genotype
else if ( sameVariantAllele(genotype1, genotype2, ref.getBase()) )
printVariant(sameAlleleWriter, call1);
// different variant allele
else
printVariant(diffAlleleWriter, call1);
}
// confident only when combined
else if ( bestVsRef1 < LOD && bestVsRef2 < LOD && bestVsRef1 + bestVsRef2 >= LOD ) {
printVariant(comboVarWriter, call1);
}
// only one is confident variant
else if ( (bestVsRef1 < LOD && bestVsRef2 >= LOD) || (bestVsRef1 >= LOD && bestVsRef2 < LOD) ) {
printVariant(oneVarWriter, call1);
}
}
// one is variant and the other is ref
else if ( call1.isSNP() && call2.isReference() && bestVsRef1 >= LOD )
printVariant(refVsVar1Writer, call1);
else if ( call2.isSNP() && call1.isReference() && bestVsRef2 >= LOD )
printVariant(refVsVar2Writer, call1);
}
private boolean sameVariantAllele(String genotype1, String genotype2, char ref) {
if ( genotype1.length() < 2 || genotype2.length() < 2 )
return genotype1.equals(genotype2);
char altAllele1 = genotype1.charAt(0) != ref ? genotype1.charAt(0) : genotype1.charAt(1);
char altAllele2 = genotype2.charAt(0) != ref ? genotype2.charAt(0) : genotype2.charAt(1);
return altAllele1 == altAllele2;
}
private static void printVariant(PrintWriter writer, AllelicVariant variant) {
writer.println(variant.toString());
}
public void cleanup() {
sameVarWriter.close();
oneVarWriter.close();
sameAlleleWriter.close();
diffAlleleWriter.close();
refVsVar1Writer.close();
refVsVar2Writer.close();
comboVarWriter.close();
coverageVar1Writer.close();
coverageVar2Writer.close();
}
}

View File

@ -0,0 +1,77 @@
package org.broadinstitute.sting.playground.gatk.walkers.concordance;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.utils.StingException;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.HashMap;
/**
* Split up two call sets into their various "Venn diagram" sets
*/
public class SimpleVenn implements ConcordanceType {
private PrintWriter union_writer = null;
private PrintWriter intersect_writer = null;
private PrintWriter discord_writer = null;
private PrintWriter set1_writer = null;
private PrintWriter set2_writer = null;
private String prefix;
public SimpleVenn(String prefix) {
this.prefix = prefix;
}
public void initialize(HashMap<String,String> args) {
try {
union_writer = new PrintWriter(prefix + ".venn.union.calls");
intersect_writer = new PrintWriter(prefix + ".venn.intersection.calls");
discord_writer = new PrintWriter(prefix + ".venn.discordant.calls");
set1_writer = new PrintWriter(prefix + ".venn.set1Only.calls");
set2_writer = new PrintWriter(prefix + ".venn.set2Only.calls");
} catch (FileNotFoundException e) {
throw new StingException(String.format("Could not open file(s) for writing"));
}
}
public void computeConcordance(RefMetaDataTracker tracker, ReferenceContext ref) {
AllelicVariant call1 = (AllelicVariant)tracker.lookup("callset1", null);
AllelicVariant call2 = (AllelicVariant)tracker.lookup("callset2", null);
if ( call1 == null && call2 == null )
return;
// union
printVariant(union_writer, call1 != null ? call1 : call2);
// set 1 only
if ( call2 == null )
printVariant(set1_writer, call1);
// set 2 only
else if ( call1 == null )
printVariant(set2_writer, call2);
// intersection (concordant)
else if ( call1.getAltBasesFWD().equals(call2.getAltBasesFWD()) )
printVariant(intersect_writer, call1);
// intersection (discordant)
else
printVariant(discord_writer, call1);
}
private static void printVariant(PrintWriter writer, AllelicVariant variant) {
writer.println(variant.toString());
}
public void cleanup() {
union_writer.close();
intersect_writer.close();
discord_writer.close();
set1_writer.close();
set2_writer.close();
}
}

View File

@ -1,91 +0,0 @@
package org.broadinstitute.sting.playground.gatk.walkers.indels;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.*;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.cmdLine.Argument;
/**
* filter an indel callset based on given criteria
*/
@Requires(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="calls",type=AllelicVariant.class)})
@Allows(value={DataSource.REFERENCE},referenceMetaData={@RMD(name="calls",type=AllelicVariant.class),@RMD(name="otherindels",type=AllelicVariant.class)})
@Reference(window=@Window(start=-20,stop=20))
public class IndelFilterWalker extends RefWalker<Integer, Integer> {
@Argument(fullName="homopolymerRunMax", shortName="homopolMax", doc="filter indels within homopolymer runs greater than the given length (max 20)", required=false)
Integer HOMOPOLYMER_MAX = 20;
@Argument(fullName="homopolymerRunMin", shortName="homopolMin", doc="filter indels within homopolymer runs less than the given length", required=false)
Integer HOMOPOLYMER_MIN = 0;
@Argument(fullName="sizeMax", shortName="sizeMax", doc="filter indels greater than a certain size", required=false)
Integer SIZE_MAX = 100;
@Argument(fullName="sizeMin", shortName="sizeMin", doc="filter indels less than a certain size", required=false)
Integer SIZE_MIN = 0;
@Argument(fullName="inOtherSet", shortName="inSet", doc="filter indels that DO NOT occur in the provided 2nd set", required=false)
Boolean IN_OTHER_SET = false;
@Argument(fullName="notInOtherSet", shortName="notInSet", doc="filter indels that DO occur in the provided 2nd set", required=false)
Boolean NOT_IN_OTHER_SET = false;
public void initialize() {
if ( HOMOPOLYMER_MAX > 20 )
HOMOPOLYMER_MAX = 20;
}
public Integer reduceInit() { return 0; }
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
AllelicVariant indel = (AllelicVariant)tracker.lookup("calls", null);
if ( indel == null || !indel.isIndel() )
return 0;
if ( indel.length() < SIZE_MIN || indel.length() > SIZE_MAX )
return 0;
int homopol = homopolymerRunSize(ref, indel);
if ( homopol < HOMOPOLYMER_MIN || homopol > HOMOPOLYMER_MAX )
return 0;
AllelicVariant other = (AllelicVariant)tracker.lookup("otherIndels", null);
if ( (IN_OTHER_SET && other == null) || (NOT_IN_OTHER_SET && other != null) )
return 0;
out.println(indel);
return 1;
}
public Integer reduce(Integer value, Integer sum) {
return sum + value;
}
public void onTraversalDone(Integer result) {
out.printf("output %d indels.\n", result);
}
private int homopolymerRunSize(ReferenceContext ref, AllelicVariant indel) {
char[] bases = ref.getBases();
GenomeLoc window = ref.getWindow();
GenomeLoc locus = ref.getLocus();
int refBasePos = (int)(locus.getStart() - window.getStart());
char indelBase = indel.isDeletion() ? bases[refBasePos+1] : indel.getAltBasesFWD().charAt(0);
int leftRun = 0;
for ( int i = refBasePos; i >= 0; i--) {
if ( bases[i] != indelBase )
break;
leftRun++;
}
indelBase = indel.isDeletion() ? bases[Math.min(refBasePos+indel.length(),bases.length-1)] : indel.getAltBasesFWD().charAt(indel.getAltBasesFWD().length()-1);
int rightRun = 0;
for ( int i = refBasePos + (indel.isDeletion() ? 1+indel.length() : 1); i < bases.length; i++) {
if ( bases[i] != indelBase )
break;
rightRun++;
}
//System.out.println(String.valueOf(bases) + ": " + leftRun + " / " + rightRun);
return Math.max(leftRun, rightRun);
}
}

View File

@ -47,33 +47,53 @@ if __name__ == "__main__":
snp_output = outputDir("calls/unfiltered_snps")
filter_output = outputDir("calls/filtered_snps")
indel_output = outputDir("calls/indels")
final_bam_dir = "/humgen/gsa-hphome1/projects/1kg_pilot2/useTheseBamsForAnalyses"
samples = ["NA12878","NA12891","NA12892","NA19238","NA19239","NA19240"]
DoCs = [76,88,66,56,68,86]
techs = ["SLX"]
chrs = range(1, 22) + ["X"]
DoCs = [82,91,70,56,68,86]
# Official genome-wide Depth of Coverage tables for pilot 2, freeze 5:
# NA12878 NA12891 NA12892 NA19238 NA19239 NA19240
# 454: 36 18
# SLX: 82 91 70 56 68 86
# SOLID: 37 64
# 454+SLD: 64 77
# ALL: xx xx
for sample, DoC in zip(samples, DoCs):
#
# Actually do some work here
#
myChrs = chrs
if sample in ["NA12891", "NA19239"]:
myChrs = chrs + ["Y"]
MQs = [100,5,5]
def finalBam(tech):
return os.path.join(final_bam_dir, "%s.%s.bam" % ( sample, tech ))
def outputFile(root, tech, name):
return os.path.join(root, "%s.%s.%s" % ( sample, tech, name ))
def badSnps( tech ):
return outputFile(cleaner_output, tech, "realigner.badsnps")
def indelsForFiltering( tech ):
return outputFile(indel_output, tech, "low.calls")
for chr in myChrs:
myTechs = techs
if sample in ["NA12878", "NA19240"]:
myTechs = techs + ["SOLID","454"]
def filepath(tech):
return "/broad/1KG/DCC/ftp/pilot_data/%s/alignment/%s.chrom%s.%s.SRP000032.2009_07.bam" % ( sample, sample, chr, tech )
for tech,mappingQuality in zip(myTechs,MQs):
myChrs = chrs
if sample in ["NA12891", "NA19239"]:
myChrs = chrs + ["Y"]
def badSnps( tech, chr ):
return os.path.join(cleaner_output, "%s.chr%s.%s.realigner.badsnps" % ( sample, chr, tech ))
for chr in myChrs:
bam = "/broad/1KG/DCC/ftp/pilot_data/%s/alignment/%s.chrom%s.%s.SRP000032.2009_07.bam" % ( sample, sample, chr, tech )
techs = ["SLX", "SOLID", "454"]
MQs = [100,5,5]
solexaBamFile = None
solidBamFile = None
for tech,mappingQuality in zip(techs,MQs):
bam = filepath(tech)
def outputFile(root, name):
return os.path.join(root, "%s.chr%s.%s.%s" % ( sample, chr, tech, name ))
def MismatchIntervals(bam, outputFile, intervals):
return config.gatkCmd('MismatchIntervals') + " -o " + outputFile + " -L " + intervals + " -I " + bam
def IndelIntervals(bam, outputFile, intervals):
@ -84,18 +104,10 @@ if __name__ == "__main__":
return config.gatkCmd('IntervalCleaner') + " -O " + outputFile + " -L " + intervals + " -I " + bam
def Injector(bam, outputFile, intervals, inputfile):
return config.gatkCmd('CleanedReadInjector') + " --output_bam " + outputFile + " -L " + intervals + " -I " + bam + " --cleaned_reads " + inputfile
def IndelCaller(bam, outputFile, intervals, fraction):
return config.gatkCmd('IndelGenotyper') + " -O " + outputFile + " -L " + intervals + " -I " + bam + " -minFraction " + fraction
def SnpCaller(bam, outputFile, intervals):
return config.gatkCmd('SingleSampleGenotyper') + " -o " + outputFile + " -L " + intervals + " -I " + bam
def VarFiltration(bam, outputHead, intervals, snpcalls, badsnps, indelcalls, depth, mq):
return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -L " + intervals + " -I " + bam + " -B variant,Variants," + snpcalls + ",cleaned,CleanedOutSnp," + badsnps + ",indels,SimpleIndel," + indelcalls + " -X DepthOfCoverage:" + depth + " -X MappingQualityZero:" + mq
def VarFiltration454(bam, outputHead, intervals, snpcalls, depth, mq):
return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -L " + intervals + " -I " + bam + " -B variant,Variants," + snpcalls + " -X DepthOfCoverage:" + depth + " -X MappingQualityZero:" + mq
jobid = None
indelsFileLow = None
mergeid = None
bamToCallFrom = finalBam(tech)
if ( tech != "454" ):
mismatchIntervalsFile = outputFile(intervals_dir, "mismatches.intervals")
@ -111,7 +123,7 @@ if __name__ == "__main__":
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, mergedIntervalsFile, just_print_commands = OPTIONS.dry, waitID = jobid)
cleanedFile = outputFile(cleaner_output, "bam")
badsnpsFile = outputFile(cleaner_output, "realigner.badsnps")
badsnpsFile = badSnps(tech)
cmd = CleanIntervals(bam, cleanedFile, mergedIntervalsFile, badsnpsFile)
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, cleanedFile, just_print_commands = OPTIONS.dry, waitID = jobid)
@ -119,52 +131,73 @@ if __name__ == "__main__":
cmd = Injector(bam, injectedFile, str(chr), cleanedFile)
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, injectedFile, just_print_commands = OPTIONS.dry, waitID = jobid)
if ( tech == "SLX" ):
solexaBamFile = injectedFile
if ( tech == "SOLID" ):
solidBamFile = injectedFile
# HOW DO I CALL MergeBAMBatch FROM HERE?
# def MergeBams():
# return "MergeBAMBatch.py -d " + final_bam_dir + " -q " + OPTIONS.farmQueue + " -s '" + bamToCallFrom + "\t" + os.path.join(injector_output, "%s.chr*.%s.bam" % ( sample, tech )) + "'"
# cmd = MergeBams()
# jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, bamToCallFrom, just_print_commands = OPTIONS.dry, waitID = jobid)
mergeid = jobid
cmd = "samtools index " + injectedFile
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, injectedFile + ".bai", just_print_commands = OPTIONS.dry, waitID = jobid)
cmd = "cat "
for chr in myChrs:
cmd = cmd + " " + badSnps(tech, chr)
cmd = cmd + " > " + badSnps(tech)
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, badSnps(tech), just_print_commands = OPTIONS.dry, waitID = mergeid)
indelsFileLow = outputFile(indel_output, "low.calls")
cmd = IndelCaller(injectedFile, indelsFileLow, str(chr), "0.1")
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, indelsFileLow, just_print_commands = OPTIONS.dry, waitID = jobid)
indelsFileHigh = outputFile(indel_output, "high.calls")
cmd = IndelCaller(injectedFile, indelsFileHigh, str(chr), "0.3")
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, indelsFileHigh, just_print_commands = OPTIONS.dry, waitID = jobid)
snpsFile = outputFile(snp_output, "calls")
cmd = SnpCaller(injectedFile, snpsFile, str(chr))
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, snpsFile, just_print_commands = OPTIONS.dry, waitID = jobid)
varFiltFile = os.path.join(filter_output, "%s.chr%s.%s" % ( sample, chr, tech ))
cmd = VarFiltration(injectedFile, varFiltFile, str(chr), snpsFile, badsnpsFile, indelsFileLow, str(DoC), str(mappingQuality))
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, varFiltFile, just_print_commands = OPTIONS.dry, waitID = jobid)
else:
snpsFile = outputFile(snp_output, "calls")
cmd = SnpCaller(bam, snpsFile, str(chr))
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, snpsFile, just_print_commands = OPTIONS.dry, waitID = jobid)
varFiltFile = os.path.join(filter_output, "%s.chr%s.%s" % ( sample, chr, tech ))
cmd = VarFiltration454(bam, varFiltFile, str(chr), snpsFile, str(DoC), str(mappingQuality))
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, varFiltFile, just_print_commands = OPTIONS.dry, waitID = jobid)
def IndelCaller(bam, outputFile, fraction):
return config.gatkCmd('IndelGenotyper') + " -O " + outputFile + " -I " + bam + " -minFraction " + fraction
def SnpCaller(bam, outputFile):
return config.gatkCmd('SingleSampleGenotyper') + " -o " + outputFile + " -I " + bam
def VarFiltration(bam, outputHead, snpcalls, badsnps, indelcalls, depth, mq):
return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -I " + bam + " -B variant,Variants," + snpcalls + ",cleaned,CleanedOutSnp," + badsnps + ",indels,SimpleIndel," + indelcalls + " -X DepthOfCoverage:" + depth + " -X MappingQualityZero:" + mq
def VarFiltration454(bam, outputHead, snpcalls, depth, mq):
return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -I " + bam + " -B variant,Variants," + snpcalls + " -X DepthOfCoverage:" + depth + " -X MappingQualityZero:" + mq
indelsFileHigh = outputFile(indel_output, tech, "high.calls")
cmd = IndelCaller(bamToCallFrom, indelsFileHigh, "0.3")
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, indelsFileHigh, just_print_commands = OPTIONS.dry, waitID = mergeid)
def outputFile(root, techs, name):
return os.path.join(root, "%s.chr%s.%s.%s" % ( sample, chr, techs, name ))
def SnpCaller(bams, outputFile, intervals):
return config.gatkCmd('SingleSampleGenotyper') + " -o " + outputFile + " -L " + intervals + " ".join(map( lambda x: " -I " + x, bams ))
def VarFiltration(bams, outputHead, intervals, snpcalls, badsnps, indelcalls, depth, mq):
return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -L " + intervals + " -B variant,Variants," + snpcalls + ",cleaned,CleanedOutSnp," + badsnps + ",indels,SimpleIndel," + indelcalls + " -X DepthOfCoverage:" + depth + " -X MappingQualityZero:" + mq + " ".join(map( lambda x: " -I " + x, bams ))
indelsFileLow = indelsForFiltering(tech)
cmd = IndelCaller(bamToCallFrom, indelsFileLow, "0.1")
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, indelsFileLow, just_print_commands = OPTIONS.dry, waitID = mergeid)
solid454SnpsFile = outputFile(snp_output, "454-SOLID", "calls")
cmd = SnpCaller([solidBamFile,filepath("454")], solid454SnpsFile, str(chr))
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, solid454SnpsFile, just_print_commands = OPTIONS.dry, waitID = jobid)
snpsFile = outputFile(snp_output, tech, "calls")
cmd = SnpCaller(bamToCallFrom, snpsFile)
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, snpsFile, just_print_commands = OPTIONS.dry, waitID = jobid) # wait on the low indel calls
varFiltFile = os.path.join(filter_output, "%s.%s" % ( sample, tech ))
if ( tech != "454" ):
cmd = VarFiltration(bamToCallFrom, varFiltFile, snpsFile, badsnpsFile, indelsFileLow, str(DoC), str(mappingQuality))
else:
cmd = VarFiltration454(bamToCallFrom, varFiltFile, snpsFile, str(DoC), str(mappingQuality))
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, varFiltFile, just_print_commands = OPTIONS.dry, waitID = jobid)
def SnpCaller(bams, outputFile):
return config.gatkCmd('SingleSampleGenotyper') + " -o " + outputFile + " ".join(map( lambda x: " -I " + x, bams ))
def VarFiltration(bams, outputHead, snpcalls, badsnps, indelcalls, depth, mq):
return config.gatkCmd('VariantFiltration') + " -VOH " + outputHead + " -B variant,Variants," + snpcalls + ",cleaned,CleanedOutSnp," + badsnps + ",indels,SimpleIndel," + indelcalls + " -X DepthOfCoverage:" + depth + " -X MappingQualityZero:" + mq + " ".join(map( lambda x: " -I " + x, bams ))
#
# HOW DO I MAKE THESE JOBS DEPEND ON THE MERGE IDS OF THE INDIVIDUAL SAMPLES???
# (Or until everything else is done?)
#
solid454SnpsFile = outputFile(snp_output, "454-SOLID", "calls")
cmd = SnpCaller([finalBam("SOLID"),finalBam("454")], solid454SnpsFile)
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, solid454SnpsFile, just_print_commands = OPTIONS.dry, waitID = allMergeIds)
solid454VarFiltFile = os.path.join(filter_output, "%s.454-SOLID" % ( sample ))
cmd = VarFiltration([finalBam("SOLID"),finalBam("454")], solid454VarFiltFile, solid454SnpsFile, badSnps("SOLID"), indelsForFiltering("SOLID"), str(DoC), str(mappingQuality))
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, allVarFiltFile, just_print_commands = OPTIONS.dry, waitID = jobid)
allSnpsFile = outputFile(snp_output, "allTechs", "calls")
cmd = SnpCaller([finalBam("SLX"),finalBam("SOLID"),finalBam("454")], solid454SnpsFile)
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, allSnpsFile, just_print_commands = OPTIONS.dry, waitID = allMergeIds)
allVarFiltFile = os.path.join(filter_output, "%s.allTechs" % ( sample ))
cmd = VarFiltration([finalBam("SLX"),finalBam("SOLID"),finalBam("454")], varFiltFile, allSnpsFile, badSnps("SLX"), indelsForFiltering("SLX"), str(DoC), str(mappingQuality))
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, allVarFiltFile, just_print_commands = OPTIONS.dry, waitID = jobid)
#
# HOW DO I COMBINE ALL OF THE MQs AND DOCs?
#
allSnpsFile = outputFile(snp_output, "allTechs", "calls")
cmd = SnpCaller([solexaBamFile,solidBamFile,filepath("454")], allSnpsFile, str(chr))
jobid = farm_commands.cmd(cmd, OPTIONS.farmQueue, allSnpsFile, just_print_commands = OPTIONS.dry, waitID = jobid)