Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
c405a75f54
|
|
@ -69,7 +69,7 @@ import java.util.*;
|
|||
* <p>
|
||||
* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
|
||||
* is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
|
||||
* or deletion (indels) in the individualÕs genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
|
||||
* or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
|
||||
* the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently,
|
||||
* it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are
|
||||
* correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,
|
||||
|
|
|
|||
|
|
@ -23,7 +23,10 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.phasing;
|
||||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
|
|
@ -49,16 +52,46 @@ import java.util.*;
|
|||
|
||||
import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods;
|
||||
|
||||
|
||||
/**
|
||||
* Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads).
|
||||
*
|
||||
* <p>
|
||||
* Performs physical phasing of SNP calls, based on sequencing reads.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* VCF file of SNP calls, BAM file of sequence reads.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* Phased VCF file.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T ReadBackedPhasing
|
||||
* -R reference.fasta
|
||||
* -I reads.bam
|
||||
* --variant:vcf SNPs.vcf
|
||||
* -BTI variant
|
||||
* -BTIMR INTERSECTION
|
||||
* -o phased_SNPs.vcf
|
||||
* --phaseQualityThresh 20.0
|
||||
* </pre>
|
||||
*
|
||||
* @author Menachem Fromer
|
||||
* @since July 2010
|
||||
*/
|
||||
@Allows(value = {DataSource.READS, DataSource.REFERENCE})
|
||||
@Requires(value = {DataSource.READS, DataSource.REFERENCE})
|
||||
@By(DataSource.READS)
|
||||
|
||||
@ReadFilters({MappingQualityZeroReadFilter.class})
|
||||
// Filter out all reads with zero mapping quality
|
||||
@ReadFilters({MappingQualityZeroReadFilter.class})
|
||||
|
||||
public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, PhasingStats> {
|
||||
private static final boolean DEBUG = false;
|
||||
|
|
@ -73,13 +106,13 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
@Output(doc = "File to which variants should be written", required = true)
|
||||
protected VCFWriter writer = null;
|
||||
|
||||
@Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads; [default:20000]", required = false)
|
||||
@Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads for the phasing procedure", required = false)
|
||||
protected Integer cacheWindow = 20000;
|
||||
|
||||
@Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm; [default:10]", required = false)
|
||||
@Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm", required = false)
|
||||
protected Integer maxPhaseSites = 10; // 2^10 == 10^3 diploid haplotypes
|
||||
|
||||
@Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing; [default:10.0]", required = false)
|
||||
@Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing", required = false)
|
||||
protected Double phaseQualityThresh = 10.0; // PQ = 10.0 <=> P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9
|
||||
|
||||
@Hidden
|
||||
|
|
@ -87,10 +120,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
protected String variantStatsFilePrefix = null;
|
||||
private PhasingQualityStatsWriter statsWriter = null;
|
||||
|
||||
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for phasing [default: 17]", required = false)
|
||||
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for phasing", required = false)
|
||||
public int MIN_BASE_QUALITY_SCORE = 17;
|
||||
|
||||
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for phasing [default: 20]", required = false)
|
||||
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for phasing", required = false)
|
||||
public int MIN_MAPPING_QUALITY_SCORE = 20;
|
||||
|
||||
@Argument(fullName = "sampleToPhase", shortName = "sampleToPhase", doc = "Only include these samples when phasing", required = false)
|
||||
|
|
@ -111,10 +144,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
|
|||
|
||||
public static final String PHASING_INCONSISTENT_KEY = "PhasingInconsistent";
|
||||
|
||||
@Argument(fullName = "enableMergePhasedSegregatingPolymorphismsToMNP", shortName = "enableMergeToMNP", doc = "Merge consecutive phased sites into MNP records [default:false]", required = false)
|
||||
@Argument(fullName = "enableMergePhasedSegregatingPolymorphismsToMNP", shortName = "enableMergeToMNP", doc = "Merge consecutive phased sites into MNP records", required = false)
|
||||
protected boolean enableMergePhasedSegregatingPolymorphismsToMNP = false;
|
||||
|
||||
@Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false)
|
||||
@Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record", required = false)
|
||||
protected int maxGenomicDistanceForMNP = 1;
|
||||
|
||||
@Hidden
|
||||
|
|
|
|||
|
|
@ -40,95 +40,105 @@ import java.io.PrintStream;
|
|||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Emits specific fields as dictated by the user from one or more VCF files.
|
||||
* Emits specific fields from a VCF file to a tab-deliminated table
|
||||
*
|
||||
* <p>
|
||||
* This walker accepts a single VCF file and writes out user-selected fields from the
|
||||
* VCF as a header-containing, tab-deliminated file. The user specifies one or more
|
||||
* fields to print with the -F NAME, each of which appears as a single column in
|
||||
* the output file, with a header named NAME, and the value of this field in the VCF
|
||||
* one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding
|
||||
* in the INFO field (AC=10). Note that this tool does not support capturing any
|
||||
* GENOTYPE field values. If a VCF record is missing a value, then the tool by
|
||||
* default throws an error, but the special value NA can be emitted instead with
|
||||
* appropriate tool arguments.
|
||||
*
|
||||
* </p>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>A VCF file</li>
|
||||
* <li>A list of -F fields to write</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A table deliminated file containing the values of the requested fields in the VCF file
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* -T $WalkerName \
|
||||
* -V file.vcf \
|
||||
* -F CHROM -F POS -F ID -F QUAL -F AC \
|
||||
* -o results.table
|
||||
*
|
||||
* would produce a file that looks like:
|
||||
*
|
||||
* CHROM POS ID QUAL AC
|
||||
* 1 10 . 50 1
|
||||
* 1 20 rs10 99 10
|
||||
* et cetera...
|
||||
* </pre>
|
||||
*
|
||||
* @author Mark DePristo
|
||||
* @since 2010
|
||||
*/
|
||||
public class VariantsToTable extends RodWalker<Integer, Integer> {
|
||||
|
||||
@ArgumentCollection
|
||||
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
|
||||
|
||||
@Output(doc="File to which results should be written",required=true)
|
||||
protected PrintStream out;
|
||||
|
||||
@Argument(fullName="fields", shortName="F", doc="Fields to emit from the VCF, allows any VCF field, any info field, and some meta fields like nHets", required=true)
|
||||
public ArrayList<String> fieldsToTake = new ArrayList<String>();
|
||||
/**
|
||||
* -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10).
|
||||
* Note that this tool does not support capturing any GENOTYPE field values. Note this argument
|
||||
* accepts any number of inputs. So -F CHROM -F POS is allowed.
|
||||
*/
|
||||
@Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true)
|
||||
public List<String> fieldsToTake = new ArrayList<String>();
|
||||
|
||||
@Argument(fullName="showFiltered", shortName="raw", doc="Include filtered records")
|
||||
/**
|
||||
* By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
|
||||
* Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value.
|
||||
*/
|
||||
@Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false)
|
||||
public boolean showFiltered = false;
|
||||
|
||||
@Argument(fullName="maxRecords", shortName="M", doc="Maximum number of records to emit, if provided", required=false)
|
||||
/**
|
||||
* If provided, then this tool will exit with success after this number of records have been emitted to the file.
|
||||
*/
|
||||
@Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false)
|
||||
public int MAX_RECORDS = -1;
|
||||
int nRecords = 0;
|
||||
|
||||
/**
|
||||
* By default, only biallelic (REF=A, ALT=B) sites are including in the output. If this flag is provided, then
|
||||
* VariantsToTable will emit field values for records with multiple ALT alleles. Note that in general this
|
||||
* can make your resulting file unreadable and malformated according to tools like R, as the representation of
|
||||
* multi-allelic INFO field values can be lists of values.
|
||||
*/
|
||||
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
|
||||
public boolean keepMultiAllelic = false;
|
||||
|
||||
/**
|
||||
* By default, this tool throws a UserException when it encounters a field without a value in some record. This
|
||||
* is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being
|
||||
* found before the tool runs through 40M 1000G records. However, in some cases you genuinely want to allow such
|
||||
* fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument
|
||||
* will cause VariantsToTable to write out NA values for missing fields instead of throwing an error.
|
||||
*/
|
||||
@Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false)
|
||||
public boolean ALLOW_MISSING_DATA = false;
|
||||
|
||||
public void initialize() {
|
||||
// print out the header
|
||||
out.println(Utils.join("\t", fieldsToTake));
|
||||
}
|
||||
|
||||
public static abstract class Getter { public abstract String get(VariantContext vc); }
|
||||
public static Map<String, Getter> getters = new HashMap<String, Getter>();
|
||||
|
||||
static {
|
||||
// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
|
||||
getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } });
|
||||
getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
|
||||
getters.put("REF", new Getter() {
|
||||
public String get(VariantContext vc) {
|
||||
String x = "";
|
||||
if ( vc.hasReferenceBaseForIndel() ) {
|
||||
Byte refByte = vc.getReferenceBaseForIndel();
|
||||
x=x+new String(new byte[]{refByte});
|
||||
}
|
||||
return x+vc.getReference().getDisplayString();
|
||||
}
|
||||
});
|
||||
getters.put("ALT", new Getter() {
|
||||
public String get(VariantContext vc) {
|
||||
StringBuilder x = new StringBuilder();
|
||||
int n = vc.getAlternateAlleles().size();
|
||||
if ( n == 0 ) return ".";
|
||||
if ( vc.hasReferenceBaseForIndel() ) {
|
||||
Byte refByte = vc.getReferenceBaseForIndel();
|
||||
x.append(new String(new byte[]{refByte}));
|
||||
}
|
||||
|
||||
for ( int i = 0; i < n; i++ ) {
|
||||
if ( i != 0 ) x.append(",");
|
||||
x.append(vc.getAlternateAllele(i).getDisplayString());
|
||||
}
|
||||
return x.toString();
|
||||
}
|
||||
});
|
||||
getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } });
|
||||
getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) {
|
||||
if ( vc.isSNP() && vc.isBiallelic() )
|
||||
return VariantContextUtils.isTransition(vc) ? "1" : "0";
|
||||
else
|
||||
return "-1";
|
||||
}});
|
||||
getters.put("FILTER", new Getter() { public String get(VariantContext vc) {
|
||||
return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); }
|
||||
});
|
||||
|
||||
getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } });
|
||||
getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
|
||||
getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
|
||||
getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
|
||||
getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
|
||||
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
|
||||
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
|
||||
getters.put("GQ", new Getter() { public String get(VariantContext vc) {
|
||||
if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
|
||||
return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError());
|
||||
}});
|
||||
}
|
||||
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null ) // RodWalkers can make funky map calls
|
||||
return 0;
|
||||
|
|
@ -155,6 +165,15 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
return s.endsWith("*");
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility function that returns the list of values for each field in fields from vc.
|
||||
*
|
||||
* @param vc the VariantContext whose field values we can to capture
|
||||
* @param fields a non-null list of fields to capture from VC
|
||||
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise
|
||||
* provides a value of NA
|
||||
* @return
|
||||
*/
|
||||
public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
|
||||
List<String> vals = new ArrayList<String>();
|
||||
|
||||
|
|
@ -213,13 +232,75 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
return vals;
|
||||
}
|
||||
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduce(Integer counter, Integer sum) {
|
||||
return counter + sum;
|
||||
}
|
||||
|
||||
//
|
||||
// default reduce -- doesn't do anything at all
|
||||
//
|
||||
public Integer reduceInit() { return 0; }
|
||||
public Integer reduce(Integer counter, Integer sum) { return counter + sum; }
|
||||
public void onTraversalDone(Integer sum) {}
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// static system for getting values from VC by name.
|
||||
//
|
||||
// ----------------------------------------------------------------------------------------------------
|
||||
|
||||
public static abstract class Getter { public abstract String get(VariantContext vc); }
|
||||
public static Map<String, Getter> getters = new HashMap<String, Getter>();
|
||||
|
||||
static {
|
||||
// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
|
||||
getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } });
|
||||
getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
|
||||
getters.put("REF", new Getter() {
|
||||
public String get(VariantContext vc) {
|
||||
String x = "";
|
||||
if ( vc.hasReferenceBaseForIndel() ) {
|
||||
Byte refByte = vc.getReferenceBaseForIndel();
|
||||
x=x+new String(new byte[]{refByte});
|
||||
}
|
||||
return x+vc.getReference().getDisplayString();
|
||||
}
|
||||
});
|
||||
getters.put("ALT", new Getter() {
|
||||
public String get(VariantContext vc) {
|
||||
StringBuilder x = new StringBuilder();
|
||||
int n = vc.getAlternateAlleles().size();
|
||||
if ( n == 0 ) return ".";
|
||||
if ( vc.hasReferenceBaseForIndel() ) {
|
||||
Byte refByte = vc.getReferenceBaseForIndel();
|
||||
x.append(new String(new byte[]{refByte}));
|
||||
}
|
||||
|
||||
for ( int i = 0; i < n; i++ ) {
|
||||
if ( i != 0 ) x.append(",");
|
||||
x.append(vc.getAlternateAllele(i).getDisplayString());
|
||||
}
|
||||
return x.toString();
|
||||
}
|
||||
});
|
||||
getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } });
|
||||
getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) {
|
||||
if ( vc.isSNP() && vc.isBiallelic() )
|
||||
return VariantContextUtils.isTransition(vc) ? "1" : "0";
|
||||
else
|
||||
return "-1";
|
||||
}});
|
||||
getters.put("FILTER", new Getter() { public String get(VariantContext vc) {
|
||||
return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); }
|
||||
});
|
||||
|
||||
getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } });
|
||||
getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
|
||||
getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
|
||||
getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
|
||||
getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
|
||||
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
|
||||
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
|
||||
getters.put("GQ", new Getter() { public String get(VariantContext vc) {
|
||||
if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
|
||||
return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError());
|
||||
}});
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue