Merge branch 'master' of ssh://gsa2/humgen/gsa-scr1/chartl/dev/git
This commit is contained in:
commit
edd39da2ea
|
|
@ -55,7 +55,7 @@ public @interface Output {
|
|||
* --help argument is specified.
|
||||
* @return Doc string associated with this command-line argument.
|
||||
*/
|
||||
String doc() default "An output file presented to the walker. Will overwrite contents if file exists.";
|
||||
String doc() default "An output file created by the walker. Will overwrite contents if file exists";
|
||||
|
||||
/**
|
||||
* Is this argument required. If true, the command-line argument system will
|
||||
|
|
|
|||
|
|
@ -65,10 +65,53 @@ import java.util.*;
|
|||
|
||||
/**
|
||||
* Performs local realignment of reads based on misalignments due to the presence of indels.
|
||||
* Unlike most mappers, this walker uses the full alignment context to determine whether an
|
||||
* appropriate alternate reference (i.e. indel) exists and updates SAMRecords accordingly.
|
||||
*
|
||||
* <p>
|
||||
* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
|
||||
* is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
|
||||
* or deletion (indels) in the individualÕs genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
|
||||
* the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently,
|
||||
* it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are
|
||||
* correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,
|
||||
* also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus
|
||||
* indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an
|
||||
* appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and
|
||||
* specifically identify indels.
|
||||
* <p>
|
||||
* <ol>There are 2 steps to the realignment process:
|
||||
* <li>Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)</li>
|
||||
* <li>Running the realigner over those intervals (IndelRealigner)</li>
|
||||
* </ol>
|
||||
* <p>
|
||||
* An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step.
|
||||
* <p>
|
||||
* Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them
|
||||
* (or with reads from similar technologies).
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more aligned BAM files and optionally one or more lists of known indels.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A realigned version of your input BAM file(s).
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx4g -jar GenomeAnalysisTK.jar \
|
||||
* -I input.bam \
|
||||
* -R ref.fasta \
|
||||
* -T IndelRealigner \
|
||||
* -targetIntervals intervalListFromRTC.intervals \
|
||||
* -o realignedBam.bam \
|
||||
* [--known /path/to/indels.vcf] \
|
||||
* [-compress 0] (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
|
||||
* </pre>
|
||||
*
|
||||
* @author ebanks
|
||||
*/
|
||||
//Reference(window=@Window(start=-30,stop=30))
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
|
||||
public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
||||
|
||||
|
|
@ -77,88 +120,136 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
|
|||
public static final String PROGRAM_RECORD_NAME = "GATK IndelRealigner";
|
||||
|
||||
public enum ConsensusDeterminationModel {
|
||||
/**
|
||||
* Uses only indels from a provided ROD of known indels.
|
||||
*/
|
||||
KNOWNS_ONLY,
|
||||
/**
|
||||
* Additionally uses indels already present in the original alignments of the reads.
|
||||
*/
|
||||
USE_READS,
|
||||
/**
|
||||
* Additionally uses 'Smith-Waterman' to generate alternate consenses.
|
||||
*/
|
||||
USE_SW
|
||||
}
|
||||
|
||||
@Input(fullName="known", shortName = "known", doc="Input VCF file with known indels", required=false)
|
||||
/**
|
||||
* Any number of VCF files representing known indels to be used for constructing alternate consenses.
|
||||
* Could be e.g. dbSNP and/or official 1000 Genomes indel calls. Non-indel variants in these files will be ignored.
|
||||
*/
|
||||
@Input(fullName="known", shortName = "known", doc="Input VCF file(s) with known indels", required=false)
|
||||
public List<RodBinding<VariantContext>> known = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s).
|
||||
*/
|
||||
@Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true)
|
||||
protected String intervalsFile = null;
|
||||
|
||||
/**
|
||||
* This term is equivalent to "significance" - i.e. is the improvement significant enough to merit realignment? Note that this number
|
||||
* should be adjusted based on your particular data set. For low coverage and/or when looking for indels with low allele frequency,
|
||||
* this number should be smaller.
|
||||
*/
|
||||
@Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false)
|
||||
protected double LOD_THRESHOLD = 5.0;
|
||||
|
||||
@Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false)
|
||||
protected double MISMATCH_THRESHOLD = 0.15;
|
||||
|
||||
/**
|
||||
* The realigned bam file.
|
||||
*/
|
||||
@Output(required=false, doc="Output bam")
|
||||
protected StingSAMFileWriter writer = null;
|
||||
protected ConstrainedMateFixingManager manager = null;
|
||||
protected SAMFileWriter writerToUse = null;
|
||||
|
||||
@Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "How should we determine the possible alternate consenses? -- in the order of least permissive to most permissive there is KNOWNS_ONLY (use only indels from known indels provided in RODs), USE_READS (additionally use indels already present in the original alignments of the reads), and USE_SW (additionally use 'Smith-Waterman' to generate alternate consenses). The default is USE_READS", required = false)
|
||||
/**
|
||||
* We recommend that users run with USE_READS when trying to realign high quality longer read data mapped with a gapped aligner;
|
||||
* Smith-Waterman is really only necessary when using an ungapped aligner (e.g. MAQ in the case of single-end read data).
|
||||
*/
|
||||
@Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "Determines how to compute the possible alternate consenses", required = false)
|
||||
public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS;
|
||||
|
||||
|
||||
// ADVANCED OPTIONS FOLLOW
|
||||
|
||||
@Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+
|
||||
"Keep it low to minimize memory consumption (but the tool may skip realignment on regions with too much coverage. If it is too low, it may generate errors during realignment); keep it high to maximize realignment (but make sure to give Java enough memory).", required=false)
|
||||
/**
|
||||
* For expert users only! This is similar to the argument in the RealignerTargetCreator walker. The point here is that the realigner
|
||||
* will only proceed with the realignment (even above the given threshold) if it minimizes entropy among the reads (and doesn't simply
|
||||
* push the mismatch column to another position). This parameter is just a heuristic and should be adjusted based on your particular data set.
|
||||
*/
|
||||
@Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false)
|
||||
protected double MISMATCH_THRESHOLD = 0.15;
|
||||
|
||||
/**
|
||||
* For expert users only! To minimize memory consumption you can lower this number (but then the tool may skip realignment on regions with too much coverage;
|
||||
* and if the number is too low, it may generate errors during realignment). Just make sure to give Java enough memory! 4Gb should be enough with the default value.
|
||||
*/
|
||||
@Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter", required=false)
|
||||
protected int MAX_RECORDS_IN_MEMORY = 150000;
|
||||
|
||||
/**
|
||||
* For expert users only!
|
||||
*/
|
||||
@Argument(fullName="maxIsizeForMovement", shortName="maxIsize", doc="maximum insert size of read pairs that we attempt to realign", required=false)
|
||||
protected int MAX_ISIZE_FOR_MOVEMENT = 3000;
|
||||
|
||||
/**
|
||||
* For expert users only!
|
||||
*/
|
||||
@Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="maximum positional move in basepairs that a read can be adjusted during realignment", required=false)
|
||||
protected int MAX_POS_MOVE_ALLOWED = 200;
|
||||
|
||||
/**
|
||||
* For expert users only! If you need to find the optimal solution regardless of running time, use a higher number.
|
||||
*/
|
||||
@Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false)
|
||||
protected int MAX_CONSENSUSES = 30;
|
||||
|
||||
/**
|
||||
* For expert users only! If you need to find the optimal solution regardless of running time, use a higher number.
|
||||
*/
|
||||
@Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false)
|
||||
protected int MAX_READS_FOR_CONSENSUSES = 120;
|
||||
|
||||
@Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment; "+
|
||||
"if this value is exceeded, realignment is not attempted and the reads are passed to the output file(s) as-is", required=false)
|
||||
/**
|
||||
* For expert users only! If this value is exceeded at a given interval, realignment is not attempted and the reads are passed to the output file(s) as-is.
|
||||
* If you need to allow more reads (e.g. with very deep coverage) regardless of memory, use a higher number.
|
||||
*/
|
||||
@Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment", required=false)
|
||||
protected int MAX_READS = 20000;
|
||||
|
||||
@Argument(fullName="noPGTag", shortName="noPG", required=false,
|
||||
doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. "+
|
||||
"This option is required in order to pass integration tests.")
|
||||
protected boolean NO_PG_TAG = false;
|
||||
|
||||
@Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false,
|
||||
doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam.")
|
||||
@Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam")
|
||||
protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false;
|
||||
|
||||
@Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false,
|
||||
doc="This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, "+
|
||||
"it will throw an exception. Use this argument when your interval list is not sorted to instruct "+"" +
|
||||
"the Realigner to first sort it in memory.")
|
||||
/**
|
||||
* For expert users only! This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, it will throw an exception.
|
||||
* Use this argument when your interval list is not sorted to instruct the Realigner to first sort it in memory.
|
||||
*/
|
||||
@Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, doc="The target intervals are not sorted")
|
||||
protected boolean TARGET_NOT_SORTED = false;
|
||||
|
||||
//NWay output: testing, not ready for the prime time, hence hidden:
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="nWayOut", shortName="nWayOut", required=false,
|
||||
doc="Generate one output file for each input (-I) bam file. Reads from all input files "+
|
||||
"will be realigned together, but then each read will be saved in the output file corresponding to "+
|
||||
"the input file the read came from. There are two ways to generate output bam file names: 1) if the "+
|
||||
"value of this argument is a general string (e.g. '.cleaned.bam'), then "+
|
||||
"extensions (\".bam\" or \".sam\") will be stripped from the input file names and the provided string value "+
|
||||
"will be pasted on instead; 2) if the value ends with a '.map' (e.g. input_output.map), then " +
|
||||
"the two-column tab-separated file with the specified name must exist and list unique output file name (2nd column)" +
|
||||
"for each input file name (1st column).")
|
||||
/**
|
||||
* Reads from all input files will be realigned together, but then each read will be saved in the output file corresponding to the input file that
|
||||
* the read came from. There are two ways to generate output bam file names: 1) if the value of this argument is a general string (e.g. '.cleaned.bam'),
|
||||
* then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the
|
||||
* value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output
|
||||
* file name (2nd column) for each input file name (1st column).
|
||||
*/
|
||||
@Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file")
|
||||
protected String N_WAY_OUT = null;
|
||||
|
||||
|
||||
// DEBUGGING OPTIONS FOLLOW
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="check_early",shortName="check_early",required=false,doc="Do early check of reads against existing consensuses")
|
||||
protected boolean CHECKEARLY = false;
|
||||
|
||||
|
||||
// DEBUGGING OPTIONS FOLLOW
|
||||
@Hidden
|
||||
@Argument(fullName="noPGTag", shortName="noPG", required=false,
|
||||
doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
protected boolean NO_PG_TAG = false;
|
||||
|
||||
@Hidden
|
||||
@Output(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY")
|
||||
|
|
|
|||
|
|
@ -52,7 +52,51 @@ import java.util.Collections;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads, MQ0 reads, and reads with consecutive indel operators in the CIGAR string.
|
||||
* Emits intervals for the Local Indel Realigner to target for realignment.
|
||||
*
|
||||
* <p>
|
||||
* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
|
||||
* is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
|
||||
* or deletion (indels) in the individualÕs genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
|
||||
* the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently,
|
||||
* it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are
|
||||
* correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,
|
||||
* also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus
|
||||
* indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an
|
||||
* appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and
|
||||
* specifically identify indels.
|
||||
* <p>
|
||||
* <ol>There are 2 steps to the realignment process:
|
||||
* <li>Determining (small) suspicious intervals which are likely in need of realignment (RealignerTargetCreator)</li>
|
||||
* <li>Running the realigner over those intervals (see the IndelRealigner tool)</li>
|
||||
* </ol>
|
||||
* <p>
|
||||
* An important note: the input bam(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step.
|
||||
* <p>
|
||||
* Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them
|
||||
* (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* One or more aligned BAM files and optionally one or more lists of known indels.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A list of target intervals to pass to the Indel Realigner.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -I input.bam \
|
||||
* -R ref.fasta \
|
||||
* -T RealignerTargetCreator \
|
||||
* -o forIndelRealigner.intervals \
|
||||
* [--known /path/to/indels.vcf]
|
||||
* </pre>
|
||||
*
|
||||
* @author ebanks
|
||||
*/
|
||||
@ReadFilters({Platform454Filter.class, MappingQualityZeroReadFilter.class, BadCigarFilter.class})
|
||||
@Reference(window=@Window(start=-1,stop=50))
|
||||
|
|
@ -61,29 +105,41 @@ import java.util.List;
|
|||
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
|
||||
public class RealignerTargetCreator extends RodWalker<RealignerTargetCreator.Event, RealignerTargetCreator.Event> {
|
||||
|
||||
/**
|
||||
* The target intervals for realignment.
|
||||
*/
|
||||
@Output
|
||||
protected PrintStream out;
|
||||
|
||||
/**
|
||||
* Any number of VCF files representing known SNPs and/or indels. Could be e.g. dbSNP and/or official 1000 Genomes indel calls.
|
||||
* SNPs in these files will be ignored unless the --mismatchFraction argument is used.
|
||||
*/
|
||||
@Input(fullName="known", shortName = "known", doc="Input VCF file with known indels", required=false)
|
||||
public List<RodBinding<VariantContext>> known = Collections.emptyList();
|
||||
|
||||
// mismatch/entropy/SNP arguments
|
||||
/**
|
||||
* Any two SNP calls and/or high entropy positions are considered clustered when they occur no more than this many basepairs apart.
|
||||
*/
|
||||
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy or SNP clusters", required=false)
|
||||
protected int windowSize = 10;
|
||||
|
||||
@Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1", required=false)
|
||||
/**
|
||||
* To disable this behavior, set this value to <= 0 or > 1. This feature is really only necessary when using an ungapped aligner
|
||||
* (e.g. MAQ in the case of single-end read data) and should be used in conjunction with '--model USE_SW' in the IndelRealigner.
|
||||
*/
|
||||
@Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy", required=false)
|
||||
protected double mismatchThreshold = 0.0;
|
||||
|
||||
@Argument(fullName="minReadsAtLocus", shortName="minReads", doc="minimum reads at a locus to enable using the entropy calculation", required=false)
|
||||
protected int minReadsAtLocus = 4;
|
||||
|
||||
// interval merging arguments
|
||||
/**
|
||||
* Because the realignment algorithm is N^2, allowing too large an interval might take too long to completely realign.
|
||||
*/
|
||||
@Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="maximum interval size", required=false)
|
||||
protected int maxIntervalSize = 500;
|
||||
|
||||
@Deprecated
|
||||
@Argument(fullName="realignReadsWithBadMates", doc="This argument is no longer used.", required=false)
|
||||
protected boolean DEPRECATED_REALIGN_MATES = false;
|
||||
|
||||
@Override
|
||||
public boolean generateExtendedEvents() { return true; }
|
||||
|
|
|
|||
|
|
@ -96,8 +96,8 @@ public class GATKDoclet {
|
|||
//logger.debug("Considering " + doc);
|
||||
Class clazz = getClassForClassDoc(doc);
|
||||
|
||||
if ( clazz != null && clazz.getName().equals("org.broadinstitute.sting.gatk.walkers.annotator.AlleleBalance"))
|
||||
logger.debug("foo");
|
||||
//if ( clazz != null && clazz.getName().equals("org.broadinstitute.sting.gatk.walkers.annotator.AlleleBalance"))
|
||||
// logger.debug("foo");
|
||||
|
||||
DocumentedGATKFeature feature = getFeatureForClassDoc(doc);
|
||||
DocumentedGATKFeatureHandler handler = createHandler(doc, feature);
|
||||
|
|
|
|||
Loading…
Reference in New Issue