From ef9216011ea1f7e82a40145dbdc16edf008cda87 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 16 Aug 2011 12:24:53 -0400 Subject: [PATCH 2/5] Added docs to IR --- .../gatk/walkers/indels/IndelRealigner.java | 163 ++++++++++++++---- 1 file changed, 126 insertions(+), 37 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index fa3991694..426c10604 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -65,10 +65,51 @@ import java.util.*; /** * Performs local realignment of reads based on misalignments due to the presence of indels. - * Unlike most mappers, this walker uses the full alignment context to determine whether an - * appropriate alternate reference (i.e. indel) exists and updates SAMRecords accordingly. + * + *

+ * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases + * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion + * or deletion (indels) in the individualÕs genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching + * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, + * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are + * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, + * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus + * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an + * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and + * specifically identify indels. + *

+ *

    There are 2 steps to the realignment process: + *
  1. Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)
  2. + *
  3. Running the realigner over those intervals (IndelRealigner)
  4. + *
+ *

+ * An important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them + * (or with reads from similar technologies). + * + *

Input

+ *

+ * One or more aligned BAM files and optionally one or more lists of known indels. + *

+ * + *

Output

+ *

+ * A realigned version of your input BAM file(s). + *

+ * + *

Examples

+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ *   -I  \
+ *   -R  \
+ *   -T IndelRealigner \
+ *   -targetIntervals  \
+ *   -o  \
+ *   [--known /path/to/indels.vcf] \
+ *   [-compress 0]    (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
+ * 
+ * + * @author ebanks */ -//Reference(window=@Window(start=-30,stop=30)) @BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) public class IndelRealigner extends ReadWalker { @@ -77,88 +118,136 @@ public class IndelRealigner extends ReadWalker { public static final String PROGRAM_RECORD_NAME = "GATK IndelRealigner"; public enum ConsensusDeterminationModel { + /** + * Uses only indels from a provided ROD of known indels. + */ KNOWNS_ONLY, + /** + * Additionally uses indels already present in the original alignments of the reads. + */ USE_READS, + /** + * Additionally uses 'Smith-Waterman' to generate alternate consenses. + */ USE_SW } - @Input(fullName="known", shortName = "known", doc="Input VCF file with known indels", required=false) + /** + * Any number of VCF files representing known indels to be used for constructing alternate consenses. + * Could be e.g. dbSNP and/or official 1000 Genomes indel calls. Non-indel variants in these files will be ignored. + */ + @Input(fullName="known", shortName = "known", doc="Input VCF file(s) with known indels", required=false) public List> known = Collections.emptyList(); + /** + * The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s). + */ @Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) protected String intervalsFile = null; + /** + * This term is equivalent to "significance" - i.e. is the improvement significant enough to merit realignment? Note that this number + * should be adjusted based on your particular data set. For low coverage and/or when looking for indels with low allele frequency, + * this number should be smaller. + */ @Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false) protected double LOD_THRESHOLD = 5.0; - @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) - protected double MISMATCH_THRESHOLD = 0.15; - + /** + * The realigned bam file. + */ @Output(required=false, doc="Output bam") protected StingSAMFileWriter writer = null; protected ConstrainedMateFixingManager manager = null; protected SAMFileWriter writerToUse = null; - @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "How should we determine the possible alternate consenses? -- in the order of least permissive to most permissive there is KNOWNS_ONLY (use only indels from known indels provided in RODs), USE_READS (additionally use indels already present in the original alignments of the reads), and USE_SW (additionally use 'Smith-Waterman' to generate alternate consenses). The default is USE_READS", required = false) + /** + * We recommend that users run with USE_READS when trying to realign high quality longer read data mapped with a gapped aligner; + * Smith-Waterman is really only necessary when using an ungapped aligner (e.g. MAQ in the case of single-end read data). + */ + @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "Determines how to compute the possible alternate consenses", required = false) public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS; // ADVANCED OPTIONS FOLLOW - @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+ - "Keep it low to minimize memory consumption (but the tool may skip realignment on regions with too much coverage. If it is too low, it may generate errors during realignment); keep it high to maximize realignment (but make sure to give Java enough memory).", required=false) + /** + * For expert users only! This is similar to the argument in the RealignerTargetCreator walker. The point here is that the realigner + * will only proceed with the realignment (even above the given threshold) if it minimizes entropy among the reads (and doesn't simply + * push the mismatch column to another position). This parameter is just a heuristic and should be adjusted based on your particular data set. + */ + @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false) + protected double MISMATCH_THRESHOLD = 0.15; + + /** + * For expert users only! To minimize memory consumption you can lower this number (but then the tool may skip realignment on regions with too much coverage; + * and if the number is too low, it may generate errors during realignment). Just make sure to give Java enough memory! 4Gb should be enough with the default value. + */ + @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter", required=false) protected int MAX_RECORDS_IN_MEMORY = 150000; + /** + * For expert users only! + */ @Argument(fullName="maxIsizeForMovement", shortName="maxIsize", doc="maximum insert size of read pairs that we attempt to realign", required=false) protected int MAX_ISIZE_FOR_MOVEMENT = 3000; + /** + * For expert users only! + */ @Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="maximum positional move in basepairs that a read can be adjusted during realignment", required=false) protected int MAX_POS_MOVE_ALLOWED = 200; + /** + * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. + */ @Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false) protected int MAX_CONSENSUSES = 30; + /** + * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number. + */ @Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false) protected int MAX_READS_FOR_CONSENSUSES = 120; - @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment; "+ - "if this value is exceeded, realignment is not attempted and the reads are passed to the output file(s) as-is", required=false) + /** + * For expert users only! If this value is exceeded at a given interval, realignment is not attempted and the reads are passed to the output file(s) as-is. + * If you need to allow more reads (e.g. with very deep coverage) regardless of memory, use a higher number. + */ + @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment", required=false) protected int MAX_READS = 20000; - @Argument(fullName="noPGTag", shortName="noPG", required=false, - doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. "+ - "This option is required in order to pass integration tests.") - protected boolean NO_PG_TAG = false; - - @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, - doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam.") + @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam") protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false; - @Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, - doc="This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, "+ - "it will throw an exception. Use this argument when your interval list is not sorted to instruct "+"" + - "the Realigner to first sort it in memory.") + /** + * For expert users only! This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, it will throw an exception. + * Use this argument when your interval list is not sorted to instruct the Realigner to first sort it in memory. + */ + @Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, doc="The target intervals are not sorted") protected boolean TARGET_NOT_SORTED = false; - //NWay output: testing, not ready for the prime time, hence hidden: - - @Hidden - @Argument(fullName="nWayOut", shortName="nWayOut", required=false, - doc="Generate one output file for each input (-I) bam file. Reads from all input files "+ - "will be realigned together, but then each read will be saved in the output file corresponding to "+ - "the input file the read came from. There are two ways to generate output bam file names: 1) if the "+ - "value of this argument is a general string (e.g. '.cleaned.bam'), then "+ - "extensions (\".bam\" or \".sam\") will be stripped from the input file names and the provided string value "+ - "will be pasted on instead; 2) if the value ends with a '.map' (e.g. input_output.map), then " + - "the two-column tab-separated file with the specified name must exist and list unique output file name (2nd column)" + - "for each input file name (1st column).") + /** + * Reads from all input files will be realigned together, but then each read will be saved in the output file corresponding to the input file that + * the read came from. There are two ways to generate output bam file names: 1) if the value of this argument is a general string (e.g. '.cleaned.bam'), + * then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the + * value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output + * file name (2nd column) for each input file name (1st column). + */ + @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file") protected String N_WAY_OUT = null; + + + // DEBUGGING OPTIONS FOLLOW + @Hidden @Argument(fullName="check_early",shortName="check_early",required=false,doc="Do early check of reads against existing consensuses") protected boolean CHECKEARLY = false; - - // DEBUGGING OPTIONS FOLLOW + @Hidden + @Argument(fullName="noPGTag", shortName="noPG", required=false, + doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") + protected boolean NO_PG_TAG = false; @Hidden @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") From 125ad0bcfabbb7459f5ba54413681e318d93df57 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 16 Aug 2011 12:46:48 -0400 Subject: [PATCH 3/5] Added docs to RTC --- .../gatk/walkers/indels/IndelRealigner.java | 4 +- .../indels/RealignerTargetCreator.java | 70 +++++++++++++++++-- .../sting/utils/help/GATKDoclet.java | 4 +- 3 files changed, 68 insertions(+), 10 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 426c10604..0f1c0dd3b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -83,7 +83,9 @@ import java.util.*; *
  • Running the realigner over those intervals (IndelRealigner)
  • * *

    - * An important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them + * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. + * + * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them * (or with reads from similar technologies). * *

    Input

    diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index fbb62f17e..7b654e3f8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -52,7 +52,51 @@ import java.util.Collections; import java.util.List; /** - * Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads, MQ0 reads, and reads with consecutive indel operators in the CIGAR string. + * Emits intervals for the Local Indel Realigner to target for cleaning. + * + *

    + * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases + * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion + * or deletion (indels) in the individualÕs genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching + * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, + * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are + * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, + * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus + * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an + * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and + * specifically identify indels. + *

    + *

      There are 2 steps to the realignment process: + *
    1. Determining (small) suspicious intervals which are likely in need of realignment (RealignerTargetCreator)
    2. + *
    3. Running the realigner over those intervals (see the IndelRealigner tool)
    4. + *
    + *

    + * An important note: the input bam(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. + * + * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them + * (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string. + * + *

    Input

    + *

    + * One or more aligned BAM files and optionally one or more lists of known indels. + *

    + * + *

    Output

    + *

    + * A list of target intervals to pass to the Indel Realigner. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -I  \
    + *   -R  \
    + *   -T RealignerTargetCreator \
    + *   -o  \
    + *   [--known /path/to/indels.vcf]
    + * 
    + * + * @author ebanks */ @ReadFilters({Platform454Filter.class, MappingQualityZeroReadFilter.class, BadCigarFilter.class}) @Reference(window=@Window(start=-1,stop=50)) @@ -61,29 +105,41 @@ import java.util.List; @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) public class RealignerTargetCreator extends RodWalker { + /** + * The target intervals for realignment. + */ @Output protected PrintStream out; + /** + * Any number of VCF files representing known SNPs and/or indels. Could be e.g. dbSNP and/or official 1000 Genomes indel calls. + * SNPs in these files will be ignored unless the --mismatchFraction argument is used. + */ @Input(fullName="known", shortName = "known", doc="Input VCF file with known indels", required=false) public List> known = Collections.emptyList(); - // mismatch/entropy/SNP arguments + /** + * Any two SNP calls and/or high entropy positions are considered clustered when they occur no more than this many basepairs apart. + */ @Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy or SNP clusters", required=false) protected int windowSize = 10; - @Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1", required=false) + /** + * To disable this behavior, set this value to <= 0 or > 1. This feature is really only necessary when using an ungapped aligner + * (e.g. MAQ in the case of single-end read data) and should be used in conjunction with '--model USE_SW' in the IndelRealigner. + */ + @Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy", required=false) protected double mismatchThreshold = 0.0; @Argument(fullName="minReadsAtLocus", shortName="minReads", doc="minimum reads at a locus to enable using the entropy calculation", required=false) protected int minReadsAtLocus = 4; - // interval merging arguments + /** + * Because the realignment algorithm is N^2, allowing too large an interval might take too long to completely realign. + */ @Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="maximum interval size", required=false) protected int maxIntervalSize = 500; - @Deprecated - @Argument(fullName="realignReadsWithBadMates", doc="This argument is no longer used.", required=false) - protected boolean DEPRECATED_REALIGN_MATES = false; @Override public boolean generateExtendedEvents() { return true; } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java index d071be105..8f3ec293a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDoclet.java @@ -96,8 +96,8 @@ public class GATKDoclet { //logger.debug("Considering " + doc); Class clazz = getClassForClassDoc(doc); - if ( clazz != null && clazz.getName().equals("org.broadinstitute.sting.gatk.walkers.annotator.AlleleBalance")) - logger.debug("foo"); + //if ( clazz != null && clazz.getName().equals("org.broadinstitute.sting.gatk.walkers.annotator.AlleleBalance")) + // logger.debug("foo"); DocumentedGATKFeature feature = getFeatureForClassDoc(doc); DocumentedGATKFeatureHandler handler = createHandler(doc, feature); From ab0b56ed11611c1c5dc9292f2184170573bf1f44 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Tue, 16 Aug 2011 12:55:45 -0400 Subject: [PATCH 4/5] Minor doc fixes --- .../org/broadinstitute/sting/commandline/Output.java | 2 +- .../sting/gatk/walkers/indels/IndelRealigner.java | 10 +++++----- .../gatk/walkers/indels/RealignerTargetCreator.java | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/commandline/Output.java b/public/java/src/org/broadinstitute/sting/commandline/Output.java index 22565dbf5..f8aef0355 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/Output.java +++ b/public/java/src/org/broadinstitute/sting/commandline/Output.java @@ -55,7 +55,7 @@ public @interface Output { * --help argument is specified. * @return Doc string associated with this command-line argument. */ - String doc() default "An output file presented to the walker. Will overwrite contents if file exists."; + String doc() default "An output file created by the walker. Will overwrite contents if file exists"; /** * Is this argument required. If true, the command-line argument system will diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 0f1c0dd3b..029b6deaf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -84,7 +84,7 @@ import java.util.*; * *

    * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step. - * + *

    * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them * (or with reads from similar technologies). * @@ -101,11 +101,11 @@ import java.util.*; *

    Examples

    *
      * java -Xmx4g -jar GenomeAnalysisTK.jar \
    - *   -I  \
    - *   -R  \
    + *   -I input.bam \
    + *   -R ref.fasta \
      *   -T IndelRealigner \
    - *   -targetIntervals  \
    - *   -o  \
    + *   -targetIntervals intervalListFromRTC.intervals \
    + *   -o realignedBam.bam \
      *   [--known /path/to/indels.vcf] \
      *   [-compress 0]    (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
      * 
    diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index 7b654e3f8..08ed1af52 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -52,7 +52,7 @@ import java.util.Collections; import java.util.List; /** - * Emits intervals for the Local Indel Realigner to target for cleaning. + * Emits intervals for the Local Indel Realigner to target for realignment. * *

    * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases @@ -72,7 +72,7 @@ import java.util.List; * *

    * An important note: the input bam(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. - * + *

    * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them * (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string. * @@ -89,10 +89,10 @@ import java.util.List; *

    Examples

    *
      * java -Xmx2g -jar GenomeAnalysisTK.jar \
    - *   -I  \
    - *   -R  \
    + *   -I input.bam \
    + *   -R ref.fasta \
      *   -T RealignerTargetCreator \
    - *   -o  \
    + *   -o forIndelRealigner.intervals \
      *   [--known /path/to/indels.vcf]
      * 
    *