diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java index 533c7be73..b6a3853f8 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CheckPileup.java @@ -49,20 +49,109 @@ import java.io.PrintStream; import java.util.Arrays; /** - * At every locus in the input set, compares the pileup data (reference base, aligned base from - * each overlapping read, and quality score) to the reference pileup data generated by samtools. Samtools' pileup data - * should be specified using the command-line argument '-pileup:SAMPileup '. + * Compare GATK's internal pileup to a reference Samtools pileup + * + *

At every locus in the input set, compares the pileup data (reference base, aligned base from + * each overlapping read, and quality score) generated internally by GATK to a reference pileup data generated + * by Samtools. Note that the pileup program has been replaced in Samtools by mpileup, which produces a slightly + * different output format by default. + *

+ * + *

Format

+ *

There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old + * 10-column "consensus" format which could be obtained by using the -c argument, now deprecated.

+ *

Simple pileup: 6-column format

+ *

+ * Each line consists of chromosome, 1-based coordinate, reference base, the + * number of reads covering the site, read bases and base qualities. At the + * read base column, a dot stands for a match to the reference base on the + * forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch + * on the forward strand and `acgtn' for a mismatch on the reverse strand. + * A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between + * this reference position and the next reference position. The length of the + * insertion is given by the integer in the pattern, followed by the inserted sequence. + *

+ *
+ *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
+ *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
+ *     seq1 274 T 23  ,.$....,,.,.,...,,,.,...    7<7;<;<<<<<<<<<=<;<;<<6
+ *     seq1 275 A 23  ,$....,,.,.,...,,,.,...^l.  <+;9*<<<<<<<<<=<<:;<<<<
+ *     seq1 276 G 22  ...T,,.,.,...,,,.,....  33;+<<7=7<<7<&<<1;<<6<
+ *     seq1 277 T 22  ....,,.,.,.C.,,,.,..G.  +7<;<<<<<<<&<=<<:;<<&<
+ *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
+ *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
+ * 
+ *

+ * See the Pileup format documentation for more details. + *

+ * + *

Consensus pileup: 10/13-column format

+ *

The "consensus" or extended pileup consists of the following: + *

+ *

+ *

Example of consensus pileup for SNP or non-variant sites

+ *
+ *     seq1  60  T  T  66  0  99  13  ...........^~.^~.   9<<55<;<<<<<<
+ *     seq1  61  G  G  72  0  99  15  .............^~.^y. (;975&;<<<<<<<<
+ *     seq1  62  T  T  72  0  99  15  .$..............    <;;,55;<<<<<<<<
+ *     seq1  63  G  G  72  0  99  15  .$.............^~.  4;2;<7:+<<<<<<<
+ *     seq1  64  G  G  69  0  99  14  ..............  9+5<;;;<<<<<<<
+ *     seq1  65  A  A  69  0  99  14  .$............. <5-2<;;<<<<<<;
+ *     seq1  66  C  C  66  0  99  13  .............   &*<;;<<<<<<8<
+ *     seq1  67  C  C  69  0  99  14  .............^~.    ,75<.4<<<<<-<<
+ *     seq1  68  C  C  69  0  99  14  ..............  576<;7<<<<<8<< *
+ * 
+ * + *

Example of consensus pileup for indels

+ *
+ *     Escherichia_coli_K12	3995037	*	*\/*	430	0	37	144	*	+A	143	1	0
+ *     Escherichia_coli_K12	3995279	*	*\/*	202	0	36	68	*	+A	67	1	0
+ *     Escherichia_coli_K12	3995281	*	*\/*	239	0	36	67	*	-CG	66	1	0
+ * 
+ *

+ * See Consensus pileup format (deprecated) for more details. + *

+ * + *

Input

+ *

A BAM file conatining your aligned sequence data and a pileup file generated by Samtools covering the region you + * want to examine.

+ * + *

Output

+ *

A text file listing mismatches between the input pileup and the GATK's internal pileup. If there are no mismatches, the output file is empty.

+ * + *

Example

+ *
+ * java -jar GenomeAnalysisTK.jar \
+ *   -T CheckPileup \
+ *   -R ref.fasta \
+ *   -I your_data.bam \
+ *   --pileup:SAMPileup pileup_file.txt \
+ *   -L chr1:257-275 \
+ *   -o output_file_name
+ * 
*/ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @Requires(value={DataSource.READS,DataSource.REFERENCE}) public class CheckPileup extends LocusWalker implements TreeReducible { - @Input(fullName = "pileup", doc="The SAMPileup containing the expected output", required = true) + /** + * This is the existing pileup against which we'll compare GATK's internal pileup at each genome position in the desired interval. + */ + @Input(fullName = "pileup", shortName = "pileup", doc="Pileup generated by Samtools", required = true) RodBinding pileup; @Output private PrintStream out; - - @Argument(fullName="continue_after_error",doc="Continue after an error",required=false) + /** + * By default the program will quit if it encounters an error (such as missing truth data for a given position). + * Use this flag to override the default behavior; the program will then simply print an error message and move on + * to the next position. + */ + @Argument(fullName="continue_after_error",doc="Continue after encountering an error",required=false) public boolean CONTINUE_AFTER_AN_ERROR = false; public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @@ -72,7 +161,7 @@ public class CheckPileup extends LocusWalker implemen if ( truePileup == null ) { out.printf("No truth pileup data available at %s%n", pileup.getPileupString(ref.getBaseAsChar())); if ( ! CONTINUE_AFTER_AN_ERROR ) { - throw new UserException.CommandLineException(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases", + throw new UserException.BadInput(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases", context.getLocation(), new String(pileup.getBases()))); } } else { @@ -80,7 +169,7 @@ public class CheckPileup extends LocusWalker implemen if ( pileupDiff != null ) { out.printf("%s vs. %s%n", pileup.getPileupString(ref.getBaseAsChar()), truePileup.getPileupString()); if ( ! CONTINUE_AFTER_AN_ERROR ) { - throw new RuntimeException(String.format("Pileups aren't equal: %s", pileupDiff)); + throw new UserException.BadInput(String.format("The input pileup doesn't match the GATK's internal pileup: %s", pileupDiff)); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java index 23bbf1460..48e21fdd0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/Pileup.java @@ -48,11 +48,17 @@ import java.util.List; /** * Emulates the samtools pileup command to print aligned reads * - *

Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position, - * consisting of chromosome name, coordinate, reference base, read bases, and read qualities. + *

Prints the alignment in something similar to the Samtools pileup format (see the + * Pileup format documentation for more details about + * the original format). There is one line per genomic position, listing the chromosome name, coordinate, reference + * base, read bases, and read qualities. In addition to these default fields, additional information can be added to + * the output as extra columns; see options detailed below.

* - * Emulated command: - * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] + *

Emulated command:

+ *
+ *  samtools pileup -f in.ref.fasta -l in.site_list input.bam
+ * 
+ * *

Input

*

@@ -61,17 +67,32 @@ import java.util.List; * *

Output

*

- * Formatted pileup-style alignment of reads. + * Alignment of reads formatted in the Pileup style. *

* *

Example

*
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -T Pileup \
- *   -R ref.fasta \
- *   -I aligned_reads.bam \
+ *   -R exampleFASTA.fasta \
+ *   -I exampleBAM.bam \
+ *   -L chr1:257-267
  *   -o output.txt
  * 
+ *

Expected output

+ *
+ *     chr1 257 A CAA '&=
+ *     chr1 258 C TCC A:=
+ *     chr1 259 C CCC )A=
+ *     chr1 260 C ACC (=<
+ *     chr1 261 T TCT '44
+ *     chr1 262 A AAA '?:
+ *     chr1 263 A AGA 1'6
+ *     chr1 264 C TCC 987
+ *     chr1 265 C CCC (@(
+ *     chr1 266 C GCC ''=
+ *     chr1 267 T AAT 7%>
+ * 
* */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} ) @@ -83,18 +104,25 @@ public class Pileup extends LocusWalker implements TreeReducibl PrintStream out; /** - * In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions, + * In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions, * and for each read in the pileup it has the read name, offset in the base string, read length, and read mapping quality. These per * read items are delimited with an '@' character. */ @Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output", required=false) public boolean SHOW_VERBOSE = false; - - @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) + /** + * This enables annotating the pileup to show overlaps with metadata from a ROD file. + * For example, if you provide a VCF and there is a SNP at a given location covered by the pileup, the pileup + * output at that position will be annotated with the corresponding source ROD identifier. + */ + @Input(fullName="metadata",shortName="metadata",doc="ROD file containing metadata", required=false) public List> rods = Collections.emptyList(); - + /** + * Adds the length of the insert each base comes from to the output pileup. Here, "insert" refers to the DNA insert + * produced during library generation before sequencing. + */ @Hidden - @Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Add a column which contains the length of the insert each base comes from.",required=false) + @Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Output insert length",required=false) public boolean outputInsertLength=false; @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java index 34705c4c9..70241a6c4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupCodec.java @@ -37,13 +37,21 @@ import java.util.regex.Pattern; import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType; /** - * Decoder for SAM pileup data. For GATK validation purposes only + * Decoder for SAM pileup data. * *

- * Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute. - * It desribes the base-pair information at each chromosomal position. This format - * facilitates SNP/indel calling and brief alignment viewing by eyes. + * From the SAMTools project documentation: *

+ *

The Pileup format was first used by Tony Cox and Zemin Ning at + * the Sanger Institute. It describes the base-pair information at each chromosomal position. This format + * facilitates SNP/indel calling and brief alignment viewing by eye. Note that the pileup program has been replaced + * in Samtools by mpileup, which produces a slightly different output format by default. + *

+ + *

Format

+ *

There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old + * 10/13-column "consensus" format which could be obtained by using the -c argument, now deprecated.

+ *

Simple pileup: 6-column format

*

* Each line consists of chromosome, 1-based coordinate, reference base, the * number of reads covering the site, read bases and base qualities. At the @@ -54,13 +62,6 @@ import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.V * this reference position and the next reference position. The length of the * insertion is given by the integer in the pattern, followed by the inserted sequence. *

- * - *

- *
See also: @see SAMTools project
- *
See also: @see Pileup format
- *

- * - *

File format example

*
  *     seq1 272 T 24  ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
  *     seq1 273 T 23  ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
@@ -71,15 +72,55 @@ import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.V
  *     seq1 278 G 23  ....,,.,.,...,,,.,....^k.   %38*<<;<7<<7<=<<<;<<<<<
  *     seq1 279 C 23  A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
  * 
+ *

+ * See the Pileup format documentation for more details. + *

* - * @author Matt Hanna - * @since 2009 + *

Consensus pileup: 10/13-column format

+ *

The "consensus" or extended pileup consists of the following: + *

    + *
  • original 6 columns as described above
  • + *
  • 4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the + * reads covering the sites) for all sites, inserted before the bases and quality strings
  • + *
  • 3 extra columns indicating counts of reads supporting indels (just for indel sites)
  • + *
+ *

+ *

Example of consensus pileup for SNP or non-variant sites

+ *
+ *     seq1  60  T  T  66  0  99  13  ...........^~.^~.   9<<55<;<<<<<<
+ *     seq1  61  G  G  72  0  99  15  .............^~.^y. (;975&;<<<<<<<<
+ *     seq1  62  T  T  72  0  99  15  .$..............    <;;,55;<<<<<<<<
+ *     seq1  63  G  G  72  0  99  15  .$.............^~.  4;2;<7:+<<<<<<<
+ *     seq1  64  G  G  69  0  99  14  ..............  9+5<;;;<<<<<<<
+ *     seq1  65  A  A  69  0  99  14  .$............. <5-2<;;<<<<<<;
+ *     seq1  66  C  C  66  0  99  13  .............   &*<;;<<<<<<8<
+ *     seq1  67  C  C  69  0  99  14  .............^~.    ,75<.4<<<<<-<<
+ *     seq1  68  C  C  69  0  99  14  ..............  576<;7<<<<<8<< *
+ * 
+ * + *

Example of consensus pileup for indels

+ *
+ *     Escherichia_coli_K12	3995037	*	*\/*	430	0	37	144	*	+A	143	1	0
+ *     Escherichia_coli_K12	3995279	*	*\/*	202	0	36	68	*	+A	67	1	0
+ *     Escherichia_coli_K12	3995281	*	*\/*	239	0	36	67	*	-CG	66	1	0
+ * 
+ *

+ * See Consensus pileup format (deprecated) for more details. + *

+ * + *

Caveat

+ *

Handling of indels is questionable at the moment. Proceed with care.

+ * + * + * @author Matt Hanna, Geraldine VdAuwera + * @since 2014 */ public class SAMPileupCodec extends AsciiFeatureCodec { - // the number of tokens we expect to parse from a pileup line - private static final int expectedTokenCount = 10; + // number of tokens expected (6 or 10 are valid, anything else is wrong) + private static final int basicTokenCount = 6; + private static final int consensusSNPTokenCount = 10; + private static final int consensusIndelTokenCount = 13; private static final char fldDelim = '\t'; - // allocate once and don't ever bother creating them again: private static final String baseA = "A"; private static final String baseC = "C"; @@ -92,74 +133,110 @@ public class SAMPileupCodec extends AsciiFeatureCodec { } public SAMPileupFeature decode(String line) { -// 0 1 2 3 4 5 6 7 -//* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) -//* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6 - String[] tokens = new String[expectedTokenCount]; + //+1 because we want to know if we have more than the max + String[] tokens = new String[consensusIndelTokenCount+1]; // split the line - int count = ParsingUtils.split(line,tokens,fldDelim); - - // check to see if we've parsed the string into the right number of tokens (expectedTokenCount) - if (count != expectedTokenCount) - throw new CodecLineParsingException("the SAM pileup line didn't have the expected number of tokens " + - "(expected = " + expectedTokenCount + ", saw = " + count + " on " + - "line = " + line + ")"); + final int count = ParsingUtils.split(line,tokens,fldDelim); SAMPileupFeature feature = new SAMPileupFeature(); + /** + * Tokens 0, 1, 2 are the same for both formats so they will be interpreted without differentiation. + * The 10/13-format has 4 tokens inserted after token 2 compared to the 6-format, plus 3 more tokens added at + * the end for indels. We are currently not making any use of the extra indel tokens. + * + * Any token count other than basicTokenCount, consensusSNPTokenCount or consensusIndelTokenCount is wrong. + */ + final String observedString, bases, quals; + feature.setChr(tokens[0]); feature.setStart(Integer.parseInt(tokens[1])); - if(tokens[2].length() != 1) + if(tokens[2].length() != 1) { throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line); - feature.setRef(Character.toUpperCase(tokens[2].charAt(0))); - - String observedString = tokens[3].toUpperCase(); // field 3 - feature.setFWDAlleles(new ArrayList(2)); - - feature.setConsensusConfidence(Double.parseDouble(tokens[4])); - feature.setVariantConfidence(Double.parseDouble(tokens[5])); - - if ( feature.getRef() == '*' ) { - parseIndels(observedString,feature) ; - if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1); - else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!! - } else { - parseBasesAndQuals(feature,tokens[8],tokens[9]); - // if the variant is a SNP or a reference base (i.e. no variant at all) - if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter"); - feature.setRefBases(tokens[2].toUpperCase()); - feature.setEnd(feature.getStart()); - - char ch = observedString.charAt(0); - - switch ( ch ) { - case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break; - case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break; - case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break; - case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break; - case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break; - case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break; - case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break; - case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break; - case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break; - case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break; - } - if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE); - else { - // we know that at least one allele is non-ref; - // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e. - // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref) - feature.setVariantType(VariantType.SNP); - if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() || - feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() || - feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1)) - ) feature.setNumNonRef(1); - else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site... - } } + feature.setRef(tokens[2].charAt(0)); + switch (count) { + case basicTokenCount: + bases = tokens[4]; + quals = tokens[5]; + // parsing is pretty straightforward for 6-col format + if ( feature.getRef() == '*' ) { // this indicates an indel -- but it shouldn't occur with vanilla 6-col format + throw new CodecLineParsingException("Found an indel on line = " + line + " but it shouldn't happen in simple pileup format"); + } else { + parseBasesAndQuals(feature, bases, quals); + feature.setRefBases(tokens[2].toUpperCase()); + feature.setEnd(feature.getStart()); + } + break; + case consensusSNPTokenCount: // pileup called a SNP or a reference base + observedString = tokens[3].toUpperCase(); + feature.setFWDAlleles(new ArrayList(2)); + feature.setConsensusConfidence(Double.parseDouble(tokens[4])); + feature.setVariantConfidence(Double.parseDouble(tokens[5])); + bases = tokens[8]; + quals = tokens[9]; + // confirm that we have a non-variant, not a mis-parsed indel + if ( feature.getRef() == '*' ) { + throw new CodecLineParsingException("Line parsing of " + line + " says we have a SNP or non-variant but the ref base is '*', which indicates an indel"); + } + // Parse the SNP or non-variant + parseBasesAndQuals(feature, bases, quals); + if ( observedString.length() != 1 ) { + throw new CodecLineParsingException( "Line parsing of " + line + " says we have a SNP or non-variant but the genotype token is not a single letter: " + observedString); + } + feature.setRefBases(tokens[2].toUpperCase()); + feature.setEnd(feature.getStart()); + + char ch = observedString.charAt(0); + + switch ( ch ) { // record alleles (decompose ambiguous base codes) + case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break; + case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break; + case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break; + case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break; + case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break; + case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break; + case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break; + case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break; + case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break; + case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break; + } + if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE); + else { + // we know that at least one allele is non-ref; + // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e. + // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref) + feature.setVariantType(VariantType.SNP); + if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() || + feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() || + feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1)) + ) feature.setNumNonRef(1); + else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site... + } + break; + case consensusIndelTokenCount: + observedString = tokens[3].toUpperCase(); + feature.setFWDAlleles(new ArrayList(2)); + feature.setConsensusConfidence(Double.parseDouble(tokens[4])); + feature.setVariantConfidence(Double.parseDouble(tokens[5])); + // confirm that we have an indel, not a mis-parsed SNP or non-variant + if ( feature.getRef() != '*' ) { + throw new CodecLineParsingException("Line parsing of " + line + " says we have an indel but the ref base is not '*'"); + } + // Parse the indel + parseIndels(observedString,feature) ; + if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1); + else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this has got to be an insertion; otherwise the state is inconsistent!!!! + break; + default: + throw new CodecLineParsingException("The SAM pileup line didn't have the expected number of tokens " + + "(expected = " + basicTokenCount + " (basic pileup), " + consensusSNPTokenCount + + " (consensus pileup for a SNP or non-variant site) or " + consensusIndelTokenCount + + " (consensus pileup for an indel); saw = " + count + " on line = " + line + ")"); + } return feature; } @@ -197,7 +274,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec { else feature.setVariantType(VariantType.DELETION); feature.setRefBases(varBases); // remember what was deleted, this will be saved as "reference allele" break; - default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype); + default: throw new CodecLineParsingException("Can not interpret observed indel allele record: "+genotype); } feature.getFWDAlleles().add(varBases); feature.setLength(obs[i].length()-1); // inconsistent for non-biallelic indels!! @@ -224,7 +301,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec { { //System.out.printf("%s%n%s%n", bases, quals); - // needs to convert the base string with it's . and , to the ref base + // needs to convert the base string with its . and , to the ref base StringBuilder baseBuilder = new StringBuilder(); StringBuilder qualBuilder = new StringBuilder(); boolean done = false; @@ -254,7 +331,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec { Matcher match = regex.matcher(rest); if ( ! match.matches() ) { if ( feature.getRef() != '*' ) - throw new RuntimeException("Bad pileup format: " + bases + " at position " + i); + throw new CodecLineParsingException("Bad pileup format: " + bases + " at position " + i); done = true; } else { diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java index a6fd996fd..287363601 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/sampileup/SAMPileupFeature.java @@ -33,6 +33,10 @@ import java.util.List; /** * A tribble feature representing a SAM pileup. * + * Allows intake of both simple (6-column) or extended/consensus (10/13-column) pileups. Simple pileup features will + * contain only basic information, no observed alleles or variant/genotype inferences, and so shouldn't be used as + * input for analysis that requires that information. + * * @author mhanna * @version 0.1 */ diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java index 4d3741228..0971cb90b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/qc/CheckPileupIntegrationTest.java @@ -33,18 +33,38 @@ import java.util.Collections; /** * Run validating pileup across a set of core data as proof of the integrity of the GATK core. * - * @author mhanna - * @version 0.1 + * Tests both types of old-school pileup formats (basic and consensus). + * + * @author mhanna, vdauwera + * @version 0.2 */ public class CheckPileupIntegrationTest extends WalkerTest { + /** + * This test runs on a consensus pileup containing 10-column lines for SNPs and 13-column lines for indels + */ @Test(enabled = true) - public void testEcoliThreaded() { + public void testEcoliConsensusPileup() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T CheckPileup" + " -I " + validationDataLocation + "MV1994.selected.bam" + " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + " --pileup:SAMPileup "+ validationDataLocation + "MV1994.selected.pileup" + " -S SILENT -nt 8",0, Collections.emptyList()); - executeTest("testEcoliThreaded",spec); + executeTest("testEcoliConsensusPileup",spec); + } + + /** + * This test runs on a basic pileup containing 6-column lines for all variants TODO + */ + @Test + public void testEcoliBasicPileup() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CheckPileup" + + " -I " + validationDataLocation + "MV1994.selected.bam" + + " -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" + + " --pileup:SAMPileup "+ validationDataLocation + "MV1994.basic.pileup" + + " -L Escherichia_coli_K12:1-49" + + " -S SILENT -nt 8",0, Collections.emptyList()); + executeTest("testEcoliBasicPileup",spec); } }