Merge pull request #468 from broadinstitute/gg_fixSAMPileup

Updated SAMPileup codec and pileup-related docs
This commit is contained in:
Geraldine Van der Auwera 2014-01-14 06:30:04 -08:00
commit f67c33919b
5 changed files with 318 additions and 100 deletions

View File

@ -49,20 +49,109 @@ import java.io.PrintStream;
import java.util.Arrays;
/**
* At every locus in the input set, compares the pileup data (reference base, aligned base from
* each overlapping read, and quality score) to the reference pileup data generated by samtools. Samtools' pileup data
* should be specified using the command-line argument '-pileup:SAMPileup <your sam pileup file>'.
* Compare GATK's internal pileup to a reference Samtools pileup
*
* <p>At every locus in the input set, compares the pileup data (reference base, aligned base from
* each overlapping read, and quality score) generated internally by GATK to a reference pileup data generated
* by Samtools. Note that the pileup program has been replaced in Samtools by mpileup, which produces a slightly
* different output format by default.
* </p>
*
* <h3>Format</h3>
* <p>There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old
* 10-column "consensus" format which could be obtained by using the -c argument, now deprecated.</p>
* <h4>Simple pileup: 6-column format</h4>
* <p>
* Each line consists of chromosome, 1-based coordinate, reference base, the
* number of reads covering the site, read bases and base qualities. At the
* read base column, a dot stands for a match to the reference base on the
* forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch
* on the forward strand and `acgtn' for a mismatch on the reverse strand.
* A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between
* this reference position and the next reference position. The length of the
* insertion is given by the integer in the pattern, followed by the inserted sequence.
* </p>
* <pre>
* seq1 272 T 24 ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
* seq1 273 T 23 ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
* seq1 274 T 23 ,.$....,,.,.,...,,,.,... 7<7;<;<<<<<<<<<=<;<;<<6
* seq1 275 A 23 ,$....,,.,.,...,,,.,...^l. <+;9*<<<<<<<<<=<<:;<<<<
* seq1 276 G 22 ...T,,.,.,...,,,.,.... 33;+<<7=7<<7<&<<1;<<6<
* seq1 277 T 22 ....,,.,.,.C.,,,.,..G. +7<;<<<<<<<&<=<<:;<<&<
* seq1 278 G 23 ....,,.,.,...,,,.,....^k. %38*<<;<7<<7<=<<<;<<<<<
* seq1 279 C 23 A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
* </pre>
* <p>
* See the <a href="http://samtools.sourceforge.net/pileup.shtml">Pileup format documentation</a> for more details.
* </p>
*
* <h4>Consensus pileup: 10/13-column format</h4>
* <p>The "consensus" or extended pileup consists of the following:
* <ul>
* <li>original 6 columns as described above</li>
* <li>4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the
* reads covering the sites) for all sites, inserted before the bases and quality strings</li>
* <li>3 extra columns indicating counts of reads supporting indels (just for indel sites)</li>
* </ul>
* </p>
* <h4>Example of consensus pileup for SNP or non-variant sites</h4>
* <pre>
* seq1 60 T T 66 0 99 13 ...........^~.^~. 9<<55<;<<<<<<
* seq1 61 G G 72 0 99 15 .............^~.^y. (;975&;<<<<<<<<
* seq1 62 T T 72 0 99 15 .$.............. <;;,55;<<<<<<<<
* seq1 63 G G 72 0 99 15 .$.............^~. 4;2;<7:+<<<<<<<
* seq1 64 G G 69 0 99 14 .............. 9+5<;;;<<<<<<<
* seq1 65 A A 69 0 99 14 .$............. <5-2<;;<<<<<<;
* seq1 66 C C 66 0 99 13 ............. &*<;;<<<<<<8<
* seq1 67 C C 69 0 99 14 .............^~. ,75<.4<<<<<-<<
* seq1 68 C C 69 0 99 14 .............. 576<;7<<<<<8<< *
* </pre>
*
* <h4>Example of consensus pileup for indels</h4>
* <pre>
* Escherichia_coli_K12 3995037 * *\/* 430 0 37 144 * +A 143 1 0
* Escherichia_coli_K12 3995279 * *\/* 202 0 36 68 * +A 67 1 0
* Escherichia_coli_K12 3995281 * *\/* 239 0 36 67 * -CG 66 1 0
* </pre>
* <p>
* See <a href="http://samtools.sourceforge.net/cns0.shtml/">Consensus pileup format (deprecated)</a> for more details.
* </p>
*
* <h3>Input</h3>
* <p>A BAM file conatining your aligned sequence data and a pileup file generated by Samtools covering the region you
* want to examine.</p>
*
* <h3>Output</h3>
* <p>A text file listing mismatches between the input pileup and the GATK's internal pileup. If there are no mismatches, the output file is empty.</p>
*
* <h3>Example</h3>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -T CheckPileup \
* -R ref.fasta \
* -I your_data.bam \
* --pileup:SAMPileup pileup_file.txt \
* -L chr1:257-275 \
* -o output_file_name
* </pre>
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@Requires(value={DataSource.READS,DataSource.REFERENCE})
public class CheckPileup extends LocusWalker<Integer, CheckPileupStats> implements TreeReducible<CheckPileupStats> {
@Input(fullName = "pileup", doc="The SAMPileup containing the expected output", required = true)
/**
* This is the existing pileup against which we'll compare GATK's internal pileup at each genome position in the desired interval.
*/
@Input(fullName = "pileup", shortName = "pileup", doc="Pileup generated by Samtools", required = true)
RodBinding<SAMPileupFeature> pileup;
@Output
private PrintStream out;
@Argument(fullName="continue_after_error",doc="Continue after an error",required=false)
/**
* By default the program will quit if it encounters an error (such as missing truth data for a given position).
* Use this flag to override the default behavior; the program will then simply print an error message and move on
* to the next position.
*/
@Argument(fullName="continue_after_error",doc="Continue after encountering an error",required=false)
public boolean CONTINUE_AFTER_AN_ERROR = false;
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
@ -72,7 +161,7 @@ public class CheckPileup extends LocusWalker<Integer, CheckPileupStats> implemen
if ( truePileup == null ) {
out.printf("No truth pileup data available at %s%n", pileup.getPileupString(ref.getBaseAsChar()));
if ( ! CONTINUE_AFTER_AN_ERROR ) {
throw new UserException.CommandLineException(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases",
throw new UserException.BadInput(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases",
context.getLocation(), new String(pileup.getBases())));
}
} else {
@ -80,7 +169,7 @@ public class CheckPileup extends LocusWalker<Integer, CheckPileupStats> implemen
if ( pileupDiff != null ) {
out.printf("%s vs. %s%n", pileup.getPileupString(ref.getBaseAsChar()), truePileup.getPileupString());
if ( ! CONTINUE_AFTER_AN_ERROR ) {
throw new RuntimeException(String.format("Pileups aren't equal: %s", pileupDiff));
throw new UserException.BadInput(String.format("The input pileup doesn't match the GATK's internal pileup: %s", pileupDiff));
}
}
}

View File

@ -48,11 +48,17 @@ import java.util.List;
/**
* Emulates the samtools pileup command to print aligned reads
*
* <p>Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position,
* consisting of chromosome name, coordinate, reference base, read bases, and read qualities.
* <p>Prints the alignment in something similar to the Samtools pileup format (see the
* <a href="http://samtools.sourceforge.net/pileup.shtml">Pileup format documentation</a> for more details about
* the original format). There is one line per genomic position, listing the chromosome name, coordinate, reference
* base, read bases, and read qualities. In addition to these default fields, additional information can be added to
* the output as extra columns; see options detailed below.</p>
*
* Emulated command:
* samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] <in.alignment>
* <h4>Emulated command:</h4>
* <pre>
* samtools pileup -f in.ref.fasta -l in.site_list input.bam
* </pre>
*
* <h3>Input</h3>
* <p>
@ -61,17 +67,32 @@ import java.util.List;
*
* <h3>Output</h3>
* <p>
* Formatted pileup-style alignment of reads.
* Alignment of reads formatted in the Pileup style.
* </p>
*
* <h3>Example</h3>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -T Pileup \
* -R ref.fasta \
* -I aligned_reads.bam \
* -R exampleFASTA.fasta \
* -I exampleBAM.bam \
* -L chr1:257-267
* -o output.txt
* </pre>
* <h4>Expected output</h4>
* <pre>
* chr1 257 A CAA '&=
* chr1 258 C TCC A:=
* chr1 259 C CCC )A=
* chr1 260 C ACC (=<
* chr1 261 T TCT '44
* chr1 262 A AAA '?:
* chr1 263 A AGA 1'6
* chr1 264 C TCC 987
* chr1 265 C CCC (@(
* chr1 266 C GCC ''=
* chr1 267 T AAT 7%>
* </pre>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
@ -83,18 +104,25 @@ public class Pileup extends LocusWalker<String, Integer> implements TreeReducibl
PrintStream out;
/**
* In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions,
* In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions,
* and for each read in the pileup it has the read name, offset in the base string, read length, and read mapping quality. These per
* read items are delimited with an '@' character.
*/
@Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output", required=false)
public boolean SHOW_VERBOSE = false;
@Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false)
/**
* This enables annotating the pileup to show overlaps with metadata from a ROD file.
* For example, if you provide a VCF and there is a SNP at a given location covered by the pileup, the pileup
* output at that position will be annotated with the corresponding source ROD identifier.
*/
@Input(fullName="metadata",shortName="metadata",doc="ROD file containing metadata", required=false)
public List<RodBinding<Feature>> rods = Collections.emptyList();
/**
* Adds the length of the insert each base comes from to the output pileup. Here, "insert" refers to the DNA insert
* produced during library generation before sequencing.
*/
@Hidden
@Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Add a column which contains the length of the insert each base comes from.",required=false)
@Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Output insert length",required=false)
public boolean outputInsertLength=false;
@Override

View File

@ -37,13 +37,21 @@ import java.util.regex.Pattern;
import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType;
/**
* Decoder for SAM pileup data. For GATK validation purposes only
* Decoder for SAM pileup data.
*
* <p>
* Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute.
* It desribes the base-pair information at each chromosomal position. This format
* facilitates SNP/indel calling and brief alignment viewing by eyes.
* From the <a href="http://samtools.sourceforge.net/">SAMTools project documentation</a>:
* </p>
* <p>The Pileup format was first used by Tony Cox and Zemin Ning at
* the Sanger Institute. It describes the base-pair information at each chromosomal position. This format
* facilitates SNP/indel calling and brief alignment viewing by eye. Note that the pileup program has been replaced
* in Samtools by mpileup, which produces a slightly different output format by default.
* </p>
* <h3>Format</h3>
* <p>There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old
* 10/13-column "consensus" format which could be obtained by using the -c argument, now deprecated. </p>
* <h4>Simple pileup: 6-column format</h4>
* <p>
* Each line consists of chromosome, 1-based coordinate, reference base, the
* number of reads covering the site, read bases and base qualities. At the
@ -54,13 +62,6 @@ import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.V
* this reference position and the next reference position. The length of the
* insertion is given by the integer in the pattern, followed by the inserted sequence.
* </p>
*
* <p>
* <br>See also: @see <a href="http://samtools.sourceforge.net/">SAMTools project</a></br>
* <br>See also: @see <a href="http://samtools.sourceforge.net/pileup.shtml">Pileup format</a></br>
* </p>
*
* <h2>File format example</h2>
* <pre>
* seq1 272 T 24 ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
* seq1 273 T 23 ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
@ -71,15 +72,55 @@ import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.V
* seq1 278 G 23 ....,,.,.,...,,,.,....^k. %38*<<;<7<<7<=<<<;<<<<<
* seq1 279 C 23 A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
* </pre>
* <p>
* See the <a href="http://samtools.sourceforge.net/pileup.shtml">Pileup format documentation</a> for more details.
* </p>
*
* @author Matt Hanna
* @since 2009
* <h4>Consensus pileup: 10/13-column format</h4>
* <p>The "consensus" or extended pileup consists of the following:
* <ul>
* <li>original 6 columns as described above</li>
* <li>4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the
* reads covering the sites) for all sites, inserted before the bases and quality strings</li>
* <li>3 extra columns indicating counts of reads supporting indels (just for indel sites)</li>
* </ul>
* </p>
* <h4>Example of consensus pileup for SNP or non-variant sites</h4>
* <pre>
* seq1 60 T T 66 0 99 13 ...........^~.^~. 9<<55<;<<<<<<
* seq1 61 G G 72 0 99 15 .............^~.^y. (;975&;<<<<<<<<
* seq1 62 T T 72 0 99 15 .$.............. <;;,55;<<<<<<<<
* seq1 63 G G 72 0 99 15 .$.............^~. 4;2;<7:+<<<<<<<
* seq1 64 G G 69 0 99 14 .............. 9+5<;;;<<<<<<<
* seq1 65 A A 69 0 99 14 .$............. <5-2<;;<<<<<<;
* seq1 66 C C 66 0 99 13 ............. &*<;;<<<<<<8<
* seq1 67 C C 69 0 99 14 .............^~. ,75<.4<<<<<-<<
* seq1 68 C C 69 0 99 14 .............. 576<;7<<<<<8<< *
* </pre>
*
* <h4>Example of consensus pileup for indels</h4>
* <pre>
* Escherichia_coli_K12 3995037 * *\/* 430 0 37 144 * +A 143 1 0
* Escherichia_coli_K12 3995279 * *\/* 202 0 36 68 * +A 67 1 0
* Escherichia_coli_K12 3995281 * *\/* 239 0 36 67 * -CG 66 1 0
* </pre>
* <p>
* See <a href="http://samtools.sourceforge.net/cns0.shtml/">Consensus pileup format (deprecated)</a> for more details.
* </p>
*
* <h3>Caveat</h3>
* <p>Handling of indels is questionable at the moment. Proceed with care.</p>
*
*
* @author Matt Hanna, Geraldine VdAuwera
* @since 2014
*/
public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
// the number of tokens we expect to parse from a pileup line
private static final int expectedTokenCount = 10;
// number of tokens expected (6 or 10 are valid, anything else is wrong)
private static final int basicTokenCount = 6;
private static final int consensusSNPTokenCount = 10;
private static final int consensusIndelTokenCount = 13;
private static final char fldDelim = '\t';
// allocate once and don't ever bother creating them again:
private static final String baseA = "A";
private static final String baseC = "C";
@ -92,74 +133,110 @@ public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
}
public SAMPileupFeature decode(String line) {
// 0 1 2 3 4 5 6 7
//* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
//* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
String[] tokens = new String[expectedTokenCount];
//+1 because we want to know if we have more than the max
String[] tokens = new String[consensusIndelTokenCount+1];
// split the line
int count = ParsingUtils.split(line,tokens,fldDelim);
// check to see if we've parsed the string into the right number of tokens (expectedTokenCount)
if (count != expectedTokenCount)
throw new CodecLineParsingException("the SAM pileup line didn't have the expected number of tokens " +
"(expected = " + expectedTokenCount + ", saw = " + count + " on " +
"line = " + line + ")");
final int count = ParsingUtils.split(line,tokens,fldDelim);
SAMPileupFeature feature = new SAMPileupFeature();
/**
* Tokens 0, 1, 2 are the same for both formats so they will be interpreted without differentiation.
* The 10/13-format has 4 tokens inserted after token 2 compared to the 6-format, plus 3 more tokens added at
* the end for indels. We are currently not making any use of the extra indel tokens.
*
* Any token count other than basicTokenCount, consensusSNPTokenCount or consensusIndelTokenCount is wrong.
*/
final String observedString, bases, quals;
feature.setChr(tokens[0]);
feature.setStart(Integer.parseInt(tokens[1]));
if(tokens[2].length() != 1)
if(tokens[2].length() != 1) {
throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line);
feature.setRef(Character.toUpperCase(tokens[2].charAt(0)));
String observedString = tokens[3].toUpperCase(); // field 3
feature.setFWDAlleles(new ArrayList<String>(2));
feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
feature.setVariantConfidence(Double.parseDouble(tokens[5]));
if ( feature.getRef() == '*' ) {
parseIndels(observedString,feature) ;
if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1);
else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
} else {
parseBasesAndQuals(feature,tokens[8],tokens[9]);
// if the variant is a SNP or a reference base (i.e. no variant at all)
if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter");
feature.setRefBases(tokens[2].toUpperCase());
feature.setEnd(feature.getStart());
char ch = observedString.charAt(0);
switch ( ch ) {
case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break;
case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break;
case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break;
case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break;
case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break;
case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break;
case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break;
case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break;
case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break;
case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break;
}
if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE);
else {
// we know that at least one allele is non-ref;
// if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
// homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
feature.setVariantType(VariantType.SNP);
if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() ||
feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ||
feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))
) feature.setNumNonRef(1);
else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
}
}
feature.setRef(tokens[2].charAt(0));
switch (count) {
case basicTokenCount:
bases = tokens[4];
quals = tokens[5];
// parsing is pretty straightforward for 6-col format
if ( feature.getRef() == '*' ) { // this indicates an indel -- but it shouldn't occur with vanilla 6-col format
throw new CodecLineParsingException("Found an indel on line = " + line + " but it shouldn't happen in simple pileup format");
} else {
parseBasesAndQuals(feature, bases, quals);
feature.setRefBases(tokens[2].toUpperCase());
feature.setEnd(feature.getStart());
}
break;
case consensusSNPTokenCount: // pileup called a SNP or a reference base
observedString = tokens[3].toUpperCase();
feature.setFWDAlleles(new ArrayList<String>(2));
feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
feature.setVariantConfidence(Double.parseDouble(tokens[5]));
bases = tokens[8];
quals = tokens[9];
// confirm that we have a non-variant, not a mis-parsed indel
if ( feature.getRef() == '*' ) {
throw new CodecLineParsingException("Line parsing of " + line + " says we have a SNP or non-variant but the ref base is '*', which indicates an indel");
}
// Parse the SNP or non-variant
parseBasesAndQuals(feature, bases, quals);
if ( observedString.length() != 1 ) {
throw new CodecLineParsingException( "Line parsing of " + line + " says we have a SNP or non-variant but the genotype token is not a single letter: " + observedString);
}
feature.setRefBases(tokens[2].toUpperCase());
feature.setEnd(feature.getStart());
char ch = observedString.charAt(0);
switch ( ch ) { // record alleles (decompose ambiguous base codes)
case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break;
case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break;
case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break;
case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break;
case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break;
case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break;
case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break;
case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break;
case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break;
case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break;
}
if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE);
else {
// we know that at least one allele is non-ref;
// if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
// homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
feature.setVariantType(VariantType.SNP);
if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() ||
feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ||
feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))
) feature.setNumNonRef(1);
else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
}
break;
case consensusIndelTokenCount:
observedString = tokens[3].toUpperCase();
feature.setFWDAlleles(new ArrayList<String>(2));
feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
feature.setVariantConfidence(Double.parseDouble(tokens[5]));
// confirm that we have an indel, not a mis-parsed SNP or non-variant
if ( feature.getRef() != '*' ) {
throw new CodecLineParsingException("Line parsing of " + line + " says we have an indel but the ref base is not '*'");
}
// Parse the indel
parseIndels(observedString,feature) ;
if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1);
else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this has got to be an insertion; otherwise the state is inconsistent!!!!
break;
default:
throw new CodecLineParsingException("The SAM pileup line didn't have the expected number of tokens " +
"(expected = " + basicTokenCount + " (basic pileup), " + consensusSNPTokenCount +
" (consensus pileup for a SNP or non-variant site) or " + consensusIndelTokenCount +
" (consensus pileup for an indel); saw = " + count + " on line = " + line + ")");
}
return feature;
}
@ -197,7 +274,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
else feature.setVariantType(VariantType.DELETION);
feature.setRefBases(varBases); // remember what was deleted, this will be saved as "reference allele"
break;
default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
default: throw new CodecLineParsingException("Can not interpret observed indel allele record: "+genotype);
}
feature.getFWDAlleles().add(varBases);
feature.setLength(obs[i].length()-1); // inconsistent for non-biallelic indels!!
@ -224,7 +301,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
{
//System.out.printf("%s%n%s%n", bases, quals);
// needs to convert the base string with it's . and , to the ref base
// needs to convert the base string with its . and , to the ref base
StringBuilder baseBuilder = new StringBuilder();
StringBuilder qualBuilder = new StringBuilder();
boolean done = false;
@ -254,7 +331,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
Matcher match = regex.matcher(rest);
if ( ! match.matches() ) {
if ( feature.getRef() != '*' )
throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
throw new CodecLineParsingException("Bad pileup format: " + bases + " at position " + i);
done = true;
}
else {

View File

@ -33,6 +33,10 @@ import java.util.List;
/**
* A tribble feature representing a SAM pileup.
*
* Allows intake of both simple (6-column) or extended/consensus (10/13-column) pileups. Simple pileup features will
* contain only basic information, no observed alleles or variant/genotype inferences, and so shouldn't be used as
* input for analysis that requires that information.
*
* @author mhanna
* @version 0.1
*/

View File

@ -33,18 +33,38 @@ import java.util.Collections;
/**
* Run validating pileup across a set of core data as proof of the integrity of the GATK core.
*
* @author mhanna
* @version 0.1
* Tests both types of old-school pileup formats (basic and consensus).
*
* @author mhanna, vdauwera
* @version 0.2
*/
public class CheckPileupIntegrationTest extends WalkerTest {
/**
* This test runs on a consensus pileup containing 10-column lines for SNPs and 13-column lines for indels
*/
@Test(enabled = true)
public void testEcoliThreaded() {
public void testEcoliConsensusPileup() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T CheckPileup" +
" -I " + validationDataLocation + "MV1994.selected.bam" +
" -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" +
" --pileup:SAMPileup "+ validationDataLocation + "MV1994.selected.pileup" +
" -S SILENT -nt 8",0, Collections.<String>emptyList());
executeTest("testEcoliThreaded",spec);
executeTest("testEcoliConsensusPileup",spec);
}
/**
* This test runs on a basic pileup containing 6-column lines for all variants TODO
*/
@Test
public void testEcoliBasicPileup() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T CheckPileup" +
" -I " + validationDataLocation + "MV1994.selected.bam" +
" -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" +
" --pileup:SAMPileup "+ validationDataLocation + "MV1994.basic.pileup" +
" -L Escherichia_coli_K12:1-49" +
" -S SILENT -nt 8",0, Collections.<String>emptyList());
executeTest("testEcoliBasicPileup",spec);
}
}