Merge pull request #468 from broadinstitute/gg_fixSAMPileup
Updated SAMPileup codec and pileup-related docs
This commit is contained in:
commit
f67c33919b
|
|
@ -49,20 +49,109 @@ import java.io.PrintStream;
|
|||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* At every locus in the input set, compares the pileup data (reference base, aligned base from
|
||||
* each overlapping read, and quality score) to the reference pileup data generated by samtools. Samtools' pileup data
|
||||
* should be specified using the command-line argument '-pileup:SAMPileup <your sam pileup file>'.
|
||||
* Compare GATK's internal pileup to a reference Samtools pileup
|
||||
*
|
||||
* <p>At every locus in the input set, compares the pileup data (reference base, aligned base from
|
||||
* each overlapping read, and quality score) generated internally by GATK to a reference pileup data generated
|
||||
* by Samtools. Note that the pileup program has been replaced in Samtools by mpileup, which produces a slightly
|
||||
* different output format by default.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Format</h3>
|
||||
* <p>There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old
|
||||
* 10-column "consensus" format which could be obtained by using the -c argument, now deprecated.</p>
|
||||
* <h4>Simple pileup: 6-column format</h4>
|
||||
* <p>
|
||||
* Each line consists of chromosome, 1-based coordinate, reference base, the
|
||||
* number of reads covering the site, read bases and base qualities. At the
|
||||
* read base column, a dot stands for a match to the reference base on the
|
||||
* forward strand, a comma for a match on the reverse strand, `ACGTN' for a mismatch
|
||||
* on the forward strand and `acgtn' for a mismatch on the reverse strand.
|
||||
* A pattern `\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between
|
||||
* this reference position and the next reference position. The length of the
|
||||
* insertion is given by the integer in the pattern, followed by the inserted sequence.
|
||||
* </p>
|
||||
* <pre>
|
||||
* seq1 272 T 24 ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
|
||||
* seq1 273 T 23 ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
|
||||
* seq1 274 T 23 ,.$....,,.,.,...,,,.,... 7<7;<;<<<<<<<<<=<;<;<<6
|
||||
* seq1 275 A 23 ,$....,,.,.,...,,,.,...^l. <+;9*<<<<<<<<<=<<:;<<<<
|
||||
* seq1 276 G 22 ...T,,.,.,...,,,.,.... 33;+<<7=7<<7<&<<1;<<6<
|
||||
* seq1 277 T 22 ....,,.,.,.C.,,,.,..G. +7<;<<<<<<<&<=<<:;<<&<
|
||||
* seq1 278 G 23 ....,,.,.,...,,,.,....^k. %38*<<;<7<<7<=<<<;<<<<<
|
||||
* seq1 279 C 23 A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
|
||||
* </pre>
|
||||
* <p>
|
||||
* See the <a href="http://samtools.sourceforge.net/pileup.shtml">Pileup format documentation</a> for more details.
|
||||
* </p>
|
||||
*
|
||||
* <h4>Consensus pileup: 10/13-column format</h4>
|
||||
* <p>The "consensus" or extended pileup consists of the following:
|
||||
* <ul>
|
||||
* <li>original 6 columns as described above</li>
|
||||
* <li>4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the
|
||||
* reads covering the sites) for all sites, inserted before the bases and quality strings</li>
|
||||
* <li>3 extra columns indicating counts of reads supporting indels (just for indel sites)</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* <h4>Example of consensus pileup for SNP or non-variant sites</h4>
|
||||
* <pre>
|
||||
* seq1 60 T T 66 0 99 13 ...........^~.^~. 9<<55<;<<<<<<
|
||||
* seq1 61 G G 72 0 99 15 .............^~.^y. (;975&;<<<<<<<<
|
||||
* seq1 62 T T 72 0 99 15 .$.............. <;;,55;<<<<<<<<
|
||||
* seq1 63 G G 72 0 99 15 .$.............^~. 4;2;<7:+<<<<<<<
|
||||
* seq1 64 G G 69 0 99 14 .............. 9+5<;;;<<<<<<<
|
||||
* seq1 65 A A 69 0 99 14 .$............. <5-2<;;<<<<<<;
|
||||
* seq1 66 C C 66 0 99 13 ............. &*<;;<<<<<<8<
|
||||
* seq1 67 C C 69 0 99 14 .............^~. ,75<.4<<<<<-<<
|
||||
* seq1 68 C C 69 0 99 14 .............. 576<;7<<<<<8<< *
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Example of consensus pileup for indels</h4>
|
||||
* <pre>
|
||||
* Escherichia_coli_K12 3995037 * *\/* 430 0 37 144 * +A 143 1 0
|
||||
* Escherichia_coli_K12 3995279 * *\/* 202 0 36 68 * +A 67 1 0
|
||||
* Escherichia_coli_K12 3995281 * *\/* 239 0 36 67 * -CG 66 1 0
|
||||
* </pre>
|
||||
* <p>
|
||||
* See <a href="http://samtools.sourceforge.net/cns0.shtml/">Consensus pileup format (deprecated)</a> for more details.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
* <p>A BAM file conatining your aligned sequence data and a pileup file generated by Samtools covering the region you
|
||||
* want to examine.</p>
|
||||
*
|
||||
* <h3>Output</h3>
|
||||
* <p>A text file listing mismatches between the input pileup and the GATK's internal pileup. If there are no mismatches, the output file is empty.</p>
|
||||
*
|
||||
* <h3>Example</h3>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T CheckPileup \
|
||||
* -R ref.fasta \
|
||||
* -I your_data.bam \
|
||||
* --pileup:SAMPileup pileup_file.txt \
|
||||
* -L chr1:257-275 \
|
||||
* -o output_file_name
|
||||
* </pre>
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
|
||||
@Requires(value={DataSource.READS,DataSource.REFERENCE})
|
||||
public class CheckPileup extends LocusWalker<Integer, CheckPileupStats> implements TreeReducible<CheckPileupStats> {
|
||||
@Input(fullName = "pileup", doc="The SAMPileup containing the expected output", required = true)
|
||||
/**
|
||||
* This is the existing pileup against which we'll compare GATK's internal pileup at each genome position in the desired interval.
|
||||
*/
|
||||
@Input(fullName = "pileup", shortName = "pileup", doc="Pileup generated by Samtools", required = true)
|
||||
RodBinding<SAMPileupFeature> pileup;
|
||||
|
||||
@Output
|
||||
private PrintStream out;
|
||||
|
||||
@Argument(fullName="continue_after_error",doc="Continue after an error",required=false)
|
||||
/**
|
||||
* By default the program will quit if it encounters an error (such as missing truth data for a given position).
|
||||
* Use this flag to override the default behavior; the program will then simply print an error message and move on
|
||||
* to the next position.
|
||||
*/
|
||||
@Argument(fullName="continue_after_error",doc="Continue after encountering an error",required=false)
|
||||
public boolean CONTINUE_AFTER_AN_ERROR = false;
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
|
|
@ -72,7 +161,7 @@ public class CheckPileup extends LocusWalker<Integer, CheckPileupStats> implemen
|
|||
if ( truePileup == null ) {
|
||||
out.printf("No truth pileup data available at %s%n", pileup.getPileupString(ref.getBaseAsChar()));
|
||||
if ( ! CONTINUE_AFTER_AN_ERROR ) {
|
||||
throw new UserException.CommandLineException(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases",
|
||||
throw new UserException.BadInput(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases",
|
||||
context.getLocation(), new String(pileup.getBases())));
|
||||
}
|
||||
} else {
|
||||
|
|
@ -80,7 +169,7 @@ public class CheckPileup extends LocusWalker<Integer, CheckPileupStats> implemen
|
|||
if ( pileupDiff != null ) {
|
||||
out.printf("%s vs. %s%n", pileup.getPileupString(ref.getBaseAsChar()), truePileup.getPileupString());
|
||||
if ( ! CONTINUE_AFTER_AN_ERROR ) {
|
||||
throw new RuntimeException(String.format("Pileups aren't equal: %s", pileupDiff));
|
||||
throw new UserException.BadInput(String.format("The input pileup doesn't match the GATK's internal pileup: %s", pileupDiff));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,11 +48,17 @@ import java.util.List;
|
|||
/**
|
||||
* Emulates the samtools pileup command to print aligned reads
|
||||
*
|
||||
* <p>Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position,
|
||||
* consisting of chromosome name, coordinate, reference base, read bases, and read qualities.
|
||||
* <p>Prints the alignment in something similar to the Samtools pileup format (see the
|
||||
* <a href="http://samtools.sourceforge.net/pileup.shtml">Pileup format documentation</a> for more details about
|
||||
* the original format). There is one line per genomic position, listing the chromosome name, coordinate, reference
|
||||
* base, read bases, and read qualities. In addition to these default fields, additional information can be added to
|
||||
* the output as extra columns; see options detailed below.</p>
|
||||
*
|
||||
* Emulated command:
|
||||
* samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] <in.alignment>
|
||||
* <h4>Emulated command:</h4>
|
||||
* <pre>
|
||||
* samtools pileup -f in.ref.fasta -l in.site_list input.bam
|
||||
* </pre>
|
||||
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
|
|
@ -61,17 +67,32 @@ import java.util.List;
|
|||
*
|
||||
* <h3>Output</h3>
|
||||
* <p>
|
||||
* Formatted pileup-style alignment of reads.
|
||||
* Alignment of reads formatted in the Pileup style.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Example</h3>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -T Pileup \
|
||||
* -R ref.fasta \
|
||||
* -I aligned_reads.bam \
|
||||
* -R exampleFASTA.fasta \
|
||||
* -I exampleBAM.bam \
|
||||
* -L chr1:257-267
|
||||
* -o output.txt
|
||||
* </pre>
|
||||
* <h4>Expected output</h4>
|
||||
* <pre>
|
||||
* chr1 257 A CAA '&=
|
||||
* chr1 258 C TCC A:=
|
||||
* chr1 259 C CCC )A=
|
||||
* chr1 260 C ACC (=<
|
||||
* chr1 261 T TCT '44
|
||||
* chr1 262 A AAA '?:
|
||||
* chr1 263 A AGA 1'6
|
||||
* chr1 264 C TCC 987
|
||||
* chr1 265 C CCC (@(
|
||||
* chr1 266 C GCC ''=
|
||||
* chr1 267 T AAT 7%>
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_QC, extraDocs = {CommandLineGATK.class} )
|
||||
|
|
@ -83,18 +104,25 @@ public class Pileup extends LocusWalker<String, Integer> implements TreeReducibl
|
|||
PrintStream out;
|
||||
|
||||
/**
|
||||
* In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions,
|
||||
* In addition to the standard pileup output, adds 'verbose' output too. The verbose output contains the number of spanning deletions,
|
||||
* and for each read in the pileup it has the read name, offset in the base string, read length, and read mapping quality. These per
|
||||
* read items are delimited with an '@' character.
|
||||
*/
|
||||
@Argument(fullName="showVerbose",shortName="verbose",doc="Add an extra verbose section to the pileup output", required=false)
|
||||
public boolean SHOW_VERBOSE = false;
|
||||
|
||||
@Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false)
|
||||
/**
|
||||
* This enables annotating the pileup to show overlaps with metadata from a ROD file.
|
||||
* For example, if you provide a VCF and there is a SNP at a given location covered by the pileup, the pileup
|
||||
* output at that position will be annotated with the corresponding source ROD identifier.
|
||||
*/
|
||||
@Input(fullName="metadata",shortName="metadata",doc="ROD file containing metadata", required=false)
|
||||
public List<RodBinding<Feature>> rods = Collections.emptyList();
|
||||
|
||||
/**
|
||||
* Adds the length of the insert each base comes from to the output pileup. Here, "insert" refers to the DNA insert
|
||||
* produced during library generation before sequencing.
|
||||
*/
|
||||
@Hidden
|
||||
@Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Add a column which contains the length of the insert each base comes from.",required=false)
|
||||
@Argument(fullName="outputInsertLength",shortName = "outputInsertLength",doc="Output insert length",required=false)
|
||||
public boolean outputInsertLength=false;
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -37,13 +37,21 @@ import java.util.regex.Pattern;
|
|||
import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.VariantType;
|
||||
|
||||
/**
|
||||
* Decoder for SAM pileup data. For GATK validation purposes only
|
||||
* Decoder for SAM pileup data.
|
||||
*
|
||||
* <p>
|
||||
* Pileup format is first used by Tony Cox and Zemin Ning at the Sanger Institute.
|
||||
* It desribes the base-pair information at each chromosomal position. This format
|
||||
* facilitates SNP/indel calling and brief alignment viewing by eyes.
|
||||
* From the <a href="http://samtools.sourceforge.net/">SAMTools project documentation</a>:
|
||||
* </p>
|
||||
* <p>The Pileup format was first used by Tony Cox and Zemin Ning at
|
||||
* the Sanger Institute. It describes the base-pair information at each chromosomal position. This format
|
||||
* facilitates SNP/indel calling and brief alignment viewing by eye. Note that the pileup program has been replaced
|
||||
* in Samtools by mpileup, which produces a slightly different output format by default.
|
||||
* </p>
|
||||
|
||||
* <h3>Format</h3>
|
||||
* <p>There are two versions of the original pileup format: the current 6-column format produced by Samtools, and the old
|
||||
* 10/13-column "consensus" format which could be obtained by using the -c argument, now deprecated. </p>
|
||||
* <h4>Simple pileup: 6-column format</h4>
|
||||
* <p>
|
||||
* Each line consists of chromosome, 1-based coordinate, reference base, the
|
||||
* number of reads covering the site, read bases and base qualities. At the
|
||||
|
|
@ -54,13 +62,6 @@ import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.V
|
|||
* this reference position and the next reference position. The length of the
|
||||
* insertion is given by the integer in the pattern, followed by the inserted sequence.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* <br>See also: @see <a href="http://samtools.sourceforge.net/">SAMTools project</a></br>
|
||||
* <br>See also: @see <a href="http://samtools.sourceforge.net/pileup.shtml">Pileup format</a></br>
|
||||
* </p>
|
||||
*
|
||||
* <h2>File format example</h2>
|
||||
* <pre>
|
||||
* seq1 272 T 24 ,.$.....,,.,.,...,,,.,..^+. <<<+;<<<<<<<<<<<=<;<;7<&
|
||||
* seq1 273 T 23 ,.....,,.,.,...,,,.,..A <<<;<<<<<<<<<3<=<<<;<<+
|
||||
|
|
@ -71,15 +72,55 @@ import static org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature.V
|
|||
* seq1 278 G 23 ....,,.,.,...,,,.,....^k. %38*<<;<7<<7<=<<<;<<<<<
|
||||
* seq1 279 C 23 A..T,,.,.,...,,,.,..... ;75&<<<<<<<<<=<<<9<<:<<
|
||||
* </pre>
|
||||
* <p>
|
||||
* See the <a href="http://samtools.sourceforge.net/pileup.shtml">Pileup format documentation</a> for more details.
|
||||
* </p>
|
||||
*
|
||||
* @author Matt Hanna
|
||||
* @since 2009
|
||||
* <h4>Consensus pileup: 10/13-column format</h4>
|
||||
* <p>The "consensus" or extended pileup consists of the following:
|
||||
* <ul>
|
||||
* <li>original 6 columns as described above</li>
|
||||
* <li>4 extra columns representing consensus values (consensus base, consensus quality, variant quality and maximum mapping quality of the
|
||||
* reads covering the sites) for all sites, inserted before the bases and quality strings</li>
|
||||
* <li>3 extra columns indicating counts of reads supporting indels (just for indel sites)</li>
|
||||
* </ul>
|
||||
* </p>
|
||||
* <h4>Example of consensus pileup for SNP or non-variant sites</h4>
|
||||
* <pre>
|
||||
* seq1 60 T T 66 0 99 13 ...........^~.^~. 9<<55<;<<<<<<
|
||||
* seq1 61 G G 72 0 99 15 .............^~.^y. (;975&;<<<<<<<<
|
||||
* seq1 62 T T 72 0 99 15 .$.............. <;;,55;<<<<<<<<
|
||||
* seq1 63 G G 72 0 99 15 .$.............^~. 4;2;<7:+<<<<<<<
|
||||
* seq1 64 G G 69 0 99 14 .............. 9+5<;;;<<<<<<<
|
||||
* seq1 65 A A 69 0 99 14 .$............. <5-2<;;<<<<<<;
|
||||
* seq1 66 C C 66 0 99 13 ............. &*<;;<<<<<<8<
|
||||
* seq1 67 C C 69 0 99 14 .............^~. ,75<.4<<<<<-<<
|
||||
* seq1 68 C C 69 0 99 14 .............. 576<;7<<<<<8<< *
|
||||
* </pre>
|
||||
*
|
||||
* <h4>Example of consensus pileup for indels</h4>
|
||||
* <pre>
|
||||
* Escherichia_coli_K12 3995037 * *\/* 430 0 37 144 * +A 143 1 0
|
||||
* Escherichia_coli_K12 3995279 * *\/* 202 0 36 68 * +A 67 1 0
|
||||
* Escherichia_coli_K12 3995281 * *\/* 239 0 36 67 * -CG 66 1 0
|
||||
* </pre>
|
||||
* <p>
|
||||
* See <a href="http://samtools.sourceforge.net/cns0.shtml/">Consensus pileup format (deprecated)</a> for more details.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Caveat</h3>
|
||||
* <p>Handling of indels is questionable at the moment. Proceed with care.</p>
|
||||
*
|
||||
*
|
||||
* @author Matt Hanna, Geraldine VdAuwera
|
||||
* @since 2014
|
||||
*/
|
||||
public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
|
||||
// the number of tokens we expect to parse from a pileup line
|
||||
private static final int expectedTokenCount = 10;
|
||||
// number of tokens expected (6 or 10 are valid, anything else is wrong)
|
||||
private static final int basicTokenCount = 6;
|
||||
private static final int consensusSNPTokenCount = 10;
|
||||
private static final int consensusIndelTokenCount = 13;
|
||||
private static final char fldDelim = '\t';
|
||||
|
||||
// allocate once and don't ever bother creating them again:
|
||||
private static final String baseA = "A";
|
||||
private static final String baseC = "C";
|
||||
|
|
@ -92,74 +133,110 @@ public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
|
|||
}
|
||||
|
||||
public SAMPileupFeature decode(String line) {
|
||||
// 0 1 2 3 4 5 6 7
|
||||
//* chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
|
||||
//* chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
|
||||
String[] tokens = new String[expectedTokenCount];
|
||||
//+1 because we want to know if we have more than the max
|
||||
String[] tokens = new String[consensusIndelTokenCount+1];
|
||||
|
||||
// split the line
|
||||
int count = ParsingUtils.split(line,tokens,fldDelim);
|
||||
|
||||
// check to see if we've parsed the string into the right number of tokens (expectedTokenCount)
|
||||
if (count != expectedTokenCount)
|
||||
throw new CodecLineParsingException("the SAM pileup line didn't have the expected number of tokens " +
|
||||
"(expected = " + expectedTokenCount + ", saw = " + count + " on " +
|
||||
"line = " + line + ")");
|
||||
final int count = ParsingUtils.split(line,tokens,fldDelim);
|
||||
|
||||
SAMPileupFeature feature = new SAMPileupFeature();
|
||||
|
||||
/**
|
||||
* Tokens 0, 1, 2 are the same for both formats so they will be interpreted without differentiation.
|
||||
* The 10/13-format has 4 tokens inserted after token 2 compared to the 6-format, plus 3 more tokens added at
|
||||
* the end for indels. We are currently not making any use of the extra indel tokens.
|
||||
*
|
||||
* Any token count other than basicTokenCount, consensusSNPTokenCount or consensusIndelTokenCount is wrong.
|
||||
*/
|
||||
final String observedString, bases, quals;
|
||||
|
||||
feature.setChr(tokens[0]);
|
||||
feature.setStart(Integer.parseInt(tokens[1]));
|
||||
|
||||
if(tokens[2].length() != 1)
|
||||
if(tokens[2].length() != 1) {
|
||||
throw new CodecLineParsingException("The SAM pileup line had unexpected base " + tokens[2] + " on line = " + line);
|
||||
feature.setRef(Character.toUpperCase(tokens[2].charAt(0)));
|
||||
|
||||
String observedString = tokens[3].toUpperCase(); // field 3
|
||||
feature.setFWDAlleles(new ArrayList<String>(2));
|
||||
|
||||
feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
|
||||
feature.setVariantConfidence(Double.parseDouble(tokens[5]));
|
||||
|
||||
if ( feature.getRef() == '*' ) {
|
||||
parseIndels(observedString,feature) ;
|
||||
if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1);
|
||||
else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
|
||||
} else {
|
||||
parseBasesAndQuals(feature,tokens[8],tokens[9]);
|
||||
// if the variant is a SNP or a reference base (i.e. no variant at all)
|
||||
if ( observedString.length() != 1 ) throw new RuntimeException( "point mutation genotype is expected to be represented by a single letter");
|
||||
feature.setRefBases(tokens[2].toUpperCase());
|
||||
feature.setEnd(feature.getStart());
|
||||
|
||||
char ch = observedString.charAt(0);
|
||||
|
||||
switch ( ch ) {
|
||||
case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break;
|
||||
case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break;
|
||||
case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break;
|
||||
case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break;
|
||||
case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break;
|
||||
case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break;
|
||||
case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break;
|
||||
case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break;
|
||||
case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break;
|
||||
case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break;
|
||||
}
|
||||
if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE);
|
||||
else {
|
||||
// we know that at least one allele is non-ref;
|
||||
// if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
|
||||
// homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
|
||||
feature.setVariantType(VariantType.SNP);
|
||||
if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() ||
|
||||
feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ||
|
||||
feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))
|
||||
) feature.setNumNonRef(1);
|
||||
else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
|
||||
}
|
||||
}
|
||||
feature.setRef(tokens[2].charAt(0));
|
||||
|
||||
switch (count) {
|
||||
case basicTokenCount:
|
||||
bases = tokens[4];
|
||||
quals = tokens[5];
|
||||
// parsing is pretty straightforward for 6-col format
|
||||
if ( feature.getRef() == '*' ) { // this indicates an indel -- but it shouldn't occur with vanilla 6-col format
|
||||
throw new CodecLineParsingException("Found an indel on line = " + line + " but it shouldn't happen in simple pileup format");
|
||||
} else {
|
||||
parseBasesAndQuals(feature, bases, quals);
|
||||
feature.setRefBases(tokens[2].toUpperCase());
|
||||
feature.setEnd(feature.getStart());
|
||||
}
|
||||
break;
|
||||
case consensusSNPTokenCount: // pileup called a SNP or a reference base
|
||||
observedString = tokens[3].toUpperCase();
|
||||
feature.setFWDAlleles(new ArrayList<String>(2));
|
||||
feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
|
||||
feature.setVariantConfidence(Double.parseDouble(tokens[5]));
|
||||
bases = tokens[8];
|
||||
quals = tokens[9];
|
||||
// confirm that we have a non-variant, not a mis-parsed indel
|
||||
if ( feature.getRef() == '*' ) {
|
||||
throw new CodecLineParsingException("Line parsing of " + line + " says we have a SNP or non-variant but the ref base is '*', which indicates an indel");
|
||||
}
|
||||
// Parse the SNP or non-variant
|
||||
parseBasesAndQuals(feature, bases, quals);
|
||||
if ( observedString.length() != 1 ) {
|
||||
throw new CodecLineParsingException( "Line parsing of " + line + " says we have a SNP or non-variant but the genotype token is not a single letter: " + observedString);
|
||||
}
|
||||
feature.setRefBases(tokens[2].toUpperCase());
|
||||
feature.setEnd(feature.getStart());
|
||||
|
||||
char ch = observedString.charAt(0);
|
||||
|
||||
switch ( ch ) { // record alleles (decompose ambiguous base codes)
|
||||
case 'A': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseA); break;
|
||||
case 'C': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseC); break;
|
||||
case 'G': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseG); break;
|
||||
case 'T': feature.getFWDAlleles().add(baseT); feature.getFWDAlleles().add(baseT); break;
|
||||
case 'M': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseC); break;
|
||||
case 'R': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseG); break;
|
||||
case 'W': feature.getFWDAlleles().add(baseA); feature.getFWDAlleles().add(baseT); break;
|
||||
case 'S': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseG); break;
|
||||
case 'Y': feature.getFWDAlleles().add(baseC); feature.getFWDAlleles().add(baseT); break;
|
||||
case 'K': feature.getFWDAlleles().add(baseG); feature.getFWDAlleles().add(baseT); break;
|
||||
}
|
||||
if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() && feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ) feature.setVariantType(VariantType.NONE);
|
||||
else {
|
||||
// we know that at least one allele is non-ref;
|
||||
// if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
|
||||
// homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
|
||||
feature.setVariantType(VariantType.SNP);
|
||||
if ( feature.getFWDAlleles().get(0).charAt(0) == feature.getRef() ||
|
||||
feature.getFWDAlleles().get(1).charAt(0) == feature.getRef() ||
|
||||
feature.getFWDAlleles().get(0).equals(feature.getFWDAlleles().get(1))
|
||||
) feature.setNumNonRef(1);
|
||||
else feature.setNumNonRef(2); // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
|
||||
}
|
||||
break;
|
||||
case consensusIndelTokenCount:
|
||||
observedString = tokens[3].toUpperCase();
|
||||
feature.setFWDAlleles(new ArrayList<String>(2));
|
||||
feature.setConsensusConfidence(Double.parseDouble(tokens[4]));
|
||||
feature.setVariantConfidence(Double.parseDouble(tokens[5]));
|
||||
// confirm that we have an indel, not a mis-parsed SNP or non-variant
|
||||
if ( feature.getRef() != '*' ) {
|
||||
throw new CodecLineParsingException("Line parsing of " + line + " says we have an indel but the ref base is not '*'");
|
||||
}
|
||||
// Parse the indel
|
||||
parseIndels(observedString,feature) ;
|
||||
if ( feature.isDeletion() ) feature.setEnd(feature.getStart()+feature.length()-1);
|
||||
else feature.setEnd(feature.getStart()); // if it's not a deletion and we are biallelic, this has got to be an insertion; otherwise the state is inconsistent!!!!
|
||||
break;
|
||||
default:
|
||||
throw new CodecLineParsingException("The SAM pileup line didn't have the expected number of tokens " +
|
||||
"(expected = " + basicTokenCount + " (basic pileup), " + consensusSNPTokenCount +
|
||||
" (consensus pileup for a SNP or non-variant site) or " + consensusIndelTokenCount +
|
||||
" (consensus pileup for an indel); saw = " + count + " on line = " + line + ")");
|
||||
}
|
||||
return feature;
|
||||
}
|
||||
|
||||
|
|
@ -197,7 +274,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
|
|||
else feature.setVariantType(VariantType.DELETION);
|
||||
feature.setRefBases(varBases); // remember what was deleted, this will be saved as "reference allele"
|
||||
break;
|
||||
default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
|
||||
default: throw new CodecLineParsingException("Can not interpret observed indel allele record: "+genotype);
|
||||
}
|
||||
feature.getFWDAlleles().add(varBases);
|
||||
feature.setLength(obs[i].length()-1); // inconsistent for non-biallelic indels!!
|
||||
|
|
@ -224,7 +301,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
|
|||
{
|
||||
//System.out.printf("%s%n%s%n", bases, quals);
|
||||
|
||||
// needs to convert the base string with it's . and , to the ref base
|
||||
// needs to convert the base string with its . and , to the ref base
|
||||
StringBuilder baseBuilder = new StringBuilder();
|
||||
StringBuilder qualBuilder = new StringBuilder();
|
||||
boolean done = false;
|
||||
|
|
@ -254,7 +331,7 @@ public class SAMPileupCodec extends AsciiFeatureCodec<SAMPileupFeature> {
|
|||
Matcher match = regex.matcher(rest);
|
||||
if ( ! match.matches() ) {
|
||||
if ( feature.getRef() != '*' )
|
||||
throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
|
||||
throw new CodecLineParsingException("Bad pileup format: " + bases + " at position " + i);
|
||||
done = true;
|
||||
}
|
||||
else {
|
||||
|
|
|
|||
|
|
@ -33,6 +33,10 @@ import java.util.List;
|
|||
/**
|
||||
* A tribble feature representing a SAM pileup.
|
||||
*
|
||||
* Allows intake of both simple (6-column) or extended/consensus (10/13-column) pileups. Simple pileup features will
|
||||
* contain only basic information, no observed alleles or variant/genotype inferences, and so shouldn't be used as
|
||||
* input for analysis that requires that information.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -33,18 +33,38 @@ import java.util.Collections;
|
|||
/**
|
||||
* Run validating pileup across a set of core data as proof of the integrity of the GATK core.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
* Tests both types of old-school pileup formats (basic and consensus).
|
||||
*
|
||||
* @author mhanna, vdauwera
|
||||
* @version 0.2
|
||||
*/
|
||||
public class CheckPileupIntegrationTest extends WalkerTest {
|
||||
/**
|
||||
* This test runs on a consensus pileup containing 10-column lines for SNPs and 13-column lines for indels
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testEcoliThreaded() {
|
||||
public void testEcoliConsensusPileup() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T CheckPileup" +
|
||||
" -I " + validationDataLocation + "MV1994.selected.bam" +
|
||||
" -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" +
|
||||
" --pileup:SAMPileup "+ validationDataLocation + "MV1994.selected.pileup" +
|
||||
" -S SILENT -nt 8",0, Collections.<String>emptyList());
|
||||
executeTest("testEcoliThreaded",spec);
|
||||
executeTest("testEcoliConsensusPileup",spec);
|
||||
}
|
||||
|
||||
/**
|
||||
* This test runs on a basic pileup containing 6-column lines for all variants TODO
|
||||
*/
|
||||
@Test
|
||||
public void testEcoliBasicPileup() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T CheckPileup" +
|
||||
" -I " + validationDataLocation + "MV1994.selected.bam" +
|
||||
" -R " + validationDataLocation + "Escherichia_coli_K12_MG1655.fasta" +
|
||||
" --pileup:SAMPileup "+ validationDataLocation + "MV1994.basic.pileup" +
|
||||
" -L Escherichia_coli_K12:1-49" +
|
||||
" -S SILENT -nt 8",0, Collections.<String>emptyList());
|
||||
executeTest("testEcoliBasicPileup",spec);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue