General purpose pileup code -- you can use these features to obtain detailed pileup data from reads and offsets. Useful for all pileup based walkers. Expanded support for rodSAMPileup to enable the new ValidatingPileupWalker, which takes a samtools pileup output and checks that GATK gives identical output as samtools on a per base and per qual pileup. It's going to be a very useful validation tool.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@418 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-04-14 22:13:10 +00:00
parent baae98c6d5
commit 72a3d84ed2
11 changed files with 650 additions and 328 deletions

View File

@ -176,8 +176,12 @@ public class GenomeAnalysisTK extends CommandLineProgram {
* Required main method implementation.
*/
public static void main(String[] argv) {
Instance = new GenomeAnalysisTK();
start(Instance, argv);
try {
Instance = new GenomeAnalysisTK();
start(Instance, argv);
} catch ( Exception e ) {
exitSystemWithError(e);
}
}
/**
@ -195,6 +199,30 @@ public class GenomeAnalysisTK extends CommandLineProgram {
ROD_BINDINGS.add(file);
}
private static void printExitSystemMsg(final String msg) {
System.out.printf("------------------------------------------------------------------------------------------%n");
System.out.printf("An error has occurred%n");
System.out.printf("It's possible (maybe even likely) that this is an input error on your part%n");
System.out.printf("But if it's a bug or something that should work, please report this to gsadevelopers@broad.mit.edu%n");
System.out.printf("%n");
System.out.printf("%s%n", msg);
}
public static void exitSystemWithError(final String msg) {
printExitSystemMsg(msg);
System.exit(1);
}
public static void exitSystemWithError(final String msg, Exception e ) {
e.printStackTrace();
printExitSystemMsg(msg);
System.exit(1);
}
public static void exitSystemWithError(Exception e ) {
exitSystemWithError(e.getMessage(), e);
}
protected int execute() {
final boolean TEST_ROD = false;
List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods = new ArrayList<ReferenceOrderedData<? extends ReferenceOrderedDatum> >();

View File

@ -2,15 +2,16 @@ package org.broadinstitute.sting.gatk.refdata;
import java.util.*;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
import org.broadinstitute.sting.utils.Pileup;
/**
* This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.<br>
*
*
* Example format:<br>
* for SNP:<br>
* [chr] [pos] [ref] [consensus allele(s)] [consensus confidence] [snp confidence] [max mapping qual] [num reads in the pile] <br>
@ -24,35 +25,38 @@ import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
* Time: 2:58:33 PM
* To change this template use File | Settings | File Templates.
*/
public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVariant {
private static final int NO_VARIANT = -1;
private static final int SNP_VARIANT = 0;
private static final int INSERTION_VARIANT = 1;
private static final int DELETION_VARIANT = 2;
private static final int INDEL_VARIANT = 3;
// allocate once and don't ever bother creating them again:
private static final String baseA = new String("A");
private static final String baseC = new String("C");
private static final String baseG = new String("G");
private static final String baseT = new String("T");
private static final String emptyStr = new String(); // we will use this for "reference" allele in insertions
public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVariant, Pileup {
private static final int NO_VARIANT = -1;
private static final int SNP_VARIANT = 0;
private static final int INSERTION_VARIANT = 1;
private static final int DELETION_VARIANT = 2;
private static final int INDEL_VARIANT = 3;
// allocate once and don't ever bother creating them again:
private static final String baseA = new String("A");
private static final String baseC = new String("C");
private static final String baseG = new String("G");
private static final String baseT = new String("T");
private static final String emptyStr = new String(); // we will use this for "reference" allele in insertions
protected GenomeLoc loc; // genome location of SNP
// Reference sequence chromosome or scaffold
// Start and stop positions in chrom
// Reference sequence chromosome or scaffold
// Start and stop positions in chrom
protected char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!)
protected String refBases; // the reference base according to NCBI, in the dbSNP file
protected String observedString; // store the actual string representation of observed alleles
protected String pileupQuals; // the read base qualities
protected String pileupBases; // the read bases themselves
protected List<String> observedAlleles = null; // The sequences of the observed alleles from rs-fasta files
protected int varType = NO_VARIANT;
protected int ploidy = 2; // how many allelic variants we get?
protected int nNonref = 0; // number of non-reference alleles
protected int eventLength = 0;
protected double consensusScore;
protected double variantScore;
// ----------------------------------------------------------------------
@ -69,127 +73,194 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian
// 0 1 2 3 4 5 6 7
// * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
// * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
try {
try {
String contig = parts[0];
long start = Long.parseLong(parts[1]) ;
consensusScore = Double.parseDouble(parts[4]);
variantScore = Double.parseDouble(parts[5]);
refBaseChar = Character.toUpperCase(parts[2].charAt(0));
parts[3] = parts[3].toUpperCase();
observedString = parts[3];
parseBasesAndQuals(parts[8], parts[9]);
observedAlleles = new ArrayList<String>(2);
if ( refBaseChar == '*' ) {
parseIndels(parts[3]) ;
if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength);
else loc = new GenomeLoc(contig, start, start); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
parseIndels(parts[3]) ;
if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength);
else loc = new GenomeLoc(contig, start, start); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!!
}
else {
// if the variant is a SNP or a reference base (i.e. no variant at all)
assert parts[3].length() == 1 : "non-indel genotype is expected to be represented by a single letter";
refBases = parts[2].toUpperCase();
eventLength = 1;
loc = new GenomeLoc(contig, start, start+1);
// if the variant is a SNP or a reference base (i.e. no variant at all)
assert parts[3].length() == 1 : "non-indel genotype is expected to be represented by a single letter";
refBases = parts[2].toUpperCase();
eventLength = 1;
//loc = new GenomeLoc(contig, start, start+1);
loc = new GenomeLoc(contig, start, start);
char ch = parts[3].charAt(0);
switch ( ch ) {
case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break;
case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break;
case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break;
case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break;
case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break;
case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break;
case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break;
case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break;
case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break;
case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break;
}
if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT;
else {
// we know that at least one allele is non-ref;
// if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
// homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
varType = SNP_VARIANT;
if ( observedAlleles.get(0).charAt(0) == refBaseChar ||
observedAlleles.get(1).charAt(0) == refBaseChar ||
observedAlleles.get(0) == observedAlleles.get(1)
) nNonref = 1;
else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
}
char ch = parts[3].charAt(0);
switch ( ch ) {
case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break;
case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break;
case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break;
case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break;
case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break;
case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break;
case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break;
case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break;
case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break;
case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break;
}
if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT;
else {
// we know that at least one allele is non-ref;
// if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e.
// homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref)
varType = SNP_VARIANT;
if ( observedAlleles.get(0).charAt(0) == refBaseChar ||
observedAlleles.get(1).charAt(0) == refBaseChar ||
observedAlleles.get(0) == observedAlleles.get(1)
) nNonref = 1;
else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site...
}
}
} catch ( RuntimeException e ) {
System.out.printf(" Exception caught during parsing Pileup line: %s%n", Utils.join(" <=> ", parts));
System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", Utils.join(" <=> ", parts));
throw e;
}
if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString());
}
private void parseIndels(String genotype) {
String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit
// if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference;
// if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases
// from what we have recorded for the deletion allele (e.g. "-CAC")
boolean hasRefAllele = false;
for ( int i = 0 ; i < obs.length ; i++ ) {
if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) {
hasRefAllele = true;
continue; // we will fill reference allele later
}
String varBases = obs[i].substring(1).toUpperCase();
switch ( obs[i].charAt(0) ) {
case '+':
if ( varType != NO_VARIANT && varType != INSERTION_VARIANT ) varType = INDEL_VARIANT;
else varType = INSERTION_VARIANT;
refBases = emptyStr;
break;
case '-' :
if ( varType != -1 && varType != DELETION_VARIANT ) varType = INDEL_VARIANT;
else varType = DELETION_VARIANT;
refBases = varBases; // remember what was deleted, this will be saved as "reference allele"
break;
default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
}
observedAlleles.add(varBases);
eventLength = obs[i].length() - 1; // inconsistent for non-biallelic indels!!
}
if ( hasRefAllele ) {
// we got at least one ref. allele (out of two recorded)
if ( varType == NO_VARIANT ) { // both top theories are actually ref allele;
observedAlleles.add(emptyStr);
observedAlleles.add(emptyStr); // no inserted/deleted bases at the site!
nNonref = 0; // no observations of non-reference allele at all
} else {
nNonref = 1; // hasRefAllele = true, so one allele was definitely ref, hence there is only one left
// whether we have insertion or deletion, one allele (ref for insertion, or alt for deletion) is empty;
// the one that contain actual bases (alt for insertion or ref for deletion) was already filled above:
observedAlleles.add(emptyStr);
}
} else {
// we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we
// fail to set data in a consistent way.. (the check for INDEL_VARIANT ensures that recorded variants are indeed both insertions
// or both deletions as compared to +ACC/-ACC which would still have the same bases (no matter how crazy and improbable
// such event would be)
if ( observedAlleles.get(0).equals(observedAlleles.get(1)) && varType != INDEL_VARIANT ) nNonref = 1;
else nNonref = 2;
}
// DONE with indels
private void parseIndels(String genotype) {
String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit
// if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference;
// if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases
// from what we have recorded for the deletion allele (e.g. "-CAC")
boolean hasRefAllele = false;
for ( int i = 0 ; i < obs.length ; i++ ) {
if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) {
hasRefAllele = true;
continue; // we will fill reference allele later
}
String varBases = obs[i].substring(1).toUpperCase();
switch ( obs[i].charAt(0) ) {
case '+':
if ( varType != NO_VARIANT && varType != INSERTION_VARIANT ) varType = INDEL_VARIANT;
else varType = INSERTION_VARIANT;
refBases = emptyStr;
break;
case '-' :
if ( varType != -1 && varType != DELETION_VARIANT ) varType = INDEL_VARIANT;
else varType = DELETION_VARIANT;
refBases = varBases; // remember what was deleted, this will be saved as "reference allele"
break;
default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype);
}
observedAlleles.add(varBases);
eventLength = obs[i].length() - 1; // inconsistent for non-biallelic indels!!
}
if ( hasRefAllele ) {
// we got at least one ref. allele (out of two recorded)
if ( varType == NO_VARIANT ) { // both top theories are actually ref allele;
observedAlleles.add(emptyStr);
observedAlleles.add(emptyStr); // no inserted/deleted bases at the site!
nNonref = 0; // no observations of non-reference allele at all
} else {
nNonref = 1; // hasRefAllele = true, so one allele was definitely ref, hence there is only one left
// whether we have insertion or deletion, one allele (ref for insertion, or alt for deletion) is empty;
// the one that contain actual bases (alt for insertion or ref for deletion) was already filled above:
observedAlleles.add(emptyStr);
}
} else {
// we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we
// fail to set data in a consistent way.. (the check for INDEL_VARIANT ensures that recorded variants are indeed both insertions
// or both deletions as compared to +ACC/-ACC which would still have the same bases (no matter how crazy and improbable
// such event would be)
if ( observedAlleles.get(0).equals(observedAlleles.get(1)) && varType != INDEL_VARIANT ) nNonref = 1;
else nNonref = 2;
}
// DONE with indels
}
public GenomeLoc getLocation() { return loc; }
private void parseBasesAndQuals(final String bases, final String quals)
{
//System.out.printf("%s%n%s%n", bases, quals);
// needs to convert the base string with it's . and , to the ref base
StringBuilder baseBuilder = new StringBuilder();
StringBuilder qualBuilder = new StringBuilder();
boolean done = false;
for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) {
//System.out.printf("%d %d%n", i, j);
char c = (char)bases.charAt(i);
switch ( c ) {
case '.': // matches reference
case ',': // matches reference
baseBuilder.append(refBaseChar);
qualBuilder.append((char)quals.charAt(j++));
break;
case '$': // end of read
break;
case '*': // end of indel?
j++;
break;
case '^': // mapping quality
i++;
break;
case '+': // start of indel
case '-': // start of indel
final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1
final String rest = bases.substring(i+1);
//System.out.printf("sub is %s%n", rest);
Matcher match = regex.matcher(rest);
if ( ! match.matches() ) {
if ( refBaseChar != '*' )
throw new RuntimeException("Bad pileup format: " + bases + " at position " + i);
done = true;
}
else {
String g = match.group(1);
//System.out.printf("group is %d, match is %s%n", match.groupCount(), g);
int l = Integer.parseInt(g);
i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++)
//System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1));
}
break;
default: // non reference base
baseBuilder.append(c);
qualBuilder.append((char)quals.charAt(j++));
}
}
pileupBases = baseBuilder.toString();
pileupQuals = qualBuilder.toString();
}
public GenomeLoc getLocation() { return loc; }
public String getQuals() { return pileupQuals; }
public char getRef() { return refBaseChar; }
public int size() { return pileupQuals.length(); }
public String getBases() { return pileupBases; }
public String getPileupString()
{
return String.format("%s: %s %s %s", getLocation(), getRef(), getBases(), getQuals());
}
/** Returns bases in the reference allele as a String. String can be empty (as in insertion into
* the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters
@ -198,7 +269,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian
* @return reference allele, forward strand
*/
public String getRefBasesFWD() {
return refBases;
return refBases;
}
/**
@ -243,94 +314,94 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian
}
public String toMediumString() {
String s = null;
if ( refBaseChar == '*' ) s = String.format("%s:%s:%s", getLocation().toString(), name,observedString);
else s = String.format("%s:%s:%s/%s", getLocation().toString(), name, observedAlleles.get(0),observedAlleles.get(1));
if ( isSNP() ) s += ": SNP";
else {
if ( isInsertion() ) s += ": Insertion";
else {
if ( isDeletion() ) s+= ": Deletion";
else {
if ( isIndel() ) s+=": Indel";
else s+=": Reference";
}
}
if ( isInsertion() ) s += ": Insertion";
else {
if ( isDeletion() ) s+= ": Deletion";
else {
if ( isIndel() ) s+=": Indel";
else s+=": Reference";
}
}
}
return s;
return s;
}
public String repl() {
return String.format("REPL not implemented yet");
}
@Override
public String getAltBasesFWD() {
if ( ! isSNP() && ! isIndel() ) return emptyStr;
if ( isSNP() ) {
if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1);
else return observedAlleles.get(0);
}
if ( isInsertion() ) {
if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(1);
else return observedAlleles.get(0);
}
if ( isDeletion() ) {
if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(0);
else return observedAlleles.get(1);
}
System.out.printf("WARNING: unexpected variant type in pileup %s at %s%n",name,getLocation().toString());
return null;
}
@Override
public String getAltBasesFWD() {
if ( ! isSNP() && ! isIndel() ) return emptyStr;
@Override
public char getAltSnpFWD() throws IllegalStateException {
if ( ! isSNP() ) throw new IllegalStateException("Variant is not a SNP");
if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1).charAt(0);
else return observedAlleles.get(0).charAt(0);
if ( isSNP() ) {
if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1);
else return observedAlleles.get(0);
}
}
if ( isInsertion() ) {
if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(1);
else return observedAlleles.get(0);
}
@Override
public double getConsensusConfidence() {
return consensusScore;
}
if ( isDeletion() ) {
if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(0);
else return observedAlleles.get(1);
}
System.out.printf("WARNING: unexpected variant type in pileup %s at %s%n",name,getLocation().toString());
return null;
}
@Override
public char getAltSnpFWD() throws IllegalStateException {
if ( ! isSNP() ) throw new IllegalStateException("Variant is not a SNP");
if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1).charAt(0);
else return observedAlleles.get(0).charAt(0);
}
@Override
public double getConsensusConfidence() {
return consensusScore;
}
@Override
public double getMAF() {
if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: can not determine minor allele freq for multiallelic site");
if ( isSNP() || isIndel() ) {
if ( observedAlleles.get(0).equals(observedAlleles.get(1)) ) return 1.0;
else return 0.5;
}
return 0;
}
@Override
public double getMAF() {
if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: can not determine minor allele freq for multiallelic site");
if ( isSNP() || isIndel() ) {
if ( observedAlleles.get(0).equals(observedAlleles.get(1)) ) return 1.0;
else return 0.5;
}
return 0;
}
@Override
public int getPloidy() throws IllegalStateException {
return 2; // ???
}
@Override
public int getPloidy() throws IllegalStateException {
return 2; // ???
}
@Override
public double getVariationConfidence() {
return variantScore;
}
@Override
public double getVariationConfidence() {
return variantScore;
}
@Override
public boolean isGenotype() {
return true;
}
@Override
public boolean isGenotype() {
return true;
}
@Override
public boolean isBiallelic() {
return nNonref < 2;
}
@Override
public boolean isBiallelic() {
return nNonref < 2;
}
}

View File

@ -159,7 +159,7 @@ public class TraverseByLoci extends TraversalEngine {
sum = walker.reduce(x, sum);
}
printProgress("loci", locus.getLocation());
//printProgress("loci", locus.getLocation());
return sum;
}
}

View File

@ -6,6 +6,9 @@ import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.Pileup;
import org.broadinstitute.sting.utils.BasicPileup;
import org.broadinstitute.sting.utils.ReadBackedPileup;
import net.sf.samtools.SAMRecord;
import java.util.List;
@ -33,20 +36,20 @@ public class PileupWalker extends LocusWalker<Integer, Integer> {
}
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
String bases = Utils.basePileupAsString(reads, offsets);
String quals = Utils.qualPileupAsString(reads, offsets);
ReadBackedPileup pileup = new ReadBackedPileup(context.getLocation(), ref, context);
String bases = pileup.getBases();
if ( bases.equals("") && FLAG_UNCOVERED_BASES ) {
bases = "*** UNCOVERED SITE ***";
}
String extras = "";
if ( VERBOSE ) {
String sqbases = Utils.secondaryBasePileupAsString(reads, offsets);
String sqquals = Utils.secondaryQualPileupAsString(reads, offsets);
extras += " BQ=" + pileup.getQualsAsInts();
extras += " MQ=" + pileup.getMappingQualsAsInts();
String sqbases = pileup.getSecondaryBasePileup();
String sqquals = pileup.getSecondaryQualPileup();
if (sqbases != null && sqquals != null) {
assert(sqbases.length() == sqquals.length());
@ -60,17 +63,17 @@ public class PileupWalker extends LocusWalker<Integer, Integer> {
}
}
}
extras += " BQ=" + Utils.join(",", Utils.qualPileup(reads, offsets));
extras += " MQ=" + Utils.join(",", Utils.mappingQualPileup(reads));
}
String rodString = "";
for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) {
if ( datum != null && ! (datum instanceof rodDbSNP)) {
//System.out.printf("rod = %s%n", datum.toSimpleString());
rodString += datum.toSimpleString();
//System.out.printf("Rod string %s%n", rodString);
}
}
rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null);
if ( dbsnp != null )
rodString += dbsnp.toMediumString();
@ -79,7 +82,7 @@ public class PileupWalker extends LocusWalker<Integer, Integer> {
rodString = "[ROD: " + rodString + "]";
//if ( context.getLocation().getStart() % 1 == 0 ) {
out.printf("%s: %s %s %s%s %s%n", context.getLocation(), ref, bases, quals, extras, rodString);
out.printf("%s%s %s%n", pileup.getPileupString(), extras, rodString);
//}
return 1;

View File

@ -0,0 +1,58 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.LocusContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodSAMPileup;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.BasicPileup;
import org.broadinstitute.sting.utils.ReadBackedPileup;
/**
* Created by IntelliJ IDEA.
* User: mdepristo
* Date: Feb 22, 2009
* Time: 3:22:14 PM
* To change this template use File | Settings | File Templates.
*/
public class ValidatingPileupWalker extends LocusWalker<Integer, ValidationStats> {
@Argument(fullName="verbose",required=false,defaultValue="false")
public boolean VERBOSE;
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
ReadBackedPileup pileup = new ReadBackedPileup(context.getLocation(), ref, context);
rodSAMPileup truePileup = (rodSAMPileup)tracker.lookup("pileup", null);
if ( truePileup == null )
Utils.scareUser(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases",
context.getLocation(), pileup.getBases()));
String pileupDiff = BasicPileup.pileupDiff(pileup, truePileup);
if ( pileupDiff != null ) {
out.printf("%s vs. %s%n", pileup.getPileupString(), truePileup.getPileupString());
throw new RuntimeException(String.format("Pileups aren't equal: %s", pileupDiff));
}
return pileup.size();
}
// Given result of map function
public ValidationStats reduceInit() { return new ValidationStats(); }
public ValidationStats reduce(Integer value, ValidationStats sum) {
sum.nLoci++;
sum.nBases += value;
return sum;
}
}
class ValidationStats {
public long nLoci = 0;
public long nBases = 0;
public ValidationStats() {
}
public String toString() {
return String.format("Validated %d sites covered by %d bases%n", nLoci, nBases);
}
}

View File

@ -0,0 +1,178 @@
package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.gatk.LocusContext;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.ArrayList;
import java.util.Arrays;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Apr 14, 2009
* Time: 8:54:05 AM
* To change this template use File | Settings | File Templates.
*/
abstract public class BasicPileup implements Pileup {
public String getPileupString()
{
return String.format("%s: %s %s %s", getLocation(), getRef(), getBases(), getQuals());
}
public static String basePileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder bases = new StringBuilder();
for ( byte base : basePileup(reads, offsets)) {
bases.append((char)base);
}
return bases.toString();
}
public static ArrayList<Byte> basePileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> bases = new ArrayList(reads.size());
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
bases.add(read.getReadBases()[offset]);
}
return bases;
}
public static ArrayList<Byte> qualPileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> quals = new ArrayList(reads.size());
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
byte qual = (byte)read.getBaseQualities()[offset];
quals.add(qual);
}
return quals;
}
public static ArrayList<Byte> mappingQualPileup( List<SAMRecord> reads) {
ArrayList<Byte> quals = new ArrayList(reads.size());
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
byte qual = (byte)read.getMappingQuality();
quals.add(qual);
}
return quals;
}
public static String qualPileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder quals = new StringBuilder();
for ( int qual : qualPileup(reads, offsets)) {
qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea
char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63
quals.append(qualChar);
}
return quals.toString();
}
public static ArrayList<Byte> secondaryBasePileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> bases2 = new ArrayList<Byte>(reads.size());
boolean hasAtLeastOneSQField = false;
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
byte[] compressedQuals = (byte[]) read.getAttribute("SQ");
byte base2;
//byte qual2;
if (compressedQuals != null) {
base2 = (byte) BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex(compressedQuals[offset]));
//qual2 = QualityUtils.probToQual(QualityUtils.compressedQualityToProb(compressedQuals[offset]));
hasAtLeastOneSQField = true;
} else {
base2 = (byte) '.';
}
bases2.add(base2);
}
return (hasAtLeastOneSQField ? bases2 : null);
}
public static String secondaryBasePileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder bases2 = new StringBuilder();
ArrayList<Byte> sqbases = secondaryBasePileup(reads, offsets);
if (sqbases == null) { return null; }
for (byte base2 : secondaryBasePileup(reads, offsets)) {
bases2.append((char) base2);
}
return bases2.toString();
}
public static ArrayList<Byte> secondaryQualPileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> quals2 = new ArrayList<Byte>(reads.size());
boolean hasAtLeastOneSQField = false;
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
byte[] compressedQuals = (byte[]) read.getAttribute("SQ");
byte qual2;
if (compressedQuals != null) {
qual2 = QualityUtils.probToQual(QualityUtils.compressedQualityToProb(compressedQuals[offset]));
hasAtLeastOneSQField = true;
} else {
qual2 = 0;
}
quals2.add(qual2);
}
return (hasAtLeastOneSQField ? quals2 : null);
}
public static String secondaryQualPileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder quals2 = new StringBuilder();
ArrayList<Byte> sqquals = secondaryQualPileup(reads, offsets);
if (sqquals == null) { return null; }
for (byte qual2 : secondaryQualPileup(reads, offsets)) {
quals2.append(qual2);
}
return quals2.toString();
}
public static String pileupDiff(final Pileup a, final Pileup b)
{
return pileupDiff(a,b,true);
}
private static String maybeSorted( final String x, boolean sortMe )
{
if ( sortMe ) {
byte[] bytes = x.getBytes();
Arrays.sort(bytes);
return new String(bytes);
}
else
return x;
}
public static String pileupDiff(final Pileup a, final Pileup b, boolean orderDependent)
{
if ( a.size() != b.size() )
return "Sizes not equal";
if ( a.getLocation().compareTo(b.getLocation()) != 0 )
return "Locations not equal";
String aBases = maybeSorted(a.getBases(), ! orderDependent );
String bBases = maybeSorted(b.getBases(), ! orderDependent );
if ( ! aBases.toUpperCase().equals(bBases.toUpperCase()) )
return "Bases not equal";
String aQuals = maybeSorted(a.getQuals(), ! orderDependent );
String bQuals = maybeSorted(b.getQuals(), ! orderDependent );
if ( ! aQuals.equals(bQuals) )
return "Quals not equal";
return null;
}
}

View File

@ -167,9 +167,10 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
// 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000'
//System.out.printf("Parsing location '%s'%n", str);
final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1
final Pattern regex2 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)$"); // matches case 2
final Pattern regex3 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)-([\\d,]+)$");// matches case 3
final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1
final Pattern regex2 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)$"); // matches case 2
final Pattern regex3 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)-([\\d,]+)$"); // matches case 3
final Pattern regex4 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)\\+"); // matches case 4
String contig = null;
long start = 1;
@ -179,6 +180,7 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
Matcher match1 = regex1.matcher(str);
Matcher match2 = regex2.matcher(str);
Matcher match3 = regex3.matcher(str);
Matcher match4 = regex4.matcher(str);
try {
if ( match1.matches() ) {
@ -189,6 +191,10 @@ public class GenomeLoc implements Comparable<GenomeLoc> {
start = parsePosition(match2.group(2));
stop = start;
}
else if ( match4.matches() ) {
contig = match4.group(1);
start = parsePosition(match4.group(2));
}
else if ( match3.matches() ) {
contig = match3.group(1);
start = parsePosition(match3.group(2));

View File

@ -0,0 +1,17 @@
package org.broadinstitute.sting.utils;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Apr 14, 2009
* Time: 9:33:00 AM
* To change this template use File | Settings | File Templates.
*/
public interface Pileup {
GenomeLoc getLocation();
char getRef();
String getBases();
String getQuals();
String getPileupString();
int size();
}

View File

@ -0,0 +1,77 @@
package org.broadinstitute.sting.utils;
import org.broadinstitute.sting.gatk.LocusContext;
import net.sf.samtools.SAMRecord;
import java.util.List;
import java.util.ArrayList;
/**
* Created by IntelliJ IDEA.
* User: depristo
* Date: Apr 14, 2009
* Time: 8:54:05 AM
* To change this template use File | Settings | File Templates.
*/
public class ReadBackedPileup extends BasicPileup {
GenomeLoc loc;
char ref;
List<SAMRecord> reads;
List<Integer> offsets;
public ReadBackedPileup(GenomeLoc loc, char ref, LocusContext context ) {
this(loc, ref, context.getReads(), context.getOffsets());
}
public ReadBackedPileup(GenomeLoc loc, char ref, List<SAMRecord> reads, List<Integer> offsets ) {
assert reads != null;
assert offsets != null;
assert reads.size() == offsets.size();
this.loc = loc;
this.ref = ref;
this.reads = reads;
this.offsets = offsets;
}
public int size() { return reads.size(); }
public List<SAMRecord> getReads() { return reads; }
public List<Integer> getOffsets() { return offsets; }
public GenomeLoc getLocation() {
return loc;
}
public char getRef() {
return ref;
}
public String getBases() {
return basePileupAsString(reads, offsets);
}
public String getQuals() {
return qualPileupAsString(reads, offsets);
}
public String getQualsAsInts() {
return Utils.join(",", qualPileup(reads, offsets));
}
public String getMappingQualsAsInts() {
return Utils.join(",", mappingQualPileup(reads));
}
public String getSecondaryBasePileup() {
return secondaryBasePileupAsString(reads, offsets);
}
public String getSecondaryQualPileup() {
return secondaryQualPileupAsString(reads, offsets);
}
public String getPileupString()
{
return String.format("%s: %s %s %s", getLocation(), getRef(), getBases(), getQuals());
}
}

View File

@ -35,11 +35,11 @@ public class Utils {
}
public static void scareUser(final String msg) {
System.out.printf("********************************************************************************%n");
System.out.printf("* ERROR:%n");
System.out.printf("*%n");
System.out.printf("* %s%n", msg);
System.out.printf("********************************************************************************%n");
//System.out.printf("********************************************************************************%n");
//System.out.printf("* ERROR:%n");
//System.out.printf("*%n");
//System.out.printf("* %s%n", msg);
//System.out.printf("********************************************************************************%n");
logger.fatal(msg);
throw new RuntimeException(msg);
}
@ -119,126 +119,6 @@ public class Utils {
return c;
}
public static String basePileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder bases = new StringBuilder();
for ( byte base : basePileup(reads, offsets)) {
bases.append((char)base);
}
return bases.toString();
}
public static ArrayList<Byte> basePileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> bases = new ArrayList(reads.size());
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
bases.add(read.getReadBases()[offset]);
}
return bases;
}
public static ArrayList<Byte> qualPileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> quals = new ArrayList(reads.size());
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
byte qual = (byte)read.getBaseQualities()[offset];
quals.add(qual);
}
return quals;
}
public static ArrayList<Byte> mappingQualPileup( List<SAMRecord> reads) {
ArrayList<Byte> quals = new ArrayList(reads.size());
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
byte qual = (byte)read.getMappingQuality();
quals.add(qual);
}
return quals;
}
public static String qualPileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder quals = new StringBuilder();
for ( int qual : qualPileup(reads, offsets)) {
qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea
char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63
quals.append(qualChar);
}
return quals.toString();
}
public static ArrayList<Byte> secondaryBasePileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> bases2 = new ArrayList<Byte>(reads.size());
boolean hasAtLeastOneSQField = false;
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
byte[] compressedQuals = (byte[]) read.getAttribute("SQ");
byte base2;
//byte qual2;
if (compressedQuals != null) {
base2 = (byte) BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex(compressedQuals[offset]));
//qual2 = QualityUtils.probToQual(QualityUtils.compressedQualityToProb(compressedQuals[offset]));
hasAtLeastOneSQField = true;
} else {
base2 = (byte) '.';
}
bases2.add(base2);
}
return (hasAtLeastOneSQField ? bases2 : null);
}
public static String secondaryBasePileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder bases2 = new StringBuilder();
ArrayList<Byte> sqbases = secondaryBasePileup(reads, offsets);
if (sqbases == null) { return null; }
for (byte base2 : secondaryBasePileup(reads, offsets)) {
bases2.append((char) base2);
}
return bases2.toString();
}
public static ArrayList<Byte> secondaryQualPileup( List<SAMRecord> reads, List<Integer> offsets ) {
ArrayList<Byte> quals2 = new ArrayList<Byte>(reads.size());
boolean hasAtLeastOneSQField = false;
for ( int i = 0; i < reads.size(); i++ ) {
SAMRecord read = reads.get(i);
int offset = offsets.get(i);
byte[] compressedQuals = (byte[]) read.getAttribute("SQ");
byte qual2;
if (compressedQuals != null) {
qual2 = QualityUtils.probToQual(QualityUtils.compressedQualityToProb(compressedQuals[offset]));
hasAtLeastOneSQField = true;
} else {
qual2 = 0;
}
quals2.add(qual2);
}
return (hasAtLeastOneSQField ? quals2 : null);
}
public static String secondaryQualPileupAsString( List<SAMRecord> reads, List<Integer> offsets ) {
StringBuilder quals2 = new StringBuilder();
ArrayList<Byte> sqquals = secondaryQualPileup(reads, offsets);
if (sqquals == null) { return null; }
for (byte qual2 : secondaryQualPileup(reads, offsets)) {
quals2.append(qual2);
}
return quals2.toString();
}
public static ArrayList<Byte> subseq(byte[] fullArray) {
return subseq(fullArray, 0, fullArray.length);
}

View File

@ -50,6 +50,10 @@ if __name__ == "__main__":
head, lane_filename = os.path.split(lane)
filebase = os.path.splitext(lane_filename)[0]
# make the pileup
# 503 8:03 samtools view -b MV1994.bam Escherichia_coli_K12:10-1000 > sub.bam
# 504 8:03 samtools pileup -f Escherichia_coli_K12_MG1655.fasta sub.bam
# convert the fasta
for analysis in commandsList: