diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index 99d9c98a9..a9a9334b5 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -176,8 +176,12 @@ public class GenomeAnalysisTK extends CommandLineProgram { * Required main method implementation. */ public static void main(String[] argv) { - Instance = new GenomeAnalysisTK(); - start(Instance, argv); + try { + Instance = new GenomeAnalysisTK(); + start(Instance, argv); + } catch ( Exception e ) { + exitSystemWithError(e); + } } /** @@ -195,6 +199,30 @@ public class GenomeAnalysisTK extends CommandLineProgram { ROD_BINDINGS.add(file); } + private static void printExitSystemMsg(final String msg) { + System.out.printf("------------------------------------------------------------------------------------------%n"); + System.out.printf("An error has occurred%n"); + System.out.printf("It's possible (maybe even likely) that this is an input error on your part%n"); + System.out.printf("But if it's a bug or something that should work, please report this to gsadevelopers@broad.mit.edu%n"); + System.out.printf("%n"); + System.out.printf("%s%n", msg); + } + + public static void exitSystemWithError(final String msg) { + printExitSystemMsg(msg); + System.exit(1); + } + + public static void exitSystemWithError(final String msg, Exception e ) { + e.printStackTrace(); + printExitSystemMsg(msg); + System.exit(1); + } + + public static void exitSystemWithError(Exception e ) { + exitSystemWithError(e.getMessage(), e); + } + protected int execute() { final boolean TEST_ROD = false; List > rods = new ArrayList >(); diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java index 67b7db757..4c7e00522 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java @@ -2,15 +2,16 @@ package org.broadinstitute.sting.gatk.refdata; import java.util.*; +import java.util.regex.Pattern; +import java.util.regex.Matcher; -import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.gatk.refdata.AllelicVariant; +import org.broadinstitute.sting.utils.Pileup; /** * This class wraps Maq/samtools allele calls from pileup format and presents them as a ROD.
- * + * * Example format:
* for SNP:
* [chr] [pos] [ref] [consensus allele(s)] [consensus confidence] [snp confidence] [max mapping qual] [num reads in the pile]
@@ -24,35 +25,38 @@ import org.broadinstitute.sting.gatk.refdata.AllelicVariant; * Time: 2:58:33 PM * To change this template use File | Settings | File Templates. */ -public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVariant { - private static final int NO_VARIANT = -1; - private static final int SNP_VARIANT = 0; - private static final int INSERTION_VARIANT = 1; - private static final int DELETION_VARIANT = 2; - private static final int INDEL_VARIANT = 3; - - // allocate once and don't ever bother creating them again: - private static final String baseA = new String("A"); - private static final String baseC = new String("C"); - private static final String baseG = new String("G"); - private static final String baseT = new String("T"); - private static final String emptyStr = new String(); // we will use this for "reference" allele in insertions +public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVariant, Pileup { + private static final int NO_VARIANT = -1; + private static final int SNP_VARIANT = 0; + private static final int INSERTION_VARIANT = 1; + private static final int DELETION_VARIANT = 2; + private static final int INDEL_VARIANT = 3; + + // allocate once and don't ever bother creating them again: + private static final String baseA = new String("A"); + private static final String baseC = new String("C"); + private static final String baseG = new String("G"); + private static final String baseT = new String("T"); + private static final String emptyStr = new String(); // we will use this for "reference" allele in insertions protected GenomeLoc loc; // genome location of SNP - // Reference sequence chromosome or scaffold - // Start and stop positions in chrom + // Reference sequence chromosome or scaffold + // Start and stop positions in chrom protected char refBaseChar; // what we have set for the reference base (is set to a '*' for indel!) protected String refBases; // the reference base according to NCBI, in the dbSNP file protected String observedString; // store the actual string representation of observed alleles + protected String pileupQuals; // the read base qualities + protected String pileupBases; // the read bases themselves + protected List observedAlleles = null; // The sequences of the observed alleles from rs-fasta files protected int varType = NO_VARIANT; protected int ploidy = 2; // how many allelic variants we get? protected int nNonref = 0; // number of non-reference alleles protected int eventLength = 0; - + protected double consensusScore; protected double variantScore; // ---------------------------------------------------------------------- @@ -69,127 +73,194 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian // 0 1 2 3 4 5 6 7 // * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) // * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6 - try { - + try { + String contig = parts[0]; long start = Long.parseLong(parts[1]) ; - + consensusScore = Double.parseDouble(parts[4]); variantScore = Double.parseDouble(parts[5]); refBaseChar = Character.toUpperCase(parts[2].charAt(0)); - + parts[3] = parts[3].toUpperCase(); observedString = parts[3]; - + + parseBasesAndQuals(parts[8], parts[9]); + observedAlleles = new ArrayList(2); - + if ( refBaseChar == '*' ) { - parseIndels(parts[3]) ; - if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength); - else loc = new GenomeLoc(contig, start, start); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!! + parseIndels(parts[3]) ; + if ( varType == DELETION_VARIANT ) loc = new GenomeLoc(contig, start, start+eventLength); + else loc = new GenomeLoc(contig, start, start); // if it's not a deletion and we are biallelic, this got to be an insertion; otherwise the state is inconsistent!!!! } else { - // if the variant is a SNP or a reference base (i.e. no variant at all) - assert parts[3].length() == 1 : "non-indel genotype is expected to be represented by a single letter"; - refBases = parts[2].toUpperCase(); - eventLength = 1; - loc = new GenomeLoc(contig, start, start+1); + // if the variant is a SNP or a reference base (i.e. no variant at all) + assert parts[3].length() == 1 : "non-indel genotype is expected to be represented by a single letter"; + refBases = parts[2].toUpperCase(); + eventLength = 1; + //loc = new GenomeLoc(contig, start, start+1); + loc = new GenomeLoc(contig, start, start); - char ch = parts[3].charAt(0); - - switch ( ch ) { - case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break; - case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break; - case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break; - case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break; - case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break; - case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break; - case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break; - case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break; - case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break; - case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break; - } - if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT; - else { - // we know that at least one allele is non-ref; - // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e. - // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref) - varType = SNP_VARIANT; - if ( observedAlleles.get(0).charAt(0) == refBaseChar || - observedAlleles.get(1).charAt(0) == refBaseChar || - observedAlleles.get(0) == observedAlleles.get(1) - ) nNonref = 1; - else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site... - } + char ch = parts[3].charAt(0); + + switch ( ch ) { + case 'A': observedAlleles.add(baseA); observedAlleles.add(baseA); break; + case 'C': observedAlleles.add(baseC); observedAlleles.add(baseC); break; + case 'G': observedAlleles.add(baseG); observedAlleles.add(baseG); break; + case 'T': observedAlleles.add(baseT); observedAlleles.add(baseT); break; + case 'M': observedAlleles.add(baseA); observedAlleles.add(baseC); break; + case 'R': observedAlleles.add(baseA); observedAlleles.add(baseG); break; + case 'W': observedAlleles.add(baseA); observedAlleles.add(baseT); break; + case 'S': observedAlleles.add(baseC); observedAlleles.add(baseG); break; + case 'Y': observedAlleles.add(baseC); observedAlleles.add(baseT); break; + case 'K': observedAlleles.add(baseG); observedAlleles.add(baseT); break; + } + if ( observedAlleles.get(0).charAt(0) == refBaseChar && observedAlleles.get(1).charAt(0) == refBaseChar ) varType = NO_VARIANT; + else { + // we know that at least one allele is non-ref; + // if one is ref and the other is non-ref, or if both are non ref but they are the same (i.e. + // homozygous non-ref), we still have 2 allelic variants at the site (e.g. one ref and one nonref) + varType = SNP_VARIANT; + if ( observedAlleles.get(0).charAt(0) == refBaseChar || + observedAlleles.get(1).charAt(0) == refBaseChar || + observedAlleles.get(0) == observedAlleles.get(1) + ) nNonref = 1; + else nNonref = 2; // if both observations differ from ref and they are not equal to one another, then we get multiallelic site... + } } } catch ( RuntimeException e ) { - System.out.printf(" Exception caught during parsing Pileup line: %s%n", Utils.join(" <=> ", parts)); + System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", Utils.join(" <=> ", parts)); throw e; } if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString()); } - - private void parseIndels(String genotype) { - String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit - - // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference; - // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases - // from what we have recorded for the deletion allele (e.g. "-CAC") - boolean hasRefAllele = false; - - for ( int i = 0 ; i < obs.length ; i++ ) { - if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) { - hasRefAllele = true; - continue; // we will fill reference allele later - } - - String varBases = obs[i].substring(1).toUpperCase(); - switch ( obs[i].charAt(0) ) { - case '+': - if ( varType != NO_VARIANT && varType != INSERTION_VARIANT ) varType = INDEL_VARIANT; - else varType = INSERTION_VARIANT; - refBases = emptyStr; - break; - case '-' : - if ( varType != -1 && varType != DELETION_VARIANT ) varType = INDEL_VARIANT; - else varType = DELETION_VARIANT; - refBases = varBases; // remember what was deleted, this will be saved as "reference allele" - break; - default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype); - } - observedAlleles.add(varBases); - eventLength = obs[i].length() - 1; // inconsistent for non-biallelic indels!! - } - if ( hasRefAllele ) { - // we got at least one ref. allele (out of two recorded) - if ( varType == NO_VARIANT ) { // both top theories are actually ref allele; - observedAlleles.add(emptyStr); - observedAlleles.add(emptyStr); // no inserted/deleted bases at the site! - nNonref = 0; // no observations of non-reference allele at all - } else { - nNonref = 1; // hasRefAllele = true, so one allele was definitely ref, hence there is only one left - - // whether we have insertion or deletion, one allele (ref for insertion, or alt for deletion) is empty; - // the one that contain actual bases (alt for insertion or ref for deletion) was already filled above: - observedAlleles.add(emptyStr); - } - } else { - // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we - // fail to set data in a consistent way.. (the check for INDEL_VARIANT ensures that recorded variants are indeed both insertions - // or both deletions as compared to +ACC/-ACC which would still have the same bases (no matter how crazy and improbable - // such event would be) - if ( observedAlleles.get(0).equals(observedAlleles.get(1)) && varType != INDEL_VARIANT ) nNonref = 1; - else nNonref = 2; - } - // DONE with indels + private void parseIndels(String genotype) { + String [] obs = genotype.split("/"); // get observations, now need to tinker with them a bit + + // if reference allele is among the observed alleles, we will need to take special care of it since we do not have direct access to the reference; + // if we have an insertion, the "reference" allele is going to be empty; if it it is a deletion, we will deduce the "reference allele" bases + // from what we have recorded for the deletion allele (e.g. "-CAC") + boolean hasRefAllele = false; + + for ( int i = 0 ; i < obs.length ; i++ ) { + if ( obs[i].length() == 1 && obs[i].charAt(0) == '*' ) { + hasRefAllele = true; + continue; // we will fill reference allele later + } + + String varBases = obs[i].substring(1).toUpperCase(); + + switch ( obs[i].charAt(0) ) { + case '+': + if ( varType != NO_VARIANT && varType != INSERTION_VARIANT ) varType = INDEL_VARIANT; + else varType = INSERTION_VARIANT; + refBases = emptyStr; + break; + case '-' : + if ( varType != -1 && varType != DELETION_VARIANT ) varType = INDEL_VARIANT; + else varType = DELETION_VARIANT; + refBases = varBases; // remember what was deleted, this will be saved as "reference allele" + break; + default: throw new RuntimeException("Can not interpret observed indel allele record: "+genotype); + } + observedAlleles.add(varBases); + eventLength = obs[i].length() - 1; // inconsistent for non-biallelic indels!! + } + if ( hasRefAllele ) { + // we got at least one ref. allele (out of two recorded) + if ( varType == NO_VARIANT ) { // both top theories are actually ref allele; + observedAlleles.add(emptyStr); + observedAlleles.add(emptyStr); // no inserted/deleted bases at the site! + nNonref = 0; // no observations of non-reference allele at all + } else { + nNonref = 1; // hasRefAllele = true, so one allele was definitely ref, hence there is only one left + + // whether we have insertion or deletion, one allele (ref for insertion, or alt for deletion) is empty; + // the one that contain actual bases (alt for insertion or ref for deletion) was already filled above: + observedAlleles.add(emptyStr); + } + } else { + // we observe two non-ref alleles; they better be the same variant, otherwise the site is not bi-allelic and at the moment we + // fail to set data in a consistent way.. (the check for INDEL_VARIANT ensures that recorded variants are indeed both insertions + // or both deletions as compared to +ACC/-ACC which would still have the same bases (no matter how crazy and improbable + // such event would be) + if ( observedAlleles.get(0).equals(observedAlleles.get(1)) && varType != INDEL_VARIANT ) nNonref = 1; + else nNonref = 2; + } + // DONE with indels } - - - public GenomeLoc getLocation() { return loc; } + + private void parseBasesAndQuals(final String bases, final String quals) + { + //System.out.printf("%s%n%s%n", bases, quals); + + // needs to convert the base string with it's . and , to the ref base + StringBuilder baseBuilder = new StringBuilder(); + StringBuilder qualBuilder = new StringBuilder(); + boolean done = false; + for ( int i = 0, j = 0; i < bases.length() && ! done; i++ ) { + //System.out.printf("%d %d%n", i, j); + char c = (char)bases.charAt(i); + + switch ( c ) { + case '.': // matches reference + case ',': // matches reference + baseBuilder.append(refBaseChar); + qualBuilder.append((char)quals.charAt(j++)); + break; + case '$': // end of read + break; + case '*': // end of indel? + j++; + break; + case '^': // mapping quality + i++; + break; + case '+': // start of indel + case '-': // start of indel + final Pattern regex = Pattern.compile("([0-9]+).*"); // matches case 1 + final String rest = bases.substring(i+1); + //System.out.printf("sub is %s%n", rest); + Matcher match = regex.matcher(rest); + if ( ! match.matches() ) { + if ( refBaseChar != '*' ) + throw new RuntimeException("Bad pileup format: " + bases + " at position " + i); + done = true; + } + else { + String g = match.group(1); + //System.out.printf("group is %d, match is %s%n", match.groupCount(), g); + int l = Integer.parseInt(g); + i += l + g.length(); // length of number + that many bases + +/- at the start (included in the next i++) + //System.out.printf("remaining is %d => %s%n", l, bases.substring(i+1)); + } + break; + default: // non reference base + baseBuilder.append(c); + qualBuilder.append((char)quals.charAt(j++)); + } + } + + pileupBases = baseBuilder.toString(); + pileupQuals = qualBuilder.toString(); + } + + public GenomeLoc getLocation() { return loc; } + public String getQuals() { return pileupQuals; } + public char getRef() { return refBaseChar; } + public int size() { return pileupQuals.length(); } + public String getBases() { return pileupBases; } + + public String getPileupString() + { + return String.format("%s: %s %s %s", getLocation(), getRef(), getBases(), getQuals()); + } + /** Returns bases in the reference allele as a String. String can be empty (as in insertion into * the reference), can contain a single character (as in SNP or one-base deletion), or multiple characters @@ -198,7 +269,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian * @return reference allele, forward strand */ public String getRefBasesFWD() { - return refBases; + return refBases; } /** @@ -243,94 +314,94 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements AllelicVarian } public String toMediumString() { - + String s = null; if ( refBaseChar == '*' ) s = String.format("%s:%s:%s", getLocation().toString(), name,observedString); else s = String.format("%s:%s:%s/%s", getLocation().toString(), name, observedAlleles.get(0),observedAlleles.get(1)); if ( isSNP() ) s += ": SNP"; else { - if ( isInsertion() ) s += ": Insertion"; - else { - if ( isDeletion() ) s+= ": Deletion"; - else { - if ( isIndel() ) s+=": Indel"; - else s+=": Reference"; - } - } + if ( isInsertion() ) s += ": Insertion"; + else { + if ( isDeletion() ) s+= ": Deletion"; + else { + if ( isIndel() ) s+=": Indel"; + else s+=": Reference"; + } + } } - return s; + return s; } public String repl() { return String.format("REPL not implemented yet"); } - - @Override - public String getAltBasesFWD() { - if ( ! isSNP() && ! isIndel() ) return emptyStr; - if ( isSNP() ) { - if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1); - else return observedAlleles.get(0); - } - - if ( isInsertion() ) { - if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(1); - else return observedAlleles.get(0); - } - - if ( isDeletion() ) { - if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(0); - else return observedAlleles.get(1); - - } - - System.out.printf("WARNING: unexpected variant type in pileup %s at %s%n",name,getLocation().toString()); - return null; - } + @Override + public String getAltBasesFWD() { + if ( ! isSNP() && ! isIndel() ) return emptyStr; - @Override - public char getAltSnpFWD() throws IllegalStateException { - if ( ! isSNP() ) throw new IllegalStateException("Variant is not a SNP"); - if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1).charAt(0); - else return observedAlleles.get(0).charAt(0); + if ( isSNP() ) { + if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1); + else return observedAlleles.get(0); + } - } + if ( isInsertion() ) { + if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(1); + else return observedAlleles.get(0); + } - @Override - public double getConsensusConfidence() { - return consensusScore; - } + if ( isDeletion() ) { + if ( observedAlleles.get(0) == emptyStr ) return observedAlleles.get(0); + else return observedAlleles.get(1); + + } + + System.out.printf("WARNING: unexpected variant type in pileup %s at %s%n",name,getLocation().toString()); + return null; + } + + @Override + public char getAltSnpFWD() throws IllegalStateException { + if ( ! isSNP() ) throw new IllegalStateException("Variant is not a SNP"); + if ( observedAlleles.get(0).charAt(0) == refBaseChar ) return observedAlleles.get(1).charAt(0); + else return observedAlleles.get(0).charAt(0); + + } + + @Override + public double getConsensusConfidence() { + return consensusScore; + } - @Override - public double getMAF() { - if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: can not determine minor allele freq for multiallelic site"); - if ( isSNP() || isIndel() ) { - if ( observedAlleles.get(0).equals(observedAlleles.get(1)) ) return 1.0; - else return 0.5; - } - return 0; - } + @Override + public double getMAF() { + if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: can not determine minor allele freq for multiallelic site"); + if ( isSNP() || isIndel() ) { + if ( observedAlleles.get(0).equals(observedAlleles.get(1)) ) return 1.0; + else return 0.5; + } + return 0; + } - @Override - public int getPloidy() throws IllegalStateException { - return 2; // ??? - } + @Override + public int getPloidy() throws IllegalStateException { + return 2; // ??? + } - @Override - public double getVariationConfidence() { - return variantScore; - } + @Override + public double getVariationConfidence() { + return variantScore; + } - @Override - public boolean isGenotype() { - return true; - } + @Override + public boolean isGenotype() { + return true; + } - @Override - public boolean isBiallelic() { - return nNonref < 2; - } + @Override + public boolean isBiallelic() { + return nNonref < 2; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java index 92dbf381c..2a31b8538 100644 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraverseByLoci.java @@ -159,7 +159,7 @@ public class TraverseByLoci extends TraversalEngine { sum = walker.reduce(x, sum); } - printProgress("loci", locus.getLocation()); + //printProgress("loci", locus.getLocation()); return sum; } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java index de9423c0d..befed2ab7 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java @@ -6,6 +6,9 @@ import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.cmdLine.Argument; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.Pileup; +import org.broadinstitute.sting.utils.BasicPileup; +import org.broadinstitute.sting.utils.ReadBackedPileup; import net.sf.samtools.SAMRecord; import java.util.List; @@ -33,20 +36,20 @@ public class PileupWalker extends LocusWalker { } public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { - List reads = context.getReads(); - List offsets = context.getOffsets(); - String bases = Utils.basePileupAsString(reads, offsets); - String quals = Utils.qualPileupAsString(reads, offsets); - + ReadBackedPileup pileup = new ReadBackedPileup(context.getLocation(), ref, context); + String bases = pileup.getBases(); + if ( bases.equals("") && FLAG_UNCOVERED_BASES ) { bases = "*** UNCOVERED SITE ***"; } String extras = ""; if ( VERBOSE ) { - String sqbases = Utils.secondaryBasePileupAsString(reads, offsets); - String sqquals = Utils.secondaryQualPileupAsString(reads, offsets); + extras += " BQ=" + pileup.getQualsAsInts(); + extras += " MQ=" + pileup.getMappingQualsAsInts(); + String sqbases = pileup.getSecondaryBasePileup(); + String sqquals = pileup.getSecondaryQualPileup(); if (sqbases != null && sqquals != null) { assert(sqbases.length() == sqquals.length()); @@ -60,17 +63,17 @@ public class PileupWalker extends LocusWalker { } } } - - extras += " BQ=" + Utils.join(",", Utils.qualPileup(reads, offsets)); - extras += " MQ=" + Utils.join(",", Utils.mappingQualPileup(reads)); } String rodString = ""; for ( ReferenceOrderedDatum datum : tracker.getAllRods() ) { if ( datum != null && ! (datum instanceof rodDbSNP)) { + //System.out.printf("rod = %s%n", datum.toSimpleString()); rodString += datum.toSimpleString(); + //System.out.printf("Rod string %s%n", rodString); } } + rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null); if ( dbsnp != null ) rodString += dbsnp.toMediumString(); @@ -79,7 +82,7 @@ public class PileupWalker extends LocusWalker { rodString = "[ROD: " + rodString + "]"; //if ( context.getLocation().getStart() % 1 == 0 ) { - out.printf("%s: %s %s %s%s %s%n", context.getLocation(), ref, bases, quals, extras, rodString); + out.printf("%s%s %s%n", pileup.getPileupString(), extras, rodString); //} return 1; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/ValidatingPileupWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/ValidatingPileupWalker.java new file mode 100755 index 000000000..684fa937e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/ValidatingPileupWalker.java @@ -0,0 +1,58 @@ +package org.broadinstitute.sting.gatk.walkers; + +import org.broadinstitute.sting.gatk.LocusContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.rodSAMPileup; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.BasicPileup; +import org.broadinstitute.sting.utils.ReadBackedPileup; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 22, 2009 + * Time: 3:22:14 PM + * To change this template use File | Settings | File Templates. + */ +public class ValidatingPileupWalker extends LocusWalker { + @Argument(fullName="verbose",required=false,defaultValue="false") + public boolean VERBOSE; + + public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { + ReadBackedPileup pileup = new ReadBackedPileup(context.getLocation(), ref, context); + + rodSAMPileup truePileup = (rodSAMPileup)tracker.lookup("pileup", null); + if ( truePileup == null ) + Utils.scareUser(String.format("No pileup data available at %s given GATK's output of %s -- this walker requires samtools pileup data over all bases", + context.getLocation(), pileup.getBases())); + + String pileupDiff = BasicPileup.pileupDiff(pileup, truePileup); + if ( pileupDiff != null ) { + out.printf("%s vs. %s%n", pileup.getPileupString(), truePileup.getPileupString()); + throw new RuntimeException(String.format("Pileups aren't equal: %s", pileupDiff)); + } + + return pileup.size(); + } + + // Given result of map function + public ValidationStats reduceInit() { return new ValidationStats(); } + public ValidationStats reduce(Integer value, ValidationStats sum) { + sum.nLoci++; + sum.nBases += value; + return sum; + } +} + +class ValidationStats { + public long nLoci = 0; + public long nBases = 0; + + public ValidationStats() { + } + + public String toString() { + return String.format("Validated %d sites covered by %d bases%n", nLoci, nBases); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/BasicPileup.java b/java/src/org/broadinstitute/sting/utils/BasicPileup.java new file mode 100755 index 000000000..a86cedf3a --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/BasicPileup.java @@ -0,0 +1,178 @@ +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.gatk.LocusContext; +import net.sf.samtools.SAMRecord; + +import java.util.List; +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Apr 14, 2009 + * Time: 8:54:05 AM + * To change this template use File | Settings | File Templates. + */ +abstract public class BasicPileup implements Pileup { + public String getPileupString() + { + return String.format("%s: %s %s %s", getLocation(), getRef(), getBases(), getQuals()); + } + + public static String basePileupAsString( List reads, List offsets ) { + StringBuilder bases = new StringBuilder(); + for ( byte base : basePileup(reads, offsets)) { + bases.append((char)base); + } + return bases.toString(); + } + + public static ArrayList basePileup( List reads, List offsets ) { + ArrayList bases = new ArrayList(reads.size()); + for ( int i = 0; i < reads.size(); i++ ) { + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + bases.add(read.getReadBases()[offset]); + } + return bases; + } + + public static ArrayList qualPileup( List reads, List offsets ) { + ArrayList quals = new ArrayList(reads.size()); + for ( int i = 0; i < reads.size(); i++ ) { + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + byte qual = (byte)read.getBaseQualities()[offset]; + quals.add(qual); + } + return quals; + } + + public static ArrayList mappingQualPileup( List reads) { + ArrayList quals = new ArrayList(reads.size()); + for ( int i = 0; i < reads.size(); i++ ) { + SAMRecord read = reads.get(i); + byte qual = (byte)read.getMappingQuality(); + quals.add(qual); + } + return quals; + } + + public static String qualPileupAsString( List reads, List offsets ) { + StringBuilder quals = new StringBuilder(); + for ( int qual : qualPileup(reads, offsets)) { + qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea + char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63 + quals.append(qualChar); + } + + return quals.toString(); + } + + public static ArrayList secondaryBasePileup( List reads, List offsets ) { + ArrayList bases2 = new ArrayList(reads.size()); + boolean hasAtLeastOneSQField = false; + + for ( int i = 0; i < reads.size(); i++ ) { + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + + byte[] compressedQuals = (byte[]) read.getAttribute("SQ"); + byte base2; + //byte qual2; + if (compressedQuals != null) { + base2 = (byte) BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex(compressedQuals[offset])); + //qual2 = QualityUtils.probToQual(QualityUtils.compressedQualityToProb(compressedQuals[offset])); + hasAtLeastOneSQField = true; + } else { + base2 = (byte) '.'; + } + bases2.add(base2); + } + return (hasAtLeastOneSQField ? bases2 : null); + } + + public static String secondaryBasePileupAsString( List reads, List offsets ) { + StringBuilder bases2 = new StringBuilder(); + ArrayList sqbases = secondaryBasePileup(reads, offsets); + + if (sqbases == null) { return null; } + + for (byte base2 : secondaryBasePileup(reads, offsets)) { + bases2.append((char) base2); + } + + return bases2.toString(); + } + + public static ArrayList secondaryQualPileup( List reads, List offsets ) { + ArrayList quals2 = new ArrayList(reads.size()); + boolean hasAtLeastOneSQField = false; + + for ( int i = 0; i < reads.size(); i++ ) { + SAMRecord read = reads.get(i); + int offset = offsets.get(i); + + byte[] compressedQuals = (byte[]) read.getAttribute("SQ"); + byte qual2; + if (compressedQuals != null) { + qual2 = QualityUtils.probToQual(QualityUtils.compressedQualityToProb(compressedQuals[offset])); + hasAtLeastOneSQField = true; + } else { + qual2 = 0; + } + quals2.add(qual2); + } + return (hasAtLeastOneSQField ? quals2 : null); + } + + public static String secondaryQualPileupAsString( List reads, List offsets ) { + StringBuilder quals2 = new StringBuilder(); + ArrayList sqquals = secondaryQualPileup(reads, offsets); + + if (sqquals == null) { return null; } + + for (byte qual2 : secondaryQualPileup(reads, offsets)) { + quals2.append(qual2); + } + + return quals2.toString(); + } + + public static String pileupDiff(final Pileup a, final Pileup b) + { + return pileupDiff(a,b,true); + } + + private static String maybeSorted( final String x, boolean sortMe ) + { + if ( sortMe ) { + byte[] bytes = x.getBytes(); + Arrays.sort(bytes); + return new String(bytes); + } + else + return x; + } + + public static String pileupDiff(final Pileup a, final Pileup b, boolean orderDependent) + { + if ( a.size() != b.size() ) + return "Sizes not equal"; + if ( a.getLocation().compareTo(b.getLocation()) != 0 ) + return "Locations not equal"; + + String aBases = maybeSorted(a.getBases(), ! orderDependent ); + String bBases = maybeSorted(b.getBases(), ! orderDependent ); + if ( ! aBases.toUpperCase().equals(bBases.toUpperCase()) ) + return "Bases not equal"; + + String aQuals = maybeSorted(a.getQuals(), ! orderDependent ); + String bQuals = maybeSorted(b.getQuals(), ! orderDependent ); + if ( ! aQuals.equals(bQuals) ) + return "Quals not equal"; + + return null; + } +} diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 24961e8b0..ebed243e1 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -167,9 +167,10 @@ public class GenomeLoc implements Comparable { // 'chr2', 'chr2:1000000' or 'chr2:1,000,000-2,000,000' //System.out.printf("Parsing location '%s'%n", str); - final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1 - final Pattern regex2 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)$"); // matches case 2 - final Pattern regex3 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)-([\\d,]+)$");// matches case 3 + final Pattern regex1 = Pattern.compile("([\\w&&[^:]]+)$"); // matches case 1 + final Pattern regex2 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)$"); // matches case 2 + final Pattern regex3 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)-([\\d,]+)$"); // matches case 3 + final Pattern regex4 = Pattern.compile("([\\w&&[^:]]+):([\\d,]+)\\+"); // matches case 4 String contig = null; long start = 1; @@ -179,6 +180,7 @@ public class GenomeLoc implements Comparable { Matcher match1 = regex1.matcher(str); Matcher match2 = regex2.matcher(str); Matcher match3 = regex3.matcher(str); + Matcher match4 = regex4.matcher(str); try { if ( match1.matches() ) { @@ -189,6 +191,10 @@ public class GenomeLoc implements Comparable { start = parsePosition(match2.group(2)); stop = start; } + else if ( match4.matches() ) { + contig = match4.group(1); + start = parsePosition(match4.group(2)); + } else if ( match3.matches() ) { contig = match3.group(1); start = parsePosition(match3.group(2)); diff --git a/java/src/org/broadinstitute/sting/utils/Pileup.java b/java/src/org/broadinstitute/sting/utils/Pileup.java new file mode 100755 index 000000000..588c1ffa2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/Pileup.java @@ -0,0 +1,17 @@ +package org.broadinstitute.sting.utils; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Apr 14, 2009 + * Time: 9:33:00 AM + * To change this template use File | Settings | File Templates. + */ +public interface Pileup { + GenomeLoc getLocation(); + char getRef(); + String getBases(); + String getQuals(); + String getPileupString(); + int size(); +} diff --git a/java/src/org/broadinstitute/sting/utils/ReadBackedPileup.java b/java/src/org/broadinstitute/sting/utils/ReadBackedPileup.java new file mode 100755 index 000000000..feeb97faf --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/ReadBackedPileup.java @@ -0,0 +1,77 @@ +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.gatk.LocusContext; +import net.sf.samtools.SAMRecord; + +import java.util.List; +import java.util.ArrayList; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Apr 14, 2009 + * Time: 8:54:05 AM + * To change this template use File | Settings | File Templates. + */ +public class ReadBackedPileup extends BasicPileup { + GenomeLoc loc; + char ref; + List reads; + List offsets; + + public ReadBackedPileup(GenomeLoc loc, char ref, LocusContext context ) { + this(loc, ref, context.getReads(), context.getOffsets()); + } + + public ReadBackedPileup(GenomeLoc loc, char ref, List reads, List offsets ) { + assert reads != null; + assert offsets != null; + assert reads.size() == offsets.size(); + + this.loc = loc; + this.ref = ref; + this.reads = reads; + this.offsets = offsets; + } + + public int size() { return reads.size(); } + public List getReads() { return reads; } + public List getOffsets() { return offsets; } + + public GenomeLoc getLocation() { + return loc; + } + + public char getRef() { + return ref; + } + + public String getBases() { + return basePileupAsString(reads, offsets); + } + + public String getQuals() { + return qualPileupAsString(reads, offsets); + } + + public String getQualsAsInts() { + return Utils.join(",", qualPileup(reads, offsets)); + } + + public String getMappingQualsAsInts() { + return Utils.join(",", mappingQualPileup(reads)); + } + + public String getSecondaryBasePileup() { + return secondaryBasePileupAsString(reads, offsets); + } + + public String getSecondaryQualPileup() { + return secondaryQualPileupAsString(reads, offsets); + } + + public String getPileupString() + { + return String.format("%s: %s %s %s", getLocation(), getRef(), getBases(), getQuals()); + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index 39ea6f16b..90d9eea1c 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -35,11 +35,11 @@ public class Utils { } public static void scareUser(final String msg) { - System.out.printf("********************************************************************************%n"); - System.out.printf("* ERROR:%n"); - System.out.printf("*%n"); - System.out.printf("* %s%n", msg); - System.out.printf("********************************************************************************%n"); + //System.out.printf("********************************************************************************%n"); + //System.out.printf("* ERROR:%n"); + //System.out.printf("*%n"); + //System.out.printf("* %s%n", msg); + //System.out.printf("********************************************************************************%n"); logger.fatal(msg); throw new RuntimeException(msg); } @@ -119,126 +119,6 @@ public class Utils { return c; } - public static String basePileupAsString( List reads, List offsets ) { - StringBuilder bases = new StringBuilder(); - for ( byte base : basePileup(reads, offsets)) { - bases.append((char)base); - } - return bases.toString(); - } - - public static ArrayList basePileup( List reads, List offsets ) { - ArrayList bases = new ArrayList(reads.size()); - for ( int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - bases.add(read.getReadBases()[offset]); - } - return bases; - } - - public static ArrayList qualPileup( List reads, List offsets ) { - ArrayList quals = new ArrayList(reads.size()); - for ( int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - byte qual = (byte)read.getBaseQualities()[offset]; - quals.add(qual); - } - return quals; - } - - public static ArrayList mappingQualPileup( List reads) { - ArrayList quals = new ArrayList(reads.size()); - for ( int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - byte qual = (byte)read.getMappingQuality(); - quals.add(qual); - } - return quals; - } - - public static String qualPileupAsString( List reads, List offsets ) { - StringBuilder quals = new StringBuilder(); - for ( int qual : qualPileup(reads, offsets)) { - qual = Math.min(qual, 63); // todo: fixme, this isn't a good idea - char qualChar = (char) (33 + qual); // todo: warning, this is illegal for qual > 63 - quals.append(qualChar); - } - - return quals.toString(); - } - - public static ArrayList secondaryBasePileup( List reads, List offsets ) { - ArrayList bases2 = new ArrayList(reads.size()); - boolean hasAtLeastOneSQField = false; - - for ( int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - - byte[] compressedQuals = (byte[]) read.getAttribute("SQ"); - byte base2; - //byte qual2; - if (compressedQuals != null) { - base2 = (byte) BaseUtils.baseIndexToSimpleBase(QualityUtils.compressedQualityToBaseIndex(compressedQuals[offset])); - //qual2 = QualityUtils.probToQual(QualityUtils.compressedQualityToProb(compressedQuals[offset])); - hasAtLeastOneSQField = true; - } else { - base2 = (byte) '.'; - } - bases2.add(base2); - } - return (hasAtLeastOneSQField ? bases2 : null); - } - - public static String secondaryBasePileupAsString( List reads, List offsets ) { - StringBuilder bases2 = new StringBuilder(); - ArrayList sqbases = secondaryBasePileup(reads, offsets); - - if (sqbases == null) { return null; } - - for (byte base2 : secondaryBasePileup(reads, offsets)) { - bases2.append((char) base2); - } - - return bases2.toString(); - } - - public static ArrayList secondaryQualPileup( List reads, List offsets ) { - ArrayList quals2 = new ArrayList(reads.size()); - boolean hasAtLeastOneSQField = false; - - for ( int i = 0; i < reads.size(); i++ ) { - SAMRecord read = reads.get(i); - int offset = offsets.get(i); - - byte[] compressedQuals = (byte[]) read.getAttribute("SQ"); - byte qual2; - if (compressedQuals != null) { - qual2 = QualityUtils.probToQual(QualityUtils.compressedQualityToProb(compressedQuals[offset])); - hasAtLeastOneSQField = true; - } else { - qual2 = 0; - } - quals2.add(qual2); - } - return (hasAtLeastOneSQField ? quals2 : null); - } - - public static String secondaryQualPileupAsString( List reads, List offsets ) { - StringBuilder quals2 = new StringBuilder(); - ArrayList sqquals = secondaryQualPileup(reads, offsets); - - if (sqquals == null) { return null; } - - for (byte qual2 : secondaryQualPileup(reads, offsets)) { - quals2.append(qual2); - } - - return quals2.toString(); - } - public static ArrayList subseq(byte[] fullArray) { return subseq(fullArray, 0, fullArray.length); } diff --git a/python/StressTestGATK.py b/python/StressTestGATK.py index bdb7f1e83..9c7e0e41d 100755 --- a/python/StressTestGATK.py +++ b/python/StressTestGATK.py @@ -50,6 +50,10 @@ if __name__ == "__main__": head, lane_filename = os.path.split(lane) filebase = os.path.splitext(lane_filename)[0] + + # make the pileup + # 503 8:03 samtools view -b MV1994.bam Escherichia_coli_K12:10-1000 > sub.bam + # 504 8:03 samtools pileup -f Escherichia_coli_K12_MG1655.fasta sub.bam # convert the fasta for analysis in commandsList: