From 5289230eb8417e37050e52f4ff00eacb7957e44b Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 25 Jun 2009 22:50:55 +0000 Subject: [PATCH] Version 0.2.1 (released) of the TableRecalibrator git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1108 348d0f76-0448-11de-a6fe-93d51630548a --- R/plot_q_emp_stated_hst.R | 11 +- R/plot_qual_diff_v_cycle.R | 6 +- .../gatk/refdata/ReferenceOrderedData.java | 1 - .../recalibration/CovariateCounter.java | 128 ++++++++++ .../recalibration/CovariateCounterWalker.java | 232 +++++++----------- .../gatk/walkers/recalibration/RecalData.java | 13 +- .../TableRecalibrationWalker.java | 80 +++--- .../gatk/walkers/BaseQualityHistoWalker.java | 3 + .../sting/utils/GenomeLocParser.java | 2 +- .../sting/utils/QualityUtils.java | 10 +- .../sting/utils/sam/ArtificialSAMUtils.java | 2 +- .../recalibration/CovariateCounterTest.java | 183 ++++++++++++++ packages/GATKResources.xml | 14 ++ packages/ReadQualityRecalibrator.xml | 15 +- python/Gelis2PopSNPs.py | 12 +- python/analyzeRecalQuals.py | 22 +- python/picard_utils.py | 4 +- shell/makeResources.sh | 2 + 18 files changed, 530 insertions(+), 210 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounter.java create mode 100755 java/test/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterTest.java create mode 100755 packages/GATKResources.xml create mode 100755 shell/makeResources.sh diff --git a/R/plot_q_emp_stated_hst.R b/R/plot_q_emp_stated_hst.R index ec6f1e0af..468b619d0 100755 --- a/R/plot_q_emp_stated_hst.R +++ b/R/plot_q_emp_stated_hst.R @@ -12,8 +12,13 @@ t=read.table(input, header=T) #png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446) outfile = paste(input, ".quality_emp_v_stated.pdf", sep="") pdf(outfile, height=7, width=7) -plot(t$Qreported, t$Qempirical, type="p", col="blue", xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores") -abline(0,1) +d.good <- t[t$nMismatches >= 1000,] +d.100 <- t[t$nMismatches < 100,] +d.1000 <- t[t$nMismatches < 1000 & t$nMismatches >= 100,] +plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,45), ylim=c(0,45), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores") +points(d.100$Qreported, d.100$Qempirical, type="p", col="lightblue", pch=16) +points(d.1000$Qreported, d.1000$Qempirical, type="p", col="cornflowerblue", pch=16) +abline(0,1, lty=2) dev.off() #outfile = paste(input, ".quality_emp_hist.png", sep="") @@ -21,6 +26,6 @@ dev.off() outfile = paste(input, ".quality_emp_hist.pdf", sep="") pdf(outfile, height=7, width=7) hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0) -plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,40), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n") +plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,45), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n") axis(2,axTicks(2), format(axTicks(2), scientific=F)) dev.off() diff --git a/R/plot_qual_diff_v_cycle.R b/R/plot_qual_diff_v_cycle.R index 862569c22..a9d4f5e46 100755 --- a/R/plot_qual_diff_v_cycle.R +++ b/R/plot_qual_diff_v_cycle.R @@ -12,5 +12,9 @@ outfile = paste(input, ".qual_diff_v_cycle.pdf", sep="") pdf(outfile, height=7, width=7) par(cex=1.1) c <- read.table(input, header=T) -plot(c$Cycle, c$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", ylim=c(-10, 10)) +d.good <- c[c$nMismatches >= 100,] +d.100 <- c[c$nMismatches < 100,] +plot(d.good$Cycle, d.good$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", col="blue", ylim=c(-10, 10)) +points(d.100$Cycle, d.100$Qempirical_Qreported, type="p", col="lightblue", pch=3) +#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16) diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index 29308fe67..6f1c77c43 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -85,7 +85,6 @@ public class ReferenceOrderedData implements public static void parseBindings(Logger logger, ArrayList bindings, List > rods) { // Loop over triplets - System.out.printf("Binding is %s%n", Utils.join(" XXX ", bindings)); for( String bindingSets: bindings ) { String[] bindingTokens = bindingSets.split(","); if( bindingTokens.length % 3 != 0 ) diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounter.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounter.java new file mode 100755 index 000000000..da9ea62de --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounter.java @@ -0,0 +1,128 @@ +package org.broadinstitute.sting.gatk.walkers.recalibration; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.BaseUtils; + +import javax.management.RuntimeErrorException; +import java.util.*; + +public class CovariateCounter { + private boolean collapsePos = false; + private boolean collapseDinuc = false; + + private HashMap data = new HashMap(); + + public CovariateCounter( Set readGroups, boolean collapsePos, boolean collapseDinuc ) { + this.collapsePos = collapsePos; + this.collapseDinuc = collapseDinuc; + + for (String readGroup : readGroups ) { + RecalDataManager manager = new RecalDataManager(readGroup, ! collapsePos, ! collapseDinuc ); + data.put(readGroup, manager); + } + } + + /** + * Returns the set of readGroup names we are counting covariates for + * @return + */ + public Set getReadGroups() { + return data.keySet(); + } + + public boolean isCollapseDinuc() { + return collapseDinuc; + } + + public boolean isCollapsePos() { + return collapsePos; + } + + /** + * Returns the number of read groups being managed + * @return + */ + public int getNReadGroups() { + return data.size(); + } + + /** + * Get the particular RecalData datum associated with readGroup, at machine pos, with reported + * quality qual, and with the dinuc context of prevBase, base. If an example of such a + * base has been seen before, returns the associated RecalData. If not, it creates one, places it in the + * system so that subsequent requests will return that object, and returns it. + * + * @param readGroup + * @param pos + * @param qual + * @param prevBase + * @param base + * @return + */ + public RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) { + byte[] cs = {(byte)prevBase, (byte)base}; + String s = new String(cs); + return data.get(readGroup).expandingGetRecalData(pos, qual, s, true); + } + + /** + * Get a list of all of the RecalData associated with readGroup + * + * @param readGroup + * @return + */ + public List getRecalData(String readGroup) { + return data.get(readGroup).getAll(); + } + + /** + * Updates the recalibration data for the base at offset in the read, associated with readGroup rg. + * Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read + * but by implied machine cycle associated with the offset. + * + * TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based + * TODO: and offset is 0-based. How very annoying. + * + * @param rg + * @param read + * @param offset + * @param ref + * @return + */ + public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) { + if ( offset == 0 ) + throw new RuntimeException("Illegal read offset " + offset + " in read " + read.getReadName()); + + int cycle = offset; + byte[] bases = read.getReadBases(); + byte[] quals = read.getBaseQualities(); + + char base = (char)bases[offset]; + char prevBase = (char)bases[offset - 1]; + + if (read.getReadNegativeStrandFlag()) { + ref = (char)BaseUtils.simpleComplement(ref); + base = (char)BaseUtils.simpleComplement(base); + prevBase = (char)BaseUtils.simpleComplement((char)bases[offset+1]); + cycle = read.getReadLength() - (offset + 1); + } + + int qual = quals[offset]; + if ( qual > 0 ) { + RecalData datum = getRecalData(rg, cycle, qual, prevBase, base); + if (datum != null) datum.inc(base,ref); + return 1; + } else { + return 0; + } + } + + public void printState() { + for ( String readGroup : getReadGroups() ) { + for ( RecalData datum : getRecalData(readGroup) ) { + if ( datum.N > 0 ) + System.out.println(datum); + } + } + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java index 47a4d721b..100972c66 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterWalker.java @@ -7,18 +7,15 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.rodDbSNP; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.WalkerName; - import org.broadinstitute.sting.utils.cmdLine.Argument; -import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.BaseUtils; import java.util.*; import java.io.PrintStream; import java.io.FileNotFoundException; @WalkerName("CountCovariates") -public class CovariateCounterWalker extends LocusWalker { +public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false) public int buggyMaxReadLen = 100000; @@ -28,83 +25,60 @@ public class CovariateCounterWalker extends LocusWalker { @Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score") public int MIN_MAPPING_QUALITY = 1; - @Argument(fullName="READ_GROUP", shortName="rg", required=false, doc="Only use reads with this read group (@RG)") - public String READ_GROUP = "none"; - - //@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.") - //public int MAX_READ_GROUPS = 100; - @Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)") public List platforms = Collections.singletonList("*"); //public List platforms = Collections.singletonList("ILLUMINA"); - @Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="") + //@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="") public boolean collapsePos = false; - @Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="") + //@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="") public boolean collapseDinuc = false; - HashMap data = new HashMap(); + private CovariateCounter covariateCounter = null; - long counted_sites = 0; // number of sites used to count covariates - long counted_bases = 0; // number of bases used to count covariates - long skipped_sites = 0; // number of sites skipped because of a dbSNP entry + private long counted_sites = 0; // number of sites used to count covariates + private long counted_bases = 0; // number of bases used to count covariates + private long skipped_sites = 0; // number of sites skipped because of a dbSNP entry - PrintStream recalTableOut = null; + /** + * Initialize the system. Setup the data CovariateCountry for the read groups in our header + */ public void initialize() { - try { - recalTableOut = new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" ); - } catch ( FileNotFoundException e ) { - throw new RuntimeException("Couldn't open output file", e); - } - + Set readGroups = new HashSet(); for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { if( readGroup.getAttribute("PL") == null ) Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are supported",readGroup.getReadGroupId())); if( !isSupportedReadGroup(readGroup) ) continue; - String rg = readGroup.getReadGroupId(); - //RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, RecalData.NDINUCS, ! collapsePos, ! collapseDinuc ); - RecalDataManager manager = new RecalDataManager(rg, ! collapsePos, ! collapseDinuc ); - data.put(rg, manager); + readGroups.add(readGroup.getReadGroupId()); } - out.printf("Created recalibration data collectors for %d read group(s)%n", data.size()); + + covariateCounter = new CovariateCounter(readGroups, collapsePos, collapseDinuc); + logger.info(String.format("Created recalibration data collectors for %d read group(s)", covariateCounter.getNReadGroups())); } - /** - * Get the particular RecalData datum associated with readGroup, at machine pos, with reported - * quality qual, and with the dinuc context of prevBase, base. If an example of such a - * base has been seen before, returns the associated RecalData. If not, it creates one, places it in the - * system so that subsequent requests will return that object, and returns it. - * - * @param readGroup - * @param pos - * @param qual - * @param prevBase - * @param base - * @return - */ - private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) { - byte[] cs = {(byte)prevBase, (byte)base}; - String s = new String(cs); - return data.get(readGroup).expandingGetRecalData(pos, qual, s, true); - } + // -------------------------------------------------------------------------------------------------------------- + // + // map + // + // -------------------------------------------------------------------------------------------------------------- /** - * Get a list of all of the RecalData associated with readGroup + * Walk over each read in the locus pileup and update the covariate counts based on these bases and their + * matching (or not) with ref. dbSNP aware, so avoids sites that are known as SNPs in DBSNP. * - * @param readGroup + * @param tracker + * @param ref + * @param context * @return */ - private List getRecalData(String readGroup) { - return data.get(readGroup).getAll(); - } - public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) { - //System.out.printf("%s %c%n", context.getLocation(), ref); rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null); if ( dbsnp == null || !dbsnp.isSNP() ) { + // We aren't at a dbSNP position that's a SNP, so update the read + List reads = context.getReads(); List offsets = context.getOffsets(); for (int i =0; i < reads.size(); i++ ) { @@ -115,73 +89,71 @@ public class CovariateCounterWalker extends LocusWalker { } SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG")); - if ( isSupportedReadGroup(readGroup) && - (READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) && - (read.getMappingQuality() >= MIN_MAPPING_QUALITY)) { + if ((read.getMappingQuality() >= MIN_MAPPING_QUALITY && isSupportedReadGroup(readGroup) )) { int offset = offsets.get(i); - int numBases = read.getReadLength(); - if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count - counted_bases += updateDataFromRead(readGroup.getReadGroupId(), read, offset, ref); + if ( offset > 0 && offset < (read.getReadLength() - 1) ) { // skip first and last bases because they suck and they don't have a dinuc count + counted_bases += covariateCounter.updateDataFromRead(readGroup.getReadGroupId(), read, offset, ref); } } } counted_sites += 1; } else { skipped_sites += 1; - //System.out.println(dbsnp.toSimpleString()+" "+new ReadBackedPileup(ref, context).getPileupString()); } return 1; } + /** - * Updates the recalibration data for the base at offset in the read, associated with readGroup rg. - * Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read - * but by implied machine cycle associated with the offset. + * Check to see whether this read group should be processed. Returns true if the + * read group is in the list of platforms to process or the platform == *, indicating + * that all platforms should be processed. * - * TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based - * TODO: and offset is 0-based. How very annoying. - * - * @param rg - * @param read - * @param offset - * @param ref + * @param readGroup * @return */ - private int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) { - int cycle = offset; - byte[] bases = read.getReadBases(); - byte[] quals = read.getBaseQualities(); - - char base = (char)bases[offset]; - char prevBase = (char)bases[offset - 1]; - - if (read.getReadNegativeStrandFlag()) { - ref = (char)BaseUtils.simpleComplement(ref); - base = (char)BaseUtils.simpleComplement(base); - prevBase = (char)BaseUtils.simpleComplement((char)bases[offset+1]); - cycle = read.getReadLength() - (offset + 1); + private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) { + for( String platform: platforms ) { + platform = platform.trim(); + if( readGroup.getAttribute("PL") == null || + platform.equals("*") || + readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) ) + return true; } - int qual = quals[offset]; - if ( qual > 0 ) { - RecalData datum = getRecalData(rg, cycle, qual, prevBase, base); - if (datum != null) datum.inc(base,ref); - return 1; - } else { - return 0; + return false; + } + + // -------------------------------------------------------------------------------------------------------------- + // + // Reduce + // + // -------------------------------------------------------------------------------------------------------------- + + /** + * Provide an initial value for reduce computations. + * @return Initial value of reduce. + */ + public PrintStream reduceInit() { + try { + return new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" ); + } catch ( FileNotFoundException e ) { + throw new RuntimeException("Couldn't open output file", e); } } - public void onTraversalDone(Integer result) { + public void onTraversalDone(PrintStream recalTableStream) { printInfo(out); - out.printf("Writing raw recalibration data%n"); - writeRecalTable(); + out.printf("Writing raw recalibration data..."); out.flush(); + writeRecalTable(recalTableStream); out.printf("...done%n"); //out.printf("Writing logistic recalibration data%n"); //writeLogisticRecalibrationTable(); //out.printf("...done%n"); + + recalTableStream.close(); } /** @@ -189,13 +161,13 @@ public class CovariateCounterWalker extends LocusWalker { * @param out */ private void printInfo(PrintStream out) { - out.printf("# date %s%n", new Date()); - out.printf("# collapsed_pos %b%n", collapsePos); - out.printf("# collapsed_dinuc %b%n", collapseDinuc); - out.printf("# counted_sites %d%n", counted_sites); - out.printf("# counted_bases %d%n", counted_bases); - out.printf("# skipped_sites %d%n", skipped_sites); - out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites); + out.printf("# date \"%s\"%n", new Date()); + out.printf("# collapsed_pos %b%n", collapsePos); + out.printf("# collapsed_dinuc %b%n", collapseDinuc); + out.printf("# counted_sites %d%n", counted_sites); + out.printf("# counted_bases %d%n", counted_bases); + out.printf("# skipped_sites %d%n", skipped_sites); + out.printf("# fraction_skipped 1 / %.0f bp%n", (double)counted_sites / skipped_sites); } @Deprecated @@ -204,14 +176,14 @@ public class CovariateCounterWalker extends LocusWalker { try { dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv"); dinuc_out.println("rg,dn,logitQ,pos,indicator,count"); - for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { + for (String readGroup : covariateCounter.getReadGroups()) { for ( int dinuc_index=0; dinuc_index 0) - dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B); + dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup, RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B); if (datum.B > 0) - dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 1, datum.B); + dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup, RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 1, datum.B); } } } @@ -229,56 +201,26 @@ public class CovariateCounterWalker extends LocusWalker { * Writes out the key recalibration data collected from the reads. Dumps this recalibration data * as a CVS string to the recalTableOut PrintStream. Emits the data for all read groups into this file. */ - private void writeRecalTable() { - printInfo(recalTableOut); - recalTableOut.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp"); - for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) { - // TODO: should sort the data coming out of getRecalData here for easier processing - for ( RecalData datum: RecalData.sort(getRecalData(readGroup.getReadGroupId())) ) { + private void writeRecalTable(PrintStream recalTableStream) { + printInfo(recalTableStream); + + recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp"); + for (String readGroup : covariateCounter.getReadGroups()) { + for ( RecalData datum: RecalData.sort(covariateCounter.getRecalData(readGroup)) ) { if ( datum.N > 0 ) - recalTableOut.format("%s%n", datum.toCSVString(collapsePos)); + recalTableStream.format("%s%n", datum.toCSVString(collapsePos)); } } - recalTableOut.close(); - } - - /** - * Check to see whether this read group should be processed. Returns true if the - * read group is in the list of platforms to process or the platform == *, indicating - * that all platforms should be processed. - * - * @param readGroup - * @return - */ - private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) { - for( String platform: platforms ) { - platform = platform.trim(); - if( readGroup.getAttribute("PL") == null || - platform.equals("*") || - readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) ) - return true; - } - - return false; - } - - /** - * No initialization routines - * - * @return - */ - public Integer reduceInit() { - return 0; } /** * Doesn't do anything * - * @param a - * @param b + * @param empty + * @param recalTableStream * @return */ - public Integer reduce(Integer a, Integer b) { - return 0; + public PrintStream reduce(Integer empty, PrintStream recalTableStream) { + return recalTableStream; } } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalData.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalData.java index d93574946..dabf68b83 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalData.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalData.java @@ -167,7 +167,7 @@ public class RecalData implements Comparable { * @param s * @return */ - public static RecalData fromCSVString(String s) { + public static RecalData fromCSVString(String s) throws NumberFormatException { String[] vals = s.split(","); String rg = vals[0]; int pos = vals[1].equals("*") ? 0 : Integer.parseInt(vals[1]); @@ -178,6 +178,13 @@ public class RecalData implements Comparable { RecalData datum = new RecalData(pos, qual, rg, dinuc); datum.B = B; datum.N = N; + + // Checking for badness + if ( pos < 0 ) throw new NumberFormatException("Illegal position detected: " + pos); + if ( B < 0 ) throw new NumberFormatException("Illegal mismatch count detected: " + B); + if ( N < 0 ) throw new NumberFormatException("Illegal base count detected: " + N); + if ( qual < 0 || qual > QualityUtils.MAX_QUAL_SCORE ) throw new NumberFormatException("Illegal qual detected: " + qual); + return datum; } @@ -216,8 +223,8 @@ public class RecalData implements Comparable { } double q = QualityUtils.phredScaleErrorRate(sumExpectedErrors / nBases); - System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n", - sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q); + //System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n", + // sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q); return q; } } \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 8b50ae1d6..ca1aff4b6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -54,6 +54,8 @@ public class TableRecalibrationWalker extends ReadWalker [prevBase x base -> [cycle, qual, new qual]] @@ -77,9 +79,12 @@ public class TableRecalibrationWalker extends ReadWalker data = new ArrayList(); boolean collapsedPos = false; boolean collapsedDinuc = false; - List lines = new xReadLines(new File(paramsFile)).readLines(); - for ( String line : lines ) { + //List lines = new xReadLines(new File(paramsFile)).readLines(); + for ( String line : new xReadLines(new File(paramsFile)) ) { lineNumber++; if ( HEADER_PATTERN.matcher(line).matches() ) continue; @@ -159,12 +164,14 @@ public class TableRecalibrationWalker extends ReadWalker QualityUtils.MAX_REASONABLE_Q_SCORE ) + throw new StingException(String.format("Bug found -- assigning bad quality score %d x %d => %d", cycle, qual, newQual)); + recalQuals[cycle] = newQual; //System.out.printf("Mapping %d => %d%n", qual, newQual); } @@ -322,17 +346,12 @@ class CombinatorialRecalMapping implements RecalMapping { int pos = manager.canonicalPos(datum.pos); if ( table[pos][datum.qual] != 0 ) throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum)); - //table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte()); table[pos][datum.qual] = datum.empiricalQualByte(useRawQempirical); //System.out.printf("Binding %d %d => %d%n", pos, datum.qual, datum.empiricalQualByte(useRawQempirical)); } } public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) { - //String dinuc = String.format("%c%c", (char)prevBase, (char)base); - //if ( qual == 2 ) - // System.out.printf("Qual = 2%n"); - int pos = manager.canonicalPos(cycle); int index = this.manager.getDinucIndex(prevBase, base); byte[][] dataTable = index == -1 ? null : cache.get(index); @@ -340,13 +359,7 @@ class CombinatorialRecalMapping implements RecalMapping { if ( dataTable == null && prevBase != 'N' && base != 'N' ) throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, (char)prevBase, (char)base)); - byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual; - - //if ( result == 2 ) - // System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n", - // readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result); - - return result; + return dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual; } } @@ -368,13 +381,13 @@ class SerialRecalMapping implements RecalMapping { private double globalDeltaQ = 0.0; private double[][] deltaQPosMap, deltaQDinucMap; double [] deltaQualMap; - RecalData [][] qPosSupports, qDinucSupports; + RecalData [][] qPosSupports = null, qDinucSupports = null; CombinatorialRecalMapping combiMap; RecalDataManager manager; String dinucToLookAt = null; // "CC"; - int posToLookAt = 0; + int posToLookAt = -1; int qualToLookAt = 25; public SerialRecalMapping(RecalDataManager manager, final boolean useRawQempirical, @@ -387,7 +400,7 @@ class SerialRecalMapping implements RecalMapping { RecalData datum = new RecalData(0, 0, manager.readGroup, "**").inc(manager.getAll()); double aggregrateQreported = RecalData.combinedQreported(manager.getAll()); globalDeltaQ = datum.empiricalQualDouble(useRawQempirical) - aggregrateQreported; - System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ); + //System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ); } for ( RecalData datum : manager.getAll() ) { @@ -399,12 +412,12 @@ class SerialRecalMapping implements RecalMapping { deltaQualMap = new double[maxQReported+1]; for ( RecalData datum : RecalData.sort(manager.combine(true, false, true)) ) { deltaQualMap[datum.qual] = datum.empiricalQualDouble(useRawQempirical) - datum.qual - globalDeltaQ; - System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]); + //System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]); } // calculate the delta Q pos array deltaQPosMap = new double[maxPos+1][maxQReported+1]; - qPosSupports = new RecalData[maxPos+1][maxQReported+1]; + //qPosSupports = new RecalData[maxPos+1][maxQReported+1]; for ( RecalData datumAtPosQual : manager.combineDinucs() ) { double offset = globalDeltaQ + deltaQualMap[datumAtPosQual.qual]; updateCache(qPosSupports, datumAtPosQual, useRawQempirical, deltaQPosMap, datumAtPosQual.pos, datumAtPosQual.qual, offset); @@ -412,7 +425,7 @@ class SerialRecalMapping implements RecalMapping { // calculate the delta Q dinuc array deltaQDinucMap = new double[dinucs.size()+1][maxQReported+1]; - qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1]; + //qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1]; for ( RecalData datumAtDinucQual : manager.combineCycles() ) { double offset = globalDeltaQ + deltaQualMap[datumAtDinucQual.qual]; updateCache(qDinucSupports, datumAtDinucQual, useRawQempirical, deltaQDinucMap, datumAtDinucQual.getDinucIndex(), datumAtDinucQual.qual, offset); @@ -429,7 +442,7 @@ class SerialRecalMapping implements RecalMapping { for ( int j = 0; j < maxQReported; j++ ) { if ( printStateP(i, null, j) ) System.out.printf("Mapping: pos=%d qual=%2d delta=%.2f based on %s%n", - i, j, deltaQPosMap[i][j], qPosSupports[i][j]); + i, j, deltaQPosMap[i][j], qPosSupports != null ? qPosSupports[i][j] : null); } } @@ -438,7 +451,7 @@ class SerialRecalMapping implements RecalMapping { String dinuc = RecalData.dinucIndex2bases(i); if ( printStateP(0, dinuc, j ) ) System.out.printf("Mapping: dinuc=%s qual=%2d delta=%.2f based on %s%n", - dinuc, j, deltaQDinucMap[i][j], qDinucSupports[i][j]); + dinuc, j, deltaQDinucMap[i][j], qDinucSupports != null ? qDinucSupports[i][j] : null); } } @@ -457,7 +470,8 @@ class SerialRecalMapping implements RecalMapping { throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum)); double deltaQ = datum.empiricalQualDouble(useRawQempirical) - datum.qual - meanQ; table[i][j] = deltaQ; - supports[i][j] = datum; + if ( supports != null ) + supports[i][j] = datum; } private boolean printStateP( int cycle, String dinuc, int qual ) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseQualityHistoWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseQualityHistoWalker.java index 370d3ffa4..7dd66a51b 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseQualityHistoWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/BaseQualityHistoWalker.java @@ -30,6 +30,9 @@ public class BaseQualityHistoWalker extends ReadWalker { // Map over the org.broadinstitute.sting.gatk.LocusContext public Integer map(char[] ref, SAMRecord read) { for ( byte qual : read.getBaseQualities() ) { + if ( qual < 0 || qual > 100 ) { + throw new RuntimeException(String.format("Invalid base quality detected -- %d at %s%n", qual, read.getReadName())); + } //System.out.println(qual); this.qualCounts[qual]++; } diff --git a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 3572f8257..5f16de7d0 100644 --- a/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -244,7 +244,7 @@ public class GenomeLocParser { * @return the list of merged locations */ public static List mergeOverlappingLocations(final List raw) { - logger.debug(" Raw locations are:\n" + Utils.join("\n", raw)); + logger.debug(" Raw locations are: " + Utils.join(", ", raw)); if (raw.size() <= 1) return raw; else { diff --git a/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 3111612b4..fdf7fbbfd 100755 --- a/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -85,8 +85,16 @@ public class QualityUtils { return boundQual(qual, MAX_QUAL_SCORE); } + /** + * Returns an integer quality score bounded by 1 - maxQual. + * + * @param qual + * @param maxQual + * @return + */ static public byte boundQual(int qual, byte maxQual) { - return (byte) Math.min(qual, maxQual); + //return (byte) Math.min(qual, maxQual); + return (byte) Math.max(Math.min(qual, maxQual), 1); } /** diff --git a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index d805aa4dd..ae2b5d1a9 100755 --- a/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -191,7 +191,7 @@ public class ArtificialSAMUtils { } SAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length); rec.setReadBases(bases); - rec.setBaseQualities(bases); + rec.setBaseQualities(qual); return rec; } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterTest.java new file mode 100755 index 000000000..45d11b14b --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/walkers/recalibration/CovariateCounterTest.java @@ -0,0 +1,183 @@ +// our package +package org.broadinstitute.sting.gatk.walkers.recalibration; + +import java.util.*; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMFileHeader; + +// the imports for unit testing. +import org.junit.Assert; +import org.junit.Test; +import org.junit.Before; +import org.junit.BeforeClass; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.playground.gatk.walkers.indels.CleanedReadInjector; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMFileReader; +import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; + +import java.util.HashSet; +import java.io.FileNotFoundException; +import java.io.File; + +/** + * Basic unit test for RecalData + */ +public class CovariateCounterTest extends BaseTest { + String readGroup1 = "rg1"; + String readGroup2 = "rg2"; + Set readGroups = new HashSet(); + + SAMFileHeader header; + + SAMRecord read1, read2, read3; + + byte bases1[] = {'a', 't', 'c', 'g', 'a'}; + byte quals1[] = {1, 2, 3, 4, 5}; + byte quals3[] = {1, 2, 5, 5, 5}; + byte bases2[] = {'t', 'c', 'g', 'a', 't'}; + byte quals2[] = {2, 2, 4, 5, 2}; + + /* + public CovariateCounter( Set readGroups, boolean collapsePos, boolean collapseDinuc ) { + public Set getReadGroups() { + public boolean isCollapseDinuc() { + public boolean isCollapsePos() { + public int getNReadGroups() { + private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) { + public List getRecalData(String readGroup) { + public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) { + */ + + /** + * The fasta, for comparison. + */ + protected static IndexedFastaSequenceFile sequenceFile = null; + + CovariateCounter c; + + /** + * Initialize the fasta. + */ + @BeforeClass + public static void initialize() throws FileNotFoundException { + sequenceFile = new IndexedFastaSequenceFile( new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") ); + GenomeLocParser.setupRefContigOrdering(sequenceFile); + + } + + @Before + public void initializeBefore() { + header = ArtificialSAMUtils.createArtificialSamHeader(2,0,247249719); + readGroups.addAll(Arrays.asList(readGroup1, readGroup2)); + ArtificialSAMUtils.createDefaultReadGroup( header, readGroup1, "sample1" ); + ArtificialSAMUtils.createDefaultReadGroup( header, readGroup2, "sample2" ); + c = new CovariateCounter( readGroups, false, false ); + + read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases1, quals1); + read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",1,1, bases2, quals2); + read3 = ArtificialSAMUtils.createArtificialRead(header,"read3",1,1, bases1, quals3); + } + + @Test + public void testCovariateCounterSetup() { + Assert.assertEquals("Number of read groups is wrong", c.getNReadGroups(), 2); + Assert.assertEquals("Read group identities are wrong", c.getReadGroups(), readGroups); + Assert.assertEquals("Incorrectly collapsed counter", c.isCollapseDinuc(), false); + Assert.assertEquals("Incorrectly collapsed counter", c.isCollapsePos(), false); + } + + @Test + public void testOneRead() { + for ( int i = 1; i < read1.getReadBases().length; i++ ) + c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]); + c.printState(); + + Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0); + for ( int i = 1; i < bases1.length; i++ ) { + RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]); + System.out.printf("%s%n", datum); + Assert.assertNotNull("Incorrect mapping to recal bin", datum); + Assert.assertEquals("Bad mismatch count", datum.B, 0); + Assert.assertEquals("Bad base count", datum.N, 1); + Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]); + Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]); + Assert.assertEquals("Qual is bad", datum.qual, quals1[i]); + } + } + + @Test + public void testTwoReads() { + for ( int i = 1; i < read1.getReadBases().length; i++ ) + c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]); + for ( int i = 1; i < read2.getReadBases().length; i++ ) + c.updateDataFromRead(readGroup2, read2, i, (char)read2.getReadBases()[i]); + c.printState(); + + Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0); + for ( int i = 1; i < bases1.length; i++ ) { + RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]); + System.out.printf("%s%n", datum); + Assert.assertNotNull("Incorrect mapping to recal bin", datum); + Assert.assertEquals("Bad mismatch count", datum.B, 0); + Assert.assertEquals("Bad base count", datum.N, 1); + Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]); + Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]); + Assert.assertEquals("Qual is bad", datum.qual, quals1[i]); + } + } + + @Test + public void testTwoReadsSameGroup() { + for ( int i = 1; i < read1.getReadBases().length; i++ ) + c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]); + for ( int i = 1; i < read2.getReadBases().length; i++ ) + c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]); + c.printState(); + + for ( int i = 1; i < bases1.length; i++ ) { + RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]); + System.out.printf("%s%n", datum); + Assert.assertNotNull("Incorrect mapping to recal bin", datum); + Assert.assertEquals("Bad mismatch count", datum.B, 0); + Assert.assertEquals("Bad base count", datum.N, 2); + Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]); + Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]); + Assert.assertEquals("Qual is bad", datum.qual, quals1[i]); + } + } + + @Test + public void testTwoReadsSameGroupNotIdentical() { + for ( int i = 1; i < read1.getReadBases().length; i++ ) + c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]); + for ( int i = 1; i < read3.getReadBases().length; i++ ) + c.updateDataFromRead(readGroup1, read3, i, (char)read3.getReadBases()[i]); + c.printState(); + + for ( int i = 1; i < bases1.length; i++ ) { + RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]); + System.out.printf("%s%n", datum); + Assert.assertNotNull("Incorrect mapping to recal bin", datum); + Assert.assertEquals("Bad mismatch count", datum.B, 0); + Assert.assertEquals("Bad base count", datum.N, quals1[i] == quals3[i] ? 2 : 1); + Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]); + Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]); + Assert.assertEquals("Qual is bad", datum.qual, quals1[i]); + } + } + + @Test (expected = RuntimeException.class) + public void testBadReadOffset() { + byte bases[] = {'a', 't', 'c', 'g', 'a'}; + byte quals[] = {1, 2, 3, 4, 5}; + + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases, quals); + + c.updateDataFromRead(readGroup1, read, 0, (char)bases[0]); + } +} \ No newline at end of file diff --git a/packages/GATKResources.xml b/packages/GATKResources.xml new file mode 100755 index 000000000..c95a0e7a3 --- /dev/null +++ b/packages/GATKResources.xml @@ -0,0 +1,14 @@ + + + GATKResources + + /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod + /broad/1KG/reference/human_b36_both.fasta + /broad/1KG/reference/human_b36_both.dict + /broad/1KG/reference/human_b36_both.fasta.fai + /humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod + /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta + /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dict + /seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai + + diff --git a/packages/ReadQualityRecalibrator.xml b/packages/ReadQualityRecalibrator.xml index 02b23face..c2d368500 100644 --- a/packages/ReadQualityRecalibrator.xml +++ b/packages/ReadQualityRecalibrator.xml @@ -3,18 +3,7 @@ ReadQualityRecalibrator org.broadinstitute.sting.gatk.CommandLineGATK - org.broadinstitute.sting.playground.gatk.walkers.CovariateCounterWalker - org.broadinstitute.sting.playground.gatk.walkers.LogisticRecalibrationWalker + org.broadinstitute.sting.gatk.walkers.recalibration.CovariateCounterWalker + org.broadinstitute.sting.gatk.walkers.recalibration.TableRecalibrationWalker - - python/RecalQual.py - python/LogisticRegressionByReadGroup.py - - - R/logistic_regression.R - /humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod - /broad/1KG/reference/human_b36_both.fasta - /broad/1KG/reference/human_b36_both.dict - /broad/1KG/reference/human_b36_both.fasta.fai - diff --git a/python/Gelis2PopSNPs.py b/python/Gelis2PopSNPs.py index 871bf44b4..89246ddad 100755 --- a/python/Gelis2PopSNPs.py +++ b/python/Gelis2PopSNPs.py @@ -44,6 +44,16 @@ def bams2geli(bams): calls = map(call1, bams) return map(lambda x: x[0], calls), map(lambda x: x[1], calls) +def gelis2gelisText( gelis ): + def geli2geliText( maybeGeli ): + if os.path.splitext(maybeGeli)[1] == ".calls" : + return maybeGeli + else: + return os.path.split(geli)[1] + '.calls' + + return map( geli2geliText, gelis) + + def main(): global OPTIONS, ROOT @@ -124,7 +134,7 @@ def main(): # convert the geli's to text jobid = None - variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis) + variantsOut = gelis2gelisText( gelis ) for geli, variantOut in zip(gelis, variantsOut): name = os.path.split(geli)[1] if not os.path.exists(variantOut): diff --git a/python/analyzeRecalQuals.py b/python/analyzeRecalQuals.py index f75d8c414..d466ba0fa 100755 --- a/python/analyzeRecalQuals.py +++ b/python/analyzeRecalQuals.py @@ -13,6 +13,7 @@ import operator MAX_QUAL_SCORE = 50 def phredQScore( nMismatches, nBases ): + """Calculates a phred-scaled score for nMismatches in nBases""" #print 'phredQScore', nMismatches, nBases if nMismatches == 0: return MAX_QUAL_SCORE @@ -24,10 +25,12 @@ def phredQScore( nMismatches, nBases ): def phredScore2ErrorProp(qual): + """Converts a phred-scaled quality score to an error probability""" #print 'phredScore2ErrorProp', qual return math.pow(10.0, float(qual) / -10.0) def tryByInt(s): + """Try to cast something to an int, or return it as a string""" try: return int(s) except: @@ -36,11 +39,13 @@ def tryByInt(s): expectedHeader = 'rg,pos,Qrep,dn,nBases,nMismatches,Qemp'.split(',') defaultValues = '0,0,0,**,0,0,0'.split(',') class RecalData(dict): - + """Basic recalibration data -- corresponds exactly to the Java version in GATK""" def __init__(self): self.parse(expectedHeader, defaultValues) def parse(self, header, data): + """Parse the comma-separated data line with corresponding header. Throws an error + if the header doesn't correspond to the expectedHeader""" # rg,pos,Qrep,dn,NBases,MMismatches,Qemp types = [str, tryByInt, int, str, int, int, int] for head, expected, datum, type in zip(header, expectedHeader, data, types): @@ -57,8 +62,11 @@ class RecalData(dict): def __getattr__(self, name): return self[name] - - # rg,dn,Qrep,pos,NBases,MMismatches,Qemp + + + # + # Trivial accessor functions + # def readGroup(self): return self.rg def dinuc(self): return self.dn def qReported(self): return self.Qrep @@ -204,8 +212,9 @@ def lsamplestdev (inlist, counts, mean): for item, count in zip(inlist, counts): diff = item - mean inc = count * diff * diff - #print item, count, mean, diff, diff*diff, inc + #print "%3d" % int(item), count, mean, diff, diff*diff, inc, sum sum += inc + #print sum, n, sum / float(n-1), math.sqrt(sum / float(n-1)) return math.sqrt(sum / float(n-1)) def rmse(reportedList, empiricalList, counts): @@ -320,7 +329,7 @@ def analyzeFiles(files): for file in files: print 'Analyzing file', file plotter = getPlotterForFile(file) - if plotter <> None: + if plotter <> None and not OPTIONS.noplots: cmd = ' '.join([Rscript, plotter, file]) farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry) @@ -341,6 +350,9 @@ def main(): parser.add_option("-s", "--stdout", dest="toStdout", action='store_true', default=False, help="If provided, writes output to standard output, not to files") + parser.add_option("", "--no_plots", dest="noplots", + action='store_true', default=False, + help="If provided, no plots will be generated") parser.add_option("", "--dry", dest="dry", action='store_true', default=False, help="If provided, nothing actually gets run, just a dry run") diff --git a/python/picard_utils.py b/python/picard_utils.py index 3377b782f..56fadd162 100755 --- a/python/picard_utils.py +++ b/python/picard_utils.py @@ -153,7 +153,7 @@ def aggregateGeliCalls( sortedGeliCalls ): #return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)] return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)] -def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m' ): +def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m', compression_level = 1 ): if useSamtools: return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles) else: @@ -164,7 +164,7 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, MSDStr = '' if MSD: MSDStr = 'MSD=true' - return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) + return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + compression_level + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) #return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) def getPicardPath(lane, picardRoot = '/seq/picard/'): diff --git a/shell/makeResources.sh b/shell/makeResources.sh new file mode 100755 index 000000000..2337f4858 --- /dev/null +++ b/shell/makeResources.sh @@ -0,0 +1,2 @@ +cd dist/packages/GATKResources +tar cvhzf gatk_resources_062309.tgz resources