Version 0.2.1 (released) of the TableRecalibrator

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1108 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-06-25 22:50:55 +00:00
parent 73caf5db15
commit 5289230eb8
18 changed files with 530 additions and 210 deletions

View File

@ -12,8 +12,13 @@ t=read.table(input, header=T)
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
pdf(outfile, height=7, width=7)
plot(t$Qreported, t$Qempirical, type="p", col="blue", xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
abline(0,1)
d.good <- t[t$nMismatches >= 1000,]
d.100 <- t[t$nMismatches < 100,]
d.1000 <- t[t$nMismatches < 1000 & t$nMismatches >= 100,]
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,45), ylim=c(0,45), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
points(d.100$Qreported, d.100$Qempirical, type="p", col="lightblue", pch=16)
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="cornflowerblue", pch=16)
abline(0,1, lty=2)
dev.off()
#outfile = paste(input, ".quality_emp_hist.png", sep="")
@ -21,6 +26,6 @@ dev.off()
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
pdf(outfile, height=7, width=7)
hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0)
plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,40), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,45), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
axis(2,axTicks(2), format(axTicks(2), scientific=F))
dev.off()

View File

@ -12,5 +12,9 @@ outfile = paste(input, ".qual_diff_v_cycle.pdf", sep="")
pdf(outfile, height=7, width=7)
par(cex=1.1)
c <- read.table(input, header=T)
plot(c$Cycle, c$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", ylim=c(-10, 10))
d.good <- c[c$nMismatches >= 100,]
d.100 <- c[c$nMismatches < 100,]
plot(d.good$Cycle, d.good$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", col="blue", ylim=c(-10, 10))
points(d.100$Cycle, d.100$Qempirical_Qreported, type="p", col="lightblue", pch=3)
#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16)

View File

@ -85,7 +85,6 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
public static void parseBindings(Logger logger, ArrayList<String> bindings, List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods)
{
// Loop over triplets
System.out.printf("Binding is %s%n", Utils.join(" XXX ", bindings));
for( String bindingSets: bindings ) {
String[] bindingTokens = bindingSets.split(",");
if( bindingTokens.length % 3 != 0 )

View File

@ -0,0 +1,128 @@
package org.broadinstitute.sting.gatk.walkers.recalibration;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.BaseUtils;
import javax.management.RuntimeErrorException;
import java.util.*;
public class CovariateCounter {
private boolean collapsePos = false;
private boolean collapseDinuc = false;
private HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
public CovariateCounter( Set<String> readGroups, boolean collapsePos, boolean collapseDinuc ) {
this.collapsePos = collapsePos;
this.collapseDinuc = collapseDinuc;
for (String readGroup : readGroups ) {
RecalDataManager manager = new RecalDataManager(readGroup, ! collapsePos, ! collapseDinuc );
data.put(readGroup, manager);
}
}
/**
* Returns the set of readGroup names we are counting covariates for
* @return
*/
public Set<String> getReadGroups() {
return data.keySet();
}
public boolean isCollapseDinuc() {
return collapseDinuc;
}
public boolean isCollapsePos() {
return collapsePos;
}
/**
* Returns the number of read groups being managed
* @return
*/
public int getNReadGroups() {
return data.size();
}
/**
* Get the particular RecalData datum associated with readGroup, at machine pos, with reported
* quality qual, and with the dinuc context of prevBase, base. If an example of such a
* base has been seen before, returns the associated RecalData. If not, it creates one, places it in the
* system so that subsequent requests will return that object, and returns it.
*
* @param readGroup
* @param pos
* @param qual
* @param prevBase
* @param base
* @return
*/
public RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
byte[] cs = {(byte)prevBase, (byte)base};
String s = new String(cs);
return data.get(readGroup).expandingGetRecalData(pos, qual, s, true);
}
/**
* Get a list of all of the RecalData associated with readGroup
*
* @param readGroup
* @return
*/
public List<RecalData> getRecalData(String readGroup) {
return data.get(readGroup).getAll();
}
/**
* Updates the recalibration data for the base at offset in the read, associated with readGroup rg.
* Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read
* but by implied machine cycle associated with the offset.
*
* TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based
* TODO: and offset is 0-based. How very annoying.
*
* @param rg
* @param read
* @param offset
* @param ref
* @return
*/
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
if ( offset == 0 )
throw new RuntimeException("Illegal read offset " + offset + " in read " + read.getReadName());
int cycle = offset;
byte[] bases = read.getReadBases();
byte[] quals = read.getBaseQualities();
char base = (char)bases[offset];
char prevBase = (char)bases[offset - 1];
if (read.getReadNegativeStrandFlag()) {
ref = (char)BaseUtils.simpleComplement(ref);
base = (char)BaseUtils.simpleComplement(base);
prevBase = (char)BaseUtils.simpleComplement((char)bases[offset+1]);
cycle = read.getReadLength() - (offset + 1);
}
int qual = quals[offset];
if ( qual > 0 ) {
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
if (datum != null) datum.inc(base,ref);
return 1;
} else {
return 0;
}
}
public void printState() {
for ( String readGroup : getReadGroups() ) {
for ( RecalData datum : getRecalData(readGroup) ) {
if ( datum.N > 0 )
System.out.println(datum);
}
}
}
}

View File

@ -7,18 +7,15 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.BaseUtils;
import java.util.*;
import java.io.PrintStream;
import java.io.FileNotFoundException;
@WalkerName("CountCovariates")
public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
@Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false)
public int buggyMaxReadLen = 100000;
@ -28,83 +25,60 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
@Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
public int MIN_MAPPING_QUALITY = 1;
@Argument(fullName="READ_GROUP", shortName="rg", required=false, doc="Only use reads with this read group (@RG)")
public String READ_GROUP = "none";
//@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
//public int MAX_READ_GROUPS = 100;
@Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)")
public List<String> platforms = Collections.singletonList("*");
//public List<String> platforms = Collections.singletonList("ILLUMINA");
@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
//@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
public boolean collapsePos = false;
@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
//@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
public boolean collapseDinuc = false;
HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
private CovariateCounter covariateCounter = null;
long counted_sites = 0; // number of sites used to count covariates
long counted_bases = 0; // number of bases used to count covariates
long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
private long counted_sites = 0; // number of sites used to count covariates
private long counted_bases = 0; // number of bases used to count covariates
private long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
PrintStream recalTableOut = null;
/**
* Initialize the system. Setup the data CovariateCountry for the read groups in our header
*/
public void initialize() {
try {
recalTableOut = new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" );
} catch ( FileNotFoundException e ) {
throw new RuntimeException("Couldn't open output file", e);
}
Set<String> readGroups = new HashSet<String>();
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
if( readGroup.getAttribute("PL") == null )
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are supported",readGroup.getReadGroupId()));
if( !isSupportedReadGroup(readGroup) )
continue;
String rg = readGroup.getReadGroupId();
//RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, RecalData.NDINUCS, ! collapsePos, ! collapseDinuc );
RecalDataManager manager = new RecalDataManager(rg, ! collapsePos, ! collapseDinuc );
data.put(rg, manager);
readGroups.add(readGroup.getReadGroupId());
}
out.printf("Created recalibration data collectors for %d read group(s)%n", data.size());
covariateCounter = new CovariateCounter(readGroups, collapsePos, collapseDinuc);
logger.info(String.format("Created recalibration data collectors for %d read group(s)", covariateCounter.getNReadGroups()));
}
/**
* Get the particular RecalData datum associated with readGroup, at machine pos, with reported
* quality qual, and with the dinuc context of prevBase, base. If an example of such a
* base has been seen before, returns the associated RecalData. If not, it creates one, places it in the
* system so that subsequent requests will return that object, and returns it.
*
* @param readGroup
* @param pos
* @param qual
* @param prevBase
* @param base
* @return
*/
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
byte[] cs = {(byte)prevBase, (byte)base};
String s = new String(cs);
return data.get(readGroup).expandingGetRecalData(pos, qual, s, true);
}
// --------------------------------------------------------------------------------------------------------------
//
// map
//
// --------------------------------------------------------------------------------------------------------------
/**
* Get a list of all of the RecalData associated with readGroup
* Walk over each read in the locus pileup and update the covariate counts based on these bases and their
* matching (or not) with ref. dbSNP aware, so avoids sites that are known as SNPs in DBSNP.
*
* @param readGroup
* @param tracker
* @param ref
* @param context
* @return
*/
private List<RecalData> getRecalData(String readGroup) {
return data.get(readGroup).getAll();
}
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
//System.out.printf("%s %c%n", context.getLocation(), ref);
rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null);
if ( dbsnp == null || !dbsnp.isSNP() ) {
// We aren't at a dbSNP position that's a SNP, so update the read
List<SAMRecord> reads = context.getReads();
List<Integer> offsets = context.getOffsets();
for (int i =0; i < reads.size(); i++ ) {
@ -115,73 +89,71 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
}
SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
if ( isSupportedReadGroup(readGroup) &&
(READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
(read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
if ((read.getMappingQuality() >= MIN_MAPPING_QUALITY && isSupportedReadGroup(readGroup) )) {
int offset = offsets.get(i);
int numBases = read.getReadLength();
if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count
counted_bases += updateDataFromRead(readGroup.getReadGroupId(), read, offset, ref);
if ( offset > 0 && offset < (read.getReadLength() - 1) ) { // skip first and last bases because they suck and they don't have a dinuc count
counted_bases += covariateCounter.updateDataFromRead(readGroup.getReadGroupId(), read, offset, ref);
}
}
}
counted_sites += 1;
} else {
skipped_sites += 1;
//System.out.println(dbsnp.toSimpleString()+" "+new ReadBackedPileup(ref, context).getPileupString());
}
return 1;
}
/**
* Updates the recalibration data for the base at offset in the read, associated with readGroup rg.
* Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read
* but by implied machine cycle associated with the offset.
* Check to see whether this read group should be processed. Returns true if the
* read group is in the list of platforms to process or the platform == *, indicating
* that all platforms should be processed.
*
* TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based
* TODO: and offset is 0-based. How very annoying.
*
* @param rg
* @param read
* @param offset
* @param ref
* @param readGroup
* @return
*/
private int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
int cycle = offset;
byte[] bases = read.getReadBases();
byte[] quals = read.getBaseQualities();
char base = (char)bases[offset];
char prevBase = (char)bases[offset - 1];
if (read.getReadNegativeStrandFlag()) {
ref = (char)BaseUtils.simpleComplement(ref);
base = (char)BaseUtils.simpleComplement(base);
prevBase = (char)BaseUtils.simpleComplement((char)bases[offset+1]);
cycle = read.getReadLength() - (offset + 1);
private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) {
for( String platform: platforms ) {
platform = platform.trim();
if( readGroup.getAttribute("PL") == null ||
platform.equals("*") ||
readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) )
return true;
}
int qual = quals[offset];
if ( qual > 0 ) {
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
if (datum != null) datum.inc(base,ref);
return 1;
} else {
return 0;
return false;
}
// --------------------------------------------------------------------------------------------------------------
//
// Reduce
//
// --------------------------------------------------------------------------------------------------------------
/**
* Provide an initial value for reduce computations.
* @return Initial value of reduce.
*/
public PrintStream reduceInit() {
try {
return new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" );
} catch ( FileNotFoundException e ) {
throw new RuntimeException("Couldn't open output file", e);
}
}
public void onTraversalDone(Integer result) {
public void onTraversalDone(PrintStream recalTableStream) {
printInfo(out);
out.printf("Writing raw recalibration data%n");
writeRecalTable();
out.printf("Writing raw recalibration data..."); out.flush();
writeRecalTable(recalTableStream);
out.printf("...done%n");
//out.printf("Writing logistic recalibration data%n");
//writeLogisticRecalibrationTable();
//out.printf("...done%n");
recalTableStream.close();
}
/**
@ -189,13 +161,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
* @param out
*/
private void printInfo(PrintStream out) {
out.printf("# date %s%n", new Date());
out.printf("# collapsed_pos %b%n", collapsePos);
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
out.printf("# counted_sites %d%n", counted_sites);
out.printf("# counted_bases %d%n", counted_bases);
out.printf("# skipped_sites %d%n", skipped_sites);
out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites);
out.printf("# date \"%s\"%n", new Date());
out.printf("# collapsed_pos %b%n", collapsePos);
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
out.printf("# counted_sites %d%n", counted_sites);
out.printf("# counted_bases %d%n", counted_bases);
out.printf("# skipped_sites %d%n", skipped_sites);
out.printf("# fraction_skipped 1 / %.0f bp%n", (double)counted_sites / skipped_sites);
}
@Deprecated
@ -204,14 +176,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
try {
dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv");
dinuc_out.println("rg,dn,logitQ,pos,indicator,count");
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
for (String readGroup : covariateCounter.getReadGroups()) {
for ( int dinuc_index=0; dinuc_index<RecalData.NDINUCS; dinuc_index++) {
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
for ( RecalData datum: covariateCounter.getRecalData(readGroup) ) {
if ( RecalData.dinucIndex(datum.dinuc) == dinuc_index ) {
if ((datum.N - datum.B) > 0)
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup, RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
if (datum.B > 0)
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 1, datum.B);
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup, RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 1, datum.B);
}
}
}
@ -229,56 +201,26 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
* Writes out the key recalibration data collected from the reads. Dumps this recalibration data
* as a CVS string to the recalTableOut PrintStream. Emits the data for all read groups into this file.
*/
private void writeRecalTable() {
printInfo(recalTableOut);
recalTableOut.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
// TODO: should sort the data coming out of getRecalData here for easier processing
for ( RecalData datum: RecalData.sort(getRecalData(readGroup.getReadGroupId())) ) {
private void writeRecalTable(PrintStream recalTableStream) {
printInfo(recalTableStream);
recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
for (String readGroup : covariateCounter.getReadGroups()) {
for ( RecalData datum: RecalData.sort(covariateCounter.getRecalData(readGroup)) ) {
if ( datum.N > 0 )
recalTableOut.format("%s%n", datum.toCSVString(collapsePos));
recalTableStream.format("%s%n", datum.toCSVString(collapsePos));
}
}
recalTableOut.close();
}
/**
* Check to see whether this read group should be processed. Returns true if the
* read group is in the list of platforms to process or the platform == *, indicating
* that all platforms should be processed.
*
* @param readGroup
* @return
*/
private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) {
for( String platform: platforms ) {
platform = platform.trim();
if( readGroup.getAttribute("PL") == null ||
platform.equals("*") ||
readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) )
return true;
}
return false;
}
/**
* No initialization routines
*
* @return
*/
public Integer reduceInit() {
return 0;
}
/**
* Doesn't do anything
*
* @param a
* @param b
* @param empty
* @param recalTableStream
* @return
*/
public Integer reduce(Integer a, Integer b) {
return 0;
public PrintStream reduce(Integer empty, PrintStream recalTableStream) {
return recalTableStream;
}
}

View File

@ -167,7 +167,7 @@ public class RecalData implements Comparable<RecalData> {
* @param s
* @return
*/
public static RecalData fromCSVString(String s) {
public static RecalData fromCSVString(String s) throws NumberFormatException {
String[] vals = s.split(",");
String rg = vals[0];
int pos = vals[1].equals("*") ? 0 : Integer.parseInt(vals[1]);
@ -178,6 +178,13 @@ public class RecalData implements Comparable<RecalData> {
RecalData datum = new RecalData(pos, qual, rg, dinuc);
datum.B = B;
datum.N = N;
// Checking for badness
if ( pos < 0 ) throw new NumberFormatException("Illegal position detected: " + pos);
if ( B < 0 ) throw new NumberFormatException("Illegal mismatch count detected: " + B);
if ( N < 0 ) throw new NumberFormatException("Illegal base count detected: " + N);
if ( qual < 0 || qual > QualityUtils.MAX_QUAL_SCORE ) throw new NumberFormatException("Illegal qual detected: " + qual);
return datum;
}
@ -216,8 +223,8 @@ public class RecalData implements Comparable<RecalData> {
}
double q = QualityUtils.phredScaleErrorRate(sumExpectedErrors / nBases);
System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n",
sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q);
//System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n",
// sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q);
return q;
}
}

View File

@ -54,6 +54,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
private static String VERSION = "0.2.1";
private final static boolean DEBUG = false;
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
@ -77,9 +79,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
private static Pattern COLLAPSED_DINUC_PATTERN = Pattern.compile("^#\\s+collapsed_dinuc\\s+(\\w+)");
private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*");
//private static boolean DEBUG_ME = true;
public void initialize() {
logger.info("TableRecalibrator version: " + VERSION);
//
// crap hack until Enum arg types are supported
// crappy hack until Enum arg types are supported
//
for ( RecalibrationMode potential : RecalibrationMode.values() ) {
if ( potential.toString().equals(modeString)) {
@ -96,13 +101,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
//
int lineNumber = 0;
try {
System.out.printf("Reading data...%n");
logger.info(String.format("Reading data..."));
List<RecalData> data = new ArrayList<RecalData>();
boolean collapsedPos = false;
boolean collapsedDinuc = false;
List<String> lines = new xReadLines(new File(paramsFile)).readLines();
for ( String line : lines ) {
//List<String> lines = new xReadLines(new File(paramsFile)).readLines();
for ( String line : new xReadLines(new File(paramsFile)) ) {
lineNumber++;
if ( HEADER_PATTERN.matcher(line).matches() )
continue;
@ -159,12 +164,14 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
if ( collapsedPos )
throw new RuntimeException(String.format("Cannot perform position_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
collapsedPos = true;
break;
throw new RuntimeException("Unsupported mode requested, sorry");
//break;
case BY_DINUC_ONLY:
if ( collapsedDinuc )
throw new RuntimeException(String.format("Cannot perform dinuc_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
collapsedDinuc = true;
break;
throw new RuntimeException("Unsupported mode requested, sorry");
//break;
case COMBINATORIAL:
if ( collapsedPos || collapsedDinuc )
throw new RuntimeException(String.format("Cannot perform combinatorial recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
@ -231,12 +238,22 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
quals = BaseUtils.reverse(quals);
}
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
try {
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
if (read.getReadNegativeStrandFlag()) // reverse the quals for the neg strand read
recalQuals = BaseUtils.reverse(recalQuals);
read.setBaseQualities(recalQuals);
return read;
//if ( read.getReadName().equals("IL12_395:7:215:171:693") ) {
// for ( int i = 0; i < quals.length; i++ ) {
// System.out.printf("READ found: %s is now %s%n", quals[i], recalQuals[i]);
// }
//}
if (read.getReadNegativeStrandFlag()) // reverse the quals for the neg strand read
recalQuals = BaseUtils.reverse(recalQuals);
read.setBaseQualities(recalQuals);
return read;
} catch ( StingException e ) {
throw new RuntimeException(String.format("Bug found while processing read %s: %s", read.format(), e.getMessage()));
}
}
/**
@ -248,14 +265,21 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
* @param quals
* @return
*/
public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) {
public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) throws StingException {
byte[] recalQuals = new byte[quals.length];
RecalMapping mapper = cache.get(readGroup);
//if ( mapper == null && DEBUG_ME )
// return recalQuals;
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
for ( int cycle = 1; cycle < bases.length; cycle++ ) { // skip first and last base, qual already set because no dinuc
byte qual = quals[cycle];
byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual);
if ( newQual <= 0 || newQual > QualityUtils.MAX_REASONABLE_Q_SCORE )
throw new StingException(String.format("Bug found -- assigning bad quality score %d x %d => %d", cycle, qual, newQual));
recalQuals[cycle] = newQual;
//System.out.printf("Mapping %d => %d%n", qual, newQual);
}
@ -322,17 +346,12 @@ class CombinatorialRecalMapping implements RecalMapping {
int pos = manager.canonicalPos(datum.pos);
if ( table[pos][datum.qual] != 0 )
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
table[pos][datum.qual] = datum.empiricalQualByte(useRawQempirical);
//System.out.printf("Binding %d %d => %d%n", pos, datum.qual, datum.empiricalQualByte(useRawQempirical));
}
}
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
//String dinuc = String.format("%c%c", (char)prevBase, (char)base);
//if ( qual == 2 )
// System.out.printf("Qual = 2%n");
int pos = manager.canonicalPos(cycle);
int index = this.manager.getDinucIndex(prevBase, base);
byte[][] dataTable = index == -1 ? null : cache.get(index);
@ -340,13 +359,7 @@ class CombinatorialRecalMapping implements RecalMapping {
if ( dataTable == null && prevBase != 'N' && base != 'N' )
throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, (char)prevBase, (char)base));
byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
//if ( result == 2 )
// System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n",
// readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result);
return result;
return dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
}
}
@ -368,13 +381,13 @@ class SerialRecalMapping implements RecalMapping {
private double globalDeltaQ = 0.0;
private double[][] deltaQPosMap, deltaQDinucMap;
double [] deltaQualMap;
RecalData [][] qPosSupports, qDinucSupports;
RecalData [][] qPosSupports = null, qDinucSupports = null;
CombinatorialRecalMapping combiMap;
RecalDataManager manager;
String dinucToLookAt = null; // "CC";
int posToLookAt = 0;
int posToLookAt = -1;
int qualToLookAt = 25;
public SerialRecalMapping(RecalDataManager manager, final boolean useRawQempirical,
@ -387,7 +400,7 @@ class SerialRecalMapping implements RecalMapping {
RecalData datum = new RecalData(0, 0, manager.readGroup, "**").inc(manager.getAll());
double aggregrateQreported = RecalData.combinedQreported(manager.getAll());
globalDeltaQ = datum.empiricalQualDouble(useRawQempirical) - aggregrateQreported;
System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ);
//System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ);
}
for ( RecalData datum : manager.getAll() ) {
@ -399,12 +412,12 @@ class SerialRecalMapping implements RecalMapping {
deltaQualMap = new double[maxQReported+1];
for ( RecalData datum : RecalData.sort(manager.combine(true, false, true)) ) {
deltaQualMap[datum.qual] = datum.empiricalQualDouble(useRawQempirical) - datum.qual - globalDeltaQ;
System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]);
//System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]);
}
// calculate the delta Q pos array
deltaQPosMap = new double[maxPos+1][maxQReported+1];
qPosSupports = new RecalData[maxPos+1][maxQReported+1];
//qPosSupports = new RecalData[maxPos+1][maxQReported+1];
for ( RecalData datumAtPosQual : manager.combineDinucs() ) {
double offset = globalDeltaQ + deltaQualMap[datumAtPosQual.qual];
updateCache(qPosSupports, datumAtPosQual, useRawQempirical, deltaQPosMap, datumAtPosQual.pos, datumAtPosQual.qual, offset);
@ -412,7 +425,7 @@ class SerialRecalMapping implements RecalMapping {
// calculate the delta Q dinuc array
deltaQDinucMap = new double[dinucs.size()+1][maxQReported+1];
qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1];
//qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1];
for ( RecalData datumAtDinucQual : manager.combineCycles() ) {
double offset = globalDeltaQ + deltaQualMap[datumAtDinucQual.qual];
updateCache(qDinucSupports, datumAtDinucQual, useRawQempirical, deltaQDinucMap, datumAtDinucQual.getDinucIndex(), datumAtDinucQual.qual, offset);
@ -429,7 +442,7 @@ class SerialRecalMapping implements RecalMapping {
for ( int j = 0; j < maxQReported; j++ ) {
if ( printStateP(i, null, j) )
System.out.printf("Mapping: pos=%d qual=%2d delta=%.2f based on %s%n",
i, j, deltaQPosMap[i][j], qPosSupports[i][j]);
i, j, deltaQPosMap[i][j], qPosSupports != null ? qPosSupports[i][j] : null);
}
}
@ -438,7 +451,7 @@ class SerialRecalMapping implements RecalMapping {
String dinuc = RecalData.dinucIndex2bases(i);
if ( printStateP(0, dinuc, j ) )
System.out.printf("Mapping: dinuc=%s qual=%2d delta=%.2f based on %s%n",
dinuc, j, deltaQDinucMap[i][j], qDinucSupports[i][j]);
dinuc, j, deltaQDinucMap[i][j], qDinucSupports != null ? qDinucSupports[i][j] : null);
}
}
@ -457,7 +470,8 @@ class SerialRecalMapping implements RecalMapping {
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
double deltaQ = datum.empiricalQualDouble(useRawQempirical) - datum.qual - meanQ;
table[i][j] = deltaQ;
supports[i][j] = datum;
if ( supports != null )
supports[i][j] = datum;
}
private boolean printStateP( int cycle, String dinuc, int qual ) {

View File

@ -30,6 +30,9 @@ public class BaseQualityHistoWalker extends ReadWalker<Integer, Integer> {
// Map over the org.broadinstitute.sting.gatk.LocusContext
public Integer map(char[] ref, SAMRecord read) {
for ( byte qual : read.getBaseQualities() ) {
if ( qual < 0 || qual > 100 ) {
throw new RuntimeException(String.format("Invalid base quality detected -- %d at %s%n", qual, read.getReadName()));
}
//System.out.println(qual);
this.qualCounts[qual]++;
}

View File

@ -244,7 +244,7 @@ public class GenomeLocParser {
* @return the list of merged locations
*/
public static List<GenomeLoc> mergeOverlappingLocations(final List<GenomeLoc> raw) {
logger.debug(" Raw locations are:\n" + Utils.join("\n", raw));
logger.debug(" Raw locations are: " + Utils.join(", ", raw));
if (raw.size() <= 1)
return raw;
else {

View File

@ -85,8 +85,16 @@ public class QualityUtils {
return boundQual(qual, MAX_QUAL_SCORE);
}
/**
* Returns an integer quality score bounded by 1 - maxQual.
*
* @param qual
* @param maxQual
* @return
*/
static public byte boundQual(int qual, byte maxQual) {
return (byte) Math.min(qual, maxQual);
//return (byte) Math.min(qual, maxQual);
return (byte) Math.max(Math.min(qual, maxQual), 1);
}
/**

View File

@ -191,7 +191,7 @@ public class ArtificialSAMUtils {
}
SAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length);
rec.setReadBases(bases);
rec.setBaseQualities(bases);
rec.setBaseQualities(qual);
return rec;
}

View File

@ -0,0 +1,183 @@
// our package
package org.broadinstitute.sting.gatk.walkers.recalibration;
import java.util.*;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileHeader;
// the imports for unit testing.
import org.junit.Assert;
import org.junit.Test;
import org.junit.Before;
import org.junit.BeforeClass;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.playground.gatk.walkers.indels.CleanedReadInjector;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileReader;
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
import java.util.HashSet;
import java.io.FileNotFoundException;
import java.io.File;
/**
* Basic unit test for RecalData
*/
public class CovariateCounterTest extends BaseTest {
String readGroup1 = "rg1";
String readGroup2 = "rg2";
Set<String> readGroups = new HashSet<String>();
SAMFileHeader header;
SAMRecord read1, read2, read3;
byte bases1[] = {'a', 't', 'c', 'g', 'a'};
byte quals1[] = {1, 2, 3, 4, 5};
byte quals3[] = {1, 2, 5, 5, 5};
byte bases2[] = {'t', 'c', 'g', 'a', 't'};
byte quals2[] = {2, 2, 4, 5, 2};
/*
public CovariateCounter( Set<String> readGroups, boolean collapsePos, boolean collapseDinuc ) {
public Set<String> getReadGroups() {
public boolean isCollapseDinuc() {
public boolean isCollapsePos() {
public int getNReadGroups() {
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
public List<RecalData> getRecalData(String readGroup) {
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
*/
/**
* The fasta, for comparison.
*/
protected static IndexedFastaSequenceFile sequenceFile = null;
CovariateCounter c;
/**
* Initialize the fasta.
*/
@BeforeClass
public static void initialize() throws FileNotFoundException {
sequenceFile = new IndexedFastaSequenceFile( new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") );
GenomeLocParser.setupRefContigOrdering(sequenceFile);
}
@Before
public void initializeBefore() {
header = ArtificialSAMUtils.createArtificialSamHeader(2,0,247249719);
readGroups.addAll(Arrays.asList(readGroup1, readGroup2));
ArtificialSAMUtils.createDefaultReadGroup( header, readGroup1, "sample1" );
ArtificialSAMUtils.createDefaultReadGroup( header, readGroup2, "sample2" );
c = new CovariateCounter( readGroups, false, false );
read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases1, quals1);
read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",1,1, bases2, quals2);
read3 = ArtificialSAMUtils.createArtificialRead(header,"read3",1,1, bases1, quals3);
}
@Test
public void testCovariateCounterSetup() {
Assert.assertEquals("Number of read groups is wrong", c.getNReadGroups(), 2);
Assert.assertEquals("Read group identities are wrong", c.getReadGroups(), readGroups);
Assert.assertEquals("Incorrectly collapsed counter", c.isCollapseDinuc(), false);
Assert.assertEquals("Incorrectly collapsed counter", c.isCollapsePos(), false);
}
@Test
public void testOneRead() {
for ( int i = 1; i < read1.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
c.printState();
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
for ( int i = 1; i < bases1.length; i++ ) {
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
System.out.printf("%s%n", datum);
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
Assert.assertEquals("Bad mismatch count", datum.B, 0);
Assert.assertEquals("Bad base count", datum.N, 1);
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
}
}
@Test
public void testTwoReads() {
for ( int i = 1; i < read1.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
for ( int i = 1; i < read2.getReadBases().length; i++ )
c.updateDataFromRead(readGroup2, read2, i, (char)read2.getReadBases()[i]);
c.printState();
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
for ( int i = 1; i < bases1.length; i++ ) {
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
System.out.printf("%s%n", datum);
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
Assert.assertEquals("Bad mismatch count", datum.B, 0);
Assert.assertEquals("Bad base count", datum.N, 1);
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
}
}
@Test
public void testTwoReadsSameGroup() {
for ( int i = 1; i < read1.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
for ( int i = 1; i < read2.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
c.printState();
for ( int i = 1; i < bases1.length; i++ ) {
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
System.out.printf("%s%n", datum);
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
Assert.assertEquals("Bad mismatch count", datum.B, 0);
Assert.assertEquals("Bad base count", datum.N, 2);
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
}
}
@Test
public void testTwoReadsSameGroupNotIdentical() {
for ( int i = 1; i < read1.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
for ( int i = 1; i < read3.getReadBases().length; i++ )
c.updateDataFromRead(readGroup1, read3, i, (char)read3.getReadBases()[i]);
c.printState();
for ( int i = 1; i < bases1.length; i++ ) {
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
System.out.printf("%s%n", datum);
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
Assert.assertEquals("Bad mismatch count", datum.B, 0);
Assert.assertEquals("Bad base count", datum.N, quals1[i] == quals3[i] ? 2 : 1);
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
}
}
@Test (expected = RuntimeException.class)
public void testBadReadOffset() {
byte bases[] = {'a', 't', 'c', 'g', 'a'};
byte quals[] = {1, 2, 3, 4, 5};
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases, quals);
c.updateDataFromRead(readGroup1, read, 0, (char)bases[0]);
}
}

View File

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<package>
<name>GATKResources</name>
<resources>
<file>/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod</file>
<file>/broad/1KG/reference/human_b36_both.fasta</file>
<file>/broad/1KG/reference/human_b36_both.dict</file>
<file>/broad/1KG/reference/human_b36_both.fasta.fai</file>
<file>/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod</file>
<file>/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta</file>
<file>/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dict</file>
<file>/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai</file>
</resources>
</package>

View File

@ -3,18 +3,7 @@
<name>ReadQualityRecalibrator</name>
<main-class>org.broadinstitute.sting.gatk.CommandLineGATK</main-class>
<dependencies>
<class>org.broadinstitute.sting.playground.gatk.walkers.CovariateCounterWalker</class>
<class>org.broadinstitute.sting.playground.gatk.walkers.LogisticRecalibrationWalker</class>
<class>org.broadinstitute.sting.gatk.walkers.recalibration.CovariateCounterWalker</class>
<class>org.broadinstitute.sting.gatk.walkers.recalibration.TableRecalibrationWalker</class>
</dependencies>
<scripts>
<file>python/RecalQual.py</file>
<file>python/LogisticRegressionByReadGroup.py</file>
</scripts>
<resources>
<file>R/logistic_regression.R</file>
<file>/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod</file>
<file>/broad/1KG/reference/human_b36_both.fasta</file>
<file>/broad/1KG/reference/human_b36_both.dict</file>
<file>/broad/1KG/reference/human_b36_both.fasta.fai</file>
</resources>
</package>

View File

@ -44,6 +44,16 @@ def bams2geli(bams):
calls = map(call1, bams)
return map(lambda x: x[0], calls), map(lambda x: x[1], calls)
def gelis2gelisText( gelis ):
def geli2geliText( maybeGeli ):
if os.path.splitext(maybeGeli)[1] == ".calls" :
return maybeGeli
else:
return os.path.split(geli)[1] + '.calls'
return map( geli2geliText, gelis)
def main():
global OPTIONS, ROOT
@ -124,7 +134,7 @@ def main():
# convert the geli's to text
jobid = None
variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
variantsOut = gelis2gelisText( gelis )
for geli, variantOut in zip(gelis, variantsOut):
name = os.path.split(geli)[1]
if not os.path.exists(variantOut):

View File

@ -13,6 +13,7 @@ import operator
MAX_QUAL_SCORE = 50
def phredQScore( nMismatches, nBases ):
"""Calculates a phred-scaled score for nMismatches in nBases"""
#print 'phredQScore', nMismatches, nBases
if nMismatches == 0:
return MAX_QUAL_SCORE
@ -24,10 +25,12 @@ def phredQScore( nMismatches, nBases ):
def phredScore2ErrorProp(qual):
"""Converts a phred-scaled quality score to an error probability"""
#print 'phredScore2ErrorProp', qual
return math.pow(10.0, float(qual) / -10.0)
def tryByInt(s):
"""Try to cast something to an int, or return it as a string"""
try:
return int(s)
except:
@ -36,11 +39,13 @@ def tryByInt(s):
expectedHeader = 'rg,pos,Qrep,dn,nBases,nMismatches,Qemp'.split(',')
defaultValues = '0,0,0,**,0,0,0'.split(',')
class RecalData(dict):
"""Basic recalibration data -- corresponds exactly to the Java version in GATK"""
def __init__(self):
self.parse(expectedHeader, defaultValues)
def parse(self, header, data):
"""Parse the comma-separated data line with corresponding header. Throws an error
if the header doesn't correspond to the expectedHeader"""
# rg,pos,Qrep,dn,NBases,MMismatches,Qemp
types = [str, tryByInt, int, str, int, int, int]
for head, expected, datum, type in zip(header, expectedHeader, data, types):
@ -57,8 +62,11 @@ class RecalData(dict):
def __getattr__(self, name):
return self[name]
# rg,dn,Qrep,pos,NBases,MMismatches,Qemp
#
# Trivial accessor functions
#
def readGroup(self): return self.rg
def dinuc(self): return self.dn
def qReported(self): return self.Qrep
@ -204,8 +212,9 @@ def lsamplestdev (inlist, counts, mean):
for item, count in zip(inlist, counts):
diff = item - mean
inc = count * diff * diff
#print item, count, mean, diff, diff*diff, inc
#print "%3d" % int(item), count, mean, diff, diff*diff, inc, sum
sum += inc
#print sum, n, sum / float(n-1), math.sqrt(sum / float(n-1))
return math.sqrt(sum / float(n-1))
def rmse(reportedList, empiricalList, counts):
@ -320,7 +329,7 @@ def analyzeFiles(files):
for file in files:
print 'Analyzing file', file
plotter = getPlotterForFile(file)
if plotter <> None:
if plotter <> None and not OPTIONS.noplots:
cmd = ' '.join([Rscript, plotter, file])
farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry)
@ -341,6 +350,9 @@ def main():
parser.add_option("-s", "--stdout", dest="toStdout",
action='store_true', default=False,
help="If provided, writes output to standard output, not to files")
parser.add_option("", "--no_plots", dest="noplots",
action='store_true', default=False,
help="If provided, no plots will be generated")
parser.add_option("", "--dry", dest="dry",
action='store_true', default=False,
help="If provided, nothing actually gets run, just a dry run")

View File

@ -153,7 +153,7 @@ def aggregateGeliCalls( sortedGeliCalls ):
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m' ):
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m', compression_level = 1 ):
if useSamtools:
return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles)
else:
@ -164,7 +164,7 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True,
MSDStr = ''
if MSD: MSDStr = 'MSD=true'
return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + compression_level + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
def getPicardPath(lane, picardRoot = '/seq/picard/'):

View File

@ -0,0 +1,2 @@
cd dist/packages/GATKResources
tar cvhzf gatk_resources_062309.tgz resources