Version 0.2.1 (released) of the TableRecalibrator
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1108 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
73caf5db15
commit
5289230eb8
|
|
@ -12,8 +12,13 @@ t=read.table(input, header=T)
|
|||
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
|
||||
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
plot(t$Qreported, t$Qempirical, type="p", col="blue", xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
|
||||
abline(0,1)
|
||||
d.good <- t[t$nMismatches >= 1000,]
|
||||
d.100 <- t[t$nMismatches < 100,]
|
||||
d.1000 <- t[t$nMismatches < 1000 & t$nMismatches >= 100,]
|
||||
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,45), ylim=c(0,45), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
|
||||
points(d.100$Qreported, d.100$Qempirical, type="p", col="lightblue", pch=16)
|
||||
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="cornflowerblue", pch=16)
|
||||
abline(0,1, lty=2)
|
||||
dev.off()
|
||||
|
||||
#outfile = paste(input, ".quality_emp_hist.png", sep="")
|
||||
|
|
@ -21,6 +26,6 @@ dev.off()
|
|||
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0)
|
||||
plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,40), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
|
||||
plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,45), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
|
|
|||
|
|
@ -12,5 +12,9 @@ outfile = paste(input, ".qual_diff_v_cycle.pdf", sep="")
|
|||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
c <- read.table(input, header=T)
|
||||
plot(c$Cycle, c$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", ylim=c(-10, 10))
|
||||
d.good <- c[c$nMismatches >= 100,]
|
||||
d.100 <- c[c$nMismatches < 100,]
|
||||
plot(d.good$Cycle, d.good$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", col="blue", ylim=c(-10, 10))
|
||||
points(d.100$Cycle, d.100$Qempirical_Qreported, type="p", col="lightblue", pch=3)
|
||||
#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16)
|
||||
|
||||
|
|
|
|||
|
|
@ -85,7 +85,6 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
public static void parseBindings(Logger logger, ArrayList<String> bindings, List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods)
|
||||
{
|
||||
// Loop over triplets
|
||||
System.out.printf("Binding is %s%n", Utils.join(" XXX ", bindings));
|
||||
for( String bindingSets: bindings ) {
|
||||
String[] bindingTokens = bindingSets.split(",");
|
||||
if( bindingTokens.length % 3 != 0 )
|
||||
|
|
|
|||
|
|
@ -0,0 +1,128 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import javax.management.RuntimeErrorException;
|
||||
import java.util.*;
|
||||
|
||||
public class CovariateCounter {
|
||||
private boolean collapsePos = false;
|
||||
private boolean collapseDinuc = false;
|
||||
|
||||
private HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
|
||||
|
||||
public CovariateCounter( Set<String> readGroups, boolean collapsePos, boolean collapseDinuc ) {
|
||||
this.collapsePos = collapsePos;
|
||||
this.collapseDinuc = collapseDinuc;
|
||||
|
||||
for (String readGroup : readGroups ) {
|
||||
RecalDataManager manager = new RecalDataManager(readGroup, ! collapsePos, ! collapseDinuc );
|
||||
data.put(readGroup, manager);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the set of readGroup names we are counting covariates for
|
||||
* @return
|
||||
*/
|
||||
public Set<String> getReadGroups() {
|
||||
return data.keySet();
|
||||
}
|
||||
|
||||
public boolean isCollapseDinuc() {
|
||||
return collapseDinuc;
|
||||
}
|
||||
|
||||
public boolean isCollapsePos() {
|
||||
return collapsePos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of read groups being managed
|
||||
* @return
|
||||
*/
|
||||
public int getNReadGroups() {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the particular RecalData datum associated with readGroup, at machine pos, with reported
|
||||
* quality qual, and with the dinuc context of prevBase, base. If an example of such a
|
||||
* base has been seen before, returns the associated RecalData. If not, it creates one, places it in the
|
||||
* system so that subsequent requests will return that object, and returns it.
|
||||
*
|
||||
* @param readGroup
|
||||
* @param pos
|
||||
* @param qual
|
||||
* @param prevBase
|
||||
* @param base
|
||||
* @return
|
||||
*/
|
||||
public RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
|
||||
byte[] cs = {(byte)prevBase, (byte)base};
|
||||
String s = new String(cs);
|
||||
return data.get(readGroup).expandingGetRecalData(pos, qual, s, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a list of all of the RecalData associated with readGroup
|
||||
*
|
||||
* @param readGroup
|
||||
* @return
|
||||
*/
|
||||
public List<RecalData> getRecalData(String readGroup) {
|
||||
return data.get(readGroup).getAll();
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the recalibration data for the base at offset in the read, associated with readGroup rg.
|
||||
* Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read
|
||||
* but by implied machine cycle associated with the offset.
|
||||
*
|
||||
* TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based
|
||||
* TODO: and offset is 0-based. How very annoying.
|
||||
*
|
||||
* @param rg
|
||||
* @param read
|
||||
* @param offset
|
||||
* @param ref
|
||||
* @return
|
||||
*/
|
||||
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
|
||||
if ( offset == 0 )
|
||||
throw new RuntimeException("Illegal read offset " + offset + " in read " + read.getReadName());
|
||||
|
||||
int cycle = offset;
|
||||
byte[] bases = read.getReadBases();
|
||||
byte[] quals = read.getBaseQualities();
|
||||
|
||||
char base = (char)bases[offset];
|
||||
char prevBase = (char)bases[offset - 1];
|
||||
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
ref = (char)BaseUtils.simpleComplement(ref);
|
||||
base = (char)BaseUtils.simpleComplement(base);
|
||||
prevBase = (char)BaseUtils.simpleComplement((char)bases[offset+1]);
|
||||
cycle = read.getReadLength() - (offset + 1);
|
||||
}
|
||||
|
||||
int qual = quals[offset];
|
||||
if ( qual > 0 ) {
|
||||
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
|
||||
if (datum != null) datum.inc(base,ref);
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public void printState() {
|
||||
for ( String readGroup : getReadGroups() ) {
|
||||
for ( RecalData datum : getRecalData(readGroup) ) {
|
||||
if ( datum.N > 0 )
|
||||
System.out.println(datum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -7,18 +7,15 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
|||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
@WalkerName("CountCovariates")
|
||||
public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||
@Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false)
|
||||
public int buggyMaxReadLen = 100000;
|
||||
|
||||
|
|
@ -28,83 +25,60 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
@Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
|
||||
public int MIN_MAPPING_QUALITY = 1;
|
||||
|
||||
@Argument(fullName="READ_GROUP", shortName="rg", required=false, doc="Only use reads with this read group (@RG)")
|
||||
public String READ_GROUP = "none";
|
||||
|
||||
//@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
|
||||
//public int MAX_READ_GROUPS = 100;
|
||||
|
||||
@Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)")
|
||||
public List<String> platforms = Collections.singletonList("*");
|
||||
//public List<String> platforms = Collections.singletonList("ILLUMINA");
|
||||
|
||||
@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
|
||||
//@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
|
||||
public boolean collapsePos = false;
|
||||
|
||||
@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
|
||||
//@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
|
||||
public boolean collapseDinuc = false;
|
||||
|
||||
HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
|
||||
private CovariateCounter covariateCounter = null;
|
||||
|
||||
long counted_sites = 0; // number of sites used to count covariates
|
||||
long counted_bases = 0; // number of bases used to count covariates
|
||||
long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
|
||||
private long counted_sites = 0; // number of sites used to count covariates
|
||||
private long counted_bases = 0; // number of bases used to count covariates
|
||||
private long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
|
||||
|
||||
PrintStream recalTableOut = null;
|
||||
|
||||
/**
|
||||
* Initialize the system. Setup the data CovariateCountry for the read groups in our header
|
||||
*/
|
||||
public void initialize() {
|
||||
try {
|
||||
recalTableOut = new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException("Couldn't open output file", e);
|
||||
}
|
||||
|
||||
Set<String> readGroups = new HashSet<String>();
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
if( readGroup.getAttribute("PL") == null )
|
||||
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are supported",readGroup.getReadGroupId()));
|
||||
if( !isSupportedReadGroup(readGroup) )
|
||||
continue;
|
||||
String rg = readGroup.getReadGroupId();
|
||||
//RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, RecalData.NDINUCS, ! collapsePos, ! collapseDinuc );
|
||||
RecalDataManager manager = new RecalDataManager(rg, ! collapsePos, ! collapseDinuc );
|
||||
data.put(rg, manager);
|
||||
readGroups.add(readGroup.getReadGroupId());
|
||||
}
|
||||
out.printf("Created recalibration data collectors for %d read group(s)%n", data.size());
|
||||
|
||||
covariateCounter = new CovariateCounter(readGroups, collapsePos, collapseDinuc);
|
||||
logger.info(String.format("Created recalibration data collectors for %d read group(s)", covariateCounter.getNReadGroups()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the particular RecalData datum associated with readGroup, at machine pos, with reported
|
||||
* quality qual, and with the dinuc context of prevBase, base. If an example of such a
|
||||
* base has been seen before, returns the associated RecalData. If not, it creates one, places it in the
|
||||
* system so that subsequent requests will return that object, and returns it.
|
||||
*
|
||||
* @param readGroup
|
||||
* @param pos
|
||||
* @param qual
|
||||
* @param prevBase
|
||||
* @param base
|
||||
* @return
|
||||
*/
|
||||
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
|
||||
byte[] cs = {(byte)prevBase, (byte)base};
|
||||
String s = new String(cs);
|
||||
return data.get(readGroup).expandingGetRecalData(pos, qual, s, true);
|
||||
}
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Get a list of all of the RecalData associated with readGroup
|
||||
* Walk over each read in the locus pileup and update the covariate counts based on these bases and their
|
||||
* matching (or not) with ref. dbSNP aware, so avoids sites that are known as SNPs in DBSNP.
|
||||
*
|
||||
* @param readGroup
|
||||
* @param tracker
|
||||
* @param ref
|
||||
* @param context
|
||||
* @return
|
||||
*/
|
||||
private List<RecalData> getRecalData(String readGroup) {
|
||||
return data.get(readGroup).getAll();
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||
//System.out.printf("%s %c%n", context.getLocation(), ref);
|
||||
rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null);
|
||||
if ( dbsnp == null || !dbsnp.isSNP() ) {
|
||||
// We aren't at a dbSNP position that's a SNP, so update the read
|
||||
|
||||
List<SAMRecord> reads = context.getReads();
|
||||
List<Integer> offsets = context.getOffsets();
|
||||
for (int i =0; i < reads.size(); i++ ) {
|
||||
|
|
@ -115,73 +89,71 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
}
|
||||
|
||||
SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
|
||||
if ( isSupportedReadGroup(readGroup) &&
|
||||
(READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
|
||||
(read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
|
||||
if ((read.getMappingQuality() >= MIN_MAPPING_QUALITY && isSupportedReadGroup(readGroup) )) {
|
||||
int offset = offsets.get(i);
|
||||
int numBases = read.getReadLength();
|
||||
if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count
|
||||
counted_bases += updateDataFromRead(readGroup.getReadGroupId(), read, offset, ref);
|
||||
if ( offset > 0 && offset < (read.getReadLength() - 1) ) { // skip first and last bases because they suck and they don't have a dinuc count
|
||||
counted_bases += covariateCounter.updateDataFromRead(readGroup.getReadGroupId(), read, offset, ref);
|
||||
}
|
||||
}
|
||||
}
|
||||
counted_sites += 1;
|
||||
} else {
|
||||
skipped_sites += 1;
|
||||
//System.out.println(dbsnp.toSimpleString()+" "+new ReadBackedPileup(ref, context).getPileupString());
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Updates the recalibration data for the base at offset in the read, associated with readGroup rg.
|
||||
* Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read
|
||||
* but by implied machine cycle associated with the offset.
|
||||
* Check to see whether this read group should be processed. Returns true if the
|
||||
* read group is in the list of platforms to process or the platform == *, indicating
|
||||
* that all platforms should be processed.
|
||||
*
|
||||
* TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based
|
||||
* TODO: and offset is 0-based. How very annoying.
|
||||
*
|
||||
* @param rg
|
||||
* @param read
|
||||
* @param offset
|
||||
* @param ref
|
||||
* @param readGroup
|
||||
* @return
|
||||
*/
|
||||
private int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
|
||||
int cycle = offset;
|
||||
byte[] bases = read.getReadBases();
|
||||
byte[] quals = read.getBaseQualities();
|
||||
|
||||
char base = (char)bases[offset];
|
||||
char prevBase = (char)bases[offset - 1];
|
||||
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
ref = (char)BaseUtils.simpleComplement(ref);
|
||||
base = (char)BaseUtils.simpleComplement(base);
|
||||
prevBase = (char)BaseUtils.simpleComplement((char)bases[offset+1]);
|
||||
cycle = read.getReadLength() - (offset + 1);
|
||||
private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) {
|
||||
for( String platform: platforms ) {
|
||||
platform = platform.trim();
|
||||
if( readGroup.getAttribute("PL") == null ||
|
||||
platform.equals("*") ||
|
||||
readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) )
|
||||
return true;
|
||||
}
|
||||
|
||||
int qual = quals[offset];
|
||||
if ( qual > 0 ) {
|
||||
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
|
||||
if (datum != null) datum.inc(base,ref);
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Reduce
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Provide an initial value for reduce computations.
|
||||
* @return Initial value of reduce.
|
||||
*/
|
||||
public PrintStream reduceInit() {
|
||||
try {
|
||||
return new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" );
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException("Couldn't open output file", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
public void onTraversalDone(PrintStream recalTableStream) {
|
||||
printInfo(out);
|
||||
|
||||
out.printf("Writing raw recalibration data%n");
|
||||
writeRecalTable();
|
||||
out.printf("Writing raw recalibration data..."); out.flush();
|
||||
writeRecalTable(recalTableStream);
|
||||
out.printf("...done%n");
|
||||
|
||||
//out.printf("Writing logistic recalibration data%n");
|
||||
//writeLogisticRecalibrationTable();
|
||||
//out.printf("...done%n");
|
||||
|
||||
recalTableStream.close();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -189,13 +161,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
* @param out
|
||||
*/
|
||||
private void printInfo(PrintStream out) {
|
||||
out.printf("# date %s%n", new Date());
|
||||
out.printf("# collapsed_pos %b%n", collapsePos);
|
||||
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
||||
out.printf("# counted_sites %d%n", counted_sites);
|
||||
out.printf("# counted_bases %d%n", counted_bases);
|
||||
out.printf("# skipped_sites %d%n", skipped_sites);
|
||||
out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites);
|
||||
out.printf("# date \"%s\"%n", new Date());
|
||||
out.printf("# collapsed_pos %b%n", collapsePos);
|
||||
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
||||
out.printf("# counted_sites %d%n", counted_sites);
|
||||
out.printf("# counted_bases %d%n", counted_bases);
|
||||
out.printf("# skipped_sites %d%n", skipped_sites);
|
||||
out.printf("# fraction_skipped 1 / %.0f bp%n", (double)counted_sites / skipped_sites);
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
|
|
@ -204,14 +176,14 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
try {
|
||||
dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv");
|
||||
dinuc_out.println("rg,dn,logitQ,pos,indicator,count");
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
for (String readGroup : covariateCounter.getReadGroups()) {
|
||||
for ( int dinuc_index=0; dinuc_index<RecalData.NDINUCS; dinuc_index++) {
|
||||
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
||||
for ( RecalData datum: covariateCounter.getRecalData(readGroup) ) {
|
||||
if ( RecalData.dinucIndex(datum.dinuc) == dinuc_index ) {
|
||||
if ((datum.N - datum.B) > 0)
|
||||
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
|
||||
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup, RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
|
||||
if (datum.B > 0)
|
||||
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 1, datum.B);
|
||||
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup, RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 1, datum.B);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -229,56 +201,26 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
|||
* Writes out the key recalibration data collected from the reads. Dumps this recalibration data
|
||||
* as a CVS string to the recalTableOut PrintStream. Emits the data for all read groups into this file.
|
||||
*/
|
||||
private void writeRecalTable() {
|
||||
printInfo(recalTableOut);
|
||||
recalTableOut.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
|
||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||
// TODO: should sort the data coming out of getRecalData here for easier processing
|
||||
for ( RecalData datum: RecalData.sort(getRecalData(readGroup.getReadGroupId())) ) {
|
||||
private void writeRecalTable(PrintStream recalTableStream) {
|
||||
printInfo(recalTableStream);
|
||||
|
||||
recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
|
||||
for (String readGroup : covariateCounter.getReadGroups()) {
|
||||
for ( RecalData datum: RecalData.sort(covariateCounter.getRecalData(readGroup)) ) {
|
||||
if ( datum.N > 0 )
|
||||
recalTableOut.format("%s%n", datum.toCSVString(collapsePos));
|
||||
recalTableStream.format("%s%n", datum.toCSVString(collapsePos));
|
||||
}
|
||||
}
|
||||
recalTableOut.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Check to see whether this read group should be processed. Returns true if the
|
||||
* read group is in the list of platforms to process or the platform == *, indicating
|
||||
* that all platforms should be processed.
|
||||
*
|
||||
* @param readGroup
|
||||
* @return
|
||||
*/
|
||||
private boolean isSupportedReadGroup( SAMReadGroupRecord readGroup ) {
|
||||
for( String platform: platforms ) {
|
||||
platform = platform.trim();
|
||||
if( readGroup.getAttribute("PL") == null ||
|
||||
platform.equals("*") ||
|
||||
readGroup.getAttribute("PL").toString().equalsIgnoreCase(platform) )
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* No initialization routines
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Doesn't do anything
|
||||
*
|
||||
* @param a
|
||||
* @param b
|
||||
* @param empty
|
||||
* @param recalTableStream
|
||||
* @return
|
||||
*/
|
||||
public Integer reduce(Integer a, Integer b) {
|
||||
return 0;
|
||||
public PrintStream reduce(Integer empty, PrintStream recalTableStream) {
|
||||
return recalTableStream;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -167,7 +167,7 @@ public class RecalData implements Comparable<RecalData> {
|
|||
* @param s
|
||||
* @return
|
||||
*/
|
||||
public static RecalData fromCSVString(String s) {
|
||||
public static RecalData fromCSVString(String s) throws NumberFormatException {
|
||||
String[] vals = s.split(",");
|
||||
String rg = vals[0];
|
||||
int pos = vals[1].equals("*") ? 0 : Integer.parseInt(vals[1]);
|
||||
|
|
@ -178,6 +178,13 @@ public class RecalData implements Comparable<RecalData> {
|
|||
RecalData datum = new RecalData(pos, qual, rg, dinuc);
|
||||
datum.B = B;
|
||||
datum.N = N;
|
||||
|
||||
// Checking for badness
|
||||
if ( pos < 0 ) throw new NumberFormatException("Illegal position detected: " + pos);
|
||||
if ( B < 0 ) throw new NumberFormatException("Illegal mismatch count detected: " + B);
|
||||
if ( N < 0 ) throw new NumberFormatException("Illegal base count detected: " + N);
|
||||
if ( qual < 0 || qual > QualityUtils.MAX_QUAL_SCORE ) throw new NumberFormatException("Illegal qual detected: " + qual);
|
||||
|
||||
return datum;
|
||||
}
|
||||
|
||||
|
|
@ -216,8 +223,8 @@ public class RecalData implements Comparable<RecalData> {
|
|||
}
|
||||
|
||||
double q = QualityUtils.phredScaleErrorRate(sumExpectedErrors / nBases);
|
||||
System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n",
|
||||
sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q);
|
||||
//System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n",
|
||||
// sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q);
|
||||
return q;
|
||||
}
|
||||
}
|
||||
|
|
@ -54,6 +54,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
|
||||
|
||||
private static String VERSION = "0.2.1";
|
||||
|
||||
private final static boolean DEBUG = false;
|
||||
|
||||
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
|
||||
|
|
@ -77,9 +79,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private static Pattern COLLAPSED_DINUC_PATTERN = Pattern.compile("^#\\s+collapsed_dinuc\\s+(\\w+)");
|
||||
private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*");
|
||||
|
||||
//private static boolean DEBUG_ME = true;
|
||||
|
||||
public void initialize() {
|
||||
logger.info("TableRecalibrator version: " + VERSION);
|
||||
//
|
||||
// crap hack until Enum arg types are supported
|
||||
// crappy hack until Enum arg types are supported
|
||||
//
|
||||
for ( RecalibrationMode potential : RecalibrationMode.values() ) {
|
||||
if ( potential.toString().equals(modeString)) {
|
||||
|
|
@ -96,13 +101,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
//
|
||||
int lineNumber = 0;
|
||||
try {
|
||||
System.out.printf("Reading data...%n");
|
||||
logger.info(String.format("Reading data..."));
|
||||
List<RecalData> data = new ArrayList<RecalData>();
|
||||
boolean collapsedPos = false;
|
||||
boolean collapsedDinuc = false;
|
||||
|
||||
List<String> lines = new xReadLines(new File(paramsFile)).readLines();
|
||||
for ( String line : lines ) {
|
||||
//List<String> lines = new xReadLines(new File(paramsFile)).readLines();
|
||||
for ( String line : new xReadLines(new File(paramsFile)) ) {
|
||||
lineNumber++;
|
||||
if ( HEADER_PATTERN.matcher(line).matches() )
|
||||
continue;
|
||||
|
|
@ -159,12 +164,14 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
if ( collapsedPos )
|
||||
throw new RuntimeException(String.format("Cannot perform position_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||
collapsedPos = true;
|
||||
break;
|
||||
throw new RuntimeException("Unsupported mode requested, sorry");
|
||||
//break;
|
||||
case BY_DINUC_ONLY:
|
||||
if ( collapsedDinuc )
|
||||
throw new RuntimeException(String.format("Cannot perform dinuc_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||
collapsedDinuc = true;
|
||||
break;
|
||||
throw new RuntimeException("Unsupported mode requested, sorry");
|
||||
//break;
|
||||
case COMBINATORIAL:
|
||||
if ( collapsedPos || collapsedDinuc )
|
||||
throw new RuntimeException(String.format("Cannot perform combinatorial recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||
|
|
@ -231,12 +238,22 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
quals = BaseUtils.reverse(quals);
|
||||
}
|
||||
|
||||
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
|
||||
try {
|
||||
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
|
||||
|
||||
if (read.getReadNegativeStrandFlag()) // reverse the quals for the neg strand read
|
||||
recalQuals = BaseUtils.reverse(recalQuals);
|
||||
read.setBaseQualities(recalQuals);
|
||||
return read;
|
||||
//if ( read.getReadName().equals("IL12_395:7:215:171:693") ) {
|
||||
// for ( int i = 0; i < quals.length; i++ ) {
|
||||
// System.out.printf("READ found: %s is now %s%n", quals[i], recalQuals[i]);
|
||||
// }
|
||||
//}
|
||||
|
||||
if (read.getReadNegativeStrandFlag()) // reverse the quals for the neg strand read
|
||||
recalQuals = BaseUtils.reverse(recalQuals);
|
||||
read.setBaseQualities(recalQuals);
|
||||
return read;
|
||||
} catch ( StingException e ) {
|
||||
throw new RuntimeException(String.format("Bug found while processing read %s: %s", read.format(), e.getMessage()));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -248,14 +265,21 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
* @param quals
|
||||
* @return
|
||||
*/
|
||||
public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) {
|
||||
public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) throws StingException {
|
||||
byte[] recalQuals = new byte[quals.length];
|
||||
RecalMapping mapper = cache.get(readGroup);
|
||||
|
||||
//if ( mapper == null && DEBUG_ME )
|
||||
// return recalQuals;
|
||||
|
||||
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
|
||||
for ( int cycle = 1; cycle < bases.length; cycle++ ) { // skip first and last base, qual already set because no dinuc
|
||||
byte qual = quals[cycle];
|
||||
byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual);
|
||||
|
||||
if ( newQual <= 0 || newQual > QualityUtils.MAX_REASONABLE_Q_SCORE )
|
||||
throw new StingException(String.format("Bug found -- assigning bad quality score %d x %d => %d", cycle, qual, newQual));
|
||||
|
||||
recalQuals[cycle] = newQual;
|
||||
//System.out.printf("Mapping %d => %d%n", qual, newQual);
|
||||
}
|
||||
|
|
@ -322,17 +346,12 @@ class CombinatorialRecalMapping implements RecalMapping {
|
|||
int pos = manager.canonicalPos(datum.pos);
|
||||
if ( table[pos][datum.qual] != 0 )
|
||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
||||
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
|
||||
table[pos][datum.qual] = datum.empiricalQualByte(useRawQempirical);
|
||||
//System.out.printf("Binding %d %d => %d%n", pos, datum.qual, datum.empiricalQualByte(useRawQempirical));
|
||||
}
|
||||
}
|
||||
|
||||
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
|
||||
//String dinuc = String.format("%c%c", (char)prevBase, (char)base);
|
||||
//if ( qual == 2 )
|
||||
// System.out.printf("Qual = 2%n");
|
||||
|
||||
int pos = manager.canonicalPos(cycle);
|
||||
int index = this.manager.getDinucIndex(prevBase, base);
|
||||
byte[][] dataTable = index == -1 ? null : cache.get(index);
|
||||
|
|
@ -340,13 +359,7 @@ class CombinatorialRecalMapping implements RecalMapping {
|
|||
if ( dataTable == null && prevBase != 'N' && base != 'N' )
|
||||
throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, (char)prevBase, (char)base));
|
||||
|
||||
byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
|
||||
|
||||
//if ( result == 2 )
|
||||
// System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n",
|
||||
// readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result);
|
||||
|
||||
return result;
|
||||
return dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -368,13 +381,13 @@ class SerialRecalMapping implements RecalMapping {
|
|||
private double globalDeltaQ = 0.0;
|
||||
private double[][] deltaQPosMap, deltaQDinucMap;
|
||||
double [] deltaQualMap;
|
||||
RecalData [][] qPosSupports, qDinucSupports;
|
||||
RecalData [][] qPosSupports = null, qDinucSupports = null;
|
||||
|
||||
CombinatorialRecalMapping combiMap;
|
||||
RecalDataManager manager;
|
||||
|
||||
String dinucToLookAt = null; // "CC";
|
||||
int posToLookAt = 0;
|
||||
int posToLookAt = -1;
|
||||
int qualToLookAt = 25;
|
||||
|
||||
public SerialRecalMapping(RecalDataManager manager, final boolean useRawQempirical,
|
||||
|
|
@ -387,7 +400,7 @@ class SerialRecalMapping implements RecalMapping {
|
|||
RecalData datum = new RecalData(0, 0, manager.readGroup, "**").inc(manager.getAll());
|
||||
double aggregrateQreported = RecalData.combinedQreported(manager.getAll());
|
||||
globalDeltaQ = datum.empiricalQualDouble(useRawQempirical) - aggregrateQreported;
|
||||
System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ);
|
||||
//System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ);
|
||||
}
|
||||
|
||||
for ( RecalData datum : manager.getAll() ) {
|
||||
|
|
@ -399,12 +412,12 @@ class SerialRecalMapping implements RecalMapping {
|
|||
deltaQualMap = new double[maxQReported+1];
|
||||
for ( RecalData datum : RecalData.sort(manager.combine(true, false, true)) ) {
|
||||
deltaQualMap[datum.qual] = datum.empiricalQualDouble(useRawQempirical) - datum.qual - globalDeltaQ;
|
||||
System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]);
|
||||
//System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]);
|
||||
}
|
||||
|
||||
// calculate the delta Q pos array
|
||||
deltaQPosMap = new double[maxPos+1][maxQReported+1];
|
||||
qPosSupports = new RecalData[maxPos+1][maxQReported+1];
|
||||
//qPosSupports = new RecalData[maxPos+1][maxQReported+1];
|
||||
for ( RecalData datumAtPosQual : manager.combineDinucs() ) {
|
||||
double offset = globalDeltaQ + deltaQualMap[datumAtPosQual.qual];
|
||||
updateCache(qPosSupports, datumAtPosQual, useRawQempirical, deltaQPosMap, datumAtPosQual.pos, datumAtPosQual.qual, offset);
|
||||
|
|
@ -412,7 +425,7 @@ class SerialRecalMapping implements RecalMapping {
|
|||
|
||||
// calculate the delta Q dinuc array
|
||||
deltaQDinucMap = new double[dinucs.size()+1][maxQReported+1];
|
||||
qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1];
|
||||
//qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1];
|
||||
for ( RecalData datumAtDinucQual : manager.combineCycles() ) {
|
||||
double offset = globalDeltaQ + deltaQualMap[datumAtDinucQual.qual];
|
||||
updateCache(qDinucSupports, datumAtDinucQual, useRawQempirical, deltaQDinucMap, datumAtDinucQual.getDinucIndex(), datumAtDinucQual.qual, offset);
|
||||
|
|
@ -429,7 +442,7 @@ class SerialRecalMapping implements RecalMapping {
|
|||
for ( int j = 0; j < maxQReported; j++ ) {
|
||||
if ( printStateP(i, null, j) )
|
||||
System.out.printf("Mapping: pos=%d qual=%2d delta=%.2f based on %s%n",
|
||||
i, j, deltaQPosMap[i][j], qPosSupports[i][j]);
|
||||
i, j, deltaQPosMap[i][j], qPosSupports != null ? qPosSupports[i][j] : null);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -438,7 +451,7 @@ class SerialRecalMapping implements RecalMapping {
|
|||
String dinuc = RecalData.dinucIndex2bases(i);
|
||||
if ( printStateP(0, dinuc, j ) )
|
||||
System.out.printf("Mapping: dinuc=%s qual=%2d delta=%.2f based on %s%n",
|
||||
dinuc, j, deltaQDinucMap[i][j], qDinucSupports[i][j]);
|
||||
dinuc, j, deltaQDinucMap[i][j], qDinucSupports != null ? qDinucSupports[i][j] : null);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -457,7 +470,8 @@ class SerialRecalMapping implements RecalMapping {
|
|||
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
||||
double deltaQ = datum.empiricalQualDouble(useRawQempirical) - datum.qual - meanQ;
|
||||
table[i][j] = deltaQ;
|
||||
supports[i][j] = datum;
|
||||
if ( supports != null )
|
||||
supports[i][j] = datum;
|
||||
}
|
||||
|
||||
private boolean printStateP( int cycle, String dinuc, int qual ) {
|
||||
|
|
|
|||
|
|
@ -30,6 +30,9 @@ public class BaseQualityHistoWalker extends ReadWalker<Integer, Integer> {
|
|||
// Map over the org.broadinstitute.sting.gatk.LocusContext
|
||||
public Integer map(char[] ref, SAMRecord read) {
|
||||
for ( byte qual : read.getBaseQualities() ) {
|
||||
if ( qual < 0 || qual > 100 ) {
|
||||
throw new RuntimeException(String.format("Invalid base quality detected -- %d at %s%n", qual, read.getReadName()));
|
||||
}
|
||||
//System.out.println(qual);
|
||||
this.qualCounts[qual]++;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -244,7 +244,7 @@ public class GenomeLocParser {
|
|||
* @return the list of merged locations
|
||||
*/
|
||||
public static List<GenomeLoc> mergeOverlappingLocations(final List<GenomeLoc> raw) {
|
||||
logger.debug(" Raw locations are:\n" + Utils.join("\n", raw));
|
||||
logger.debug(" Raw locations are: " + Utils.join(", ", raw));
|
||||
if (raw.size() <= 1)
|
||||
return raw;
|
||||
else {
|
||||
|
|
|
|||
|
|
@ -85,8 +85,16 @@ public class QualityUtils {
|
|||
return boundQual(qual, MAX_QUAL_SCORE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an integer quality score bounded by 1 - maxQual.
|
||||
*
|
||||
* @param qual
|
||||
* @param maxQual
|
||||
* @return
|
||||
*/
|
||||
static public byte boundQual(int qual, byte maxQual) {
|
||||
return (byte) Math.min(qual, maxQual);
|
||||
//return (byte) Math.min(qual, maxQual);
|
||||
return (byte) Math.max(Math.min(qual, maxQual), 1);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -191,7 +191,7 @@ public class ArtificialSAMUtils {
|
|||
}
|
||||
SAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length);
|
||||
rec.setReadBases(bases);
|
||||
rec.setBaseQualities(bases);
|
||||
rec.setBaseQualities(qual);
|
||||
return rec;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,183 @@
|
|||
// our package
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import java.util.*;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
|
||||
// the imports for unit testing.
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.playground.gatk.walkers.indels.CleanedReadInjector;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileReader;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
|
||||
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Basic unit test for RecalData
|
||||
*/
|
||||
public class CovariateCounterTest extends BaseTest {
|
||||
String readGroup1 = "rg1";
|
||||
String readGroup2 = "rg2";
|
||||
Set<String> readGroups = new HashSet<String>();
|
||||
|
||||
SAMFileHeader header;
|
||||
|
||||
SAMRecord read1, read2, read3;
|
||||
|
||||
byte bases1[] = {'a', 't', 'c', 'g', 'a'};
|
||||
byte quals1[] = {1, 2, 3, 4, 5};
|
||||
byte quals3[] = {1, 2, 5, 5, 5};
|
||||
byte bases2[] = {'t', 'c', 'g', 'a', 't'};
|
||||
byte quals2[] = {2, 2, 4, 5, 2};
|
||||
|
||||
/*
|
||||
public CovariateCounter( Set<String> readGroups, boolean collapsePos, boolean collapseDinuc ) {
|
||||
public Set<String> getReadGroups() {
|
||||
public boolean isCollapseDinuc() {
|
||||
public boolean isCollapsePos() {
|
||||
public int getNReadGroups() {
|
||||
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
|
||||
public List<RecalData> getRecalData(String readGroup) {
|
||||
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
|
||||
*/
|
||||
|
||||
/**
|
||||
* The fasta, for comparison.
|
||||
*/
|
||||
protected static IndexedFastaSequenceFile sequenceFile = null;
|
||||
|
||||
CovariateCounter c;
|
||||
|
||||
/**
|
||||
* Initialize the fasta.
|
||||
*/
|
||||
@BeforeClass
|
||||
public static void initialize() throws FileNotFoundException {
|
||||
sequenceFile = new IndexedFastaSequenceFile( new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") );
|
||||
GenomeLocParser.setupRefContigOrdering(sequenceFile);
|
||||
|
||||
}
|
||||
|
||||
@Before
|
||||
public void initializeBefore() {
|
||||
header = ArtificialSAMUtils.createArtificialSamHeader(2,0,247249719);
|
||||
readGroups.addAll(Arrays.asList(readGroup1, readGroup2));
|
||||
ArtificialSAMUtils.createDefaultReadGroup( header, readGroup1, "sample1" );
|
||||
ArtificialSAMUtils.createDefaultReadGroup( header, readGroup2, "sample2" );
|
||||
c = new CovariateCounter( readGroups, false, false );
|
||||
|
||||
read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases1, quals1);
|
||||
read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",1,1, bases2, quals2);
|
||||
read3 = ArtificialSAMUtils.createArtificialRead(header,"read3",1,1, bases1, quals3);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCovariateCounterSetup() {
|
||||
Assert.assertEquals("Number of read groups is wrong", c.getNReadGroups(), 2);
|
||||
Assert.assertEquals("Read group identities are wrong", c.getReadGroups(), readGroups);
|
||||
Assert.assertEquals("Incorrectly collapsed counter", c.isCollapseDinuc(), false);
|
||||
Assert.assertEquals("Incorrectly collapsed counter", c.isCollapsePos(), false);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOneRead() {
|
||||
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||
c.printState();
|
||||
|
||||
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
|
||||
for ( int i = 1; i < bases1.length; i++ ) {
|
||||
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
|
||||
System.out.printf("%s%n", datum);
|
||||
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
|
||||
Assert.assertEquals("Bad mismatch count", datum.B, 0);
|
||||
Assert.assertEquals("Bad base count", datum.N, 1);
|
||||
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
|
||||
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
|
||||
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTwoReads() {
|
||||
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||
for ( int i = 1; i < read2.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup2, read2, i, (char)read2.getReadBases()[i]);
|
||||
c.printState();
|
||||
|
||||
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
|
||||
for ( int i = 1; i < bases1.length; i++ ) {
|
||||
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
|
||||
System.out.printf("%s%n", datum);
|
||||
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
|
||||
Assert.assertEquals("Bad mismatch count", datum.B, 0);
|
||||
Assert.assertEquals("Bad base count", datum.N, 1);
|
||||
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
|
||||
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
|
||||
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTwoReadsSameGroup() {
|
||||
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||
for ( int i = 1; i < read2.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||
c.printState();
|
||||
|
||||
for ( int i = 1; i < bases1.length; i++ ) {
|
||||
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
|
||||
System.out.printf("%s%n", datum);
|
||||
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
|
||||
Assert.assertEquals("Bad mismatch count", datum.B, 0);
|
||||
Assert.assertEquals("Bad base count", datum.N, 2);
|
||||
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
|
||||
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
|
||||
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTwoReadsSameGroupNotIdentical() {
|
||||
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||
for ( int i = 1; i < read3.getReadBases().length; i++ )
|
||||
c.updateDataFromRead(readGroup1, read3, i, (char)read3.getReadBases()[i]);
|
||||
c.printState();
|
||||
|
||||
for ( int i = 1; i < bases1.length; i++ ) {
|
||||
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
|
||||
System.out.printf("%s%n", datum);
|
||||
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
|
||||
Assert.assertEquals("Bad mismatch count", datum.B, 0);
|
||||
Assert.assertEquals("Bad base count", datum.N, quals1[i] == quals3[i] ? 2 : 1);
|
||||
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
|
||||
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
|
||||
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@Test (expected = RuntimeException.class)
|
||||
public void testBadReadOffset() {
|
||||
byte bases[] = {'a', 't', 'c', 'g', 'a'};
|
||||
byte quals[] = {1, 2, 3, 4, 5};
|
||||
|
||||
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases, quals);
|
||||
|
||||
c.updateDataFromRead(readGroup1, read, 0, (char)bases[0]);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<package>
|
||||
<name>GATKResources</name>
|
||||
<resources>
|
||||
<file>/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod</file>
|
||||
<file>/broad/1KG/reference/human_b36_both.fasta</file>
|
||||
<file>/broad/1KG/reference/human_b36_both.dict</file>
|
||||
<file>/broad/1KG/reference/human_b36_both.fasta.fai</file>
|
||||
<file>/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod</file>
|
||||
<file>/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta</file>
|
||||
<file>/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dict</file>
|
||||
<file>/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai</file>
|
||||
</resources>
|
||||
</package>
|
||||
|
|
@ -3,18 +3,7 @@
|
|||
<name>ReadQualityRecalibrator</name>
|
||||
<main-class>org.broadinstitute.sting.gatk.CommandLineGATK</main-class>
|
||||
<dependencies>
|
||||
<class>org.broadinstitute.sting.playground.gatk.walkers.CovariateCounterWalker</class>
|
||||
<class>org.broadinstitute.sting.playground.gatk.walkers.LogisticRecalibrationWalker</class>
|
||||
<class>org.broadinstitute.sting.gatk.walkers.recalibration.CovariateCounterWalker</class>
|
||||
<class>org.broadinstitute.sting.gatk.walkers.recalibration.TableRecalibrationWalker</class>
|
||||
</dependencies>
|
||||
<scripts>
|
||||
<file>python/RecalQual.py</file>
|
||||
<file>python/LogisticRegressionByReadGroup.py</file>
|
||||
</scripts>
|
||||
<resources>
|
||||
<file>R/logistic_regression.R</file>
|
||||
<file>/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod</file>
|
||||
<file>/broad/1KG/reference/human_b36_both.fasta</file>
|
||||
<file>/broad/1KG/reference/human_b36_both.dict</file>
|
||||
<file>/broad/1KG/reference/human_b36_both.fasta.fai</file>
|
||||
</resources>
|
||||
</package>
|
||||
|
|
|
|||
|
|
@ -44,6 +44,16 @@ def bams2geli(bams):
|
|||
calls = map(call1, bams)
|
||||
return map(lambda x: x[0], calls), map(lambda x: x[1], calls)
|
||||
|
||||
def gelis2gelisText( gelis ):
|
||||
def geli2geliText( maybeGeli ):
|
||||
if os.path.splitext(maybeGeli)[1] == ".calls" :
|
||||
return maybeGeli
|
||||
else:
|
||||
return os.path.split(geli)[1] + '.calls'
|
||||
|
||||
return map( geli2geliText, gelis)
|
||||
|
||||
|
||||
def main():
|
||||
global OPTIONS, ROOT
|
||||
|
||||
|
|
@ -124,7 +134,7 @@ def main():
|
|||
|
||||
# convert the geli's to text
|
||||
jobid = None
|
||||
variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
|
||||
variantsOut = gelis2gelisText( gelis )
|
||||
for geli, variantOut in zip(gelis, variantsOut):
|
||||
name = os.path.split(geli)[1]
|
||||
if not os.path.exists(variantOut):
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ import operator
|
|||
MAX_QUAL_SCORE = 50
|
||||
|
||||
def phredQScore( nMismatches, nBases ):
|
||||
"""Calculates a phred-scaled score for nMismatches in nBases"""
|
||||
#print 'phredQScore', nMismatches, nBases
|
||||
if nMismatches == 0:
|
||||
return MAX_QUAL_SCORE
|
||||
|
|
@ -24,10 +25,12 @@ def phredQScore( nMismatches, nBases ):
|
|||
|
||||
|
||||
def phredScore2ErrorProp(qual):
|
||||
"""Converts a phred-scaled quality score to an error probability"""
|
||||
#print 'phredScore2ErrorProp', qual
|
||||
return math.pow(10.0, float(qual) / -10.0)
|
||||
|
||||
def tryByInt(s):
|
||||
"""Try to cast something to an int, or return it as a string"""
|
||||
try:
|
||||
return int(s)
|
||||
except:
|
||||
|
|
@ -36,11 +39,13 @@ def tryByInt(s):
|
|||
expectedHeader = 'rg,pos,Qrep,dn,nBases,nMismatches,Qemp'.split(',')
|
||||
defaultValues = '0,0,0,**,0,0,0'.split(',')
|
||||
class RecalData(dict):
|
||||
|
||||
"""Basic recalibration data -- corresponds exactly to the Java version in GATK"""
|
||||
def __init__(self):
|
||||
self.parse(expectedHeader, defaultValues)
|
||||
|
||||
def parse(self, header, data):
|
||||
"""Parse the comma-separated data line with corresponding header. Throws an error
|
||||
if the header doesn't correspond to the expectedHeader"""
|
||||
# rg,pos,Qrep,dn,NBases,MMismatches,Qemp
|
||||
types = [str, tryByInt, int, str, int, int, int]
|
||||
for head, expected, datum, type in zip(header, expectedHeader, data, types):
|
||||
|
|
@ -57,8 +62,11 @@ class RecalData(dict):
|
|||
|
||||
def __getattr__(self, name):
|
||||
return self[name]
|
||||
|
||||
# rg,dn,Qrep,pos,NBases,MMismatches,Qemp
|
||||
|
||||
|
||||
#
|
||||
# Trivial accessor functions
|
||||
#
|
||||
def readGroup(self): return self.rg
|
||||
def dinuc(self): return self.dn
|
||||
def qReported(self): return self.Qrep
|
||||
|
|
@ -204,8 +212,9 @@ def lsamplestdev (inlist, counts, mean):
|
|||
for item, count in zip(inlist, counts):
|
||||
diff = item - mean
|
||||
inc = count * diff * diff
|
||||
#print item, count, mean, diff, diff*diff, inc
|
||||
#print "%3d" % int(item), count, mean, diff, diff*diff, inc, sum
|
||||
sum += inc
|
||||
#print sum, n, sum / float(n-1), math.sqrt(sum / float(n-1))
|
||||
return math.sqrt(sum / float(n-1))
|
||||
|
||||
def rmse(reportedList, empiricalList, counts):
|
||||
|
|
@ -320,7 +329,7 @@ def analyzeFiles(files):
|
|||
for file in files:
|
||||
print 'Analyzing file', file
|
||||
plotter = getPlotterForFile(file)
|
||||
if plotter <> None:
|
||||
if plotter <> None and not OPTIONS.noplots:
|
||||
cmd = ' '.join([Rscript, plotter, file])
|
||||
farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry)
|
||||
|
||||
|
|
@ -341,6 +350,9 @@ def main():
|
|||
parser.add_option("-s", "--stdout", dest="toStdout",
|
||||
action='store_true', default=False,
|
||||
help="If provided, writes output to standard output, not to files")
|
||||
parser.add_option("", "--no_plots", dest="noplots",
|
||||
action='store_true', default=False,
|
||||
help="If provided, no plots will be generated")
|
||||
parser.add_option("", "--dry", dest="dry",
|
||||
action='store_true', default=False,
|
||||
help="If provided, nothing actually gets run, just a dry run")
|
||||
|
|
|
|||
|
|
@ -153,7 +153,7 @@ def aggregateGeliCalls( sortedGeliCalls ):
|
|||
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||
|
||||
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m' ):
|
||||
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m', compression_level = 1 ):
|
||||
if useSamtools:
|
||||
return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles)
|
||||
else:
|
||||
|
|
@ -164,7 +164,7 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True,
|
|||
MSDStr = ''
|
||||
if MSD: MSDStr = 'MSD=true'
|
||||
|
||||
return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + compression_level + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||
|
||||
def getPicardPath(lane, picardRoot = '/seq/picard/'):
|
||||
|
|
|
|||
|
|
@ -0,0 +1,2 @@
|
|||
cd dist/packages/GATKResources
|
||||
tar cvhzf gatk_resources_062309.tgz resources
|
||||
Loading…
Reference in New Issue