Version 0.2.1 (released) of the TableRecalibrator
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1108 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
73caf5db15
commit
5289230eb8
|
|
@ -12,8 +12,13 @@ t=read.table(input, header=T)
|
||||||
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
|
#png(outfile, height=7, width=7, units="in", res=72) # height=1000, width=446)
|
||||||
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
|
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
|
||||||
pdf(outfile, height=7, width=7)
|
pdf(outfile, height=7, width=7)
|
||||||
plot(t$Qreported, t$Qempirical, type="p", col="blue", xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
|
d.good <- t[t$nMismatches >= 1000,]
|
||||||
abline(0,1)
|
d.100 <- t[t$nMismatches < 100,]
|
||||||
|
d.1000 <- t[t$nMismatches < 1000 & t$nMismatches >= 100,]
|
||||||
|
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", xlim=c(0,45), ylim=c(0,45), pch=16, xlab="Reported quality score", ylab="Empirical quality score", main="Reported vs. empirical quality scores")
|
||||||
|
points(d.100$Qreported, d.100$Qempirical, type="p", col="lightblue", pch=16)
|
||||||
|
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="cornflowerblue", pch=16)
|
||||||
|
abline(0,1, lty=2)
|
||||||
dev.off()
|
dev.off()
|
||||||
|
|
||||||
#outfile = paste(input, ".quality_emp_hist.png", sep="")
|
#outfile = paste(input, ".quality_emp_hist.png", sep="")
|
||||||
|
|
@ -21,6 +26,6 @@ dev.off()
|
||||||
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
|
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
|
||||||
pdf(outfile, height=7, width=7)
|
pdf(outfile, height=7, width=7)
|
||||||
hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0)
|
hst=subset(data.frame(t$Qempirical, t$nBases), t.nBases != 0)
|
||||||
plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,40), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
|
plot(hst$t.Qempirical, hst$t.nBases, type="h", lwd=3, xlim=c(0,45), main="Reported quality score histogram", xlab="Empirical quality score", ylab="Count", yaxt="n")
|
||||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||||
dev.off()
|
dev.off()
|
||||||
|
|
|
||||||
|
|
@ -12,5 +12,9 @@ outfile = paste(input, ".qual_diff_v_cycle.pdf", sep="")
|
||||||
pdf(outfile, height=7, width=7)
|
pdf(outfile, height=7, width=7)
|
||||||
par(cex=1.1)
|
par(cex=1.1)
|
||||||
c <- read.table(input, header=T)
|
c <- read.table(input, header=T)
|
||||||
plot(c$Cycle, c$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", ylim=c(-10, 10))
|
d.good <- c[c$nMismatches >= 100,]
|
||||||
|
d.100 <- c[c$nMismatches < 100,]
|
||||||
|
plot(d.good$Cycle, d.good$Qempirical_Qreported, type="l", ylab="Empirical - Reported Quality", xlab="Cycle", col="blue", ylim=c(-10, 10))
|
||||||
|
points(d.100$Cycle, d.100$Qempirical_Qreported, type="p", col="lightblue", pch=3)
|
||||||
|
#points(d.1000$Cycle, d.1000$Qempirical_Qreported, type="p", col="cornflowerblue", pch=16)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -85,7 +85,6 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
public static void parseBindings(Logger logger, ArrayList<String> bindings, List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods)
|
public static void parseBindings(Logger logger, ArrayList<String> bindings, List<ReferenceOrderedData<? extends ReferenceOrderedDatum> > rods)
|
||||||
{
|
{
|
||||||
// Loop over triplets
|
// Loop over triplets
|
||||||
System.out.printf("Binding is %s%n", Utils.join(" XXX ", bindings));
|
|
||||||
for( String bindingSets: bindings ) {
|
for( String bindingSets: bindings ) {
|
||||||
String[] bindingTokens = bindingSets.split(",");
|
String[] bindingTokens = bindingSets.split(",");
|
||||||
if( bindingTokens.length % 3 != 0 )
|
if( bindingTokens.length % 3 != 0 )
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,128 @@
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
|
||||||
|
import javax.management.RuntimeErrorException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class CovariateCounter {
|
||||||
|
private boolean collapsePos = false;
|
||||||
|
private boolean collapseDinuc = false;
|
||||||
|
|
||||||
|
private HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
|
||||||
|
|
||||||
|
public CovariateCounter( Set<String> readGroups, boolean collapsePos, boolean collapseDinuc ) {
|
||||||
|
this.collapsePos = collapsePos;
|
||||||
|
this.collapseDinuc = collapseDinuc;
|
||||||
|
|
||||||
|
for (String readGroup : readGroups ) {
|
||||||
|
RecalDataManager manager = new RecalDataManager(readGroup, ! collapsePos, ! collapseDinuc );
|
||||||
|
data.put(readGroup, manager);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the set of readGroup names we are counting covariates for
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public Set<String> getReadGroups() {
|
||||||
|
return data.keySet();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isCollapseDinuc() {
|
||||||
|
return collapseDinuc;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isCollapsePos() {
|
||||||
|
return collapsePos;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the number of read groups being managed
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public int getNReadGroups() {
|
||||||
|
return data.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the particular RecalData datum associated with readGroup, at machine pos, with reported
|
||||||
|
* quality qual, and with the dinuc context of prevBase, base. If an example of such a
|
||||||
|
* base has been seen before, returns the associated RecalData. If not, it creates one, places it in the
|
||||||
|
* system so that subsequent requests will return that object, and returns it.
|
||||||
|
*
|
||||||
|
* @param readGroup
|
||||||
|
* @param pos
|
||||||
|
* @param qual
|
||||||
|
* @param prevBase
|
||||||
|
* @param base
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
|
||||||
|
byte[] cs = {(byte)prevBase, (byte)base};
|
||||||
|
String s = new String(cs);
|
||||||
|
return data.get(readGroup).expandingGetRecalData(pos, qual, s, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get a list of all of the RecalData associated with readGroup
|
||||||
|
*
|
||||||
|
* @param readGroup
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public List<RecalData> getRecalData(String readGroup) {
|
||||||
|
return data.get(readGroup).getAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Updates the recalibration data for the base at offset in the read, associated with readGroup rg.
|
||||||
|
* Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read
|
||||||
|
* but by implied machine cycle associated with the offset.
|
||||||
|
*
|
||||||
|
* TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based
|
||||||
|
* TODO: and offset is 0-based. How very annoying.
|
||||||
|
*
|
||||||
|
* @param rg
|
||||||
|
* @param read
|
||||||
|
* @param offset
|
||||||
|
* @param ref
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
|
||||||
|
if ( offset == 0 )
|
||||||
|
throw new RuntimeException("Illegal read offset " + offset + " in read " + read.getReadName());
|
||||||
|
|
||||||
|
int cycle = offset;
|
||||||
|
byte[] bases = read.getReadBases();
|
||||||
|
byte[] quals = read.getBaseQualities();
|
||||||
|
|
||||||
|
char base = (char)bases[offset];
|
||||||
|
char prevBase = (char)bases[offset - 1];
|
||||||
|
|
||||||
|
if (read.getReadNegativeStrandFlag()) {
|
||||||
|
ref = (char)BaseUtils.simpleComplement(ref);
|
||||||
|
base = (char)BaseUtils.simpleComplement(base);
|
||||||
|
prevBase = (char)BaseUtils.simpleComplement((char)bases[offset+1]);
|
||||||
|
cycle = read.getReadLength() - (offset + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
int qual = quals[offset];
|
||||||
|
if ( qual > 0 ) {
|
||||||
|
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
|
||||||
|
if (datum != null) datum.inc(base,ref);
|
||||||
|
return 1;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void printState() {
|
||||||
|
for ( String readGroup : getReadGroups() ) {
|
||||||
|
for ( RecalData datum : getRecalData(readGroup) ) {
|
||||||
|
if ( datum.N > 0 )
|
||||||
|
System.out.println(datum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -7,18 +7,15 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||||
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
import org.broadinstitute.sting.gatk.refdata.rodDbSNP;
|
||||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||||
import org.broadinstitute.sting.utils.QualityUtils;
|
|
||||||
import org.broadinstitute.sting.utils.Utils;
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
import org.broadinstitute.sting.utils.BaseUtils;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
|
|
||||||
@WalkerName("CountCovariates")
|
@WalkerName("CountCovariates")
|
||||||
public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
||||||
@Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false)
|
@Argument(fullName="buggyMaxReadLen", doc="If we see a read longer than this, we assume there's a bug and abort", required=false)
|
||||||
public int buggyMaxReadLen = 100000;
|
public int buggyMaxReadLen = 100000;
|
||||||
|
|
||||||
|
|
@ -28,83 +25,60 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
@Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
|
@Argument(fullName="MIN_MAPPING_QUALITY", shortName="minmap", required=false, doc="Only use reads with at least this quality score")
|
||||||
public int MIN_MAPPING_QUALITY = 1;
|
public int MIN_MAPPING_QUALITY = 1;
|
||||||
|
|
||||||
@Argument(fullName="READ_GROUP", shortName="rg", required=false, doc="Only use reads with this read group (@RG)")
|
|
||||||
public String READ_GROUP = "none";
|
|
||||||
|
|
||||||
//@Argument(fullName="MAX_READ_GROUPS", shortName="mrg", required=false, doc="Abort if number of read groups in input file exceeeds this count.")
|
|
||||||
//public int MAX_READ_GROUPS = 100;
|
|
||||||
|
|
||||||
@Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)")
|
@Argument(fullName="PLATFORM", shortName="pl", required=false, doc="Only calibrate read groups generated from the given platform (default = * for all platforms)")
|
||||||
public List<String> platforms = Collections.singletonList("*");
|
public List<String> platforms = Collections.singletonList("*");
|
||||||
//public List<String> platforms = Collections.singletonList("ILLUMINA");
|
//public List<String> platforms = Collections.singletonList("ILLUMINA");
|
||||||
|
|
||||||
@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
|
//@Argument(fullName="collapsePos", shortName="collapsePos", required=false, doc="")
|
||||||
public boolean collapsePos = false;
|
public boolean collapsePos = false;
|
||||||
|
|
||||||
@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
|
//@Argument(fullName="collapseDinuc", shortName="collapseDinuc", required=false, doc="")
|
||||||
public boolean collapseDinuc = false;
|
public boolean collapseDinuc = false;
|
||||||
|
|
||||||
HashMap<String, RecalDataManager> data = new HashMap<String, RecalDataManager>();
|
private CovariateCounter covariateCounter = null;
|
||||||
|
|
||||||
long counted_sites = 0; // number of sites used to count covariates
|
private long counted_sites = 0; // number of sites used to count covariates
|
||||||
long counted_bases = 0; // number of bases used to count covariates
|
private long counted_bases = 0; // number of bases used to count covariates
|
||||||
long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
|
private long skipped_sites = 0; // number of sites skipped because of a dbSNP entry
|
||||||
|
|
||||||
PrintStream recalTableOut = null;
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize the system. Setup the data CovariateCountry for the read groups in our header
|
||||||
|
*/
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
try {
|
Set<String> readGroups = new HashSet<String>();
|
||||||
recalTableOut = new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" );
|
|
||||||
} catch ( FileNotFoundException e ) {
|
|
||||||
throw new RuntimeException("Couldn't open output file", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
||||||
if( readGroup.getAttribute("PL") == null )
|
if( readGroup.getAttribute("PL") == null )
|
||||||
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are supported",readGroup.getReadGroupId()));
|
Utils.warnUser(String.format("PL attribute for read group %s is unset; assuming all reads are supported",readGroup.getReadGroupId()));
|
||||||
if( !isSupportedReadGroup(readGroup) )
|
if( !isSupportedReadGroup(readGroup) )
|
||||||
continue;
|
continue;
|
||||||
String rg = readGroup.getReadGroupId();
|
readGroups.add(readGroup.getReadGroupId());
|
||||||
//RecalDataManager manager = new RecalDataManager(rg, maxReadLen, QualityUtils.MAX_QUAL_SCORE+1, RecalData.NDINUCS, ! collapsePos, ! collapseDinuc );
|
|
||||||
RecalDataManager manager = new RecalDataManager(rg, ! collapsePos, ! collapseDinuc );
|
|
||||||
data.put(rg, manager);
|
|
||||||
}
|
}
|
||||||
out.printf("Created recalibration data collectors for %d read group(s)%n", data.size());
|
|
||||||
|
covariateCounter = new CovariateCounter(readGroups, collapsePos, collapseDinuc);
|
||||||
|
logger.info(String.format("Created recalibration data collectors for %d read group(s)", covariateCounter.getNReadGroups()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// map
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the particular RecalData datum associated with readGroup, at machine pos, with reported
|
* Walk over each read in the locus pileup and update the covariate counts based on these bases and their
|
||||||
* quality qual, and with the dinuc context of prevBase, base. If an example of such a
|
* matching (or not) with ref. dbSNP aware, so avoids sites that are known as SNPs in DBSNP.
|
||||||
* base has been seen before, returns the associated RecalData. If not, it creates one, places it in the
|
|
||||||
* system so that subsequent requests will return that object, and returns it.
|
|
||||||
*
|
*
|
||||||
* @param readGroup
|
* @param tracker
|
||||||
* @param pos
|
* @param ref
|
||||||
* @param qual
|
* @param context
|
||||||
* @param prevBase
|
|
||||||
* @param base
|
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
|
|
||||||
byte[] cs = {(byte)prevBase, (byte)base};
|
|
||||||
String s = new String(cs);
|
|
||||||
return data.get(readGroup).expandingGetRecalData(pos, qual, s, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get a list of all of the RecalData associated with readGroup
|
|
||||||
*
|
|
||||||
* @param readGroup
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
private List<RecalData> getRecalData(String readGroup) {
|
|
||||||
return data.get(readGroup).getAll();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
public Integer map(RefMetaDataTracker tracker, char ref, LocusContext context) {
|
||||||
//System.out.printf("%s %c%n", context.getLocation(), ref);
|
|
||||||
rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null);
|
rodDbSNP dbsnp = (rodDbSNP)tracker.lookup("dbSNP", null);
|
||||||
if ( dbsnp == null || !dbsnp.isSNP() ) {
|
if ( dbsnp == null || !dbsnp.isSNP() ) {
|
||||||
|
// We aren't at a dbSNP position that's a SNP, so update the read
|
||||||
|
|
||||||
List<SAMRecord> reads = context.getReads();
|
List<SAMRecord> reads = context.getReads();
|
||||||
List<Integer> offsets = context.getOffsets();
|
List<Integer> offsets = context.getOffsets();
|
||||||
for (int i =0; i < reads.size(); i++ ) {
|
for (int i =0; i < reads.size(); i++ ) {
|
||||||
|
|
@ -115,132 +89,20 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
}
|
}
|
||||||
|
|
||||||
SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
|
SAMReadGroupRecord readGroup = read.getHeader().getReadGroup((String)read.getAttribute("RG"));
|
||||||
if ( isSupportedReadGroup(readGroup) &&
|
if ((read.getMappingQuality() >= MIN_MAPPING_QUALITY && isSupportedReadGroup(readGroup) )) {
|
||||||
(READ_GROUP.equals("none") || read.getAttribute("RG") != null && read.getAttribute("RG").equals(READ_GROUP)) &&
|
|
||||||
(read.getMappingQuality() >= MIN_MAPPING_QUALITY)) {
|
|
||||||
int offset = offsets.get(i);
|
int offset = offsets.get(i);
|
||||||
int numBases = read.getReadLength();
|
if ( offset > 0 && offset < (read.getReadLength() - 1) ) { // skip first and last bases because they suck and they don't have a dinuc count
|
||||||
if ( offset > 0 && offset < (numBases-1) ) { // skip first and last bases because they suck and they don't have a dinuc count
|
counted_bases += covariateCounter.updateDataFromRead(readGroup.getReadGroupId(), read, offset, ref);
|
||||||
counted_bases += updateDataFromRead(readGroup.getReadGroupId(), read, offset, ref);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
counted_sites += 1;
|
counted_sites += 1;
|
||||||
} else {
|
} else {
|
||||||
skipped_sites += 1;
|
skipped_sites += 1;
|
||||||
//System.out.println(dbsnp.toSimpleString()+" "+new ReadBackedPileup(ref, context).getPileupString());
|
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Updates the recalibration data for the base at offset in the read, associated with readGroup rg.
|
|
||||||
* Correctly handles machine orientation of the read. I.e., it adds data not by offset in the read
|
|
||||||
* but by implied machine cycle associated with the offset.
|
|
||||||
*
|
|
||||||
* TODO: this whole system is 0-based and therefore inconsisent with the rest of the GATK, where pos is 1-based
|
|
||||||
* TODO: and offset is 0-based. How very annoying.
|
|
||||||
*
|
|
||||||
* @param rg
|
|
||||||
* @param read
|
|
||||||
* @param offset
|
|
||||||
* @param ref
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
private int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
|
|
||||||
int cycle = offset;
|
|
||||||
byte[] bases = read.getReadBases();
|
|
||||||
byte[] quals = read.getBaseQualities();
|
|
||||||
|
|
||||||
char base = (char)bases[offset];
|
|
||||||
char prevBase = (char)bases[offset - 1];
|
|
||||||
|
|
||||||
if (read.getReadNegativeStrandFlag()) {
|
|
||||||
ref = (char)BaseUtils.simpleComplement(ref);
|
|
||||||
base = (char)BaseUtils.simpleComplement(base);
|
|
||||||
prevBase = (char)BaseUtils.simpleComplement((char)bases[offset+1]);
|
|
||||||
cycle = read.getReadLength() - (offset + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
int qual = quals[offset];
|
|
||||||
if ( qual > 0 ) {
|
|
||||||
RecalData datum = getRecalData(rg, cycle, qual, prevBase, base);
|
|
||||||
if (datum != null) datum.inc(base,ref);
|
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void onTraversalDone(Integer result) {
|
|
||||||
printInfo(out);
|
|
||||||
|
|
||||||
out.printf("Writing raw recalibration data%n");
|
|
||||||
writeRecalTable();
|
|
||||||
out.printf("...done%n");
|
|
||||||
|
|
||||||
//out.printf("Writing logistic recalibration data%n");
|
|
||||||
//writeLogisticRecalibrationTable();
|
|
||||||
//out.printf("...done%n");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Prints some basic information about the CountCovariates run to the output stream out
|
|
||||||
* @param out
|
|
||||||
*/
|
|
||||||
private void printInfo(PrintStream out) {
|
|
||||||
out.printf("# date %s%n", new Date());
|
|
||||||
out.printf("# collapsed_pos %b%n", collapsePos);
|
|
||||||
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
|
||||||
out.printf("# counted_sites %d%n", counted_sites);
|
|
||||||
out.printf("# counted_bases %d%n", counted_bases);
|
|
||||||
out.printf("# skipped_sites %d%n", skipped_sites);
|
|
||||||
out.printf("# fraction_skipped 1/%.0f%n", (double)counted_sites / skipped_sites);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
private void writeLogisticRecalibrationTable() {
|
|
||||||
PrintStream dinuc_out = null;
|
|
||||||
try {
|
|
||||||
dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv");
|
|
||||||
dinuc_out.println("rg,dn,logitQ,pos,indicator,count");
|
|
||||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
|
||||||
for ( int dinuc_index=0; dinuc_index<RecalData.NDINUCS; dinuc_index++) {
|
|
||||||
for ( RecalData datum: getRecalData(readGroup.getReadGroupId()) ) {
|
|
||||||
if ( RecalData.dinucIndex(datum.dinuc) == dinuc_index ) {
|
|
||||||
if ((datum.N - datum.B) > 0)
|
|
||||||
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
|
|
||||||
if (datum.B > 0)
|
|
||||||
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup.getReadGroupId(), RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 1, datum.B);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (FileNotFoundException e) {
|
|
||||||
System.err.println("FileNotFoundException: " + e.getMessage());
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
if (dinuc_out != null) dinuc_out.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Writes out the key recalibration data collected from the reads. Dumps this recalibration data
|
|
||||||
* as a CVS string to the recalTableOut PrintStream. Emits the data for all read groups into this file.
|
|
||||||
*/
|
|
||||||
private void writeRecalTable() {
|
|
||||||
printInfo(recalTableOut);
|
|
||||||
recalTableOut.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
|
|
||||||
for (SAMReadGroupRecord readGroup : this.getToolkit().getEngine().getSAMHeader().getReadGroups()) {
|
|
||||||
// TODO: should sort the data coming out of getRecalData here for easier processing
|
|
||||||
for ( RecalData datum: RecalData.sort(getRecalData(readGroup.getReadGroupId())) ) {
|
|
||||||
if ( datum.N > 0 )
|
|
||||||
recalTableOut.format("%s%n", datum.toCSVString(collapsePos));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
recalTableOut.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check to see whether this read group should be processed. Returns true if the
|
* Check to see whether this read group should be processed. Returns true if the
|
||||||
|
|
@ -262,23 +124,103 @@ public class CovariateCounterWalker extends LocusWalker<Integer, Integer> {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Reduce
|
||||||
|
//
|
||||||
|
// --------------------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* No initialization routines
|
* Provide an initial value for reduce computations.
|
||||||
*
|
* @return Initial value of reduce.
|
||||||
* @return
|
|
||||||
*/
|
*/
|
||||||
public Integer reduceInit() {
|
public PrintStream reduceInit() {
|
||||||
return 0;
|
try {
|
||||||
|
return new PrintStream( OUTPUT_FILEROOT+".recal_data.csv" );
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
throw new RuntimeException("Couldn't open output file", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void onTraversalDone(PrintStream recalTableStream) {
|
||||||
|
printInfo(out);
|
||||||
|
|
||||||
|
out.printf("Writing raw recalibration data..."); out.flush();
|
||||||
|
writeRecalTable(recalTableStream);
|
||||||
|
out.printf("...done%n");
|
||||||
|
|
||||||
|
//out.printf("Writing logistic recalibration data%n");
|
||||||
|
//writeLogisticRecalibrationTable();
|
||||||
|
//out.printf("...done%n");
|
||||||
|
|
||||||
|
recalTableStream.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prints some basic information about the CountCovariates run to the output stream out
|
||||||
|
* @param out
|
||||||
|
*/
|
||||||
|
private void printInfo(PrintStream out) {
|
||||||
|
out.printf("# date \"%s\"%n", new Date());
|
||||||
|
out.printf("# collapsed_pos %b%n", collapsePos);
|
||||||
|
out.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
||||||
|
out.printf("# counted_sites %d%n", counted_sites);
|
||||||
|
out.printf("# counted_bases %d%n", counted_bases);
|
||||||
|
out.printf("# skipped_sites %d%n", skipped_sites);
|
||||||
|
out.printf("# fraction_skipped 1 / %.0f bp%n", (double)counted_sites / skipped_sites);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Deprecated
|
||||||
|
private void writeLogisticRecalibrationTable() {
|
||||||
|
PrintStream dinuc_out = null;
|
||||||
|
try {
|
||||||
|
dinuc_out = new PrintStream( OUTPUT_FILEROOT+".covariate_counts.csv");
|
||||||
|
dinuc_out.println("rg,dn,logitQ,pos,indicator,count");
|
||||||
|
for (String readGroup : covariateCounter.getReadGroups()) {
|
||||||
|
for ( int dinuc_index=0; dinuc_index<RecalData.NDINUCS; dinuc_index++) {
|
||||||
|
for ( RecalData datum: covariateCounter.getRecalData(readGroup) ) {
|
||||||
|
if ( RecalData.dinucIndex(datum.dinuc) == dinuc_index ) {
|
||||||
|
if ((datum.N - datum.B) > 0)
|
||||||
|
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup, RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 0, datum.N - datum.B);
|
||||||
|
if (datum.B > 0)
|
||||||
|
dinuc_out.format("%s,%s,%d,%d,%d,%d%n", readGroup, RecalData.dinucIndex2bases(dinuc_index), datum.qual, datum.pos, 1, datum.B);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (FileNotFoundException e) {
|
||||||
|
System.err.println("FileNotFoundException: " + e.getMessage());
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
if (dinuc_out != null) dinuc_out.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes out the key recalibration data collected from the reads. Dumps this recalibration data
|
||||||
|
* as a CVS string to the recalTableOut PrintStream. Emits the data for all read groups into this file.
|
||||||
|
*/
|
||||||
|
private void writeRecalTable(PrintStream recalTableStream) {
|
||||||
|
printInfo(recalTableStream);
|
||||||
|
|
||||||
|
recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
|
||||||
|
for (String readGroup : covariateCounter.getReadGroups()) {
|
||||||
|
for ( RecalData datum: RecalData.sort(covariateCounter.getRecalData(readGroup)) ) {
|
||||||
|
if ( datum.N > 0 )
|
||||||
|
recalTableStream.format("%s%n", datum.toCSVString(collapsePos));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Doesn't do anything
|
* Doesn't do anything
|
||||||
*
|
*
|
||||||
* @param a
|
* @param empty
|
||||||
* @param b
|
* @param recalTableStream
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public Integer reduce(Integer a, Integer b) {
|
public PrintStream reduce(Integer empty, PrintStream recalTableStream) {
|
||||||
return 0;
|
return recalTableStream;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -167,7 +167,7 @@ public class RecalData implements Comparable<RecalData> {
|
||||||
* @param s
|
* @param s
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public static RecalData fromCSVString(String s) {
|
public static RecalData fromCSVString(String s) throws NumberFormatException {
|
||||||
String[] vals = s.split(",");
|
String[] vals = s.split(",");
|
||||||
String rg = vals[0];
|
String rg = vals[0];
|
||||||
int pos = vals[1].equals("*") ? 0 : Integer.parseInt(vals[1]);
|
int pos = vals[1].equals("*") ? 0 : Integer.parseInt(vals[1]);
|
||||||
|
|
@ -178,6 +178,13 @@ public class RecalData implements Comparable<RecalData> {
|
||||||
RecalData datum = new RecalData(pos, qual, rg, dinuc);
|
RecalData datum = new RecalData(pos, qual, rg, dinuc);
|
||||||
datum.B = B;
|
datum.B = B;
|
||||||
datum.N = N;
|
datum.N = N;
|
||||||
|
|
||||||
|
// Checking for badness
|
||||||
|
if ( pos < 0 ) throw new NumberFormatException("Illegal position detected: " + pos);
|
||||||
|
if ( B < 0 ) throw new NumberFormatException("Illegal mismatch count detected: " + B);
|
||||||
|
if ( N < 0 ) throw new NumberFormatException("Illegal base count detected: " + N);
|
||||||
|
if ( qual < 0 || qual > QualityUtils.MAX_QUAL_SCORE ) throw new NumberFormatException("Illegal qual detected: " + qual);
|
||||||
|
|
||||||
return datum;
|
return datum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -216,8 +223,8 @@ public class RecalData implements Comparable<RecalData> {
|
||||||
}
|
}
|
||||||
|
|
||||||
double q = QualityUtils.phredScaleErrorRate(sumExpectedErrors / nBases);
|
double q = QualityUtils.phredScaleErrorRate(sumExpectedErrors / nBases);
|
||||||
System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n",
|
//System.out.printf("expected errors=%f, nBases = %d, rate=%f, qual=%f%n",
|
||||||
sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q);
|
// sumExpectedErrors, nBases, 1 - sumExpectedErrors / nBases, q);
|
||||||
return q;
|
return q;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -54,6 +54,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
|
|
||||||
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
|
private static Logger logger = Logger.getLogger(TableRecalibrationWalker.class);
|
||||||
|
|
||||||
|
private static String VERSION = "0.2.1";
|
||||||
|
|
||||||
private final static boolean DEBUG = false;
|
private final static boolean DEBUG = false;
|
||||||
|
|
||||||
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
|
// maps from [readGroup] -> [prevBase x base -> [cycle, qual, new qual]]
|
||||||
|
|
@ -77,9 +79,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
private static Pattern COLLAPSED_DINUC_PATTERN = Pattern.compile("^#\\s+collapsed_dinuc\\s+(\\w+)");
|
private static Pattern COLLAPSED_DINUC_PATTERN = Pattern.compile("^#\\s+collapsed_dinuc\\s+(\\w+)");
|
||||||
private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*");
|
private static Pattern HEADER_PATTERN = Pattern.compile("^rg.*");
|
||||||
|
|
||||||
|
//private static boolean DEBUG_ME = true;
|
||||||
|
|
||||||
public void initialize() {
|
public void initialize() {
|
||||||
|
logger.info("TableRecalibrator version: " + VERSION);
|
||||||
//
|
//
|
||||||
// crap hack until Enum arg types are supported
|
// crappy hack until Enum arg types are supported
|
||||||
//
|
//
|
||||||
for ( RecalibrationMode potential : RecalibrationMode.values() ) {
|
for ( RecalibrationMode potential : RecalibrationMode.values() ) {
|
||||||
if ( potential.toString().equals(modeString)) {
|
if ( potential.toString().equals(modeString)) {
|
||||||
|
|
@ -96,13 +101,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
//
|
//
|
||||||
int lineNumber = 0;
|
int lineNumber = 0;
|
||||||
try {
|
try {
|
||||||
System.out.printf("Reading data...%n");
|
logger.info(String.format("Reading data..."));
|
||||||
List<RecalData> data = new ArrayList<RecalData>();
|
List<RecalData> data = new ArrayList<RecalData>();
|
||||||
boolean collapsedPos = false;
|
boolean collapsedPos = false;
|
||||||
boolean collapsedDinuc = false;
|
boolean collapsedDinuc = false;
|
||||||
|
|
||||||
List<String> lines = new xReadLines(new File(paramsFile)).readLines();
|
//List<String> lines = new xReadLines(new File(paramsFile)).readLines();
|
||||||
for ( String line : lines ) {
|
for ( String line : new xReadLines(new File(paramsFile)) ) {
|
||||||
lineNumber++;
|
lineNumber++;
|
||||||
if ( HEADER_PATTERN.matcher(line).matches() )
|
if ( HEADER_PATTERN.matcher(line).matches() )
|
||||||
continue;
|
continue;
|
||||||
|
|
@ -159,12 +164,14 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
if ( collapsedPos )
|
if ( collapsedPos )
|
||||||
throw new RuntimeException(String.format("Cannot perform position_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
throw new RuntimeException(String.format("Cannot perform position_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||||
collapsedPos = true;
|
collapsedPos = true;
|
||||||
break;
|
throw new RuntimeException("Unsupported mode requested, sorry");
|
||||||
|
//break;
|
||||||
case BY_DINUC_ONLY:
|
case BY_DINUC_ONLY:
|
||||||
if ( collapsedDinuc )
|
if ( collapsedDinuc )
|
||||||
throw new RuntimeException(String.format("Cannot perform dinuc_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
throw new RuntimeException(String.format("Cannot perform dinuc_only recalibration -- data is already partially collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||||
collapsedDinuc = true;
|
collapsedDinuc = true;
|
||||||
break;
|
throw new RuntimeException("Unsupported mode requested, sorry");
|
||||||
|
//break;
|
||||||
case COMBINATORIAL:
|
case COMBINATORIAL:
|
||||||
if ( collapsedPos || collapsedDinuc )
|
if ( collapsedPos || collapsedDinuc )
|
||||||
throw new RuntimeException(String.format("Cannot perform combinatorial recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
throw new RuntimeException(String.format("Cannot perform combinatorial recalibration -- data is already collapsed by pos=%b and dinuc=%b", collapsedPos, collapsedDinuc));
|
||||||
|
|
@ -231,12 +238,22 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
quals = BaseUtils.reverse(quals);
|
quals = BaseUtils.reverse(quals);
|
||||||
}
|
}
|
||||||
|
|
||||||
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
|
try {
|
||||||
|
byte[] recalQuals = recalibrateBasesAndQuals(read.getAttribute("RG").toString(), bases, quals);
|
||||||
|
|
||||||
if (read.getReadNegativeStrandFlag()) // reverse the quals for the neg strand read
|
//if ( read.getReadName().equals("IL12_395:7:215:171:693") ) {
|
||||||
recalQuals = BaseUtils.reverse(recalQuals);
|
// for ( int i = 0; i < quals.length; i++ ) {
|
||||||
read.setBaseQualities(recalQuals);
|
// System.out.printf("READ found: %s is now %s%n", quals[i], recalQuals[i]);
|
||||||
return read;
|
// }
|
||||||
|
//}
|
||||||
|
|
||||||
|
if (read.getReadNegativeStrandFlag()) // reverse the quals for the neg strand read
|
||||||
|
recalQuals = BaseUtils.reverse(recalQuals);
|
||||||
|
read.setBaseQualities(recalQuals);
|
||||||
|
return read;
|
||||||
|
} catch ( StingException e ) {
|
||||||
|
throw new RuntimeException(String.format("Bug found while processing read %s: %s", read.format(), e.getMessage()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -248,14 +265,21 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
||||||
* @param quals
|
* @param quals
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) {
|
public byte[] recalibrateBasesAndQuals(final String readGroup, byte[] bases, byte[] quals) throws StingException {
|
||||||
byte[] recalQuals = new byte[quals.length];
|
byte[] recalQuals = new byte[quals.length];
|
||||||
RecalMapping mapper = cache.get(readGroup);
|
RecalMapping mapper = cache.get(readGroup);
|
||||||
|
|
||||||
|
//if ( mapper == null && DEBUG_ME )
|
||||||
|
// return recalQuals;
|
||||||
|
|
||||||
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
|
recalQuals[0] = quals[0]; // can't change the first -- no dinuc
|
||||||
for ( int cycle = 1; cycle < bases.length; cycle++ ) { // skip first and last base, qual already set because no dinuc
|
for ( int cycle = 1; cycle < bases.length; cycle++ ) { // skip first and last base, qual already set because no dinuc
|
||||||
byte qual = quals[cycle];
|
byte qual = quals[cycle];
|
||||||
byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual);
|
byte newQual = mapper.getNewQual(readGroup, bases[cycle - 1], bases[cycle], cycle, qual);
|
||||||
|
|
||||||
|
if ( newQual <= 0 || newQual > QualityUtils.MAX_REASONABLE_Q_SCORE )
|
||||||
|
throw new StingException(String.format("Bug found -- assigning bad quality score %d x %d => %d", cycle, qual, newQual));
|
||||||
|
|
||||||
recalQuals[cycle] = newQual;
|
recalQuals[cycle] = newQual;
|
||||||
//System.out.printf("Mapping %d => %d%n", qual, newQual);
|
//System.out.printf("Mapping %d => %d%n", qual, newQual);
|
||||||
}
|
}
|
||||||
|
|
@ -322,17 +346,12 @@ class CombinatorialRecalMapping implements RecalMapping {
|
||||||
int pos = manager.canonicalPos(datum.pos);
|
int pos = manager.canonicalPos(datum.pos);
|
||||||
if ( table[pos][datum.qual] != 0 )
|
if ( table[pos][datum.qual] != 0 )
|
||||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
||||||
//table[datum.pos][datum.qual] = (byte)(1 + datum.empiricalQualByte());
|
|
||||||
table[pos][datum.qual] = datum.empiricalQualByte(useRawQempirical);
|
table[pos][datum.qual] = datum.empiricalQualByte(useRawQempirical);
|
||||||
//System.out.printf("Binding %d %d => %d%n", pos, datum.qual, datum.empiricalQualByte(useRawQempirical));
|
//System.out.printf("Binding %d %d => %d%n", pos, datum.qual, datum.empiricalQualByte(useRawQempirical));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
|
public byte getNewQual(final String readGroup, byte prevBase, byte base, int cycle, byte qual) {
|
||||||
//String dinuc = String.format("%c%c", (char)prevBase, (char)base);
|
|
||||||
//if ( qual == 2 )
|
|
||||||
// System.out.printf("Qual = 2%n");
|
|
||||||
|
|
||||||
int pos = manager.canonicalPos(cycle);
|
int pos = manager.canonicalPos(cycle);
|
||||||
int index = this.manager.getDinucIndex(prevBase, base);
|
int index = this.manager.getDinucIndex(prevBase, base);
|
||||||
byte[][] dataTable = index == -1 ? null : cache.get(index);
|
byte[][] dataTable = index == -1 ? null : cache.get(index);
|
||||||
|
|
@ -340,13 +359,7 @@ class CombinatorialRecalMapping implements RecalMapping {
|
||||||
if ( dataTable == null && prevBase != 'N' && base != 'N' )
|
if ( dataTable == null && prevBase != 'N' && base != 'N' )
|
||||||
throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, (char)prevBase, (char)base));
|
throw new RuntimeException(String.format("Unmapped data table at %s %c%c", readGroup, (char)prevBase, (char)base));
|
||||||
|
|
||||||
byte result = dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
|
return dataTable != null && pos < dataTable.length ? dataTable[pos][qual] : qual;
|
||||||
|
|
||||||
//if ( result == 2 )
|
|
||||||
// System.out.printf("Lookup RG=%s dinuc=%s cycle=%d pos=%d qual=%d datatable=%s / %d => %d%n",
|
|
||||||
// readGroup, dinuc, cycle, pos, qual, dataTable, dataTable.length, result);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -368,13 +381,13 @@ class SerialRecalMapping implements RecalMapping {
|
||||||
private double globalDeltaQ = 0.0;
|
private double globalDeltaQ = 0.0;
|
||||||
private double[][] deltaQPosMap, deltaQDinucMap;
|
private double[][] deltaQPosMap, deltaQDinucMap;
|
||||||
double [] deltaQualMap;
|
double [] deltaQualMap;
|
||||||
RecalData [][] qPosSupports, qDinucSupports;
|
RecalData [][] qPosSupports = null, qDinucSupports = null;
|
||||||
|
|
||||||
CombinatorialRecalMapping combiMap;
|
CombinatorialRecalMapping combiMap;
|
||||||
RecalDataManager manager;
|
RecalDataManager manager;
|
||||||
|
|
||||||
String dinucToLookAt = null; // "CC";
|
String dinucToLookAt = null; // "CC";
|
||||||
int posToLookAt = 0;
|
int posToLookAt = -1;
|
||||||
int qualToLookAt = 25;
|
int qualToLookAt = 25;
|
||||||
|
|
||||||
public SerialRecalMapping(RecalDataManager manager, final boolean useRawQempirical,
|
public SerialRecalMapping(RecalDataManager manager, final boolean useRawQempirical,
|
||||||
|
|
@ -387,7 +400,7 @@ class SerialRecalMapping implements RecalMapping {
|
||||||
RecalData datum = new RecalData(0, 0, manager.readGroup, "**").inc(manager.getAll());
|
RecalData datum = new RecalData(0, 0, manager.readGroup, "**").inc(manager.getAll());
|
||||||
double aggregrateQreported = RecalData.combinedQreported(manager.getAll());
|
double aggregrateQreported = RecalData.combinedQreported(manager.getAll());
|
||||||
globalDeltaQ = datum.empiricalQualDouble(useRawQempirical) - aggregrateQreported;
|
globalDeltaQ = datum.empiricalQualDouble(useRawQempirical) - aggregrateQreported;
|
||||||
System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ);
|
//System.out.printf("Global quality score shift is %.2f - %.2f = %.2f%n", datum.empiricalQualDouble(useRawQempirical), aggregrateQreported, globalDeltaQ);
|
||||||
}
|
}
|
||||||
|
|
||||||
for ( RecalData datum : manager.getAll() ) {
|
for ( RecalData datum : manager.getAll() ) {
|
||||||
|
|
@ -399,12 +412,12 @@ class SerialRecalMapping implements RecalMapping {
|
||||||
deltaQualMap = new double[maxQReported+1];
|
deltaQualMap = new double[maxQReported+1];
|
||||||
for ( RecalData datum : RecalData.sort(manager.combine(true, false, true)) ) {
|
for ( RecalData datum : RecalData.sort(manager.combine(true, false, true)) ) {
|
||||||
deltaQualMap[datum.qual] = datum.empiricalQualDouble(useRawQempirical) - datum.qual - globalDeltaQ;
|
deltaQualMap[datum.qual] = datum.empiricalQualDouble(useRawQempirical) - datum.qual - globalDeltaQ;
|
||||||
System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]);
|
//System.out.printf("%s => %s%n", datum, deltaQualMap[datum.qual]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// calculate the delta Q pos array
|
// calculate the delta Q pos array
|
||||||
deltaQPosMap = new double[maxPos+1][maxQReported+1];
|
deltaQPosMap = new double[maxPos+1][maxQReported+1];
|
||||||
qPosSupports = new RecalData[maxPos+1][maxQReported+1];
|
//qPosSupports = new RecalData[maxPos+1][maxQReported+1];
|
||||||
for ( RecalData datumAtPosQual : manager.combineDinucs() ) {
|
for ( RecalData datumAtPosQual : manager.combineDinucs() ) {
|
||||||
double offset = globalDeltaQ + deltaQualMap[datumAtPosQual.qual];
|
double offset = globalDeltaQ + deltaQualMap[datumAtPosQual.qual];
|
||||||
updateCache(qPosSupports, datumAtPosQual, useRawQempirical, deltaQPosMap, datumAtPosQual.pos, datumAtPosQual.qual, offset);
|
updateCache(qPosSupports, datumAtPosQual, useRawQempirical, deltaQPosMap, datumAtPosQual.pos, datumAtPosQual.qual, offset);
|
||||||
|
|
@ -412,7 +425,7 @@ class SerialRecalMapping implements RecalMapping {
|
||||||
|
|
||||||
// calculate the delta Q dinuc array
|
// calculate the delta Q dinuc array
|
||||||
deltaQDinucMap = new double[dinucs.size()+1][maxQReported+1];
|
deltaQDinucMap = new double[dinucs.size()+1][maxQReported+1];
|
||||||
qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1];
|
//qDinucSupports = new RecalData[dinucs.size()+1][maxQReported+1];
|
||||||
for ( RecalData datumAtDinucQual : manager.combineCycles() ) {
|
for ( RecalData datumAtDinucQual : manager.combineCycles() ) {
|
||||||
double offset = globalDeltaQ + deltaQualMap[datumAtDinucQual.qual];
|
double offset = globalDeltaQ + deltaQualMap[datumAtDinucQual.qual];
|
||||||
updateCache(qDinucSupports, datumAtDinucQual, useRawQempirical, deltaQDinucMap, datumAtDinucQual.getDinucIndex(), datumAtDinucQual.qual, offset);
|
updateCache(qDinucSupports, datumAtDinucQual, useRawQempirical, deltaQDinucMap, datumAtDinucQual.getDinucIndex(), datumAtDinucQual.qual, offset);
|
||||||
|
|
@ -429,7 +442,7 @@ class SerialRecalMapping implements RecalMapping {
|
||||||
for ( int j = 0; j < maxQReported; j++ ) {
|
for ( int j = 0; j < maxQReported; j++ ) {
|
||||||
if ( printStateP(i, null, j) )
|
if ( printStateP(i, null, j) )
|
||||||
System.out.printf("Mapping: pos=%d qual=%2d delta=%.2f based on %s%n",
|
System.out.printf("Mapping: pos=%d qual=%2d delta=%.2f based on %s%n",
|
||||||
i, j, deltaQPosMap[i][j], qPosSupports[i][j]);
|
i, j, deltaQPosMap[i][j], qPosSupports != null ? qPosSupports[i][j] : null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -438,7 +451,7 @@ class SerialRecalMapping implements RecalMapping {
|
||||||
String dinuc = RecalData.dinucIndex2bases(i);
|
String dinuc = RecalData.dinucIndex2bases(i);
|
||||||
if ( printStateP(0, dinuc, j ) )
|
if ( printStateP(0, dinuc, j ) )
|
||||||
System.out.printf("Mapping: dinuc=%s qual=%2d delta=%.2f based on %s%n",
|
System.out.printf("Mapping: dinuc=%s qual=%2d delta=%.2f based on %s%n",
|
||||||
dinuc, j, deltaQDinucMap[i][j], qDinucSupports[i][j]);
|
dinuc, j, deltaQDinucMap[i][j], qDinucSupports != null ? qDinucSupports[i][j] : null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -457,7 +470,8 @@ class SerialRecalMapping implements RecalMapping {
|
||||||
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
throw new RuntimeException(String.format("Duplicate entry discovered: %s", datum));
|
||||||
double deltaQ = datum.empiricalQualDouble(useRawQempirical) - datum.qual - meanQ;
|
double deltaQ = datum.empiricalQualDouble(useRawQempirical) - datum.qual - meanQ;
|
||||||
table[i][j] = deltaQ;
|
table[i][j] = deltaQ;
|
||||||
supports[i][j] = datum;
|
if ( supports != null )
|
||||||
|
supports[i][j] = datum;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean printStateP( int cycle, String dinuc, int qual ) {
|
private boolean printStateP( int cycle, String dinuc, int qual ) {
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,9 @@ public class BaseQualityHistoWalker extends ReadWalker<Integer, Integer> {
|
||||||
// Map over the org.broadinstitute.sting.gatk.LocusContext
|
// Map over the org.broadinstitute.sting.gatk.LocusContext
|
||||||
public Integer map(char[] ref, SAMRecord read) {
|
public Integer map(char[] ref, SAMRecord read) {
|
||||||
for ( byte qual : read.getBaseQualities() ) {
|
for ( byte qual : read.getBaseQualities() ) {
|
||||||
|
if ( qual < 0 || qual > 100 ) {
|
||||||
|
throw new RuntimeException(String.format("Invalid base quality detected -- %d at %s%n", qual, read.getReadName()));
|
||||||
|
}
|
||||||
//System.out.println(qual);
|
//System.out.println(qual);
|
||||||
this.qualCounts[qual]++;
|
this.qualCounts[qual]++;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -244,7 +244,7 @@ public class GenomeLocParser {
|
||||||
* @return the list of merged locations
|
* @return the list of merged locations
|
||||||
*/
|
*/
|
||||||
public static List<GenomeLoc> mergeOverlappingLocations(final List<GenomeLoc> raw) {
|
public static List<GenomeLoc> mergeOverlappingLocations(final List<GenomeLoc> raw) {
|
||||||
logger.debug(" Raw locations are:\n" + Utils.join("\n", raw));
|
logger.debug(" Raw locations are: " + Utils.join(", ", raw));
|
||||||
if (raw.size() <= 1)
|
if (raw.size() <= 1)
|
||||||
return raw;
|
return raw;
|
||||||
else {
|
else {
|
||||||
|
|
|
||||||
|
|
@ -85,8 +85,16 @@ public class QualityUtils {
|
||||||
return boundQual(qual, MAX_QUAL_SCORE);
|
return boundQual(qual, MAX_QUAL_SCORE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns an integer quality score bounded by 1 - maxQual.
|
||||||
|
*
|
||||||
|
* @param qual
|
||||||
|
* @param maxQual
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
static public byte boundQual(int qual, byte maxQual) {
|
static public byte boundQual(int qual, byte maxQual) {
|
||||||
return (byte) Math.min(qual, maxQual);
|
//return (byte) Math.min(qual, maxQual);
|
||||||
|
return (byte) Math.max(Math.min(qual, maxQual), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -191,7 +191,7 @@ public class ArtificialSAMUtils {
|
||||||
}
|
}
|
||||||
SAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length);
|
SAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length);
|
||||||
rec.setReadBases(bases);
|
rec.setReadBases(bases);
|
||||||
rec.setBaseQualities(bases);
|
rec.setBaseQualities(qual);
|
||||||
return rec;
|
return rec;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,183 @@
|
||||||
|
// our package
|
||||||
|
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
|
||||||
|
// the imports for unit testing.
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.playground.gatk.walkers.indels.CleanedReadInjector;
|
||||||
|
import org.broadinstitute.sting.utils.QualityUtils;
|
||||||
|
import org.broadinstitute.sting.utils.BaseUtils;
|
||||||
|
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileReader;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter;
|
||||||
|
import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.File;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Basic unit test for RecalData
|
||||||
|
*/
|
||||||
|
public class CovariateCounterTest extends BaseTest {
|
||||||
|
String readGroup1 = "rg1";
|
||||||
|
String readGroup2 = "rg2";
|
||||||
|
Set<String> readGroups = new HashSet<String>();
|
||||||
|
|
||||||
|
SAMFileHeader header;
|
||||||
|
|
||||||
|
SAMRecord read1, read2, read3;
|
||||||
|
|
||||||
|
byte bases1[] = {'a', 't', 'c', 'g', 'a'};
|
||||||
|
byte quals1[] = {1, 2, 3, 4, 5};
|
||||||
|
byte quals3[] = {1, 2, 5, 5, 5};
|
||||||
|
byte bases2[] = {'t', 'c', 'g', 'a', 't'};
|
||||||
|
byte quals2[] = {2, 2, 4, 5, 2};
|
||||||
|
|
||||||
|
/*
|
||||||
|
public CovariateCounter( Set<String> readGroups, boolean collapsePos, boolean collapseDinuc ) {
|
||||||
|
public Set<String> getReadGroups() {
|
||||||
|
public boolean isCollapseDinuc() {
|
||||||
|
public boolean isCollapsePos() {
|
||||||
|
public int getNReadGroups() {
|
||||||
|
private RecalData getRecalData(String readGroup, int pos, int qual, char prevBase, char base) {
|
||||||
|
public List<RecalData> getRecalData(String readGroup) {
|
||||||
|
public int updateDataFromRead( String rg, SAMRecord read, int offset, char ref ) {
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The fasta, for comparison.
|
||||||
|
*/
|
||||||
|
protected static IndexedFastaSequenceFile sequenceFile = null;
|
||||||
|
|
||||||
|
CovariateCounter c;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize the fasta.
|
||||||
|
*/
|
||||||
|
@BeforeClass
|
||||||
|
public static void initialize() throws FileNotFoundException {
|
||||||
|
sequenceFile = new IndexedFastaSequenceFile( new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") );
|
||||||
|
GenomeLocParser.setupRefContigOrdering(sequenceFile);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void initializeBefore() {
|
||||||
|
header = ArtificialSAMUtils.createArtificialSamHeader(2,0,247249719);
|
||||||
|
readGroups.addAll(Arrays.asList(readGroup1, readGroup2));
|
||||||
|
ArtificialSAMUtils.createDefaultReadGroup( header, readGroup1, "sample1" );
|
||||||
|
ArtificialSAMUtils.createDefaultReadGroup( header, readGroup2, "sample2" );
|
||||||
|
c = new CovariateCounter( readGroups, false, false );
|
||||||
|
|
||||||
|
read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases1, quals1);
|
||||||
|
read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",1,1, bases2, quals2);
|
||||||
|
read3 = ArtificialSAMUtils.createArtificialRead(header,"read3",1,1, bases1, quals3);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCovariateCounterSetup() {
|
||||||
|
Assert.assertEquals("Number of read groups is wrong", c.getNReadGroups(), 2);
|
||||||
|
Assert.assertEquals("Read group identities are wrong", c.getReadGroups(), readGroups);
|
||||||
|
Assert.assertEquals("Incorrectly collapsed counter", c.isCollapseDinuc(), false);
|
||||||
|
Assert.assertEquals("Incorrectly collapsed counter", c.isCollapsePos(), false);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOneRead() {
|
||||||
|
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||||
|
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||||
|
c.printState();
|
||||||
|
|
||||||
|
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
|
||||||
|
for ( int i = 1; i < bases1.length; i++ ) {
|
||||||
|
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
|
||||||
|
System.out.printf("%s%n", datum);
|
||||||
|
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
|
||||||
|
Assert.assertEquals("Bad mismatch count", datum.B, 0);
|
||||||
|
Assert.assertEquals("Bad base count", datum.N, 1);
|
||||||
|
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
|
||||||
|
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
|
||||||
|
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTwoReads() {
|
||||||
|
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||||
|
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||||
|
for ( int i = 1; i < read2.getReadBases().length; i++ )
|
||||||
|
c.updateDataFromRead(readGroup2, read2, i, (char)read2.getReadBases()[i]);
|
||||||
|
c.printState();
|
||||||
|
|
||||||
|
Assert.assertEquals("Incorrect mapping to recal bin", c.getRecalData(readGroup1, 0, quals1[0], 'A', (char)bases1[0]).N, 0);
|
||||||
|
for ( int i = 1; i < bases1.length; i++ ) {
|
||||||
|
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
|
||||||
|
System.out.printf("%s%n", datum);
|
||||||
|
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
|
||||||
|
Assert.assertEquals("Bad mismatch count", datum.B, 0);
|
||||||
|
Assert.assertEquals("Bad base count", datum.N, 1);
|
||||||
|
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
|
||||||
|
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
|
||||||
|
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTwoReadsSameGroup() {
|
||||||
|
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||||
|
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||||
|
for ( int i = 1; i < read2.getReadBases().length; i++ )
|
||||||
|
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||||
|
c.printState();
|
||||||
|
|
||||||
|
for ( int i = 1; i < bases1.length; i++ ) {
|
||||||
|
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
|
||||||
|
System.out.printf("%s%n", datum);
|
||||||
|
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
|
||||||
|
Assert.assertEquals("Bad mismatch count", datum.B, 0);
|
||||||
|
Assert.assertEquals("Bad base count", datum.N, 2);
|
||||||
|
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
|
||||||
|
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
|
||||||
|
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testTwoReadsSameGroupNotIdentical() {
|
||||||
|
for ( int i = 1; i < read1.getReadBases().length; i++ )
|
||||||
|
c.updateDataFromRead(readGroup1, read1, i, (char)read1.getReadBases()[i]);
|
||||||
|
for ( int i = 1; i < read3.getReadBases().length; i++ )
|
||||||
|
c.updateDataFromRead(readGroup1, read3, i, (char)read3.getReadBases()[i]);
|
||||||
|
c.printState();
|
||||||
|
|
||||||
|
for ( int i = 1; i < bases1.length; i++ ) {
|
||||||
|
RecalData datum = c.getRecalData(readGroup1, i, quals1[i], (char)bases1[i-1], (char)bases1[i]);
|
||||||
|
System.out.printf("%s%n", datum);
|
||||||
|
Assert.assertNotNull("Incorrect mapping to recal bin", datum);
|
||||||
|
Assert.assertEquals("Bad mismatch count", datum.B, 0);
|
||||||
|
Assert.assertEquals("Bad base count", datum.N, quals1[i] == quals3[i] ? 2 : 1);
|
||||||
|
Assert.assertEquals("Prevbase is bad", datum.dinuc.charAt(0), bases1[i-1]);
|
||||||
|
Assert.assertEquals("Base is bad", datum.dinuc.charAt(1), bases1[i]);
|
||||||
|
Assert.assertEquals("Qual is bad", datum.qual, quals1[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test (expected = RuntimeException.class)
|
||||||
|
public void testBadReadOffset() {
|
||||||
|
byte bases[] = {'a', 't', 'c', 'g', 'a'};
|
||||||
|
byte quals[] = {1, 2, 3, 4, 5};
|
||||||
|
|
||||||
|
SAMRecord read = ArtificialSAMUtils.createArtificialRead(header,"read1",1,1, bases, quals);
|
||||||
|
|
||||||
|
c.updateDataFromRead(readGroup1, read, 0, (char)bases[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<package>
|
||||||
|
<name>GATKResources</name>
|
||||||
|
<resources>
|
||||||
|
<file>/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod</file>
|
||||||
|
<file>/broad/1KG/reference/human_b36_both.fasta</file>
|
||||||
|
<file>/broad/1KG/reference/human_b36_both.dict</file>
|
||||||
|
<file>/broad/1KG/reference/human_b36_both.fasta.fai</file>
|
||||||
|
<file>/humgen/gsa-scr1/GATK_Data/dbsnp_129_hg18.rod</file>
|
||||||
|
<file>/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta</file>
|
||||||
|
<file>/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.dict</file>
|
||||||
|
<file>/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta.fai</file>
|
||||||
|
</resources>
|
||||||
|
</package>
|
||||||
|
|
@ -3,18 +3,7 @@
|
||||||
<name>ReadQualityRecalibrator</name>
|
<name>ReadQualityRecalibrator</name>
|
||||||
<main-class>org.broadinstitute.sting.gatk.CommandLineGATK</main-class>
|
<main-class>org.broadinstitute.sting.gatk.CommandLineGATK</main-class>
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<class>org.broadinstitute.sting.playground.gatk.walkers.CovariateCounterWalker</class>
|
<class>org.broadinstitute.sting.gatk.walkers.recalibration.CovariateCounterWalker</class>
|
||||||
<class>org.broadinstitute.sting.playground.gatk.walkers.LogisticRecalibrationWalker</class>
|
<class>org.broadinstitute.sting.gatk.walkers.recalibration.TableRecalibrationWalker</class>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
<scripts>
|
|
||||||
<file>python/RecalQual.py</file>
|
|
||||||
<file>python/LogisticRegressionByReadGroup.py</file>
|
|
||||||
</scripts>
|
|
||||||
<resources>
|
|
||||||
<file>R/logistic_regression.R</file>
|
|
||||||
<file>/humgen/gsa-scr1/GATK_Data/dbsnp_129_b36.rod</file>
|
|
||||||
<file>/broad/1KG/reference/human_b36_both.fasta</file>
|
|
||||||
<file>/broad/1KG/reference/human_b36_both.dict</file>
|
|
||||||
<file>/broad/1KG/reference/human_b36_both.fasta.fai</file>
|
|
||||||
</resources>
|
|
||||||
</package>
|
</package>
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,16 @@ def bams2geli(bams):
|
||||||
calls = map(call1, bams)
|
calls = map(call1, bams)
|
||||||
return map(lambda x: x[0], calls), map(lambda x: x[1], calls)
|
return map(lambda x: x[0], calls), map(lambda x: x[1], calls)
|
||||||
|
|
||||||
|
def gelis2gelisText( gelis ):
|
||||||
|
def geli2geliText( maybeGeli ):
|
||||||
|
if os.path.splitext(maybeGeli)[1] == ".calls" :
|
||||||
|
return maybeGeli
|
||||||
|
else:
|
||||||
|
return os.path.split(geli)[1] + '.calls'
|
||||||
|
|
||||||
|
return map( geli2geliText, gelis)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
global OPTIONS, ROOT
|
global OPTIONS, ROOT
|
||||||
|
|
||||||
|
|
@ -124,7 +134,7 @@ def main():
|
||||||
|
|
||||||
# convert the geli's to text
|
# convert the geli's to text
|
||||||
jobid = None
|
jobid = None
|
||||||
variantsOut = map( lambda geli: os.path.split(geli)[1] + '.calls', gelis)
|
variantsOut = gelis2gelisText( gelis )
|
||||||
for geli, variantOut in zip(gelis, variantsOut):
|
for geli, variantOut in zip(gelis, variantsOut):
|
||||||
name = os.path.split(geli)[1]
|
name = os.path.split(geli)[1]
|
||||||
if not os.path.exists(variantOut):
|
if not os.path.exists(variantOut):
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@ import operator
|
||||||
MAX_QUAL_SCORE = 50
|
MAX_QUAL_SCORE = 50
|
||||||
|
|
||||||
def phredQScore( nMismatches, nBases ):
|
def phredQScore( nMismatches, nBases ):
|
||||||
|
"""Calculates a phred-scaled score for nMismatches in nBases"""
|
||||||
#print 'phredQScore', nMismatches, nBases
|
#print 'phredQScore', nMismatches, nBases
|
||||||
if nMismatches == 0:
|
if nMismatches == 0:
|
||||||
return MAX_QUAL_SCORE
|
return MAX_QUAL_SCORE
|
||||||
|
|
@ -24,10 +25,12 @@ def phredQScore( nMismatches, nBases ):
|
||||||
|
|
||||||
|
|
||||||
def phredScore2ErrorProp(qual):
|
def phredScore2ErrorProp(qual):
|
||||||
|
"""Converts a phred-scaled quality score to an error probability"""
|
||||||
#print 'phredScore2ErrorProp', qual
|
#print 'phredScore2ErrorProp', qual
|
||||||
return math.pow(10.0, float(qual) / -10.0)
|
return math.pow(10.0, float(qual) / -10.0)
|
||||||
|
|
||||||
def tryByInt(s):
|
def tryByInt(s):
|
||||||
|
"""Try to cast something to an int, or return it as a string"""
|
||||||
try:
|
try:
|
||||||
return int(s)
|
return int(s)
|
||||||
except:
|
except:
|
||||||
|
|
@ -36,11 +39,13 @@ def tryByInt(s):
|
||||||
expectedHeader = 'rg,pos,Qrep,dn,nBases,nMismatches,Qemp'.split(',')
|
expectedHeader = 'rg,pos,Qrep,dn,nBases,nMismatches,Qemp'.split(',')
|
||||||
defaultValues = '0,0,0,**,0,0,0'.split(',')
|
defaultValues = '0,0,0,**,0,0,0'.split(',')
|
||||||
class RecalData(dict):
|
class RecalData(dict):
|
||||||
|
"""Basic recalibration data -- corresponds exactly to the Java version in GATK"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.parse(expectedHeader, defaultValues)
|
self.parse(expectedHeader, defaultValues)
|
||||||
|
|
||||||
def parse(self, header, data):
|
def parse(self, header, data):
|
||||||
|
"""Parse the comma-separated data line with corresponding header. Throws an error
|
||||||
|
if the header doesn't correspond to the expectedHeader"""
|
||||||
# rg,pos,Qrep,dn,NBases,MMismatches,Qemp
|
# rg,pos,Qrep,dn,NBases,MMismatches,Qemp
|
||||||
types = [str, tryByInt, int, str, int, int, int]
|
types = [str, tryByInt, int, str, int, int, int]
|
||||||
for head, expected, datum, type in zip(header, expectedHeader, data, types):
|
for head, expected, datum, type in zip(header, expectedHeader, data, types):
|
||||||
|
|
@ -58,7 +63,10 @@ class RecalData(dict):
|
||||||
def __getattr__(self, name):
|
def __getattr__(self, name):
|
||||||
return self[name]
|
return self[name]
|
||||||
|
|
||||||
# rg,dn,Qrep,pos,NBases,MMismatches,Qemp
|
|
||||||
|
#
|
||||||
|
# Trivial accessor functions
|
||||||
|
#
|
||||||
def readGroup(self): return self.rg
|
def readGroup(self): return self.rg
|
||||||
def dinuc(self): return self.dn
|
def dinuc(self): return self.dn
|
||||||
def qReported(self): return self.Qrep
|
def qReported(self): return self.Qrep
|
||||||
|
|
@ -204,8 +212,9 @@ def lsamplestdev (inlist, counts, mean):
|
||||||
for item, count in zip(inlist, counts):
|
for item, count in zip(inlist, counts):
|
||||||
diff = item - mean
|
diff = item - mean
|
||||||
inc = count * diff * diff
|
inc = count * diff * diff
|
||||||
#print item, count, mean, diff, diff*diff, inc
|
#print "%3d" % int(item), count, mean, diff, diff*diff, inc, sum
|
||||||
sum += inc
|
sum += inc
|
||||||
|
#print sum, n, sum / float(n-1), math.sqrt(sum / float(n-1))
|
||||||
return math.sqrt(sum / float(n-1))
|
return math.sqrt(sum / float(n-1))
|
||||||
|
|
||||||
def rmse(reportedList, empiricalList, counts):
|
def rmse(reportedList, empiricalList, counts):
|
||||||
|
|
@ -320,7 +329,7 @@ def analyzeFiles(files):
|
||||||
for file in files:
|
for file in files:
|
||||||
print 'Analyzing file', file
|
print 'Analyzing file', file
|
||||||
plotter = getPlotterForFile(file)
|
plotter = getPlotterForFile(file)
|
||||||
if plotter <> None:
|
if plotter <> None and not OPTIONS.noplots:
|
||||||
cmd = ' '.join([Rscript, plotter, file])
|
cmd = ' '.join([Rscript, plotter, file])
|
||||||
farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry)
|
farm_commands.cmd(cmd, None, None, just_print_commands = OPTIONS.dry)
|
||||||
|
|
||||||
|
|
@ -341,6 +350,9 @@ def main():
|
||||||
parser.add_option("-s", "--stdout", dest="toStdout",
|
parser.add_option("-s", "--stdout", dest="toStdout",
|
||||||
action='store_true', default=False,
|
action='store_true', default=False,
|
||||||
help="If provided, writes output to standard output, not to files")
|
help="If provided, writes output to standard output, not to files")
|
||||||
|
parser.add_option("", "--no_plots", dest="noplots",
|
||||||
|
action='store_true', default=False,
|
||||||
|
help="If provided, no plots will be generated")
|
||||||
parser.add_option("", "--dry", dest="dry",
|
parser.add_option("", "--dry", dest="dry",
|
||||||
action='store_true', default=False,
|
action='store_true', default=False,
|
||||||
help="If provided, nothing actually gets run, just a dry run")
|
help="If provided, nothing actually gets run, just a dry run")
|
||||||
|
|
|
||||||
|
|
@ -153,7 +153,7 @@ def aggregateGeliCalls( sortedGeliCalls ):
|
||||||
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
#return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||||
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
|
||||||
|
|
||||||
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m' ):
|
def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True, useSamtools = False, memLimit = '-Xmx4096m', compression_level = 1 ):
|
||||||
if useSamtools:
|
if useSamtools:
|
||||||
return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles)
|
return SAMTOOLS_MERGE_BIN + ' ' + output_filename + ' ' + ' '.join(inputFiles)
|
||||||
else:
|
else:
|
||||||
|
|
@ -164,7 +164,7 @@ def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN, MSD = True,
|
||||||
MSDStr = ''
|
MSDStr = ''
|
||||||
if MSD: MSDStr = 'MSD=true'
|
if MSD: MSDStr = 'MSD=true'
|
||||||
|
|
||||||
return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
return 'java ' + memLimit + ' -jar ' + mergeBin + ' ' + MSDStr + ' AS=true COMPRESSION_LEVEL=' + compression_level + ' SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||||
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
#return 'java -Xmx4096m -jar ' + mergeBin + ' AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
|
||||||
|
|
||||||
def getPicardPath(lane, picardRoot = '/seq/picard/'):
|
def getPicardPath(lane, picardRoot = '/seq/picard/'):
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,2 @@
|
||||||
|
cd dist/packages/GATKResources
|
||||||
|
tar cvhzf gatk_resources_062309.tgz resources
|
||||||
Loading…
Reference in New Issue