Sites that were marked NO_DINUC no longer get dinuc-corrected but are still recalibrated using the other available covariates. Solid cycle is now the same as Illumina cycle pending an analysis that looks at the effect of PrimerRoundCovariate. Solid color space methods cleaned up to reduce number of calls to read.getAttribute(). Polished NHashMap sort method in preparation for move to core/utils. Added additional plots in AnalyzeCovariates to look at reported quality as a function of the covariate.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2451 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
2a704e83df
commit
562db45fa5
|
|
@ -20,9 +20,9 @@ d.good <- c[c$nBases >= 1000,]
|
|||
d.1000 <- c[c$nBases < 1000,]
|
||||
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
|
||||
rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
|
||||
theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
|
||||
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
|
||||
if( length(d.good$nBases) == length(c$nBases) ) {
|
||||
theTitle = paste("RMSE = ", round(rmseAll,digits=3))
|
||||
theTitle = paste("RMSE =", round(rmseAll,digits=3))
|
||||
}
|
||||
# Don't let residual error go off the edge of the plot
|
||||
d.good$residualError = d.good$Qempirical-d.good$Qreported
|
||||
|
|
@ -47,6 +47,28 @@ if( is.numeric(c$Covariate) ) {
|
|||
}
|
||||
dev.off()
|
||||
|
||||
|
||||
#
|
||||
# Plot mean quality versus the covariate
|
||||
#
|
||||
|
||||
outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
pointType = "p"
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
pointType = "o"
|
||||
}
|
||||
theTitle = paste("Quality By", covariateName);
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
|
||||
points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
|
||||
}
|
||||
dev.off()
|
||||
|
||||
#
|
||||
# Plot histogram of the covariate
|
||||
#
|
||||
|
|
|
|||
|
|
@ -20,9 +20,9 @@ f <- t[t$Qreported < Qcutoff,]
|
|||
e <- rbind(d.good, d.1000, d.10000)
|
||||
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
|
||||
rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) )
|
||||
theTitle = paste("RMSE_good = ", round(rmseGood,digits=3), ", RMSE_all = ", round(rmseAll,digits=3))
|
||||
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
|
||||
if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) {
|
||||
theTitle = paste("RMSE = ", round(rmseAll,digits=3));
|
||||
theTitle = paste("RMSE =", round(rmseAll,digits=3));
|
||||
}
|
||||
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,40), ylim=c(0,40), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
|
||||
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
|
||||
|
|
|
|||
|
|
@ -101,9 +101,9 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
|
||||
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
|
||||
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
|
||||
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted. BUGBUG: I don't understand what is going on in this case
|
||||
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
|
||||
private int numUnprocessed = 0; // Number of consecutive loci skipped because we are only processing every Nth site
|
||||
private static final String versionString = "v2.1.2"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.1.3"; // Major version, minor version, and build number
|
||||
private Pair<Long, Long> dbSNP_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for dbSNP loci
|
||||
private Pair<Long, Long> novel_counts = new Pair<Long, Long>(0L, 0L); // mismatch/base counts for non-dbSNP loci
|
||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||
|
|
@ -261,7 +261,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
byte[] bases;
|
||||
|
||||
// For each read at this locus
|
||||
for ( PileupElement p : context.getPileup() ) {
|
||||
for( PileupElement p : context.getPileup() ) {
|
||||
read = p.getRead();
|
||||
offset = p.getOffset();
|
||||
|
||||
|
|
@ -275,7 +275,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
refBase = (byte)ref.getBase();
|
||||
|
||||
// Skip if this base is an 'N' or etc.
|
||||
// BUGBUG: For DinucCovariate we should use previous reference base, not the previous base in this read.
|
||||
if( BaseUtils.isRegularBase( (char)(bases[offset]) ) ) {
|
||||
|
||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||
|
|
@ -334,7 +333,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Validate the dbSNP reference mismatch rates.
|
||||
*/
|
||||
private void validateDbsnpMismatchRate() {
|
||||
|
|
@ -463,7 +462,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
|
||||
if( SORTED_OUTPUT && requestedCovariates.size() == 4 )
|
||||
{
|
||||
for( Pair<List<? extends Comparable>,RecalDatum> entry : dataManager.data.entrySetSorted4() ) { //BUGBUG: entrySetSorted4 isn't correct here
|
||||
for( Pair<List<? extends Comparable>,RecalDatum> entry : dataManager.data.entrySetSorted() ) {
|
||||
for( Comparable comp : entry.first ) {
|
||||
recalTableStream.print( comp + "," );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -38,8 +38,8 @@ import net.sf.samtools.SAMRecord;
|
|||
* The Cycle covariate.
|
||||
* For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
|
||||
* For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle
|
||||
* For example, for the read: AAACCCCGAAATTTTTACT
|
||||
* the cycle would be 1111111122233333334
|
||||
* For example, for the read: AAACCCCGAAATTTTTACTG
|
||||
* the cycle would be 00000000111222222233
|
||||
* For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
|
||||
*/
|
||||
|
||||
|
|
@ -63,10 +63,10 @@ public class CycleCovariate implements StandardCovariate {
|
|||
int cycle = 0;
|
||||
|
||||
//-----------------------------
|
||||
// ILLUMINA
|
||||
// ILLUMINA and SOLID
|
||||
//-----------------------------
|
||||
|
||||
if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) ) {
|
||||
if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) {
|
||||
cycle = offset;
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
cycle = read.getReadLength() - (offset + 1);
|
||||
|
|
@ -109,17 +109,17 @@ public class CycleCovariate implements StandardCovariate {
|
|||
}
|
||||
|
||||
//-----------------------------
|
||||
// SOLID
|
||||
// SOLID (unused), only to be used in conjunction with PrimerRoundCovariate
|
||||
//-----------------------------
|
||||
|
||||
else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) {
|
||||
// The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
|
||||
int pos = offset;
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
pos = read.getReadLength() - (offset + 1);
|
||||
}
|
||||
cycle = pos / 5; // integer division
|
||||
}
|
||||
//else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) {
|
||||
// // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
|
||||
// int pos = offset;
|
||||
// if( read.getReadNegativeStrandFlag() ) {
|
||||
// pos = read.getReadLength() - (offset + 1);
|
||||
// }
|
||||
// cycle = pos / 5; // integer division
|
||||
//}
|
||||
|
||||
//-----------------------------
|
||||
// UNRECOGNIZED PLATFORM
|
||||
|
|
@ -127,10 +127,10 @@ public class CycleCovariate implements StandardCovariate {
|
|||
|
||||
else { // Platform is unrecognized so revert to the default platform but warn the user first
|
||||
if( !warnedUserBadPlatform ) {
|
||||
if( defaultPlatform != null) { // the user set a default platform
|
||||
if( defaultPlatform != null) { // The user set a default platform
|
||||
Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " +
|
||||
"Reverting to " + defaultPlatform + " definition of machine cycle." );
|
||||
} else { // the user did not set a default platform
|
||||
} else { // The user did not set a default platform
|
||||
Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " +
|
||||
"Reverting to Illumina definition of machine cycle. Users may set the default platform using the --default_platform <String> argument." );
|
||||
defaultPlatform = "Illumina";
|
||||
|
|
@ -138,12 +138,13 @@ public class CycleCovariate implements StandardCovariate {
|
|||
warnedUserBadPlatform = true;
|
||||
}
|
||||
read.getReadGroup().setPlatform( defaultPlatform );
|
||||
return getValue( read, offset ); // a recursive call
|
||||
return getValue( read, offset ); // A recursive call
|
||||
}
|
||||
|
||||
// differentiate between first and second of pair
|
||||
if ( read.getReadPairedFlag() && read.getSecondOfPairFlag() )
|
||||
// Differentiate between first and second of pair
|
||||
if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) {
|
||||
cycle *= -1;
|
||||
}
|
||||
|
||||
return cycle;
|
||||
}
|
||||
|
|
@ -155,6 +156,6 @@ public class CycleCovariate implements StandardCovariate {
|
|||
|
||||
// Used to estimate the amount space required for the full data HashMap
|
||||
public final int estimatedNumberOfBins() {
|
||||
return 100;
|
||||
return 80;
|
||||
}
|
||||
}
|
||||
|
|
@ -51,13 +51,13 @@ public class DinucCovariate implements StandardCovariate {
|
|||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
final byte[] BASES = { (byte)'A', (byte)'C', (byte)'G', (byte)'T' };
|
||||
dinucHashMap = new HashMap<Integer, Dinuc>();
|
||||
for(byte byte1 : BASES) {
|
||||
for(byte byte2: BASES) {
|
||||
for( byte byte1 : BASES ) {
|
||||
for( byte byte2: BASES ) {
|
||||
dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow
|
||||
}
|
||||
}
|
||||
// add the "no dinuc" entry too
|
||||
dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC);
|
||||
// Add the "no dinuc" entry too
|
||||
dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC );
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
|
|
@ -68,29 +68,39 @@ public class DinucCovariate implements StandardCovariate {
|
|||
byte[] bases = read.getReadBases();
|
||||
// If this is a negative strand read then we need to reverse the direction for our previous base
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
// no dinuc at the beginning of the read
|
||||
if( offset == bases.length-1 )
|
||||
// No dinuc at the beginning of the read
|
||||
if( offset == bases.length-1 ) {
|
||||
return NO_DINUC;
|
||||
}
|
||||
base = (byte)BaseUtils.simpleComplement( (char)(bases[offset]) );
|
||||
// Note: We are using the previous base in the read, not the previous base in the reference. This is done in part to be consistent with unmapped reads.
|
||||
prevBase = (byte)BaseUtils.simpleComplement( (char)(bases[offset + 1]) );
|
||||
} else {
|
||||
// no dinuc at the beginning of the read
|
||||
if( offset == 0 )
|
||||
// No dinuc at the beginning of the read
|
||||
if( offset == 0 ) {
|
||||
return NO_DINUC;
|
||||
}
|
||||
base = bases[offset];
|
||||
// Note: We are using the previous base in the read, not the previous base in the reference. This is done in part to be consistent with unmapped reads.
|
||||
prevBase = bases[offset - 1];
|
||||
}
|
||||
|
||||
// make sure the previous base is good
|
||||
if( !BaseUtils.isRegularBase(prevBase) )
|
||||
// Make sure the previous base is good
|
||||
if( !BaseUtils.isRegularBase( prevBase ) ) {
|
||||
return NO_DINUC;
|
||||
}
|
||||
|
||||
return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) );
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) );
|
||||
Dinuc returnDinuc = dinucHashMap.get( Dinuc.hashBytes( (byte)str.charAt(0), (byte)str.charAt(1) ) );
|
||||
if( returnDinuc.compareTo(NO_DINUC) == 0 ) {
|
||||
return null;
|
||||
}
|
||||
return returnDinuc;
|
||||
|
||||
}
|
||||
|
||||
// Used to estimate the amount space required for the full data HashMap
|
||||
|
|
|
|||
|
|
@ -41,7 +41,7 @@ import java.util.*;
|
|||
|
||||
public class NHashMap<T> extends HashMap<List<? extends Comparable>, T> {
|
||||
|
||||
private static final long serialVersionUID = 1L; //BUGBUG: what should I do here?, added by Eclipse
|
||||
private static final long serialVersionUID = 1L; // Added by Eclipse
|
||||
private ArrayList<ArrayList<Comparable>> keyLists;
|
||||
|
||||
public NHashMap() {
|
||||
|
|
@ -78,10 +78,7 @@ public class NHashMap<T> extends HashMap<List<? extends Comparable>, T> {
|
|||
return super.put( key, value );
|
||||
}
|
||||
|
||||
// This method is very ugly but is here only to help facilitate direct comparison of old and refactored recalibrator.
|
||||
// The old recalibrator prints out its mappings in a sorted order but the refactored recalibrator doesn't need to.
|
||||
@SuppressWarnings(value = "unchecked")
|
||||
public ArrayList<Pair<List<? extends Comparable>, T>> entrySetSorted4() {
|
||||
public ArrayList<Pair<List<? extends Comparable>, T>> entrySetSorted() {
|
||||
|
||||
ArrayList<Pair<List<? extends Comparable>, T>> theSet = new ArrayList<Pair<List<? extends Comparable>, T>>();
|
||||
|
||||
|
|
@ -89,35 +86,49 @@ public class NHashMap<T> extends HashMap<List<? extends Comparable>, T> {
|
|||
Collections.sort(list);
|
||||
}
|
||||
|
||||
if( keyLists.size() != 4 ) {
|
||||
throw new StingException("Are you sure you want to be calling this ugly method? NHashMap.entrySetSorted4()");
|
||||
int[] keyIndex = new int[ keyLists.size() ];
|
||||
int[] maxIndex = new int[ keyLists.size() ];
|
||||
for( int iii = 0; iii < keyLists.size(); iii++ ) {
|
||||
keyIndex[iii] = 0;
|
||||
maxIndex[iii] = keyLists.get(iii).size();
|
||||
}
|
||||
|
||||
// Try all the possible keys in sorted order, add them to the output set if they are in the hashMap
|
||||
boolean triedAllKeys = false;
|
||||
ArrayList<Comparable> newKey = null;
|
||||
for( Comparable c0 : keyLists.get(0) ) {
|
||||
for( Comparable c1 : keyLists.get(1) ) {
|
||||
for( Comparable c2 : keyLists.get(2) ) {
|
||||
for( Comparable c3 : keyLists.get(3) ) {
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add(c0);
|
||||
newKey.add(c1);
|
||||
newKey.add(c2);
|
||||
newKey.add(c3);
|
||||
T value = this.get( newKey );
|
||||
if( value!= null ) {
|
||||
theSet.add(new Pair<List<? extends Comparable>,T>( newKey, value ) );
|
||||
}
|
||||
while( !triedAllKeys ) {
|
||||
newKey = new ArrayList<Comparable>();
|
||||
for( int iii = 0; iii < keyLists.size(); iii++ ) {
|
||||
newKey.add(keyLists.get(iii).get(keyIndex[iii]));
|
||||
}
|
||||
T value = this.get( newKey );
|
||||
if( value!= null ) {
|
||||
theSet.add(new Pair<List<? extends Comparable>,T>( newKey, value ) );
|
||||
}
|
||||
|
||||
// Increment the keyIndex
|
||||
keyIndex[keyLists.size() - 1]++;
|
||||
for( int iii = keyLists.size() - 1; iii >= 0; iii-- ) {
|
||||
if( keyIndex[iii] >= maxIndex[iii] ) { // Carry it forward
|
||||
keyIndex[iii] = 0;
|
||||
if( iii > 0 ) {
|
||||
keyIndex[iii-1]++;
|
||||
} else {
|
||||
triedAllKeys = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return theSet;
|
||||
}
|
||||
|
||||
// Used to make the key from a list of <T> objects
|
||||
public static <T> List<T> makeList(T... args) {
|
||||
List<T> list = new ArrayList<T>();
|
||||
for (T arg : args)
|
||||
for( T arg : args )
|
||||
{
|
||||
list.add(arg);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -128,12 +128,15 @@ public class RecalDataManager {
|
|||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add( key.get(0) ); // Make a new key with the read group ...
|
||||
newKey.add( key.get(1) ); // and quality score ...
|
||||
newKey.add( key.get(iii + 2) ); // and the given covariate
|
||||
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.increment( fullDatum );
|
||||
Comparable theCovariateElement = key.get(iii + 2); // and the given covariate
|
||||
if( theCovariateElement != null ) {
|
||||
newKey.add( theCovariateElement );
|
||||
collapsedDatum = dataCollapsedByCovariate.get(iii).get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedByCovariate.get(iii).put( newKey, new RecalDatum(fullDatum) );
|
||||
} else {
|
||||
collapsedDatum.increment( fullDatum );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -218,12 +221,14 @@ public class RecalDataManager {
|
|||
public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) {
|
||||
|
||||
// Check if we need to use the original quality scores instead
|
||||
if( RAC.USE_ORIGINAL_QUALS && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
Object obj = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
if( obj instanceof String )
|
||||
read.setBaseQualities( QualityUtils.fastqToPhred((String)obj) );
|
||||
else {
|
||||
throw new RuntimeException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
if( RAC.USE_ORIGINAL_QUALS ) {
|
||||
Object attr = read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
if( attr instanceof String ) {
|
||||
read.setBaseQualities( QualityUtils.fastqToPhred((String)attr) );
|
||||
} else {
|
||||
throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -267,8 +272,16 @@ public class RecalDataManager {
|
|||
// If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
|
||||
if( read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") ) {
|
||||
if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null ) { // Haven't calculated the inconsistency array yet for this read
|
||||
if( read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG) != null ) {
|
||||
char[] colorSpace = ((String)read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG)).toCharArray();
|
||||
Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
char[] colorSpace;
|
||||
if( attr instanceof String ) {
|
||||
colorSpace = ((String)attr).toCharArray();
|
||||
} else {
|
||||
throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
|
||||
// Loop over the read and calculate first the infered bases from the color and then check if it is consistent with the read
|
||||
byte[] readBases = read.getReadBases();
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
|
|
@ -305,8 +318,15 @@ public class RecalDataManager {
|
|||
*/
|
||||
public static byte[] calcColorSpace( SAMRecord read, byte[] originalQualScores, final String SOLID_RECAL_MODE, final Random coinFlip, final char[] refBases ) {
|
||||
|
||||
if( read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG) != null ) {
|
||||
char[] colorSpace = ((String)read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG)).toCharArray();
|
||||
Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
char[] colorSpace;
|
||||
if( attr instanceof String ) {
|
||||
colorSpace = ((String)attr).toCharArray();
|
||||
} else {
|
||||
throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
// Loop over the read and calculate first the infered bases from the color and then check if it is consistent with the read
|
||||
byte[] readBases = read.getReadBases();
|
||||
byte[] colorImpliedBases = readBases.clone();
|
||||
|
|
@ -334,7 +354,7 @@ public class RecalDataManager {
|
|||
boolean setBaseN = true;
|
||||
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, isMappedToReference, setBaseN);
|
||||
} else if( SOLID_RECAL_MODE.equalsIgnoreCase("REMOVE_REF_BIAS") ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases
|
||||
solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBases, isMappedToReference, coinFlip);
|
||||
solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead, isMappedToReference, coinFlip);
|
||||
}
|
||||
|
||||
} else if ( !warnUserNoColorSpace ) { // Warn the user if we can't find the color space tag
|
||||
|
|
@ -349,7 +369,7 @@ public class RecalDataManager {
|
|||
/**
|
||||
* Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read RC'd if necessary
|
||||
* @param readBases The bases in the read which have been RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
* @param originalQualScores The array of original quality scores to set to zero if needed
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
|
|
@ -360,7 +380,7 @@ public class RecalDataManager {
|
|||
private static byte[] solidRecalSetToQZero( SAMRecord read, byte[] readBases, int[] inconsistency, byte[] originalQualScores,
|
||||
final char[] refBases, final boolean isMappedToRef, final boolean setBaseN ) {
|
||||
|
||||
for( int iii = 1; iii < originalQualScores.length - 1; iii++ ) { // BUGBUG: This relies on the fact that in SOLID bams the first and last base are set to Q0 and aren't recalibrated anyway
|
||||
for( int iii = 1; iii < originalQualScores.length - 1; iii++ ) {
|
||||
if( inconsistency[iii] == 1 ) {
|
||||
if( !isMappedToRef || (char)readBases[iii] == refBases[iii] ) {
|
||||
originalQualScores[iii] = (byte)0;
|
||||
|
|
@ -387,7 +407,7 @@ public class RecalDataManager {
|
|||
/**
|
||||
* Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read RC'd if necessary
|
||||
* @param readBases The bases in the read which have been RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
* @param colorImpliedBases The bases implied by the color space, RC'd if necessary
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
|
|
@ -396,8 +416,15 @@ public class RecalDataManager {
|
|||
*/
|
||||
private static void solidRecalRemoveRefBias( SAMRecord read, byte[] readBases, int[] inconsistency, byte[] colorImpliedBases,
|
||||
final char[] refBases, final boolean isMappedToRef, final Random coinFlip ) {
|
||||
if( read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG) != null ) {
|
||||
byte[] colorSpaceQuals = QualityUtils.fastqToPhred((String)read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG));
|
||||
|
||||
Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
byte[] colorSpaceQuals;
|
||||
if( attr instanceof String ) {
|
||||
colorSpaceQuals = QualityUtils.fastqToPhred((String)attr);
|
||||
} else {
|
||||
throw new StingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
for( int iii = 1; iii < inconsistency.length - 2; iii++ ) {
|
||||
if( inconsistency[iii] == 1 ) {
|
||||
|
|
@ -413,8 +440,8 @@ public class RecalDataManager {
|
|||
int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
|
||||
int diffInQuality = maxQuality - minQuality;
|
||||
int numLow = minQuality;
|
||||
if(numLow == 0) { numLow = 1; }
|
||||
int numHigh = Math.round(numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely
|
||||
if( numLow == 0 ) { numLow = 1; }
|
||||
int numHigh = Math.round( numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely
|
||||
int rand = coinFlip.nextInt( numLow + numHigh );
|
||||
if( rand >= numLow ) {
|
||||
if( maxQuality == (int)colorSpaceQuals[jjj] ) {
|
||||
|
|
|
|||
|
|
@ -94,7 +94,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
private static final String versionString = "v2.1.2"; // Major version, minor version, and build number
|
||||
private static final String versionString = "v2.1.3"; // Major version, minor version, and build number
|
||||
private SAMFileWriter OUTPUT_BAM = null;// The File Writer that will write out the recalibrated bam
|
||||
private Random coinFlip; // Random number generator is used to remove reference bias in solid bams
|
||||
private static final long RANDOM_SEED = 1032861495;
|
||||
|
|
|
|||
|
|
@ -17,9 +17,9 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
public void testCountCovariates1() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "c1b54d4221fb4fa88e0231a74310708e" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "95e26e8247d0c5e43705048c5ee64873");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "337ea30c4dcc2fe6a9adc442ffd0706b");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "ffbfd38b1720cfb67ba1bb63d4308552" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "427306f07f5fd905439b28a770f3d3d6" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "60e227ea8c3409fa85b92cae7ea6574f" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -49,10 +49,10 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testTableRecalibrator1() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "ca839a4eb0ef443e0486b96843304f92" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "fd27c3ab424ef01c77d2dbdedf721a8a");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "b6e16538bda18336f176417cdf686f6a" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "be9a46e0d9b7ad90ce303d08dfb7b4de" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12892.SLX.SRP000031.2009_06.selected.bam", "f7749792ffffbb86aec66e92a3bddf7f" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "f1780e3c3e12f07527e0468149312f10");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12873.454.SRP000031.2009_06.chr1.10_20mb.bam", "c54a67a1687a4139a8ae19762217987f" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.allTechs.bam", "d9ddbacdafc621d830a1db637973d795" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -80,7 +80,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testCountCovariatesVCF() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "5e132283b906f6de3328986fd0101be7");
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12878.1kg.p2.chr1_10mb_11_mb.SOLID.bam", "3ee0e811682c0f29951128204765ece9");
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -108,7 +108,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testCountCovariatesNoReadGroups() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "199979b483f6c03c0977141f4fea9961" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "f49bc79225bffbf8b64590b65a4b4305" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
@ -138,7 +138,7 @@ public class RecalibrationWalkersIntegrationTest extends WalkerTest {
|
|||
@Test
|
||||
public void testTableRecalibratorNoReadGroups() {
|
||||
HashMap<String, String> e = new HashMap<String, String>();
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "2889c63854e233fadd9178ccf5b2517b" );
|
||||
e.put( "/humgen/gsa-scr1/GATK_Data/Validation_Data/NA12762.SOLID.SRP000031.2009_07.chr1.10_20mb.bam", "62413fdbfe99cd6e24992de4234de5bc" );
|
||||
|
||||
for ( Map.Entry<String, String> entry : e.entrySet() ) {
|
||||
String bam = entry.getKey();
|
||||
|
|
|
|||
Loading…
Reference in New Issue