Added a validateOldRecalibrator option to CovariateCounterWalker which reorders the output to match the old recalibrator exactly. This facilitates direct comparison of output. Changed the -cov argument slightly to require the user to specify both ReadGroupCovariate and QualityScoreCovariate to make it more clear to the user which covariates are being used. Some speed up improvements throughout.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2010 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
7e30fe230a
commit
1e7ddd2d9f
|
|
@ -32,11 +32,11 @@ import net.sf.samtools.SAMRecord;
|
|||
* User: rpoplin
|
||||
* Date: Oct 30, 2009
|
||||
*
|
||||
* The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read and corresponding reference bases
|
||||
* The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read, offset, and corresponding reference bases
|
||||
*/
|
||||
|
||||
public interface Covariate {
|
||||
public Comparable getValue(SAMRecord read, int offset, char[] refBases); // used to pick out the value from the read and reference bases
|
||||
public Comparable getValue(SAMRecord read, int offset, char[] refBases); // used to pick out the value from the read, offset, and reference bases
|
||||
public Comparable getValue(String str); // used to get value from input file
|
||||
public int estimatedNumberOfBins(); // used to estimate the amount space required in the hashmap
|
||||
public int estimatedNumberOfBins(); // used to estimate the amount space required in the HashMap
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ import org.broadinstitute.sting.utils.cmdLine.Argument;
|
|||
import org.broadinstitute.sting.utils.PackageUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileNotFoundException;
|
||||
|
|
@ -72,10 +73,16 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
public boolean USE_ORIGINAL_QUALS = false;
|
||||
@Argument(fullName="recal_file", shortName="rf", required=false, doc="Filename for the outputted covariates table recalibration file")
|
||||
public String RECAL_FILE = "output.recal_data.csv";
|
||||
@Argument(fullName="validateOldRecalibrator", shortName="valor", required=false, doc="If yes will reorder the output to match the old recalibrator exactly but makes the file useless to the refactored TableRecalibrationWalker")
|
||||
public boolean validateOldRecalibrator = false;
|
||||
|
||||
protected static RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
protected static ArrayList<Covariate> requestedCovariates; // A list to hold the covariate objects that were requested
|
||||
|
||||
private long countedSites = 0; // used for reporting at the end
|
||||
private long countedBases = 0; // used for reporting at the end
|
||||
private long skippedSites = 0; // used for reporting at the end
|
||||
|
||||
/**
|
||||
* Parse the -cov arguments and create a list of covariates to be used here
|
||||
* Based on the covariates' estimates for initial capacity allocate the data hashmap
|
||||
|
|
@ -99,25 +106,30 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
|
||||
// Initialize the requested covariates by parsing the -cov argument
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
requestedCovariates.add( new ReadGroupCovariate() ); // Read Group Covariate is a required covariate for the recalibration calculation
|
||||
requestedCovariates.add( new QualityScoreCovariate() ); // Quality Score Covariate is a required covariate for the recalibration calculation
|
||||
if( COVARIATES != null ) {
|
||||
if( validateOldRecalibrator ) {
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
requestedCovariates.add( new CycleCovariate() ); // unfortunately the ordering here is different to match the output of the old recalibrator
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
requestedCovariates.add( new DinucCovariate() );
|
||||
estimatedCapacity = 300 * 200 * 40 * 16;
|
||||
} else if( COVARIATES != null ) {
|
||||
int covNumber = 1;
|
||||
for( String requestedCovariateString : COVARIATES ) {
|
||||
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
|
||||
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
|
||||
foundClass = true;
|
||||
// Read Group Covariate and Quality Score Covariate are required covariates for the recalibration calculation and must begin the list
|
||||
if( (covNumber == 1 && !requestedCovariateString.equalsIgnoreCase( "ReadGroupCovariate" )) ||
|
||||
(covNumber == 2 && !requestedCovariateString.equalsIgnoreCase( "QualityScoreCovariate" )) ) {
|
||||
throw new StingException("ReadGroupCovariate and QualityScoreCovariate are required covariates for the recalibration calculation and must begin the list" );
|
||||
}
|
||||
covNumber++;
|
||||
try {
|
||||
// Now that we've found a matching class, try to instantiate it
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
estimatedCapacity *= covariate.estimatedNumberOfBins(); // update the estimated initial capacity
|
||||
|
||||
if (covariate instanceof ReadGroupCovariate || covariate instanceof QualityScoreCovariate) {
|
||||
throw new StingException( "ReadGroupCovariate and QualityScoreCovariate are required covariates and are therefore added for you. Please remove them from the -cov list" );
|
||||
}
|
||||
|
||||
} catch ( InstantiationException e ) {
|
||||
throw new StingException( String.format("Can not instantiate covariate class '%s': must be concrete class.", covClass.getSimpleName()) );
|
||||
} catch ( IllegalAccessException e ) {
|
||||
|
|
@ -130,6 +142,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
throw new StingException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
|
||||
}
|
||||
}
|
||||
} else { // Not validating old recalibrator and no covariates were specified by the user so add the default ones
|
||||
logger.info( "Note: Using default set of covariates because none were specified." );
|
||||
requestedCovariates.add( new ReadGroupCovariate() );
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
requestedCovariates.add( new CycleCovariate() );
|
||||
requestedCovariates.add( new DinucCovariate() );
|
||||
estimatedCapacity = 300 * 40 * 200 * 16;
|
||||
}
|
||||
|
||||
logger.info( "The covariates being used here: " );
|
||||
|
|
@ -172,7 +191,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
|
||||
// Only use data from reads with mapping quality above specified quality value and base quality greater than zero
|
||||
byte[] quals = read.getBaseQualities();
|
||||
if( read.getMappingQuality() >= MIN_MAPPING_QUALITY && quals[offset] > 0)
|
||||
if( read.getMappingQuality() >= MIN_MAPPING_QUALITY && quals[offset] > 0 )
|
||||
{
|
||||
// Skip first and last bases because they don't have a dinuc count
|
||||
if( offset > 0 && offset < (read.getReadLength() - 1) ) {
|
||||
|
|
@ -181,10 +200,13 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
}
|
||||
|
||||
}
|
||||
countedSites++;
|
||||
|
||||
} else { // We skipped over the dbSNP site
|
||||
skippedSites++;
|
||||
}
|
||||
|
||||
return 1;
|
||||
return 1; // This value isn't actually used anywhere
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -208,7 +230,7 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
if( keyElement != null ) {
|
||||
key.add( keyElement );
|
||||
} else {
|
||||
badKey = true; // covariate returned bad value, for example dinuc returns null because base = 'N'
|
||||
badKey = true; // Covariate returned bad value, for example dinuc returns null because base = 'N'
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -218,7 +240,11 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
datum = dataManager.data.get( key );
|
||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
datum = new RecalDatum(); // initialized with zeros, will be incremented at end of method
|
||||
dataManager.data.put( key, datum );
|
||||
if( validateOldRecalibrator ) {
|
||||
dataManager.data.myPut( key, datum );
|
||||
} else {
|
||||
dataManager.data.put( key, datum );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -236,6 +262,12 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
if( datum != null ) {
|
||||
// Add one to the number of observations and potentially one to the number of mismatches
|
||||
datum.increment( base, refBase );
|
||||
countedBases++;
|
||||
} else {
|
||||
if( validateOldRecalibrator ) {
|
||||
countedBases++; // This line here to replicate behavior in the old recalibrator
|
||||
// (reads with bad covariates [prev_base = 'N', for example] were properly tossed out but this count was still incremented)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -274,9 +306,6 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
*/
|
||||
public void onTraversalDone( PrintStream recalTableStream ) {
|
||||
out.print( "Writing raw recalibration data..." );
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
recalTableStream.println( "@!" + cov.getClass().getSimpleName() ); // The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name
|
||||
}
|
||||
outputToCSV( recalTableStream );
|
||||
out.println( "...done!" );
|
||||
|
||||
|
|
@ -288,15 +317,45 @@ public class CovariateCounterWalker extends LocusWalker<Integer, PrintStream> {
|
|||
* @param recalTableStream the PrintStream to write out to
|
||||
*/
|
||||
private void outputToCSV( PrintStream recalTableStream ) {
|
||||
// For each entry in the data hashmap
|
||||
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
|
||||
// For each Covariate in the key
|
||||
for( Comparable comp : entry.getKey() ) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( comp + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( entry.getValue().outputToCSV() );
|
||||
|
||||
if( validateOldRecalibrator ) {
|
||||
boolean collapsePos = false;
|
||||
boolean collapseDinuc = false;
|
||||
recalTableStream.printf("# collapsed_pos %b%n", collapsePos);
|
||||
recalTableStream.printf("# collapsed_dinuc %b%n", collapseDinuc);
|
||||
recalTableStream.printf("# counted_sites %d%n", countedSites);
|
||||
recalTableStream.printf("# counted_bases %d%n", countedBases);
|
||||
recalTableStream.printf("# skipped_sites %d%n", skippedSites);
|
||||
recalTableStream.printf("# fraction_skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
|
||||
recalTableStream.println("rg,pos,Qrep,dn,nBases,nMismatches,Qemp");
|
||||
for(Pair<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySetSorted4() ) {
|
||||
// For each Covariate in the key
|
||||
for( Comparable comp : entry.getFirst() ) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( comp + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( entry.getSecond().outputToCSV() );
|
||||
}
|
||||
} else {
|
||||
recalTableStream.printf("# Counted Sites %d%n", countedSites);
|
||||
recalTableStream.printf("# Counted Bases %d%n", countedBases);
|
||||
recalTableStream.printf("# Skipped Sites %d%n", skippedSites);
|
||||
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)countedSites / skippedSites);
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
// The "@!" is a code for TableRecalibrationWalker to recognize this line as a Covariate class name
|
||||
recalTableStream.println( "@!" + cov.getClass().getSimpleName() );
|
||||
}
|
||||
// For each entry in the data hashmap
|
||||
for( Map.Entry<List<? extends Comparable>, RecalDatum> entry : dataManager.data.entrySet() ) {
|
||||
// For each Covariate in the key
|
||||
for( Comparable comp : entry.getKey() ) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( comp + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( entry.getValue().outputToCSV() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -31,6 +31,14 @@ import net.sf.samtools.SAMRecord;
|
|||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Oct 30, 2009
|
||||
*
|
||||
* The Cycle covariate.
|
||||
* For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
|
||||
* Yet to be implemented:
|
||||
* - For 454 the cycle is the number of discontinuous nucleotides seen during the length of the read
|
||||
* For example, for the read: AAACCCCGAAATTTTTTT
|
||||
* the cycle would be 111222234445555555
|
||||
* - For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
|
||||
*/
|
||||
|
||||
public class CycleCovariate implements Covariate {
|
||||
|
|
|
|||
|
|
@ -3,8 +3,6 @@ package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
|||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
|
|
@ -34,34 +32,31 @@ import java.util.*;
|
|||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 3, 2009
|
||||
*
|
||||
* The Dinucleotide covariate. This base and the one that came before it in the read, remembering to swap directions on cardinality if negative strand read.
|
||||
* This covariate will return null if there are bases that don't belong such as 'N' or 'X'.
|
||||
* This covariate will also return null if attempting to get previous base at the start of the read.
|
||||
*/
|
||||
|
||||
public class DinucCovariate implements Covariate {
|
||||
|
||||
public static ArrayList<String> BASES;
|
||||
|
||||
public DinucCovariate() { // empty constructor is required to instantiate covariate in CovariateCounterWalker and TableRecalibrationWalker
|
||||
BASES = new ArrayList<String>();
|
||||
BASES.add("A");
|
||||
BASES.add("G");
|
||||
BASES.add("C");
|
||||
BASES.add("T");
|
||||
BASES.add("a");
|
||||
BASES.add("g");
|
||||
BASES.add("c");
|
||||
BASES.add("t");
|
||||
}
|
||||
|
||||
public Comparable getValue(SAMRecord read, int offset, char[] refBases) {
|
||||
byte[] bases = read.getReadBases();
|
||||
char base = (char)bases[offset];
|
||||
char prevBase = (char)bases[offset - 1];
|
||||
char prevBase; // set below
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
base = BaseUtils.simpleComplement(base);
|
||||
if ( offset + 1 > bases.length - 1 ) { return null; } // no prevBase because at the beginning of the read
|
||||
prevBase = BaseUtils.simpleComplement( (char)bases[offset + 1] );
|
||||
} else {
|
||||
if ( offset - 1 < 0 ) { return null; } // no prevBase because at the beginning of the read
|
||||
prevBase = (char)bases[offset - 1];
|
||||
}
|
||||
// Check if bad base, probably an 'N'
|
||||
if( !BASES.contains( String.format( "%c", prevBase ) ) || !BASES.contains( String.format( "%c", base) ) ) {
|
||||
if( ( BaseUtils.simpleBaseToBaseIndex(prevBase) == -1 ) || ( BaseUtils.simpleBaseToBaseIndex(base) == -1 ) ) {
|
||||
return null; // CovariateCounterWalker will recognize that null means skip this particular location in the read
|
||||
} else {
|
||||
return String.format("%c%c", prevBase, base);
|
||||
|
|
|
|||
|
|
@ -31,6 +31,8 @@ import net.sf.samtools.SAMRecord;
|
|||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 4, 2009
|
||||
*
|
||||
* The Mapping Quality covariate.
|
||||
*/
|
||||
|
||||
public class MappingQualityCovariate implements Covariate {
|
||||
|
|
|
|||
|
|
@ -32,6 +32,10 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
|||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 4, 2009
|
||||
*
|
||||
* The Minimum Neighborhood Quality Score covariate, originally described by Chris Hartl.
|
||||
* This covariate is the minimum base quality score along the length of the read.
|
||||
* BUGBUG: I don't think this is what was intended by Chris. Covariate interface must change to implement the real covariate.
|
||||
*/
|
||||
|
||||
public class MinimumNQSCovariate implements Covariate {
|
||||
|
|
@ -58,6 +62,7 @@ public class MinimumNQSCovariate implements Covariate {
|
|||
}
|
||||
}
|
||||
|
||||
// Loop over the list of qualities and find the minimum
|
||||
Integer minQual = (int)(quals[0]);
|
||||
for ( int qual : quals ) {
|
||||
if( qual < minQual ) {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,8 @@
|
|||
package org.broadinstitute.sting.playground.gatk.walkers.Recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/*
|
||||
|
|
@ -40,21 +43,26 @@ public class NHashMap<T> extends HashMap<List<? extends Comparable>, T> {
|
|||
|
||||
public NHashMap() {
|
||||
super();
|
||||
keyLists = new ArrayList<ArrayList<Comparable>>();
|
||||
keyLists = null;
|
||||
}
|
||||
|
||||
public NHashMap( int initialCapacity, float loadingFactor ) {
|
||||
super( initialCapacity, loadingFactor );
|
||||
keyLists = new ArrayList<ArrayList<Comparable>>();
|
||||
keyLists = null;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
// This method is here only to help facilitate direct comparison of old and refactored recalibrator.
|
||||
// The old recalibrator prints out it's mappings in a sorted order but the refactored recalibrator doesn't need to.
|
||||
// Comment out this overriding method to improve performance
|
||||
public T put(List<? extends Comparable> key, T value) {
|
||||
// The old recalibrator prints out its mappings in a sorted order but the refactored recalibrator doesn't need to.
|
||||
public T myPut(List<? extends Comparable> key, T value) {
|
||||
|
||||
if( keyLists == null ) {
|
||||
keyLists = new ArrayList<ArrayList<Comparable>>();
|
||||
for( Comparable comp : key ) {
|
||||
keyLists.add( new ArrayList<Comparable>() );
|
||||
}
|
||||
}
|
||||
|
||||
ArrayList<Comparable> thisList;
|
||||
for( int iii = 0; iii < key.size(); iii++ ) {
|
||||
thisList = keyLists.get( iii );
|
||||
|
|
@ -68,27 +76,46 @@ public class NHashMap<T> extends HashMap<List<? extends Comparable>, T> {
|
|||
return super.put( key, value );
|
||||
}
|
||||
|
||||
// This method is here only to help facilitate direct comparison of old and refactored recalibrator.
|
||||
// The old recalibrator prints out it's mappings in a sorted order but the refactored recalibrator doesn't need to.
|
||||
// Comment out this overriding method to improve performance
|
||||
public Set<Map.Entry<List<? extends Comparable>, T>> entrySet() {
|
||||
|
||||
// This method is very ugly but is here only to help facilitate direct comparison of old and refactored recalibrator.
|
||||
// The old recalibrator prints out its mappings in a sorted order but the refactored recalibrator doesn't need to.
|
||||
@SuppressWarnings(value = "unchecked")
|
||||
public ArrayList<Pair<List<? extends Comparable>, T>> entrySetSorted4() {
|
||||
|
||||
ArrayList<Pair<List<? extends Comparable>, T>> theSet = new ArrayList<Pair<List<? extends Comparable>, T>>();
|
||||
|
||||
for( ArrayList<Comparable> list : keyLists ) {
|
||||
Collections.sort(list);
|
||||
}
|
||||
|
||||
for( ArrayList<Comparable> list : keyLists ) {
|
||||
for( Comparable comp : list ) {
|
||||
// BUGBUG: unfinished
|
||||
if( keyLists.size() != 4 ) {
|
||||
throw new StingException("Are you sure you want to be calling this ugly method? NHashMap.entrySetSorted4()");
|
||||
}
|
||||
|
||||
ArrayList<Comparable> newKey = null;
|
||||
for( Comparable c0 : keyLists.get(0) ) {
|
||||
for( Comparable c1 : keyLists.get(1) ) {
|
||||
for( Comparable c2 : keyLists.get(2) ) {
|
||||
for( Comparable c3 : keyLists.get(3) ) {
|
||||
newKey = new ArrayList<Comparable>();
|
||||
newKey.add(c0);
|
||||
newKey.add(c1);
|
||||
newKey.add(c2);
|
||||
newKey.add(c3);
|
||||
T value = this.get( newKey );
|
||||
if( value!= null ) {
|
||||
theSet.add(new Pair<List<? extends Comparable>,T>( newKey, value ) );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
return theSet;
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
public static <T extends Comparable> List<T> makeList(T... args) {
|
||||
|
||||
public List<T> makeList(T... args) {
|
||||
List<T> list = new ArrayList<T>();
|
||||
for (T arg : args)
|
||||
{
|
||||
|
|
|
|||
|
|
@ -32,6 +32,8 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
|||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 3, 2009
|
||||
*
|
||||
* The Reported Quality Score covariate.
|
||||
*/
|
||||
|
||||
public class QualityScoreCovariate implements Covariate {
|
||||
|
|
|
|||
|
|
@ -31,6 +31,8 @@ import net.sf.samtools.SAMRecord;
|
|||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Oct 30, 2009
|
||||
*
|
||||
* The Read Group covariate.
|
||||
*/
|
||||
|
||||
public class ReadGroupCovariate implements Covariate{
|
||||
|
|
|
|||
|
|
@ -50,12 +50,12 @@ public class RecalDataManager {
|
|||
collapsedTablesCreated = false;
|
||||
}
|
||||
|
||||
// BUGBUG: A lot going on in this method, doing a lot of pre-calculations for use in the sequential mode calculation later
|
||||
// BUGBUG: A lot going on in this method, doing a lot of pre-calculations for use in the sequential mode calculation later in TableRecalibrationWalker
|
||||
public void createCollapsedTables( int numCovariates ) {
|
||||
dataCollapsedReadGroup = new NHashMap<RecalDatum>();
|
||||
dataCollapsedQualityScore = new NHashMap<RecalDatum>();
|
||||
dataCollapsedByCovariate = new ArrayList<NHashMap<RecalDatum>>();
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||
dataCollapsedByCovariate.add( new NHashMap<RecalDatum>() );
|
||||
}
|
||||
dataSumExpectedErrors = new NHashMap<Double>();
|
||||
|
|
@ -78,7 +78,6 @@ public class RecalDataManager {
|
|||
collapsedDatum = dataCollapsedReadGroup.get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedReadGroup.put( newKey, new RecalDatum( thisDatum ) );
|
||||
//System.out.println("Added key: " + newKey + " to the dataCollapsedReadGroup");
|
||||
} else {
|
||||
collapsedDatum.increment( thisDatum );
|
||||
}
|
||||
|
|
@ -89,7 +88,6 @@ public class RecalDataManager {
|
|||
if( sumExpectedErrors == null ) {
|
||||
dataSumExpectedErrors.put( newKey, 0.0 );
|
||||
} else {
|
||||
//System.out.println("updated += " + QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * thisDatum.getNumObservations());
|
||||
dataSumExpectedErrors.remove( newKey );
|
||||
sumExpectedErrors += QualityUtils.qualToErrorProb(Byte.parseByte(key.get(1).toString())) * thisDatum.getNumObservations();
|
||||
dataSumExpectedErrors.put( newKey, sumExpectedErrors );
|
||||
|
|
@ -101,8 +99,7 @@ public class RecalDataManager {
|
|||
newKey.add( key.get(1) ); // and quality score
|
||||
collapsedDatum = dataCollapsedQualityScore.get( newKey );
|
||||
if( collapsedDatum == null ) {
|
||||
//System.out.println("Added: " + newKey + " " + newKey.hashCode());
|
||||
dataCollapsedQualityScore.put( newKey, new RecalDatum( thisDatum ) );
|
||||
dataCollapsedQualityScore.put( newKey, new RecalDatum( thisDatum ) );
|
||||
} else {
|
||||
collapsedDatum.increment( thisDatum );
|
||||
}
|
||||
|
|
@ -127,7 +124,7 @@ public class RecalDataManager {
|
|||
|
||||
public NHashMap<RecalDatum> getCollapsedTable( int covariate ) {
|
||||
if( !collapsedTablesCreated ) {
|
||||
throw new StingException("Trying to get collapsed tables before they have been populated.");
|
||||
throw new StingException("Trying to get collapsed tables before they have been populated. Null pointers abound.");
|
||||
}
|
||||
|
||||
if( covariate == 0) {
|
||||
|
|
|
|||
|
|
@ -69,25 +69,23 @@ public class RecalDatum {
|
|||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
public RecalDatum increment( long incObservations, long incMismatches ) {
|
||||
public void increment( long incObservations, long incMismatches ) {
|
||||
numObservations += incObservations;
|
||||
numMismatches += incMismatches;
|
||||
return this;
|
||||
}
|
||||
|
||||
public RecalDatum increment( RecalDatum other ) {
|
||||
return increment( other.numObservations, other.numMismatches );
|
||||
public void increment( RecalDatum other ) {
|
||||
increment( other.numObservations, other.numMismatches );
|
||||
}
|
||||
|
||||
public RecalDatum increment( List<RecalDatum> data ) {
|
||||
public void increment( List<RecalDatum> data ) {
|
||||
for ( RecalDatum other : data ) {
|
||||
this.increment( other );
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public RecalDatum increment( char curBase, char ref ) {
|
||||
return increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1 ); // inc takes num observations, then num mismatches
|
||||
public void increment( char curBase, char ref ) {
|
||||
increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(ref) ? 0 : 1 ); // increment takes num observations, then num mismatches
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -71,6 +71,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
protected RecalDataManager dataManager;
|
||||
protected ArrayList<Covariate> requestedCovariates;
|
||||
|
||||
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static Pattern COVARIATE_PATTERN = Pattern.compile("^@!.*");
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ";
|
||||
|
||||
|
|
@ -93,7 +94,10 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
try {
|
||||
for ( String line : new xReadLines(new File( RECAL_FILE )) ) {
|
||||
lineNumber++;
|
||||
if ( COVARIATE_PATTERN.matcher(line).matches() ) { // the line string is either specifying a covariate or is giving csv data
|
||||
if( COMMENT_PATTERN.matcher(line).matches() ) {
|
||||
; // skip over the comment lines, (which start with '#')
|
||||
}
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // the line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new StingException( "Malformed input recalibration file. Found covariate names intermingled with data. " + RECAL_FILE );
|
||||
} else { // found another covariate in input file
|
||||
|
|
@ -121,8 +125,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
}
|
||||
|
||||
}
|
||||
else { // found some data
|
||||
} else { // found some data
|
||||
if( !foundAllCovariates ) {
|
||||
foundAllCovariates = true;
|
||||
logger.info( "The covariates being used here: " );
|
||||
|
|
@ -137,7 +140,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
} catch ( FileNotFoundException e ) {
|
||||
Utils.scareUser("Can not find input file: " + RECAL_FILE);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new RuntimeException("Error parsing recalibration data at line " + lineNumber, e);
|
||||
throw new StingException("Error parsing recalibration data at line " + lineNumber + ". Was your table generated by CountCovariatesRefactored?");
|
||||
}
|
||||
logger.info( "...done!" );
|
||||
|
||||
|
|
@ -146,14 +149,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
dataManager.createCollapsedTables( requestedCovariates.size() );
|
||||
logger.info( "...done!" );
|
||||
}
|
||||
|
||||
//System.out.println(dataManager.getCollapsedTable(1));
|
||||
}
|
||||
|
||||
private void addCSVData(String line) {
|
||||
String[] vals = line.split(",");
|
||||
ArrayList<Comparable> key = new ArrayList<Comparable>();
|
||||
Covariate cov; // preallocated for use in for loop below
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
|
|
@ -181,7 +182,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
for( int iii = 1; iii < read.getReadLength() - 1; iii++ ) { // skip first and last base because there is no dinuc
|
||||
List<Comparable> key = new ArrayList<Comparable>();
|
||||
for( Covariate covariate : requestedCovariates ) {
|
||||
key.add( covariate.getValue( read, iii, refBases ) );
|
||||
key.add( covariate.getValue( read, iii, refBases ) ); // BUGBUG offset should be 1 based but old recalibrator is 0 based?
|
||||
}
|
||||
|
||||
if( MODE == RecalibrationMode.COMBINATORIAL ) {
|
||||
|
|
@ -257,7 +258,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
String.format( " => %d + %.2f + %.2f + %.2f = %d", qualFromRead, globalDeltaQ, deltaQReported, deltaQCovariates, newQualityByte ) );
|
||||
}
|
||||
|
||||
//System.out.println("returning: " + newQualityByte);
|
||||
return newQualityByte;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue