Totally experimental, barely useable not to be used yet implementation of an "Indel Quality Recalibrator" Idea is that any indel that's not in input dbsnp is treated as an artifact, and then a csv is built with # of indels and # of observations as a function of each input covariate (initially, only cycle, read group and homopolymer run are useful). Then, when computing likelihoods of indels based on input haplotypes we compute gap penalties based on value of covariates at read. Feature is disabled by default with hidden arguments. TBD if usefulness of feature is worth the extra time and pain.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5724 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
3224bbe750
commit
2a80ffa2ee
|
|
@ -63,7 +63,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood
|
|||
protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit);
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE);
|
||||
alleleList = new ArrayList<Allele>();
|
||||
getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
|
||||
minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING;
|
||||
|
|
|
|||
|
|
@ -27,6 +27,9 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
|
||||
public class UnifiedArgumentCollection {
|
||||
|
|
@ -101,6 +104,13 @@ public class UnifiedArgumentCollection {
|
|||
public boolean DO_CONTEXT_DEPENDENT_PENALTIES = true;
|
||||
//gdebug+
|
||||
@Hidden
|
||||
// experimental arguments, NOT TO BE USED BY ANYONE WHOSE INITIALS AREN'T GDA!!!
|
||||
@Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false)
|
||||
public boolean GET_GAP_PENALTIES_FROM_DATA = false;
|
||||
|
||||
@Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE")
|
||||
public File INDEL_RECAL_FILE = new File("indel.recal_data.csv");
|
||||
|
||||
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
|
||||
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
|
||||
@Hidden
|
||||
|
|
@ -145,7 +155,9 @@ public class UnifiedArgumentCollection {
|
|||
uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO;
|
||||
uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE;
|
||||
uac.DO_CONTEXT_DEPENDENT_PENALTIES = DO_CONTEXT_DEPENDENT_PENALTIES;
|
||||
|
||||
|
||||
uac.GET_GAP_PENALTIES_FROM_DATA = GET_GAP_PENALTIES_FROM_DATA;
|
||||
uac.INDEL_RECAL_FILE = INDEL_RECAL_FILE;
|
||||
// todo- arguments to remove
|
||||
uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT;
|
||||
uac.dovit = dovit;
|
||||
|
|
|
|||
|
|
@ -30,17 +30,28 @@ import net.sf.samtools.CigarElement;
|
|||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate;
|
||||
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager;
|
||||
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDatum;
|
||||
import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalibrationArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.genotype.Haplotype;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
||||
public class PairHMMIndelErrorModel {
|
||||
|
|
@ -103,10 +114,30 @@ public class PairHMMIndelErrorModel {
|
|||
private final double[] GAP_OPEN_PROB_TABLE;
|
||||
private final double[] GAP_CONT_PROB_TABLE;
|
||||
|
||||
private boolean getGapPenaltiesFromFile = false;
|
||||
|
||||
static {
|
||||
private int SMOOTHING = 1;
|
||||
private int MAX_QUALITY_SCORE = 50;
|
||||
private int PRESERVE_QSCORES_LESS_THAN = 5;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
//copy+
|
||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of covariates to be used in this calculation
|
||||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
protected static final String EOF_MARKER = "EOF";
|
||||
private long numReadsWithMalformedColorSpace = 0;
|
||||
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
|
||||
|
||||
//copy-
|
||||
static {
|
||||
LOG_ONE_HALF= -Math.log10(2.0);
|
||||
END_GAP_COST = LOG_ONE_HALF;
|
||||
END_GAP_COST = LOG_ONE_HALF;
|
||||
|
||||
baseMatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
|
||||
|
|
@ -119,6 +150,130 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
}
|
||||
|
||||
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP, boolean dovit,boolean gpf, File RECAL_FILE) {
|
||||
|
||||
this(indelGOP, indelGCP, deb, doCDP, dovit);
|
||||
this.getGapPenaltiesFromFile = gpf;
|
||||
|
||||
// read data from recal file
|
||||
// gdebug - start copy from TableRecalibrationWalker
|
||||
if (gpf) {
|
||||
boolean sawEOF = false;
|
||||
boolean REQUIRE_EOF = false;
|
||||
|
||||
int lineNumber = 0;
|
||||
boolean foundAllCovariates = false;
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
|
||||
try {
|
||||
for ( String line : new XReadLines(RECAL_FILE) ) {
|
||||
lineNumber++;
|
||||
if ( EOF_MARKER.equals(line) ) {
|
||||
sawEOF = true;
|
||||
} else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) {
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
// Read in the covariates that were used from the input file
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
|
||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
String[] vals = line.split(",");
|
||||
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else { // Found a line of data
|
||||
if( !foundAllCovariates ) {
|
||||
foundAllCovariates = true;
|
||||
|
||||
// At this point all the covariates should have been found and initialized
|
||||
if( requestedCovariates.size() < 2 ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE );
|
||||
}
|
||||
|
||||
final boolean createCollapsedTables = true;
|
||||
|
||||
// Initialize any covariate member variables using the shared argument collection
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
cov.initialize( RAC );
|
||||
}
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() );
|
||||
|
||||
}
|
||||
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
|
||||
if ( !sawEOF ) {
|
||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
|
||||
if ( REQUIRE_EOF )
|
||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||
}
|
||||
|
||||
if( dataManager == null ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
|
||||
}
|
||||
|
||||
// Create the tables of empirical quality scores that will be used in the sequential calculation
|
||||
dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE );
|
||||
}
|
||||
// debug end copy
|
||||
|
||||
}
|
||||
/**
|
||||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||
* @param line A line of CSV data read from the recalibration table data file
|
||||
*/
|
||||
private void addCSVData(final File file, final String line) {
|
||||
final String[] vals = line.split(",");
|
||||
|
||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
|
||||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
||||
}
|
||||
|
||||
final Object[] key = new Object[requestedCovariates.size()];
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
key[iii] = cov.getValue( vals[iii] );
|
||||
}
|
||||
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN );
|
||||
}
|
||||
|
||||
|
||||
public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP, boolean dovit) {
|
||||
this(indelGOP, indelGCP, deb, doCDP);
|
||||
this.doViterbi = dovit;
|
||||
|
|
@ -152,11 +307,11 @@ public class PairHMMIndelErrorModel {
|
|||
for (int i=START_HRUN_GAP_IDX; i < MAX_HRUN_GAP_IDX; i++) {
|
||||
gop += step;
|
||||
if (gop > maxGOP)
|
||||
gop = maxGOP;
|
||||
gop = maxGOP;
|
||||
|
||||
gcp += step;
|
||||
if(gcp > maxGCP)
|
||||
gcp = maxGCP;
|
||||
gcp = maxGCP;
|
||||
GAP_OPEN_PROB_TABLE[i] = gop;
|
||||
GAP_CONT_PROB_TABLE[i] = gcp;
|
||||
}
|
||||
|
|
@ -399,14 +554,20 @@ public class PairHMMIndelErrorModel {
|
|||
bestActionArrayM[indI][indJ] = ACTIONS_M[bestMetricIdx];
|
||||
|
||||
// update X array
|
||||
// State X(i,j): X(1:i) aligned to a gap in Y(1:j).
|
||||
// State X(i,j): X(1:i) aligned to a gap in Y(1:j).
|
||||
// When in last column of X, ie X(1:i) aligned to full Y, we don't want to penalize gaps
|
||||
|
||||
//c = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]);
|
||||
//d = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]);
|
||||
c = currentGOP[jm1];
|
||||
d = currentGCP[jm1];
|
||||
if (indJ == Y_METRIC_LENGTH-1)
|
||||
if (getGapPenaltiesFromFile) {
|
||||
c = currentGOP[im1];
|
||||
d = logGapContinuationProbability;
|
||||
|
||||
} else {
|
||||
c = currentGOP[jm1];
|
||||
d = currentGCP[jm1];
|
||||
}
|
||||
if (indJ == Y_METRIC_LENGTH-1)
|
||||
c = d = END_GAP_COST;
|
||||
|
||||
if (doViterbi) {
|
||||
|
|
@ -426,8 +587,14 @@ public class PairHMMIndelErrorModel {
|
|||
// update Y array
|
||||
//c = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]);
|
||||
//d = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]);
|
||||
c = currentGOP[jm1];
|
||||
d = currentGCP[jm1];
|
||||
if (getGapPenaltiesFromFile) {
|
||||
c = currentGOP[im1];
|
||||
d = logGapContinuationProbability;
|
||||
}
|
||||
else {
|
||||
c = currentGOP[jm1];
|
||||
d = currentGCP[jm1];
|
||||
}
|
||||
if (indI == X_METRIC_LENGTH-1)
|
||||
c = d = END_GAP_COST;
|
||||
|
||||
|
|
@ -511,7 +678,7 @@ public class PairHMMIndelErrorModel {
|
|||
i--; j--;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
@ -541,7 +708,7 @@ public class PairHMMIndelErrorModel {
|
|||
else {
|
||||
contextLogGapOpenProbabilities[hIndex][i] = GAP_OPEN_PROB_TABLE[hrunProfile[i]];
|
||||
contextLogGapContinuationProbabilities[hIndex][i] = GAP_CONT_PROB_TABLE[hrunProfile[i]];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
public synchronized double[][] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, List<Haplotype> haplotypesInVC,
|
||||
|
|
@ -559,7 +726,7 @@ public class PairHMMIndelErrorModel {
|
|||
System.out.println(new String(ref.getBases()));
|
||||
}
|
||||
|
||||
if (doContextDependentPenalties) {
|
||||
if (doContextDependentPenalties && !getGapPenaltiesFromFile) {
|
||||
// will context dependent probabilities based on homopolymet run. Probabilities are filled based on total complete haplotypes.
|
||||
|
||||
for (int j=0; j < haplotypesInVC.size(); j++) {
|
||||
|
|
@ -584,23 +751,59 @@ public class PairHMMIndelErrorModel {
|
|||
}
|
||||
}
|
||||
for (SAMRecord pread : pileup.getReads()) {
|
||||
SAMRecord read = ReadUtils.hardClipAdaptorSequence(pread);
|
||||
SAMRecord read = ReadUtils.hardClipAdaptorSequence(pread);
|
||||
if (read == null)
|
||||
continue;
|
||||
|
||||
if(ReadUtils.is454Read(read)) {
|
||||
if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
||||
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
||||
if (DEBUG)
|
||||
System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
|
||||
read.getAlignmentStart(),
|
||||
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
||||
read.getCigarString());
|
||||
double[] recalQuals = null;
|
||||
if (getGapPenaltiesFromFile) {
|
||||
RecalDataManager.parseSAMRecord( read, RAC );
|
||||
|
||||
|
||||
recalQuals = new double[read.getReadLength()];
|
||||
|
||||
//compute all covariate values for this read
|
||||
final Comparable[][] covariateValues_offset_x_covar =
|
||||
RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
|
||||
// For each base in the read
|
||||
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
||||
|
||||
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
|
||||
|
||||
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
|
||||
if(qualityScore == null)
|
||||
{
|
||||
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
|
||||
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
|
||||
}
|
||||
|
||||
recalQuals[offset] = (double)qualityScore;
|
||||
}
|
||||
|
||||
// for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi))
|
||||
// = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent
|
||||
if (DEBUG) {
|
||||
System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(),
|
||||
read.getAlignmentStart(),
|
||||
read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(),
|
||||
read.getCigarString());
|
||||
|
||||
byte[] bases = read.getReadBases();
|
||||
for (int k = 0; k < recalQuals.length; k++) {
|
||||
System.out.format("%c",bases[k]);
|
||||
}
|
||||
System.out.println();
|
||||
|
||||
for (int k = 0; k < recalQuals.length; k++) {
|
||||
System.out.format("%.0f ",recalQuals[k]);
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
// get bases of candidate haplotypes that overlap with reads
|
||||
final int trailingBases = 3;
|
||||
|
||||
|
|
@ -641,12 +844,12 @@ public class PairHMMIndelErrorModel {
|
|||
*/
|
||||
// remove soft clips if necessary
|
||||
if ((read.getAlignmentStart()>=eventStartPos-eventLength && read.getAlignmentStart() <= eventStartPos+1) ||
|
||||
(read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
|
||||
(read.getAlignmentEnd() >= eventStartPos && read.getAlignmentEnd() <= eventStartPos + eventLength)) {
|
||||
numStartSoftClippedBases = 0;
|
||||
numEndSoftClippedBases = 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
byte[] unclippedReadBases, unclippedReadQuals;
|
||||
|
||||
|
|
@ -715,6 +918,12 @@ public class PairHMMIndelErrorModel {
|
|||
byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases,
|
||||
unclippedReadBases.length-numEndClippedBases);
|
||||
|
||||
double[] recalCDP = null;
|
||||
if (getGapPenaltiesFromFile) {
|
||||
recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases,
|
||||
unclippedReadBases.length-numEndClippedBases);
|
||||
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
System.out.println("Read bases:");
|
||||
|
|
@ -751,11 +960,17 @@ public class PairHMMIndelErrorModel {
|
|||
double[] currentContextGCP = null;
|
||||
|
||||
if (doContextDependentPenalties) {
|
||||
currentContextGOP = Arrays.copyOfRange(contextLogGapOpenProbabilities[j], (int)indStart, (int)indStop);
|
||||
currentContextGCP = Arrays.copyOfRange(contextLogGapContinuationProbabilities[j], (int)indStart, (int)indStop);
|
||||
}
|
||||
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
|
||||
|
||||
if (getGapPenaltiesFromFile) {
|
||||
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null);
|
||||
|
||||
} else {
|
||||
currentContextGOP = Arrays.copyOfRange(contextLogGapOpenProbabilities[j], (int)indStart, (int)indStop);
|
||||
currentContextGCP = Arrays.copyOfRange(contextLogGapContinuationProbabilities[j], (int)indStart, (int)indStop);
|
||||
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
readLikelihoods[readIdx][j]= computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals);
|
||||
|
|
@ -824,5 +1039,76 @@ public class PairHMMIndelErrorModel {
|
|||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Implements a serial recalibration of the reads using the combinational table.
|
||||
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
|
||||
*
|
||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||
*
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
* @param key The list of Comparables that were calculated from the covariates
|
||||
* @return A recalibrated quality score as a byte
|
||||
*/
|
||||
private byte performSequentialQualityCalculation( final Object... key ) {
|
||||
|
||||
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||
final Object[] covariateCollapsedKey = new Object[3];
|
||||
|
||||
// The global quality shift (over the read group only)
|
||||
readGroupCollapsedKey[0] = key[0];
|
||||
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey ));
|
||||
double globalDeltaQ = 0.0;
|
||||
if( globalRecalDatum != null ) {
|
||||
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
||||
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||
}
|
||||
|
||||
// The shift in quality between reported and empirical
|
||||
qualityScoreCollapsedKey[0] = key[0];
|
||||
qualityScoreCollapsedKey[1] = key[1];
|
||||
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey ));
|
||||
double deltaQReported = 0.0;
|
||||
if( qReportedRecalDatum != null ) {
|
||||
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
}
|
||||
|
||||
// The shift in quality due to each covariate by itself in turn
|
||||
double deltaQCovariates = 0.0;
|
||||
double deltaQCovariateEmpirical;
|
||||
covariateCollapsedKey[0] = key[0];
|
||||
covariateCollapsedKey[1] = key[1];
|
||||
for( int iii = 2; iii < key.length; iii++ ) {
|
||||
covariateCollapsedKey[2] = key[iii]; // The given covariate
|
||||
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey ));
|
||||
if( covariateRecalDatum != null ) {
|
||||
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
}
|
||||
}
|
||||
|
||||
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
|
||||
|
||||
// Verbose printouts used to validate with old recalibrator
|
||||
//if(key.contains(null)) {
|
||||
// System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
|
||||
//}
|
||||
//else {
|
||||
// System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
|
||||
//}
|
||||
|
||||
//return newQualityByte;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,106 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Gatherer;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: carneiro
|
||||
* Date: 3/29/11
|
||||
* Time: 3:54 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
|
||||
|
||||
public class CountCovariatesGatherer extends Gatherer {
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
private static final String EOF_MARKER = "EOF";
|
||||
|
||||
private HashMap<String, RecalDatumOptimized> dataMap;
|
||||
|
||||
|
||||
private void addCSVData (String line) {
|
||||
String[] covariates = line.split(",");
|
||||
String key = "";
|
||||
RecalDatumOptimized values;
|
||||
|
||||
for (int i = 0; i < covariates.length-3; i++) {
|
||||
key += covariates[i] + ",";
|
||||
}
|
||||
|
||||
values = new RecalDatumOptimized(Integer.parseInt(covariates[covariates.length-3]),
|
||||
Integer.parseInt(covariates[covariates.length-2]));
|
||||
|
||||
if (dataMap.get(key) != null) {
|
||||
RecalDatumOptimized currentValues = dataMap.get(key);
|
||||
values.increment(currentValues);
|
||||
}
|
||||
|
||||
dataMap.put(key, values);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void gather(List<File> inputs, File output) {
|
||||
dataMap = new HashMap<String, RecalDatumOptimized>();
|
||||
PrintStream o;
|
||||
try {
|
||||
o = new PrintStream(output);
|
||||
} catch ( FileNotFoundException e) {
|
||||
throw new UserException("File to be output by CountCovariates Gather function was not found");
|
||||
}
|
||||
|
||||
boolean sawEOF = false;
|
||||
boolean printedHeader = false;
|
||||
|
||||
// Read input files
|
||||
for ( File RECAL_FILE : inputs) {
|
||||
try {
|
||||
for ( String line : new XReadLines(RECAL_FILE) ) {
|
||||
if ( EOF_MARKER.equals(line) ) {
|
||||
sawEOF = true; // sanity check
|
||||
}
|
||||
else if(COMMENT_PATTERN.matcher(line).matches()) {
|
||||
; // It doesn't make any sense to print intermediate comments, unless we merge them somehow (would require strict definition for the header)
|
||||
}
|
||||
else if (COVARIATE_PATTERN.matcher(line).matches()) {
|
||||
if (!printedHeader)
|
||||
o.println(line);
|
||||
}
|
||||
else { // Found a line of data
|
||||
addCSVData(line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||
}
|
||||
|
||||
if ( !sawEOF ) {
|
||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted!";
|
||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||
}
|
||||
printedHeader = true;
|
||||
}
|
||||
|
||||
// Write output file from dataMap
|
||||
for(String key : dataMap.keySet()) {
|
||||
RecalDatumOptimized values = dataMap.get(key);
|
||||
String v = values.getNumObservations() + "," + values.getNumMismatches() + "," + values.empiricalQualByte();
|
||||
o.println(key + v);
|
||||
}
|
||||
o.println("EOF");
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Oct 30, 2009
|
||||
*
|
||||
* The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read, offset, and corresponding reference bases
|
||||
* In general most error checking and adjustments to the data are done before the call to the covariates getValue methods in order to speed up the code.
|
||||
* This unfortunately muddies the code, but most of these corrections can be done per read while the covariates get called per base, resulting in a big speed up.
|
||||
*/
|
||||
|
||||
public interface Covariate {
|
||||
public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public void getValues( SAMRecord read, Comparable[] comparable ); //Takes an array of size (at least) read.getReadLength() and fills it with covariate
|
||||
//values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows
|
||||
//read-specific calculations to be done just once rather than for each offset.
|
||||
}
|
||||
|
||||
interface RequiredCovariate extends Covariate {
|
||||
}
|
||||
|
||||
interface StandardCovariate extends Covariate {
|
||||
}
|
||||
|
||||
interface ExperimentalCovariate extends Covariate {
|
||||
}
|
||||
|
|
@ -0,0 +1,280 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Oct 30, 2009
|
||||
*
|
||||
* The Cycle covariate.
|
||||
* For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
|
||||
* For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle
|
||||
* For example, for the read: AAACCCCGAAATTTTTACTG
|
||||
* the cycle would be 11111111222333333344
|
||||
* For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
|
||||
*/
|
||||
|
||||
public class CycleCovariate implements StandardCovariate {
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
if( RAC.DEFAULT_PLATFORM != null ) {
|
||||
if( RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SLX" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ILLUMINA" ) ||
|
||||
RAC.DEFAULT_PLATFORM.contains( "454" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SOLID" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ABI_SOLID" ) ) {
|
||||
// nothing to do
|
||||
} else {
|
||||
throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM +") is not a recognized platform. Implemented options are illumina, 454, and solid");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
|
||||
int cycle = 1;
|
||||
|
||||
//-----------------------------
|
||||
// ILLUMINA and SOLID
|
||||
//-----------------------------
|
||||
|
||||
if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) || // Some bams have "illumina" and others have "SLX"
|
||||
read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "ABI_SOLID" )) { // Some bams have "solid" and others have "ABI_SOLID"
|
||||
cycle = offset + 1;
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
cycle = read.getReadLength() - offset;
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------
|
||||
// 454
|
||||
//-----------------------------
|
||||
|
||||
else if( read.getReadGroup().getPlatform().contains( "454" ) ) { // Some bams have "LS454" and others have just "454"
|
||||
final byte[] bases = read.getReadBases();
|
||||
|
||||
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
|
||||
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
|
||||
if( !read.getReadNegativeStrandFlag() ) { // Forward direction
|
||||
int iii = 0;
|
||||
while( iii <= offset )
|
||||
{
|
||||
while( iii <= offset && bases[iii] == (byte)'T' ) { iii++; }
|
||||
while( iii <= offset && bases[iii] == (byte)'A' ) { iii++; }
|
||||
while( iii <= offset && bases[iii] == (byte)'C' ) { iii++; }
|
||||
while( iii <= offset && bases[iii] == (byte)'G' ) { iii++; }
|
||||
if( iii <= offset ) { cycle++; }
|
||||
if( iii <= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii++; }
|
||||
|
||||
}
|
||||
} else { // Negative direction
|
||||
int iii = bases.length-1;
|
||||
while( iii >= offset )
|
||||
{
|
||||
while( iii >= offset && bases[iii] == (byte)'T' ) { iii--; }
|
||||
while( iii >= offset && bases[iii] == (byte)'A' ) { iii--; }
|
||||
while( iii >= offset && bases[iii] == (byte)'C' ) { iii--; }
|
||||
while( iii >= offset && bases[iii] == (byte)'G' ) { iii--; }
|
||||
if( iii >= offset ) { cycle++; }
|
||||
if( iii >= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii--; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------
|
||||
// SOLID (unused), only to be used in conjunction with PrimerRoundCovariate
|
||||
//-----------------------------
|
||||
|
||||
//else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) {
|
||||
// // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
|
||||
// int pos = offset + 1;
|
||||
// if( read.getReadNegativeStrandFlag() ) {
|
||||
// pos = read.getReadLength() - offset;
|
||||
// }
|
||||
// cycle = pos / 5; // integer division
|
||||
//}
|
||||
|
||||
//-----------------------------
|
||||
// UNRECOGNIZED PLATFORM
|
||||
//-----------------------------
|
||||
|
||||
else { // Platform is unrecognized so revert to the default platform but warn the user first
|
||||
if( defaultPlatform != null) { // The user set a default platform
|
||||
if( !warnedUserBadPlatform ) {
|
||||
Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " +
|
||||
"Defaulting to platform = " + defaultPlatform + "." );
|
||||
}
|
||||
warnedUserBadPlatform = true;
|
||||
|
||||
read.getReadGroup().setPlatform( defaultPlatform );
|
||||
return getValue( read, offset ); // A recursive call
|
||||
} else { // The user did not set a default platform
|
||||
throw new StingException( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " +
|
||||
"No default platform specified. Users must set the default platform using the --default_platform <String> argument." );
|
||||
}
|
||||
}
|
||||
|
||||
// Differentiate between first and second of pair.
|
||||
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
|
||||
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
|
||||
// Therefore the cycle covariate must differentiate between first and second of pair reads.
|
||||
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
|
||||
// the current sequential model would consider the effects independently instead of jointly.
|
||||
if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) {
|
||||
cycle *= -1;
|
||||
}
|
||||
|
||||
return cycle;
|
||||
}
|
||||
*/
|
||||
|
||||
// todo -- this should be put into a common place in the code base
|
||||
private static List<String> PACBIO_NAMES = Arrays.asList("PACBIO");
|
||||
private static List<String> ILLUMINA_NAMES = Arrays.asList("ILLUMINA", "SLX", "SOLEXA");
|
||||
private static List<String> SOLID_NAMES = Arrays.asList("SOLID");
|
||||
private static List<String> LS454_NAMES = Arrays.asList("454");
|
||||
|
||||
private static boolean isPlatform(SAMRecord read, List<String> names) {
|
||||
String pl = read.getReadGroup().getPlatform().toUpperCase();
|
||||
for ( String name : names )
|
||||
if ( pl.contains( name ) )
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
|
||||
//-----------------------------
|
||||
// ILLUMINA and SOLID
|
||||
//-----------------------------
|
||||
|
||||
|
||||
if( isPlatform(read, ILLUMINA_NAMES) || isPlatform(read, SOLID_NAMES) || isPlatform(read, PACBIO_NAMES)) {
|
||||
final int init;
|
||||
final int increment;
|
||||
if( !read.getReadNegativeStrandFlag() ) {
|
||||
// Differentiate between first and second of pair.
|
||||
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
|
||||
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
|
||||
// Therefore the cycle covariate must differentiate between first and second of pair reads.
|
||||
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
|
||||
// the current sequential model would consider the effects independently instead of jointly.
|
||||
if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) {
|
||||
//second of pair, positive strand
|
||||
init = -1;
|
||||
increment = -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
//first of pair, positive strand
|
||||
init = 1;
|
||||
increment = 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) {
|
||||
//second of pair, negative strand
|
||||
init = -read.getReadLength();
|
||||
increment = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
//first of pair, negative strand
|
||||
init = read.getReadLength();
|
||||
increment = -1;
|
||||
}
|
||||
}
|
||||
|
||||
int cycle = init;
|
||||
for(int i = 0; i < read.getReadLength(); i++) {
|
||||
comparable[i] = cycle;
|
||||
cycle += increment;
|
||||
}
|
||||
}
|
||||
else if ( isPlatform(read, LS454_NAMES) ) { // Some bams have "LS454" and others have just "454"
|
||||
|
||||
final int readLength = read.getReadLength();
|
||||
final byte[] bases = read.getReadBases();
|
||||
|
||||
// Differentiate between first and second of pair.
|
||||
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
|
||||
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
|
||||
// Therefore the cycle covariate must differentiate between first and second of pair reads.
|
||||
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
|
||||
// the current sequential model would consider the effects independently instead of jointly.
|
||||
final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag();
|
||||
|
||||
int cycle = multiplyByNegative1 ? -1 : 1;
|
||||
|
||||
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
|
||||
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
|
||||
if( !read.getReadNegativeStrandFlag() ) { // Forward direction
|
||||
int iii = 0;
|
||||
while( iii < readLength )
|
||||
{
|
||||
while( iii < readLength && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii++; }
|
||||
while( iii < readLength && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii++; }
|
||||
while( iii < readLength && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii++; }
|
||||
while( iii < readLength && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii++; }
|
||||
if( iii < readLength ) { if (multiplyByNegative1) cycle--; else cycle++; }
|
||||
if( iii < readLength && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii++; }
|
||||
|
||||
}
|
||||
} else { // Negative direction
|
||||
int iii = readLength-1;
|
||||
while( iii >= 0 )
|
||||
{
|
||||
while( iii >= 0 && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii--; }
|
||||
while( iii >= 0 && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii--; }
|
||||
while( iii >= 0 && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii--; }
|
||||
while( iii >= 0 && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii--; }
|
||||
if( iii >= 0 ) { if (multiplyByNegative1) cycle--; else cycle++; }
|
||||
if( iii >= 0 && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii--; }
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
throw new IllegalStateException("This method hasn't been implemented yet for " + read.getReadGroup().getPlatform());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 16, 2009
|
||||
*/
|
||||
//public class Dinuc implements Comparable<org.broadinstitute.sting.gatk.walkers.recalibration.Dinuc>{
|
||||
public class Dinuc implements Comparable<org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Dinuc>{
|
||||
private byte first;
|
||||
private byte second;
|
||||
|
||||
public Dinuc() {
|
||||
first = 0;
|
||||
second = 0;
|
||||
}
|
||||
|
||||
public Dinuc(final byte _first, final byte _second) {
|
||||
first = _first;
|
||||
second = _second;
|
||||
}
|
||||
|
||||
public final void setValues(final byte _first, final byte _second) {
|
||||
first = _first;
|
||||
second = _second;
|
||||
}
|
||||
|
||||
public int compareTo(final org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Dinuc that) {
|
||||
if( this.first > that.first ) { return 1; }
|
||||
else if( this.first < that.first ) { return -1; }
|
||||
else { //this.first equals that.first
|
||||
if( this.second > that.second ) { return 1; }
|
||||
else if( this.second < that.second ) { return -1; }
|
||||
else { return 0; }
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static int hashBytes(final byte byte1, final byte byte2) {
|
||||
return byte1 << 8 + byte2;
|
||||
}
|
||||
|
||||
public String toString() { // This method call is how the Dinuc will get written out to the table recalibration file
|
||||
byte[] byteArray = {first,second};
|
||||
return new String(byteArray);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,141 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
//g import org.broadinstitute.sting.gatk.walkers.recalibration.ExperimentalCovariate;
|
||||
//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
|
||||
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Dec 4, 2009
|
||||
*
|
||||
* The Homopolymer Run Covariate. This is the number of consecutive bases in the previous N that match the current base.
|
||||
* For example, if N = 10:
|
||||
* ATTGCCCCGTAAAAAAAAATA
|
||||
* 001001230001234567800
|
||||
*/
|
||||
|
||||
public class HomopolymerCovariate implements ExperimentalCovariate {
|
||||
|
||||
int numBack = 7;
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
numBack = RAC.HOMOPOLYMER_NBACK;
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
|
||||
// This block of code is for if you don't want to only count consecutive bases
|
||||
// ATTGCCCCGTAAAAAAAAATA
|
||||
// 001001231211234567819
|
||||
/*
|
||||
int numAgree = 0; // The number of bases that agree with you in the previous numBack bases of the read
|
||||
int startPos = 0;
|
||||
int stopPos = 0;
|
||||
byte[] bases = read.getReadBases();
|
||||
byte thisBase = bases[offset];
|
||||
if( !read.getReadNegativeStrandFlag() ) { // Forward direction
|
||||
startPos = Math.max(offset - numBack, 0);
|
||||
stopPos = Math.max(offset - 1, 0);
|
||||
} else { // Negative direction
|
||||
startPos = Math.min(offset + 2, bases.length);
|
||||
stopPos = Math.min(offset + numBack + 1, bases.length);
|
||||
}
|
||||
|
||||
for( int iii = startPos; iii < stopPos; iii++ ) {
|
||||
if( bases[iii] == thisBase ) { numAgree++; }
|
||||
}
|
||||
*/
|
||||
|
||||
int numAgree = 0; // The number of consecutive bases that agree with you in the previous numBack bases of the read
|
||||
final byte[] bases = read.getReadBases();
|
||||
int iii = offset;
|
||||
if( !read.getReadNegativeStrandFlag() ) { // Forward direction
|
||||
while( iii <= bases.length-2 && bases[iii] == bases[iii+1] && numAgree < numBack ) {
|
||||
numAgree++;
|
||||
iii++;
|
||||
}
|
||||
} else { // Negative direction
|
||||
while( iii >= 1 && bases[iii] == bases[iii-1] && numAgree < numBack ) {
|
||||
numAgree++;
|
||||
iii--;
|
||||
}
|
||||
}
|
||||
|
||||
return numAgree;
|
||||
}
|
||||
|
||||
private void getContextHomopolymerLength(final byte[] refBytes, Comparable[] hrunArray) {
|
||||
// compute forward hrun length, example:
|
||||
// AGGTGACCCCCCTGAGAG
|
||||
// 001000012345000000
|
||||
int runCount = 0;
|
||||
hrunArray[0] = 0;
|
||||
int[] hforward = new int[hrunArray.length];
|
||||
int[] hreverse = new int[hrunArray.length];
|
||||
|
||||
for (int i = 1; i < refBytes.length; i++) {
|
||||
if (refBytes[i] == refBytes[i-1])
|
||||
hforward[i] = hforward[i-1]+1;
|
||||
else
|
||||
hforward[i] = 0;
|
||||
}
|
||||
|
||||
// do similar thing for reverse length, example:
|
||||
// AGGTGACCCCCCTGAGAG
|
||||
// 021000543210000000
|
||||
// and then accumulate with forward values.
|
||||
// Total:
|
||||
// AGGTGACCCCCCTGAGAG
|
||||
// 022000555555000000
|
||||
for (int i=refBytes.length-1; i > 0; i--) {
|
||||
if (refBytes[i-1] == refBytes[i])
|
||||
hreverse[i-1] += hreverse[i]+1;
|
||||
}
|
||||
|
||||
for (int i = 1; i < refBytes.length; i++)
|
||||
hrunArray[i] = hforward[i]+hreverse[i];
|
||||
}
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
|
||||
// getContextHomopolymerLength(read.getReadBases(), comparable);
|
||||
for(int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
//g import org.broadinstitute.sting.gatk.walkers.recalibration.ExperimentalCovariate;
|
||||
//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 3, 2009
|
||||
*
|
||||
* The Reported Quality Score covariate.
|
||||
*/
|
||||
|
||||
public class IndelCountCovariate implements ExperimentalCovariate {
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
}
|
||||
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read ) {
|
||||
|
||||
Cigar c = read.getCigar();
|
||||
|
||||
int indelCount = 0;
|
||||
|
||||
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
|
||||
CigarElement ce = c.getCigarElement(i);
|
||||
switch( ce.getOperator() ) {
|
||||
case D:
|
||||
indelCount++;
|
||||
break;
|
||||
case I:
|
||||
indelCount++;
|
||||
break;
|
||||
case M:
|
||||
case N:
|
||||
case S:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return indelCount;
|
||||
}
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
/* Comparable numIndels = getValue(read);
|
||||
for(int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
comparable[iii] = numIndels; // BUGBUG: this can be optimized
|
||||
} */
|
||||
int ind = 0;
|
||||
Cigar c = read.getCigar();
|
||||
System.out.println(c.toString());
|
||||
for ( int i = 0 ; i < c.numCigarElements() ; i++ ) {
|
||||
CigarElement ce = c.getCigarElement(i);
|
||||
|
||||
switch( ce.getOperator() ) {
|
||||
case D:
|
||||
case I:
|
||||
for (int k=0; k < ce.getLength(); k++)
|
||||
comparable[ind++] = ce.getLength();
|
||||
break;
|
||||
case M:
|
||||
case N:
|
||||
case S:
|
||||
for (int k=0; k < ce.getLength(); k++)
|
||||
comparable[ind++] = 0;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,637 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
import org.broad.tribble.bed.BEDCodec;
|
||||
import org.broad.tribble.dbsnp.DbSNPCodec;
|
||||
import org.broad.tribble.util.variantcontext.VariantContext;
|
||||
import org.broad.tribble.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.commandline.Gather;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.filters.ZeroMappingQualityReadFilter;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
//import org.broadinstitute.sting.gatk.walkers.recalibration.*;
|
||||
//import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.ArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* This walker is designed to work as the first pass in a two-pass processing step.
|
||||
* It does a by-locus traversal operating only at sites that are not in dbSNP.
|
||||
* We assume that all reference mismatches we see are therefore errors and indicative of poor base quality.
|
||||
* This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinucleotide)
|
||||
* Since there is a large amount of data one can then calculate an empirical probability of error
|
||||
* given the particular covariates seen at this site, where p(error) = num mismatches / num observations
|
||||
* The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score)
|
||||
* The first non-comment line of the output file gives the name of the covariates that were used for this calculation.
|
||||
*
|
||||
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified
|
||||
* Note: This walker is designed to be used in conjunction with TableRecalibrationWalker.
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Nov 3, 2009
|
||||
* @help.summary First pass of the recalibration. Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide).
|
||||
*/
|
||||
|
||||
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
|
||||
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
|
||||
@ReadFilters( {ZeroMappingQualityReadFilter.class} ) // Filter out all reads with zero mapping quality
|
||||
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
|
||||
@PartitionBy(PartitionType.LOCUS)
|
||||
// todo - merge with CountCovariates, all work is done, just need to port over
|
||||
public class IndelCountCovariatesWalker extends LocusWalker<IndelCountCovariatesWalker.CountedData, IndelCountCovariatesWalker.CountedData> implements TreeReducible<IndelCountCovariatesWalker.CountedData> {
|
||||
|
||||
/////////////////////////////
|
||||
// Constants
|
||||
/////////////////////////////
|
||||
private static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; //used to label GATKSAMRecords that should be skipped.
|
||||
private static final String SEEN_ATTRIBUTE = "SEEN"; //used to label GATKSAMRecords as processed.
|
||||
private static final String COVARS_ATTRIBUTE = "COVARS"; //used to store covariates array as a temporary attribute inside GATKSAMRecord.
|
||||
|
||||
/////////////////////////////
|
||||
// Shared Arguments
|
||||
/////////////////////////////
|
||||
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
|
||||
@Output
|
||||
PrintStream out;
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the outputted covariates table recalibration file")
|
||||
@Gather(CountCovariatesGatherer.class)
|
||||
public PrintStream RECAL_FILE;
|
||||
|
||||
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
|
||||
private boolean LIST_ONLY = false;
|
||||
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false)
|
||||
private String[] COVARIATES = null;
|
||||
@Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false)
|
||||
private boolean USE_STANDARD_COVARIATES = false;
|
||||
|
||||
@Argument(fullName="count_indels", shortName="indels", doc="Count covariates at indel sites", required=false)
|
||||
private boolean COUNT_INDELS = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
|
||||
private boolean DONT_SORT_OUTPUT = false;
|
||||
@Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
private boolean RUN_WITHOUT_DBSNP = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // A list to hold the covariate objects that were requested
|
||||
private static final double DBSNP_VS_NOVEL_MISMATCH_RATE = 2.0; // rate at which dbSNP sites (on an individual level) mismatch relative to novel sites (determined by looking at NA12878)
|
||||
private static int DBSNP_VALIDATION_CHECK_FREQUENCY = 1000000; // how often to validate dbsnp mismatch rate (in terms of loci seen)
|
||||
|
||||
public static class CountedData {
|
||||
private long countedSites = 0; // Number of loci used in the calculations, used for reporting in the output file
|
||||
private long countedBases = 0; // Number of bases used in the calculations, used for reporting in the output file
|
||||
private long skippedSites = 0; // Number of loci skipped because it was a dbSNP site, used for reporting in the output file
|
||||
private long solidInsertedReferenceBases = 0; // Number of bases where we believe SOLID has inserted the reference because the color space is inconsistent with the read base
|
||||
private long otherColorSpaceInconsistency = 0; // Number of bases where the color space is inconsistent with the read but the reference wasn't inserted.
|
||||
|
||||
private long dbSNPCountsMM = 0, dbSNPCountsBases = 0; // mismatch/base counts for dbSNP loci
|
||||
private long novelCountsMM = 0, novelCountsBases = 0; // mismatch/base counts for non-dbSNP loci
|
||||
private int lociSinceLastDbsnpCheck = 0; // loci since last dbsnp validation
|
||||
|
||||
/**
|
||||
* Adds the values of other to this, returning this
|
||||
* @param other
|
||||
* @return this object
|
||||
*/
|
||||
public CountedData add(CountedData other) {
|
||||
countedSites += other.countedSites;
|
||||
countedBases += other.countedBases;
|
||||
skippedSites += other.skippedSites;
|
||||
solidInsertedReferenceBases += other.solidInsertedReferenceBases;
|
||||
otherColorSpaceInconsistency += other.otherColorSpaceInconsistency;
|
||||
dbSNPCountsMM += other.dbSNPCountsMM;
|
||||
dbSNPCountsBases += other.dbSNPCountsBases;
|
||||
novelCountsMM += other.novelCountsMM;
|
||||
novelCountsBases += other.novelCountsBases;
|
||||
lociSinceLastDbsnpCheck += other.lociSinceLastDbsnpCheck;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
// enable deletions in the pileup
|
||||
public boolean includeReadsWithDeletionAtLoci() { return COUNT_INDELS; }
|
||||
|
||||
// enable extended events for indels
|
||||
public boolean generateExtendedEvents() { return COUNT_INDELS; }
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Parse the -cov arguments and create a list of covariates to be used here
|
||||
* Based on the covariates' estimates for initial capacity allocate the data hashmap
|
||||
*/
|
||||
public void initialize() {
|
||||
|
||||
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
|
||||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
||||
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>( Covariate.class ).getPlugins();
|
||||
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>( RequiredCovariate.class ).getPlugins();
|
||||
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>( StandardCovariate.class ).getPlugins();
|
||||
|
||||
// Print and exit if that's what was requested
|
||||
if ( LIST_ONLY ) {
|
||||
out.println( "Available covariates:" );
|
||||
for( Class<?> covClass : covariateClasses ) {
|
||||
out.println( covClass.getSimpleName() );
|
||||
}
|
||||
out.println();
|
||||
|
||||
System.exit( 0 ); // Early exit here because user requested it
|
||||
}
|
||||
|
||||
// Warn the user if no dbSNP file or other variant mask was specified
|
||||
boolean foundDBSNP = false;
|
||||
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
|
||||
if( rod != null ) {
|
||||
if( rod.getType().equals(DbSNPCodec.class) ||
|
||||
rod.getType().equals(VCFCodec.class) ||
|
||||
rod.getType().equals(BEDCodec.class) ) {
|
||||
foundDBSNP = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( !foundDBSNP && !RUN_WITHOUT_DBSNP ) {
|
||||
throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a dbSNP ROD or a VCF file containing known sites of genetic variation.");
|
||||
}
|
||||
|
||||
// Initialize the requested covariates by parsing the -cov argument
|
||||
// First add the required covariates
|
||||
if( requiredClasses.size() == 2) { // readGroup and reported quality score
|
||||
requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
} else {
|
||||
throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order.");
|
||||
}
|
||||
// Next add the standard covariates if -standard was specified by the user
|
||||
if( USE_STANDARD_COVARIATES ) {
|
||||
// We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order
|
||||
// A list of Classes can't be sorted, but a list of Class names can be
|
||||
final List<String> standardClassNames = new ArrayList<String>();
|
||||
for( Class<?> covClass : standardClasses ) {
|
||||
standardClassNames.add( covClass.getName() );
|
||||
}
|
||||
Collections.sort(standardClassNames); // Sort the list of class names
|
||||
for( String className : standardClassNames ) {
|
||||
for( Class<?> covClass : standardClasses ) { // Find the class that matches this class name
|
||||
if( covClass.getName().equals( className ) ) {
|
||||
try {
|
||||
final Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Finally parse the -cov arguments that were provided, skipping over the ones already specified
|
||||
if( COVARIATES != null ) {
|
||||
for( String requestedCovariateString : COVARIATES ) {
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : covariateClasses ) {
|
||||
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
|
||||
foundClass = true;
|
||||
if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) {
|
||||
try {
|
||||
// Now that we've found a matching class, try to instantiate it
|
||||
final Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new UserException.CommandLineException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.info( "The covariates being used here: " );
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
logger.info( "\t" + cov.getClass().getSimpleName() );
|
||||
cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection
|
||||
}
|
||||
|
||||
// try {
|
||||
// stream = new PrintStream( RAC.RECAL_FILE );
|
||||
// } catch ( FileNotFoundException e ) {
|
||||
// throw new RuntimeException( "Couldn't open output file: ", e );
|
||||
// }
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* For each read at this locus get the various covariate values and increment that location in the map based on
|
||||
* whether or not the base matches the reference at this particular location
|
||||
* @param tracker The reference metadata tracker
|
||||
* @param ref The reference context
|
||||
* @param context The alignment context
|
||||
* @return Returns 1, but this value isn't used in the reduce step
|
||||
*/
|
||||
public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
|
||||
// Pull out data for this locus for all the input RODs and check if this is a known variant site in any of them
|
||||
boolean isKnownVariant = false;
|
||||
for( final VariantContext vc : tracker.getAllVariantContexts(ref, null, context.getLocation(), false, false) ) {
|
||||
if( vc != null ) {
|
||||
isKnownVariant = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Only use data from non-dbsnp sites
|
||||
// Assume every mismatch at a non-dbsnp site is indicative of poor quality
|
||||
CountedData counter = new CountedData();
|
||||
if( !isKnownVariant ) {
|
||||
|
||||
if (COUNT_INDELS && context.hasExtendedEventPileup())
|
||||
{
|
||||
|
||||
for ( ExtendedEventPileupElement p : context.getExtendedEventPileup().toExtendedIterable() ) {
|
||||
|
||||
GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead();
|
||||
parsePileupElement(gatkRead, p.getOffset(), ref, counter, RAC, p.getType(), true);
|
||||
|
||||
}
|
||||
}
|
||||
else {
|
||||
// For each read at this locus
|
||||
for( PileupElement p : context.getBasePileup() ) {
|
||||
GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead();
|
||||
parsePileupElement(gatkRead, p.getOffset(), ref, counter, RAC, ExtendedEventPileupElement.Type.NOEVENT, false);
|
||||
}
|
||||
}
|
||||
counter.countedSites++;
|
||||
} else { // We skipped over the dbSNP site, and we are only processing every Nth locus
|
||||
counter.skippedSites++;
|
||||
updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
}
|
||||
|
||||
return counter;
|
||||
}
|
||||
|
||||
private void parsePileupElement(GATKSAMRecord gatkRead, int offset, ReferenceContext ref, CountedData counter,
|
||||
RecalibrationArgumentCollection RAC, ExtendedEventPileupElement.Type type, boolean inExtendedPileup) {
|
||||
|
||||
|
||||
if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) {
|
||||
return;
|
||||
}
|
||||
|
||||
if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) )
|
||||
{
|
||||
gatkRead.setTemporaryAttribute( SEEN_ATTRIBUTE, true );
|
||||
RecalDataManager.parseSAMRecord( gatkRead, RAC );
|
||||
|
||||
// Skip over reads with no calls in the color space if the user requested it
|
||||
if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace( gatkRead ) ) {
|
||||
gatkRead.setTemporaryAttribute( SKIP_RECORD_ATTRIBUTE, true);
|
||||
return;
|
||||
}
|
||||
|
||||
RecalDataManager.parseColorSpace( gatkRead );
|
||||
gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE,
|
||||
RecalDataManager.computeCovariates( gatkRead, requestedCovariates ));
|
||||
}
|
||||
|
||||
// Skip this position if base quality is zero
|
||||
boolean hasIndelAtThisPosition = false;
|
||||
if (inExtendedPileup) {
|
||||
// only count indel events in extended pileups
|
||||
if (type.equals(ExtendedEventPileupElement.Type.NOEVENT))
|
||||
return;
|
||||
|
||||
else
|
||||
hasIndelAtThisPosition = true;
|
||||
} else {
|
||||
// in regular pileups we still get del bases: ignore now
|
||||
if (offset < 0)
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
if (COUNT_INDELS) {
|
||||
if (offset < 0) {
|
||||
// recompute offset in case of a deletion
|
||||
offset = ref.getLocus().getStart() - gatkRead.getUnclippedStart();
|
||||
// further edge case: read starting w/insertion: ignore for now
|
||||
if (offset < 0) return;
|
||||
}
|
||||
updateDataFromRead( counter, gatkRead, offset, ref.getBase(), hasIndelAtThisPosition );
|
||||
|
||||
}
|
||||
else {
|
||||
// Skip this position if base quality is zero
|
||||
if( gatkRead.getBaseQualities()[offset] > 0 ) {
|
||||
|
||||
byte[] bases = gatkRead.getReadBases();
|
||||
byte refBase = ref.getBase();
|
||||
|
||||
// Skip if this base is an 'N' or etc.
|
||||
if( BaseUtils.isRegularBase( bases[offset] ) ) {
|
||||
|
||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||
if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
|
||||
!RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) {
|
||||
|
||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||
updateDataFromRead( counter, gatkRead, offset, refBase, hasIndelAtThisPosition );
|
||||
|
||||
} else { // calculate SOLID reference insertion rate
|
||||
if( refBase == bases[offset] ) {
|
||||
counter.solidInsertedReferenceBases++;
|
||||
} else {
|
||||
counter.otherColorSpaceInconsistency++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Update the mismatch / total_base counts for a given class of loci.
|
||||
*
|
||||
* @param counter The CountedData to be updated
|
||||
* @param context The AlignmentContext which holds the reads covered by this locus
|
||||
* @param refBase The reference base
|
||||
*/
|
||||
private static void updateMismatchCounts(CountedData counter, final AlignmentContext context, final byte refBase) {
|
||||
|
||||
if (context.hasExtendedEventPileup()){
|
||||
for( PileupElement p : context.getExtendedEventPileup() ) {
|
||||
counter.novelCountsBases++;
|
||||
}
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
for( PileupElement p : context.getBasePileup() ) {
|
||||
final byte readBase = p.getBase();
|
||||
final int readBaseIndex = BaseUtils.simpleBaseToBaseIndex(readBase);
|
||||
final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase);
|
||||
|
||||
if( readBaseIndex != -1 && refBaseIndex != -1 ) {
|
||||
if( readBaseIndex != refBaseIndex ) {
|
||||
counter.novelCountsMM++;
|
||||
}
|
||||
counter.novelCountsBases++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Major workhorse routine for this walker.
|
||||
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
|
||||
* adding one to the number of observations and potentially one to the number of mismatches
|
||||
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
|
||||
* because pulling things out of the SAMRecord is an expensive operation.
|
||||
* @param counter Data structure which holds the counted bases
|
||||
* @param gatkRead The SAMRecord holding all the data for this read
|
||||
* @param offset The offset in the read for this locus
|
||||
* @param refBase The reference base at this locus
|
||||
*/
|
||||
private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRead, final int offset, final byte refBase,
|
||||
final boolean hasIndelAtThisPosition) {
|
||||
final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE);
|
||||
|
||||
if (offset < 0) {
|
||||
int k=0;
|
||||
}
|
||||
final Object[] key = covars[offset];
|
||||
|
||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||
final NestedHashMap data = dataManager.data; //optimization - create local reference
|
||||
RecalDatumOptimized datum = (RecalDatumOptimized) data.get( key );
|
||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
// initialized with zeros, will be incremented at end of method
|
||||
datum = (RecalDatumOptimized)data.put( new RecalDatumOptimized(), true, (Object[])key );
|
||||
}
|
||||
|
||||
// Need the bases to determine whether or not we have a mismatch
|
||||
final byte base = gatkRead.getReadBases()[offset];
|
||||
final long curMismatches = datum.getNumMismatches();
|
||||
|
||||
// Add one to the number of observations and potentially one to the number of mismatches
|
||||
if (COUNT_INDELS)
|
||||
datum.incrementBaseCounts(hasIndelAtThisPosition);
|
||||
else
|
||||
datum.incrementBaseCounts( base, refBase );
|
||||
|
||||
counter.countedBases++;
|
||||
counter.novelCountsBases++;
|
||||
counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker.
|
||||
* @return returns A PrintStream created from the -recalFile filename argument specified to the walker
|
||||
*/
|
||||
public CountedData reduceInit() {
|
||||
return new CountedData();
|
||||
}
|
||||
|
||||
/**
|
||||
* The Reduce method doesn't do anything for this walker.
|
||||
* @param mapped Result of the map. This value is immediately ignored.
|
||||
* @param sum The summing CountedData used to output the CSV data
|
||||
* @return returns The sum used to output the CSV data
|
||||
*/
|
||||
public CountedData reduce( CountedData mapped, CountedData sum ) {
|
||||
// Do a dbSNP sanity check every so often
|
||||
return validatingDbsnpMismatchRate(sum.add(mapped));
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate the dbSNP reference mismatch rates.
|
||||
*/
|
||||
private CountedData validatingDbsnpMismatchRate(CountedData counter) {
|
||||
if( ++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY ) {
|
||||
counter.lociSinceLastDbsnpCheck = 0;
|
||||
|
||||
if( counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L ) {
|
||||
final double fractionMM_novel = (double)counter.novelCountsMM / (double)counter.novelCountsBases;
|
||||
final double fractionMM_dbsnp = (double)counter.dbSNPCountsMM / (double)counter.dbSNPCountsBases;
|
||||
|
||||
if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) {
|
||||
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " +
|
||||
String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) );
|
||||
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return counter;
|
||||
}
|
||||
|
||||
public CountedData treeReduce( CountedData sum1, CountedData sum2 ) {
|
||||
return validatingDbsnpMismatchRate(sum1.add(sum2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out the full data hashmap to disk in CSV format
|
||||
* @param sum The CountedData to write out to RECAL_FILE
|
||||
*/
|
||||
public void onTraversalDone( CountedData sum ) {
|
||||
logger.info( "Writing raw recalibration data..." );
|
||||
outputToCSV( sum, RECAL_FILE );
|
||||
logger.info( "...done!" );
|
||||
}
|
||||
|
||||
/**
|
||||
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
|
||||
* @param recalTableStream The PrintStream to write out to
|
||||
*/
|
||||
private void outputToCSV( CountedData sum, final PrintStream recalTableStream ) {
|
||||
recalTableStream.printf("# Counted Sites %d%n", sum.countedSites);
|
||||
recalTableStream.printf("# Counted Bases %d%n", sum.countedBases);
|
||||
recalTableStream.printf("# Skipped Sites %d%n", sum.skippedSites);
|
||||
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)sum.countedSites / sum.skippedSites);
|
||||
|
||||
if( sum.solidInsertedReferenceBases != 0 ) {
|
||||
recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) sum.countedBases / sum.solidInsertedReferenceBases);
|
||||
recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) sum.countedBases / sum.otherColorSpaceInconsistency);
|
||||
}
|
||||
|
||||
// Output header saying which covariates were used and in what order
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," );
|
||||
}
|
||||
|
||||
if (COUNT_INDELS)
|
||||
recalTableStream.println("nObservations,nTrueIndels,Qempirical");
|
||||
else
|
||||
recalTableStream.println("nObservations,nMismatches,Qempirical");
|
||||
|
||||
if( DONT_SORT_OUTPUT ) {
|
||||
printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
|
||||
} else {
|
||||
printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
|
||||
}
|
||||
|
||||
// print out an EOF marker
|
||||
recalTableStream.println(TableRecalibrationWalker.EOF_MARKER);
|
||||
}
|
||||
|
||||
private void printMappingsSorted( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
final ArrayList<Comparable> keyList = new ArrayList<Comparable>();
|
||||
for( Object comp : data.keySet() ) {
|
||||
keyList.add((Comparable) comp);
|
||||
}
|
||||
|
||||
Collections.sort(keyList);
|
||||
|
||||
for( Comparable comp : keyList ) {
|
||||
key[curPos] = comp;
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
||||
// For each Covariate in the key
|
||||
for( Object compToPrint : key ) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( compToPrint + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
||||
} else { // Another layer in the nested hash map
|
||||
printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void printMappings( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
for( Object comp : data.keySet() ) {
|
||||
key[curPos] = comp;
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
||||
// For each Covariate in the key
|
||||
for( Object compToPrint : key ) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( compToPrint + "," );
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
||||
} else { // Another layer in the nested hash map
|
||||
printMappings( recalTableStream, curPos + 1, key, (Map) val );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
//g import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
|
||||
//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: Jan 17, 2011
|
||||
* Time: 2:53:28 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class IndelPositionCovariate implements Covariate {
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
int cycle = offset;
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
cycle = read.getReadLength() - (offset + 1);
|
||||
}
|
||||
return cycle;
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
}
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
for(int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,528 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.util.SequenceUtil;
|
||||
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.helpers.DbSNPHelper;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal.
|
||||
|
||||
* For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
|
||||
* Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read.
|
||||
* This walker then outputs a new bam file with these updated (recalibrated) reads.
|
||||
*
|
||||
* Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker.
|
||||
* Note: This walker is designed to be used in conjunction with CovariateCounterWalker.
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since Nov 3, 2009
|
||||
* @help.summary Second pass of the recalibration. Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
|
||||
*/
|
||||
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
|
||||
@WalkerName("TableRecalibration")
|
||||
@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta
|
||||
public class IndelTableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
|
||||
public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration";
|
||||
|
||||
/////////////////////////////
|
||||
// Shared Arguments
|
||||
/////////////////////////////
|
||||
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
|
||||
@Input(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file")
|
||||
public File RECAL_FILE = new File("output.recal_data.csv");
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="output_bam", shortName="outputBam", doc="Please use --out instead", required=false)
|
||||
@Deprecated
|
||||
protected String outbam;
|
||||
|
||||
@Output(doc="The output BAM file", required=true)
|
||||
private StingSAMFileWriter OUTPUT_BAM = null;
|
||||
@Argument(fullName="preserve_qscores_less_than", shortName="pQ",
|
||||
doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
||||
private int PRESERVE_QSCORES_LESS_THAN = 5;
|
||||
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points, default=1")
|
||||
private int SMOOTHING = 1;
|
||||
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default=50")
|
||||
private int MAX_QUALITY_SCORE = 50;
|
||||
@Argument(fullName="doNotWriteOriginalQuals", shortName="noOQs", required=false, doc="If true, we will not write the original quality (OQ) tag for each read")
|
||||
private boolean DO_NOT_WRITE_OQ = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
private boolean NO_PG_TAG = false;
|
||||
@Hidden
|
||||
@Argument(fullName="fail_with_no_eof_marker", shortName="requireEOF", required=false, doc="If no EOF marker is present in the covariates file, exit the program with an exception.")
|
||||
private boolean REQUIRE_EOF = false;
|
||||
@Hidden
|
||||
@Argument(fullName="skipUQUpdate", shortName="skipUQUpdate", required=false, doc="If true, we will skip the UQ updating step for each read, speeding up the calculations")
|
||||
private boolean skipUQUpdate = false;
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of covariates to be used in this calculation
|
||||
private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
public static final String EOF_MARKER = "EOF";
|
||||
private long numReadsWithMalformedColorSpace = 0;
|
||||
|
||||
/////////////////////////////
|
||||
// Optimization
|
||||
/////////////////////////////
|
||||
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Read in the recalibration table input file.
|
||||
* Parse the list of covariate classes used during CovariateCounterWalker.
|
||||
* Parse the CSV data and populate the hashmap.
|
||||
*/
|
||||
public void initialize() {
|
||||
|
||||
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
|
||||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
||||
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
|
||||
int lineNumber = 0;
|
||||
boolean foundAllCovariates = false;
|
||||
|
||||
// Warn the user if a dbSNP file was specified since it isn't being used here
|
||||
boolean foundDBSNP = false;
|
||||
for( ReferenceOrderedDataSource rod : this.getToolkit().getRodDataSources() ) {
|
||||
if( rod.getName().equalsIgnoreCase(DbSNPHelper.STANDARD_DBSNP_TRACK_NAME) ) {
|
||||
foundDBSNP = true;
|
||||
}
|
||||
}
|
||||
if( foundDBSNP ) {
|
||||
Utils.warnUser("A dbSNP rod file was specified but TableRecalibrationWalker doesn't make use of it.");
|
||||
}
|
||||
|
||||
// Read in the data from the csv file and populate the data map and covariates list
|
||||
logger.info( "Reading in the data from input csv file..." );
|
||||
|
||||
boolean sawEOF = false;
|
||||
try {
|
||||
for ( String line : new XReadLines(RECAL_FILE) ) {
|
||||
lineNumber++;
|
||||
if ( EOF_MARKER.equals(line) ) {
|
||||
sawEOF = true;
|
||||
} else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) {
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
// Read in the covariates that were used from the input file
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
|
||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
String[] vals = line.split(",");
|
||||
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else { // Found a line of data
|
||||
if( !foundAllCovariates ) {
|
||||
foundAllCovariates = true;
|
||||
|
||||
// At this point all the covariates should have been found and initialized
|
||||
if( requestedCovariates.size() < 2 ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE );
|
||||
}
|
||||
|
||||
final boolean createCollapsedTables = true;
|
||||
|
||||
// Initialize any covariate member variables using the shared argument collection
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
cov.initialize( RAC );
|
||||
}
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() );
|
||||
|
||||
}
|
||||
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
logger.info( "...done!" );
|
||||
|
||||
if ( !sawEOF ) {
|
||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
|
||||
if ( REQUIRE_EOF )
|
||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||
logger.warn(errorMessage);
|
||||
}
|
||||
|
||||
logger.info( "The covariates being used here: " );
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
logger.info( "\t" + cov.getClass().getSimpleName() );
|
||||
}
|
||||
|
||||
if( dataManager == null ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
|
||||
}
|
||||
|
||||
// Create the tables of empirical quality scores that will be used in the sequential calculation
|
||||
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
|
||||
dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE );
|
||||
logger.info( "...done!" );
|
||||
|
||||
// Take the header of the input SAM file and tweak it by adding in a new programRecord with the version number and list of covariates that were used
|
||||
final SAMFileHeader header = getToolkit().getSAMFileHeader().clone();
|
||||
if( !NO_PG_TAG ) {
|
||||
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
|
||||
final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
|
||||
try {
|
||||
final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version");
|
||||
programRecord.setProgramVersion(version);
|
||||
} catch (MissingResourceException e) {}
|
||||
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this));
|
||||
sb.append(" Covariates=[");
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
sb.append(cov.getClass().getSimpleName());
|
||||
sb.append(", ");
|
||||
}
|
||||
sb.setCharAt(sb.length()-2, ']');
|
||||
sb.setCharAt(sb.length()-1, ' ');
|
||||
programRecord.setCommandLine(sb.toString());
|
||||
|
||||
List<SAMProgramRecord> oldRecords = header.getProgramRecords();
|
||||
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1);
|
||||
for ( SAMProgramRecord record : oldRecords ) {
|
||||
if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) )
|
||||
newRecords.add(record);
|
||||
}
|
||||
newRecords.add(programRecord);
|
||||
header.setProgramRecords(newRecords);
|
||||
|
||||
// Write out the new header
|
||||
OUTPUT_BAM.writeHeader( header );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||
* @param line A line of CSV data read from the recalibration table data file
|
||||
*/
|
||||
private void addCSVData(final File file, final String line) {
|
||||
final String[] vals = line.split(",");
|
||||
|
||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
|
||||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
||||
}
|
||||
|
||||
final Object[] key = new Object[requestedCovariates.size()];
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
key[iii] = cov.getValue( vals[iii] );
|
||||
}
|
||||
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN );
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read
|
||||
* @param refBases References bases over the length of the read
|
||||
* @param read The read to be recalibrated
|
||||
* @return The read with quality scores replaced
|
||||
*/
|
||||
public SAMRecord map( ReferenceContext refBases, SAMRecord read, ReadMetaDataTracker metaDataTracker ) {
|
||||
|
||||
if( read.getReadLength() == 0 ) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads.
|
||||
return read;
|
||||
}
|
||||
|
||||
RecalDataManager.parseSAMRecord( read, RAC );
|
||||
|
||||
byte[] originalQuals = read.getBaseQualities();
|
||||
final byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
final String platform = read.getReadGroup().getPlatform();
|
||||
if( platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING) ) {
|
||||
if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) ) {
|
||||
final boolean badColor = RecalDataManager.checkNoCallColorSpace( read );
|
||||
if( badColor ) {
|
||||
numReadsWithMalformedColorSpace++;
|
||||
if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) {
|
||||
return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them
|
||||
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) {
|
||||
read.setReadFailsVendorQualityCheckFlag(true);
|
||||
return read;
|
||||
}
|
||||
}
|
||||
}
|
||||
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases() );
|
||||
}
|
||||
|
||||
//compute all covariate values for this read
|
||||
final Comparable[][] covariateValues_offset_x_covar =
|
||||
RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
|
||||
|
||||
// For each base in the read
|
||||
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
||||
|
||||
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
|
||||
|
||||
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
|
||||
if(qualityScore == null)
|
||||
{
|
||||
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
|
||||
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
|
||||
}
|
||||
|
||||
recalQuals[offset] = qualityScore;
|
||||
}
|
||||
|
||||
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
|
||||
|
||||
read.setBaseQualities( recalQuals ); // Overwrite old qualities with new recalibrated qualities
|
||||
if ( !DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read
|
||||
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals));
|
||||
}
|
||||
|
||||
if (! skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) {
|
||||
read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false));
|
||||
}
|
||||
|
||||
if (RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N && refBases != null && read.getAttribute(SAMTag.NM.name()) != null) {
|
||||
read.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(read, refBases.getBases(), read.getAlignmentStart() - 1, false));
|
||||
}
|
||||
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements a serial recalibration of the reads using the combinational table.
|
||||
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
|
||||
*
|
||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||
*
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
* @param key The list of Comparables that were calculated from the covariates
|
||||
* @return A recalibrated quality score as a byte
|
||||
*/
|
||||
private byte performSequentialQualityCalculation( final Object... key ) {
|
||||
|
||||
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||
final Object[] covariateCollapsedKey = new Object[3];
|
||||
|
||||
// The global quality shift (over the read group only)
|
||||
readGroupCollapsedKey[0] = key[0];
|
||||
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey ));
|
||||
double globalDeltaQ = 0.0;
|
||||
if( globalRecalDatum != null ) {
|
||||
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
||||
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||
}
|
||||
|
||||
// The shift in quality between reported and empirical
|
||||
qualityScoreCollapsedKey[0] = key[0];
|
||||
qualityScoreCollapsedKey[1] = key[1];
|
||||
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey ));
|
||||
double deltaQReported = 0.0;
|
||||
if( qReportedRecalDatum != null ) {
|
||||
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
}
|
||||
|
||||
// The shift in quality due to each covariate by itself in turn
|
||||
double deltaQCovariates = 0.0;
|
||||
double deltaQCovariateEmpirical;
|
||||
covariateCollapsedKey[0] = key[0];
|
||||
covariateCollapsedKey[1] = key[1];
|
||||
for( int iii = 2; iii < key.length; iii++ ) {
|
||||
covariateCollapsedKey[2] = key[iii]; // The given covariate
|
||||
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey ));
|
||||
if( covariateRecalDatum != null ) {
|
||||
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
}
|
||||
}
|
||||
|
||||
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
|
||||
|
||||
// Verbose printouts used to validate with old recalibrator
|
||||
//if(key.contains(null)) {
|
||||
// System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
|
||||
//}
|
||||
//else {
|
||||
// System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
|
||||
//}
|
||||
|
||||
//return newQualityByte;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
*/
|
||||
private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) {
|
||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
||||
if( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
|
||||
recalQuals[iii] = originalQuals[iii];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Start the reduce with a handle to the output bam file
|
||||
* @return A FileWriter pointing to a new bam file
|
||||
*/
|
||||
public SAMFileWriter reduceInit() {
|
||||
return OUTPUT_BAM;
|
||||
}
|
||||
|
||||
/**
|
||||
* Output each read to disk
|
||||
* @param read The read to output
|
||||
* @param output The FileWriter to write the read to
|
||||
* @return The FileWriter
|
||||
*/
|
||||
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
|
||||
if( output != null ) {
|
||||
output.addAlignment(read);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
/**
|
||||
* Do nothing
|
||||
* @param output The SAMFileWriter that outputs the bam file
|
||||
*/
|
||||
public void onTraversalDone(SAMFileWriter output) {
|
||||
if( numReadsWithMalformedColorSpace != 0 ) {
|
||||
if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) {
|
||||
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
|
||||
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
||||
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
||||
"These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!");
|
||||
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) {
|
||||
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
|
||||
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
||||
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
||||
"These reads were completely removed from the output bam file.");
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
//g import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 3, 2009
|
||||
*
|
||||
* The Reported Quality Score covariate.
|
||||
*/
|
||||
|
||||
public class QualityScoreCovariate implements RequiredCovariate {
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
}
|
||||
|
||||
/*
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
return (int)(read.getBaseQualities()[offset]);
|
||||
}
|
||||
*/
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
byte[] baseQualities = read.getBaseQualities();
|
||||
for(int i = 0; i < read.getReadLength(); i++) {
|
||||
// comparable[i] = (int) baseQualities[i];
|
||||
comparable[i] = (int) 45;
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
//import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Oct 30, 2009
|
||||
*
|
||||
* The Read Group covariate.
|
||||
*/
|
||||
|
||||
public class ReadGroupCovariate implements RequiredCovariate {
|
||||
|
||||
public static final String defaultReadGroup = "DefaultReadGroup";
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
}
|
||||
|
||||
/*
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
return read.getReadGroup().getReadGroupId();
|
||||
}
|
||||
*/
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
final String readGroupId = read.getReadGroup().getReadGroupId();
|
||||
for(int i = 0; i < read.getReadLength(); i++) {
|
||||
comparable[i] = readGroupId;
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return str;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,649 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMUtils;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 6, 2009
|
||||
*
|
||||
* This helper class holds the data HashMap as well as submaps that represent the marginal distributions collapsed over all needed dimensions.
|
||||
* It also has static methods that are used to perform the various solid recalibration modes that attempt to correct the reference bias.
|
||||
* This class holds the parsing methods that are shared between CountCovariates and TableRecalibration.
|
||||
*/
|
||||
|
||||
public class RecalDataManager {
|
||||
|
||||
public final NestedHashMap data; // The full dataset
|
||||
private final NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
private final NestedHashMap dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||
private final ArrayList<NestedHashMap> dataCollapsedByCovariate; // Tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
|
||||
public final static String ORIGINAL_QUAL_ATTRIBUTE_TAG = "OQ"; // The tag that holds the original quality scores
|
||||
public final static String COLOR_SPACE_QUAL_ATTRIBUTE_TAG = "CQ"; // The tag that holds the color space quality scores for SOLID bams
|
||||
public final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams
|
||||
public final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color
|
||||
private static boolean warnUserNullReadGroup = false;
|
||||
private static boolean warnUserNullPlatform = false;
|
||||
|
||||
public enum SOLID_RECAL_MODE {
|
||||
DO_NOTHING,
|
||||
SET_Q_ZERO,
|
||||
SET_Q_ZERO_BASE_N,
|
||||
REMOVE_REF_BIAS
|
||||
}
|
||||
|
||||
public enum SOLID_NOCALL_STRATEGY {
|
||||
THROW_EXCEPTION,
|
||||
LEAVE_READ_UNRECALIBRATED,
|
||||
PURGE_READ
|
||||
}
|
||||
|
||||
public RecalDataManager() {
|
||||
data = new NestedHashMap();
|
||||
dataCollapsedReadGroup = null;
|
||||
dataCollapsedQualityScore = null;
|
||||
dataCollapsedByCovariate = null;
|
||||
}
|
||||
|
||||
public RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) {
|
||||
if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker
|
||||
data = null;
|
||||
dataCollapsedReadGroup = new NestedHashMap();
|
||||
dataCollapsedQualityScore = new NestedHashMap();
|
||||
dataCollapsedByCovariate = new ArrayList<NestedHashMap>();
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||
dataCollapsedByCovariate.add( new NestedHashMap() );
|
||||
}
|
||||
} else {
|
||||
data = new NestedHashMap();
|
||||
dataCollapsedReadGroup = null;
|
||||
dataCollapsedQualityScore = null;
|
||||
dataCollapsedByCovariate = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the given mapping to all of the collapsed hash tables
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
* @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
|
||||
*/
|
||||
public final void addToAllTables( final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN ) {
|
||||
|
||||
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
|
||||
//data.put(key, thisDatum); // add the mapping to the main table
|
||||
|
||||
final int qualityScore = Integer.parseInt( key[1].toString() );
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||
final Object[] covariateCollapsedKey = new Object[3];
|
||||
RecalDatum collapsedDatum;
|
||||
|
||||
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||
if( qualityScore >= PRESERVE_QSCORES_LESS_THAN ) {
|
||||
readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group
|
||||
collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get( readGroupCollapsedKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedReadGroup.put( new RecalDatum(fullDatum), readGroupCollapsedKey );
|
||||
} else {
|
||||
collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported
|
||||
}
|
||||
}
|
||||
|
||||
// Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
|
||||
qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
||||
qualityScoreCollapsedKey[1] = key[1]; // and quality score
|
||||
collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get( qualityScoreCollapsedKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedQualityScore.put( new RecalDatum(fullDatum), qualityScoreCollapsedKey );
|
||||
} else {
|
||||
collapsedDatum.increment( fullDatum );
|
||||
}
|
||||
|
||||
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) {
|
||||
covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
||||
covariateCollapsedKey[1] = key[1]; // and quality score ...
|
||||
final Object theCovariateElement = key[iii + 2]; // and the given covariate
|
||||
if( theCovariateElement != null ) {
|
||||
covariateCollapsedKey[2] = theCovariateElement;
|
||||
collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get( covariateCollapsedKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedByCovariate.get(iii).put( new RecalDatum(fullDatum), covariateCollapsedKey );
|
||||
} else {
|
||||
collapsedDatum.increment( fullDatum );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score
|
||||
* that will be used in the sequential calculation in TableRecalibrationWalker
|
||||
* @param smoothing The smoothing parameter that goes into empirical quality score calculation
|
||||
* @param maxQual At which value to cap the quality scores
|
||||
*/
|
||||
public final void generateEmpiricalQualities( final int smoothing, final int maxQual ) {
|
||||
|
||||
recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing, maxQual);
|
||||
recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing, maxQual);
|
||||
for( NestedHashMap map : dataCollapsedByCovariate ) {
|
||||
recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual);
|
||||
checkForSingletons(map.data);
|
||||
}
|
||||
}
|
||||
|
||||
private void recursivelyGenerateEmpiricalQualities( final Map data, final int smoothing, final int maxQual ) {
|
||||
|
||||
for( Object comp : data.keySet() ) {
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps
|
||||
((RecalDatum)val).calcCombinedEmpiricalQuality(smoothing, maxQual);
|
||||
} else { // Another layer in the nested hash map
|
||||
recursivelyGenerateEmpiricalQualities( (Map) val, smoothing, maxQual);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void checkForSingletons( final Map data ) {
|
||||
// todo -- this looks like it's better just as a data.valueSet() call?
|
||||
for( Object comp : data.keySet() ) {
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps
|
||||
if( data.keySet().size() == 1) {
|
||||
data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ...
|
||||
// in a previous step of the sequential calculation model
|
||||
}
|
||||
} else { // Another layer in the nested hash map
|
||||
checkForSingletons( (Map) val );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the appropriate collapsed table out of the set of all the tables held by this Object
|
||||
* @param covariate Which covariate indexes the desired collapsed HashMap
|
||||
* @return The desired collapsed HashMap
|
||||
*/
|
||||
public final NestedHashMap getCollapsedTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
return dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
} else if( covariate == 1 ) {
|
||||
return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||
} else {
|
||||
return dataCollapsedByCovariate.get( covariate - 2 ); // Table where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string
|
||||
* @param read The read to adjust
|
||||
* @param RAC The list of shared command line arguments
|
||||
*/
|
||||
public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) {
|
||||
|
||||
SAMReadGroupRecord readGroup = read.getReadGroup();
|
||||
|
||||
// If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments
|
||||
if( readGroup == null ) {
|
||||
if( RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) {
|
||||
if( !warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null ) {
|
||||
Utils.warnUser("The input .bam file contains reads with no read group. " +
|
||||
"Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " +
|
||||
"First observed at read with name = " + read.getReadName() );
|
||||
warnUserNullReadGroup = true;
|
||||
}
|
||||
// There is no readGroup so defaulting to these values
|
||||
readGroup = new SAMReadGroupRecord( RAC.DEFAULT_READ_GROUP );
|
||||
readGroup.setPlatform( RAC.DEFAULT_PLATFORM );
|
||||
((GATKSAMRecord)read).setReadGroup( readGroup );
|
||||
} else {
|
||||
throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() +
|
||||
" Users must set both the default read group using the --default_read_group <String> argument and the default platform using the --default_platform <String> argument." );
|
||||
}
|
||||
}
|
||||
|
||||
if( RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP) ) { // Collapse all the read groups into a single common String provided by the user
|
||||
final String oldPlatform = readGroup.getPlatform();
|
||||
readGroup = new SAMReadGroupRecord( RAC.FORCE_READ_GROUP );
|
||||
readGroup.setPlatform( oldPlatform );
|
||||
((GATKSAMRecord)read).setReadGroup( readGroup );
|
||||
}
|
||||
|
||||
if( RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
|
||||
readGroup.setPlatform( RAC.FORCE_PLATFORM );
|
||||
}
|
||||
|
||||
if ( readGroup.getPlatform() == null ) {
|
||||
if( RAC.DEFAULT_PLATFORM != null ) {
|
||||
if( !warnUserNullPlatform ) {
|
||||
Utils.warnUser("The input .bam file contains reads with no platform information. " +
|
||||
"Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " +
|
||||
"First observed at read with name = " + read.getReadName() );
|
||||
warnUserNullPlatform = true;
|
||||
}
|
||||
readGroup.setPlatform( RAC.DEFAULT_PLATFORM );
|
||||
} else {
|
||||
throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() +
|
||||
" Users must set the default platform using the --default_platform <String> argument." );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are inconsistent with the color space
|
||||
* @param read The SAMRecord to parse
|
||||
*/
|
||||
public static void parseColorSpace( final SAMRecord read ) {
|
||||
|
||||
// If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
|
||||
if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) {
|
||||
if( read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null ) { // Haven't calculated the inconsistency array yet for this read
|
||||
final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
byte[] colorSpace;
|
||||
if( attr instanceof String ) {
|
||||
colorSpace = ((String)attr).getBytes();
|
||||
} else {
|
||||
throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
// Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read
|
||||
byte[] readBases = read.getReadBases();
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
readBases = BaseUtils.simpleReverseComplement( read.getReadBases() );
|
||||
}
|
||||
final byte[] inconsistency = new byte[readBases.length];
|
||||
int iii;
|
||||
byte prevBase = colorSpace[0]; // The sentinel
|
||||
for( iii = 0; iii < readBases.length; iii++ ) {
|
||||
final byte thisBase = getNextBaseFromColor( read, prevBase, colorSpace[iii + 1] );
|
||||
inconsistency[iii] = (byte)( thisBase == readBases[iii] ? 0 : 1 );
|
||||
prevBase = readBases[iii];
|
||||
}
|
||||
read.setAttribute( org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency );
|
||||
|
||||
} else {
|
||||
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
|
||||
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases
|
||||
* This method doesn't add the inconsistent tag to the read like parseColorSpace does
|
||||
* @param read The SAMRecord to parse
|
||||
* @param originalQualScores The array of original quality scores to modify during the correction
|
||||
* @param solidRecalMode Which mode of solid recalibration to apply
|
||||
* @param refBases The reference for this read
|
||||
* @return A new array of quality scores that have been ref bias corrected
|
||||
*/
|
||||
public static byte[] calcColorSpace( final SAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases ) {
|
||||
|
||||
final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
byte[] colorSpace;
|
||||
if( attr instanceof String ) {
|
||||
colorSpace = ((String)attr).getBytes();
|
||||
} else {
|
||||
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
// Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read
|
||||
byte[] readBases = read.getReadBases();
|
||||
final byte[] colorImpliedBases = readBases.clone();
|
||||
byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray( read.getCigar(), read.getReadBases(), refBases ); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
readBases = BaseUtils.simpleReverseComplement( read.getReadBases() );
|
||||
refBasesDirRead = BaseUtils.simpleReverseComplement( refBasesDirRead.clone() );
|
||||
}
|
||||
final int[] inconsistency = new int[readBases.length];
|
||||
byte prevBase = colorSpace[0]; // The sentinel
|
||||
for( int iii = 0; iii < readBases.length; iii++ ) {
|
||||
final byte thisBase = getNextBaseFromColor( read, prevBase, colorSpace[iii + 1] );
|
||||
colorImpliedBases[iii] = thisBase;
|
||||
inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 );
|
||||
prevBase = readBases[iii];
|
||||
}
|
||||
|
||||
// Now that we have the inconsistency array apply the desired correction to the inconsistent bases
|
||||
if( solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO ) { // Set inconsistent bases and the one before it to Q0
|
||||
final boolean setBaseN = false;
|
||||
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
|
||||
} else if( solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N ) {
|
||||
final boolean setBaseN = true;
|
||||
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
|
||||
} else if( solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases
|
||||
solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead);
|
||||
}
|
||||
|
||||
} else {
|
||||
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
|
||||
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
|
||||
}
|
||||
|
||||
return originalQualScores;
|
||||
}
|
||||
|
||||
public static boolean checkNoCallColorSpace( final SAMRecord read ) {
|
||||
if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) {
|
||||
final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
byte[] colorSpace;
|
||||
if( attr instanceof String ) {
|
||||
colorSpace = ((String)attr).substring(1).getBytes(); // trim off the Sentinel
|
||||
} else {
|
||||
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
for( byte color : colorSpace ) {
|
||||
if( color != (byte)'0' && color != (byte)'1' && color != (byte)'2' && color != (byte)'3' ) {
|
||||
return true; // There is a bad color in this SOLiD read and the user wants to skip over it
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
|
||||
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
|
||||
}
|
||||
}
|
||||
|
||||
return false; // There aren't any color no calls in this SOLiD read
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read which have been RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
* @param originalQualScores The array of original quality scores to set to zero if needed
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
* @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar
|
||||
* @return The byte array of original quality scores some of which might have been set to zero
|
||||
*/
|
||||
private static byte[] solidRecalSetToQZero( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores,
|
||||
final byte[] refBases, final boolean setBaseN ) {
|
||||
|
||||
final boolean negStrand = read.getReadNegativeStrandFlag();
|
||||
for( int iii = 1; iii < originalQualScores.length; iii++ ) {
|
||||
if( inconsistency[iii] == 1 ) {
|
||||
if( readBases[iii] == refBases[iii] ) {
|
||||
if( negStrand ) { originalQualScores[originalQualScores.length-(iii+1)] = (byte)0; }
|
||||
else { originalQualScores[iii] = (byte)0; }
|
||||
if( setBaseN ) { readBases[iii] = (byte)'N'; }
|
||||
}
|
||||
// Set the prev base to Q0 as well
|
||||
if( readBases[iii-1] == refBases[iii-1] ) {
|
||||
if( negStrand ) { originalQualScores[originalQualScores.length-iii] = (byte)0; }
|
||||
else { originalQualScores[iii-1] = (byte)0; }
|
||||
if( setBaseN ) { readBases[iii-1] = (byte)'N'; }
|
||||
}
|
||||
}
|
||||
}
|
||||
if( negStrand ) {
|
||||
readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read
|
||||
}
|
||||
read.setReadBases( readBases );
|
||||
|
||||
return originalQualScores;
|
||||
}
|
||||
|
||||
/**
|
||||
* Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read which have been RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
* @param colorImpliedBases The bases implied by the color space, RC'd if necessary
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
*/
|
||||
private static void solidRecalRemoveRefBias( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases,
|
||||
final byte[] refBases) {
|
||||
|
||||
final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
byte[] colorSpaceQuals;
|
||||
if( attr instanceof String ) {
|
||||
String x = (String)attr;
|
||||
colorSpaceQuals = x.getBytes();
|
||||
SAMUtils.fastqToPhred(colorSpaceQuals);
|
||||
} else {
|
||||
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
for( int iii = 1; iii < inconsistency.length - 1; iii++ ) {
|
||||
if( inconsistency[iii] == 1 ) {
|
||||
for( int jjj = iii - 1; jjj <= iii; jjj++ ) { // Correct this base and the one before it along the direction of the read
|
||||
if( jjj == iii || inconsistency[jjj] == 0 ) { // Don't want to correct the previous base a second time if it was already corrected in the previous step
|
||||
if( readBases[jjj] == refBases[jjj] ) {
|
||||
if( colorSpaceQuals[jjj] == colorSpaceQuals[jjj+1] ) { // Equal evidence for the color implied base and the reference base, so flip a coin
|
||||
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt( 2 );
|
||||
if( rand == 0 ) { // The color implied base won the coin flip
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
}
|
||||
} else {
|
||||
final int maxQuality = Math.max((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
|
||||
final int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
|
||||
int diffInQuality = maxQuality - minQuality;
|
||||
int numLow = minQuality;
|
||||
if( numLow == 0 ) {
|
||||
numLow++;
|
||||
diffInQuality++;
|
||||
}
|
||||
final int numHigh = Math.round( numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely
|
||||
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt( numLow + numHigh );
|
||||
if( rand >= numLow ) { // higher q score won
|
||||
if( maxQuality == (int)colorSpaceQuals[jjj] ) {
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
} // else ref color had higher q score, and won out, so nothing to do here
|
||||
} else { // lower q score won
|
||||
if( minQuality == (int)colorSpaceQuals[jjj] ) {
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
} // else ref color had lower q score, and won out, so nothing to do here
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read
|
||||
}
|
||||
read.setReadBases( readBases );
|
||||
} else { // No color space quality tag in file
|
||||
throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the base and the color calculate the next base in the sequence
|
||||
* @param prevBase The base
|
||||
* @param color The color
|
||||
* @return The next base in the sequence
|
||||
*/
|
||||
private static byte getNextBaseFromColor( SAMRecord read, final byte prevBase, final byte color ) {
|
||||
switch(color) {
|
||||
case '0':
|
||||
return prevBase;
|
||||
case '1':
|
||||
return performColorOne( prevBase );
|
||||
case '2':
|
||||
return performColorTwo( prevBase );
|
||||
case '3':
|
||||
return performColorThree( prevBase );
|
||||
default:
|
||||
throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char)color +
|
||||
" Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality
|
||||
* @param read The read which contains the color space to check against
|
||||
* @param offset The offset in the read at which to check
|
||||
* @return Returns true if the base was inconsistent with the color space
|
||||
*/
|
||||
public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) {
|
||||
final Object attr = read.getAttribute(org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG);
|
||||
if( attr != null ) {
|
||||
final byte[] inconsistency = (byte[])attr;
|
||||
// NOTE: The inconsistency array is in the direction of the read, not aligned to the reference!
|
||||
if( read.getReadNegativeStrandFlag() ) { // Negative direction
|
||||
return inconsistency[inconsistency.length - offset - 1] != (byte)0;
|
||||
} else { // Forward direction
|
||||
return inconsistency[offset] != (byte)0;
|
||||
}
|
||||
|
||||
// This block of code is for if you want to check both the offset and the next base for color space inconsistency
|
||||
//if( read.getReadNegativeStrandFlag() ) { // Negative direction
|
||||
// if( offset == 0 ) {
|
||||
// return inconsistency[0] != 0;
|
||||
// } else {
|
||||
// return (inconsistency[inconsistency.length - offset - 1] != 0) || (inconsistency[inconsistency.length - offset] != 0);
|
||||
// }
|
||||
//} else { // Forward direction
|
||||
// if( offset == inconsistency.length - 1 ) {
|
||||
// return inconsistency[inconsistency.length - 1] != 0;
|
||||
// } else {
|
||||
// return (inconsistency[offset] != 0) || (inconsistency[offset + 1] != 0);
|
||||
// }
|
||||
//}
|
||||
|
||||
} else { // No inconsistency array, so nothing is inconsistent
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes all requested covariates for every offset in the given read
|
||||
* by calling covariate.getValues(..).
|
||||
*
|
||||
* @param gatkRead The read for which to compute covariate values.
|
||||
* @param requestedCovariates The list of requested covariates.
|
||||
* @return An array of covariate values where result[i][j] is the covariate
|
||||
* value for the ith position in the read and the jth covariate in
|
||||
* reqeustedCovariates list.
|
||||
*/
|
||||
public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List<Covariate> requestedCovariates) {
|
||||
//compute all covariates for this read
|
||||
final List<Covariate> requestedCovariatesRef = requestedCovariates;
|
||||
final int numRequestedCovariates = requestedCovariatesRef.size();
|
||||
final int readLength = gatkRead.getReadLength();
|
||||
|
||||
final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates];
|
||||
final Comparable[] tempCovariateValuesHolder = new Comparable[readLength];
|
||||
|
||||
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
|
||||
for( int i = 0; i < numRequestedCovariates; i++ ) {
|
||||
requestedCovariatesRef.get(i).getValues( gatkRead, tempCovariateValuesHolder );
|
||||
for(int j = 0; j < readLength; j++) {
|
||||
//copy values into a 2D array that allows all covar types to be extracted at once for
|
||||
//an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.
|
||||
covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j];
|
||||
}
|
||||
}
|
||||
|
||||
return covariateValues_offset_x_covar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform a ceratin transversion (A <-> C or G <-> T) on the base.
|
||||
*
|
||||
* @param base the base [AaCcGgTt]
|
||||
* @return the transversion of the base, or the input base if it's not one of the understood ones
|
||||
*/
|
||||
private static byte performColorOne(byte base) {
|
||||
switch (base) {
|
||||
case 'A':
|
||||
case 'a': return 'C';
|
||||
case 'C':
|
||||
case 'c': return 'A';
|
||||
case 'G':
|
||||
case 'g': return 'T';
|
||||
case 'T':
|
||||
case 't': return 'G';
|
||||
default: return base;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform a transition (A <-> G or C <-> T) on the base.
|
||||
*
|
||||
* @param base the base [AaCcGgTt]
|
||||
* @return the transition of the base, or the input base if it's not one of the understood ones
|
||||
*/
|
||||
private static byte performColorTwo(byte base) {
|
||||
switch (base) {
|
||||
case 'A':
|
||||
case 'a': return 'G';
|
||||
case 'C':
|
||||
case 'c': return 'T';
|
||||
case 'G':
|
||||
case 'g': return 'A';
|
||||
case 'T':
|
||||
case 't': return 'C';
|
||||
default: return base;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the complement (A <-> T or C <-> G) of a base.
|
||||
*
|
||||
* @param base the base [AaCcGgTt]
|
||||
* @return the complementary base, or the input base if it's not one of the understood ones
|
||||
*/
|
||||
private static byte performColorThree(byte base) {
|
||||
switch (base) {
|
||||
case 'A':
|
||||
case 'a': return 'T';
|
||||
case 'C':
|
||||
case 'c': return 'G';
|
||||
case 'G':
|
||||
case 'g': return 'C';
|
||||
case 'T':
|
||||
case 't': return 'A';
|
||||
default: return base;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,112 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 3, 2009
|
||||
*
|
||||
* An individual piece of recalibration data. Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates.
|
||||
*/
|
||||
|
||||
public class RecalDatum extends RecalDatumOptimized {
|
||||
|
||||
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
|
||||
private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// constructors
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public RecalDatum() {
|
||||
numObservations = 0L;
|
||||
numMismatches = 0L;
|
||||
estimatedQReported = 0.0;
|
||||
empiricalQuality = 0.0;
|
||||
}
|
||||
|
||||
public RecalDatum( final long _numObservations, final long _numMismatches, final double _estimatedQReported, final double _empiricalQuality ) {
|
||||
numObservations = _numObservations;
|
||||
numMismatches = _numMismatches;
|
||||
estimatedQReported = _estimatedQReported;
|
||||
empiricalQuality = _empiricalQuality;
|
||||
}
|
||||
|
||||
public RecalDatum( final RecalDatum copy ) {
|
||||
this.numObservations = copy.numObservations;
|
||||
this.numMismatches = copy.numMismatches;
|
||||
this.estimatedQReported = copy.estimatedQReported;
|
||||
this.empiricalQuality = copy.empiricalQuality;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// increment methods
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final void combine( final RecalDatum other ) {
|
||||
final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors();
|
||||
this.increment( other.numObservations, other.numMismatches );
|
||||
this.estimatedQReported = -10 * Math.log10(sumErrors / (double)this.numObservations);
|
||||
//if( this.estimatedQReported > QualityUtils.MAX_REASONABLE_Q_SCORE ) { this.estimatedQReported = QualityUtils.MAX_REASONABLE_Q_SCORE; }
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// methods to derive empirical quality score
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final void calcCombinedEmpiricalQuality( final int smoothing, final int maxQual ) {
|
||||
this.empiricalQuality = empiricalQualDouble(smoothing, maxQual); // cache the value so we don't call log over and over again
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// misc. methods
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final double getEstimatedQReported() {
|
||||
return estimatedQReported;
|
||||
}
|
||||
|
||||
public final double getEmpiricalQuality() {
|
||||
return empiricalQuality;
|
||||
}
|
||||
|
||||
private double calcExpectedErrors() {
|
||||
return (double)this.numObservations * qualToErrorProb( estimatedQReported );
|
||||
}
|
||||
|
||||
private double qualToErrorProb( final double qual ) {
|
||||
return Math.pow(10.0, qual / -10.0);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,139 @@
|
|||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Jan 6, 2010
|
||||
*
|
||||
* An individual piece of recalibration data. Optimized for CountCovariates. Extras added to make TableRecalibration fast have been removed.
|
||||
* Each bin counts up the number of observations and the number of reference mismatches seen for that combination of covariates.
|
||||
*/
|
||||
|
||||
public class RecalDatumOptimized {
|
||||
|
||||
protected long numObservations; // number of bases seen in total
|
||||
protected long numMismatches; // number of bases seen that didn't match the reference
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// constructors
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public RecalDatumOptimized() {
|
||||
numObservations = 0L;
|
||||
numMismatches = 0L;
|
||||
}
|
||||
|
||||
public RecalDatumOptimized( final long _numObservations, final long _numMismatches) {
|
||||
numObservations = _numObservations;
|
||||
numMismatches = _numMismatches;
|
||||
}
|
||||
|
||||
public RecalDatumOptimized( final RecalDatumOptimized copy ) {
|
||||
this.numObservations = copy.numObservations;
|
||||
this.numMismatches = copy.numMismatches;
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// increment methods
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public synchronized final void increment( final long incObservations, final long incMismatches ) {
|
||||
numObservations += incObservations;
|
||||
numMismatches += incMismatches;
|
||||
}
|
||||
|
||||
public synchronized final void increment( final RecalDatumOptimized other ) {
|
||||
increment( other.numObservations, other.numMismatches );
|
||||
}
|
||||
|
||||
public synchronized final void increment( final List<RecalDatumOptimized> data ) {
|
||||
for ( RecalDatumOptimized other : data ) {
|
||||
this.increment( other );
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized final void incrementBaseCounts( final byte curBase, final byte refBase ) {
|
||||
increment( 1, BaseUtils.simpleBaseToBaseIndex(curBase) == BaseUtils.simpleBaseToBaseIndex(refBase) ? 0 : 1 ); // increment takes num observations, then num mismatches
|
||||
}
|
||||
|
||||
public synchronized final void incrementBaseCounts(boolean hasIndelAtThisPosition) {
|
||||
increment(1, hasIndelAtThisPosition ? 1 : 0);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// methods to derive empirical quality score
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final double empiricalQualDouble( final int smoothing, final double maxQual ) {
|
||||
final double doubleMismatches = (double) ( numMismatches + smoothing );
|
||||
final double doubleObservations = (double) ( numObservations + smoothing );
|
||||
double empiricalQual = -10 * Math.log10(doubleMismatches / doubleObservations);
|
||||
if (empiricalQual > maxQual) { empiricalQual = maxQual; }
|
||||
return empiricalQual;
|
||||
}
|
||||
public final double empiricalQualDouble() { return empiricalQualDouble( 0, QualityUtils.MAX_REASONABLE_Q_SCORE ); } // 'default' behavior is to use smoothing value of zero
|
||||
|
||||
public final byte empiricalQualByte( final int smoothing ) {
|
||||
final double doubleMismatches = (double) ( numMismatches + smoothing );
|
||||
final double doubleObservations = (double) ( numObservations + smoothing );
|
||||
return QualityUtils.probToQual( 1.0 - doubleMismatches / doubleObservations ); // This is capped at Q40
|
||||
}
|
||||
public final byte empiricalQualByte() { return empiricalQualByte( 0 ); } // 'default' behavior is to use smoothing value of zero
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// misc. methods
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public final long getNumObservations() {
|
||||
return numObservations;
|
||||
}
|
||||
|
||||
public final long getNumMismatches() {
|
||||
return numMismatches;
|
||||
}
|
||||
|
||||
public final String outputToCSV( ) {
|
||||
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte() );
|
||||
}
|
||||
public final String outputToCSV( final int smoothing ) {
|
||||
return String.format( "%d,%d,%d", numObservations, numMismatches, (int)empiricalQualByte(smoothing) );
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Nov 27, 2009
|
||||
*
|
||||
* A collection of the arguments that are common to both CovariateCounterWalker and TableRecalibrationWalker.
|
||||
* This set of arguments will also be passed to the constructor of every Covariate when it is instantiated.
|
||||
*/
|
||||
|
||||
public class RecalibrationArgumentCollection {
|
||||
|
||||
//////////////////////////////////
|
||||
// Shared Command Line Arguments
|
||||
//////////////////////////////////
|
||||
@Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.")
|
||||
public String DEFAULT_READ_GROUP = null;
|
||||
@Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String DEFAULT_PLATFORM = null;
|
||||
@Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.")
|
||||
public String FORCE_READ_GROUP = null;
|
||||
@Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String FORCE_PLATFORM = null;
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
|
||||
public int WINDOW_SIZE = 5;
|
||||
@Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false)
|
||||
public int HOMOPOLYMER_NBACK = 7;
|
||||
@Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false)
|
||||
public boolean EXCEPTION_IF_NO_TILE = false;
|
||||
@Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
|
||||
public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO;
|
||||
@Argument(fullName = "solid_nocall_strategy", shortName="solid_nocall_strategy", doc="Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required=false)
|
||||
public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
|
||||
}
|
||||
Loading…
Reference in New Issue