Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable
This commit is contained in:
commit
51ac87b28c
|
|
@ -139,11 +139,11 @@ public class AnalyzeCovariates extends CommandLineProgram {
|
|||
*/
|
||||
@Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots")
|
||||
private int MAX_HISTOGRAM_VALUE = 0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, do indel quality plotting")
|
||||
private boolean DO_INDEL_QUALITY = false;
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
|
|
@ -274,7 +274,6 @@ public class AnalyzeCovariates extends CommandLineProgram {
|
|||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, IGNORE_QSCORES_LESS_THAN );
|
||||
|
||||
}
|
||||
|
||||
private void writeDataTables() {
|
||||
|
|
@ -341,7 +340,7 @@ public class AnalyzeCovariates extends CommandLineProgram {
|
|||
|
||||
// for each covariate
|
||||
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
|
||||
Covariate cov = requestedCovariates.get(iii);
|
||||
final Covariate cov = requestedCovariates.get(iii);
|
||||
final File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
|
||||
if (DO_INDEL_QUALITY) {
|
||||
RScriptExecutor executor = new RScriptExecutor();
|
||||
|
|
@ -349,7 +348,7 @@ public class AnalyzeCovariates extends CommandLineProgram {
|
|||
// The second argument is the name of the covariate in order to make the plots look nice
|
||||
executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]);
|
||||
executor.exec();
|
||||
} else {
|
||||
} else {
|
||||
if( iii == 1 ) {
|
||||
// Analyze reported quality
|
||||
RScriptExecutor executor = new RScriptExecutor();
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
|||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
|
@ -179,10 +180,18 @@ public class GenomeAnalysisEngine {
|
|||
*/
|
||||
private static final long GATK_RANDOM_SEED = 47382911L;
|
||||
private static Random randomGenerator = new Random(GATK_RANDOM_SEED);
|
||||
|
||||
public static Random getRandomGenerator() { return randomGenerator; }
|
||||
public static void resetRandomGenerator() { randomGenerator.setSeed(GATK_RANDOM_SEED); }
|
||||
public static void resetRandomGenerator(long seed) { randomGenerator.setSeed(seed); }
|
||||
|
||||
/**
|
||||
* Static base quality score recalibration helper object
|
||||
*/
|
||||
private static BaseRecalibration baseRecalibration = null;
|
||||
public static BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
|
||||
public static boolean hasBaseRecalibration() { return baseRecalibration != null; }
|
||||
public static void setBaseRecalibration(File recalFile) { baseRecalibration = new BaseRecalibration(recalFile); }
|
||||
|
||||
/**
|
||||
* Actually run the GATK with the specified walker.
|
||||
*
|
||||
|
|
@ -205,6 +214,10 @@ public class GenomeAnalysisEngine {
|
|||
if (this.getArguments().nonDeterministicRandomSeed)
|
||||
resetRandomGenerator(System.currentTimeMillis());
|
||||
|
||||
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
|
||||
if (this.getArguments().BQSR_RECAL_FILE != null)
|
||||
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE);
|
||||
|
||||
// Determine how the threads should be divided between CPU vs. IO.
|
||||
determineThreadAllocation();
|
||||
|
||||
|
|
@ -224,7 +237,7 @@ public class GenomeAnalysisEngine {
|
|||
// create temp directories as necessary
|
||||
initializeTempDirectory();
|
||||
|
||||
// create the output streams "
|
||||
// create the output streams
|
||||
initializeOutputStreams(microScheduler.getOutputTracker());
|
||||
|
||||
Iterable<Shard> shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals);
|
||||
|
|
|
|||
|
|
@ -75,6 +75,7 @@ public class GATKArgumentCollection {
|
|||
* Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times.
|
||||
* One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals).
|
||||
* Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf).
|
||||
* To specify the completely unmapped reads in the BAM file (i.e. those without a reference contig) use -L unmapped.
|
||||
*/
|
||||
@Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false)
|
||||
public List<IntervalBinding<Feature>> intervals = null;
|
||||
|
|
@ -185,6 +186,15 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
|
||||
public Boolean useOriginalBaseQualities = false;
|
||||
|
||||
/**
|
||||
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
|
||||
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
|
||||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*/
|
||||
@Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration")
|
||||
public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously
|
||||
|
||||
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
|
||||
public byte defaultBaseQualities = -1;
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,8 @@ public class GATKReport {
|
|||
|
||||
/**
|
||||
* Create a new GATKReport with the contents of a GATKReport on disk.
|
||||
* @param filename the path to the file to load
|
||||
*
|
||||
* @param filename the path to the file to load
|
||||
*/
|
||||
public GATKReport(String filename) {
|
||||
this(new File(filename));
|
||||
|
|
@ -32,7 +33,8 @@ public class GATKReport {
|
|||
|
||||
/**
|
||||
* Create a new GATKReport with the contents of a GATKReport on disk.
|
||||
* @param file the file to load
|
||||
*
|
||||
* @param file the file to load
|
||||
*/
|
||||
public GATKReport(File file) {
|
||||
loadReport(file);
|
||||
|
|
@ -40,7 +42,8 @@ public class GATKReport {
|
|||
|
||||
/**
|
||||
* Load a GATKReport file from disk
|
||||
* @param file the file to load
|
||||
*
|
||||
* @param file the file to load
|
||||
*/
|
||||
private void loadReport(File file) {
|
||||
try {
|
||||
|
|
@ -48,12 +51,11 @@ public class GATKReport {
|
|||
|
||||
GATKReportTable table = null;
|
||||
String[] header = null;
|
||||
int id = 0;
|
||||
GATKReportVersion version = null;
|
||||
List<Integer> columnStarts = null;
|
||||
|
||||
String line;
|
||||
while ( (line = reader.readLine()) != null ) {
|
||||
while ((line = reader.readLine()) != null) {
|
||||
|
||||
if (line.startsWith(GATKREPORT_HEADER_PREFIX)) {
|
||||
|
||||
|
|
@ -71,7 +73,7 @@ public class GATKReport {
|
|||
|
||||
header = null;
|
||||
columnStarts = null;
|
||||
} else if ( line.trim().isEmpty() ) {
|
||||
} else if (line.trim().isEmpty()) {
|
||||
// do nothing
|
||||
} else {
|
||||
if (table != null) {
|
||||
|
|
@ -97,19 +99,22 @@ public class GATKReport {
|
|||
if (header == null) {
|
||||
header = splitLine;
|
||||
|
||||
table.addPrimaryKey("id", false);
|
||||
|
||||
for ( String columnName : header ) {
|
||||
table.addColumn(columnName, "");
|
||||
// Set the first column as the primary key
|
||||
table.addPrimaryKey(header[0]);
|
||||
// Set every other column as column
|
||||
for (int i = 1; i < header.length; i++) {
|
||||
table.addColumn(header[i], "");
|
||||
}
|
||||
|
||||
id = 0;
|
||||
} else {
|
||||
for (int columnIndex = 0; columnIndex < header.length; columnIndex++) {
|
||||
table.set(id, header[columnIndex], splitLine[columnIndex]);
|
||||
//Get primary key Value from the current line array
|
||||
String primaryKey = splitLine[0];
|
||||
//Input all the remaining values
|
||||
for (int columnIndex = 1; columnIndex < header.length; columnIndex++) {
|
||||
table.set(primaryKey, header[columnIndex], splitLine[columnIndex]);
|
||||
}
|
||||
|
||||
id++;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -124,8 +129,8 @@ public class GATKReport {
|
|||
/**
|
||||
* Add a new table to the collection
|
||||
*
|
||||
* @param tableName the name of the table
|
||||
* @param tableDescription the description of the table
|
||||
* @param tableName the name of the table
|
||||
* @param tableDescription the description of the table
|
||||
*/
|
||||
public void addTable(String tableName, String tableDescription) {
|
||||
addTable(tableName, tableDescription, true);
|
||||
|
|
@ -139,7 +144,7 @@ public class GATKReport {
|
|||
/**
|
||||
* Return true if table with a given name exists
|
||||
*
|
||||
* @param tableName the name of the table
|
||||
* @param tableName the name of the table
|
||||
* @return true if the table exists, false otherwise
|
||||
*/
|
||||
public boolean hasTable(String tableName) {
|
||||
|
|
@ -149,8 +154,8 @@ public class GATKReport {
|
|||
/**
|
||||
* Return a table with a given name
|
||||
*
|
||||
* @param tableName the name of the table
|
||||
* @return the table object
|
||||
* @param tableName the name of the table
|
||||
* @return the table object
|
||||
*/
|
||||
public GATKReportTable getTable(String tableName) {
|
||||
GATKReportTable table = tables.get(tableName);
|
||||
|
|
@ -162,7 +167,7 @@ public class GATKReport {
|
|||
/**
|
||||
* Print all tables contained within this container to a PrintStream
|
||||
*
|
||||
* @param out the PrintStream to which the tables should be written
|
||||
* @param out the PrintStream to which the tables should be written
|
||||
*/
|
||||
public void print(PrintStream out) {
|
||||
for (GATKReportTable table : tables.values()) {
|
||||
|
|
@ -175,4 +180,24 @@ public class GATKReport {
|
|||
public Collection<GATKReportTable> getTables() {
|
||||
return tables.values();
|
||||
}
|
||||
|
||||
public void combineWith(GATKReport input) {
|
||||
|
||||
// For every input table, add values
|
||||
System.out.println("This.tables: keySet");
|
||||
for (String s : tables.keySet())
|
||||
System.out.println(s);
|
||||
|
||||
// todo test tables exist
|
||||
|
||||
|
||||
for (String tableName : input.tables.keySet()) {
|
||||
System.out.println("Input table key: " + tableName);
|
||||
if (tables.containsKey(tableName))
|
||||
tables.get(tableName).mergeRows(input.getTable(tableName));
|
||||
else
|
||||
throw new ReviewedStingException("Failed to combine GATKReport, tables don't match!");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,46 @@
|
|||
package org.broadinstitute.sting.gatk.report;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Gatherer;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: roger
|
||||
* Date: 1/9/12
|
||||
* Time: 11:17 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class GATKReportGatherer extends Gatherer {
|
||||
@Override
|
||||
public void gather(List<File> inputs, File output) {
|
||||
//Combines inputs GATKReport to one output
|
||||
|
||||
PrintStream o;
|
||||
try {
|
||||
o = new PrintStream(output);
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException("File to be output by CoverageByRG Gather function was not found");
|
||||
}
|
||||
|
||||
GATKReport current = new GATKReport();
|
||||
boolean isFirst = true;
|
||||
for (File input : inputs) {
|
||||
|
||||
// If the table is empty
|
||||
if (isFirst) {
|
||||
current = new GATKReport(input);
|
||||
isFirst = false;
|
||||
} else {
|
||||
GATKReport toAdd = new GATKReport(input);
|
||||
current.combineWith(toAdd);
|
||||
}
|
||||
}
|
||||
|
||||
current.print(o);
|
||||
}
|
||||
}
|
||||
|
|
@ -4,7 +4,10 @@ import org.apache.commons.lang.ObjectUtils;
|
|||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
|
@ -12,12 +15,12 @@ import java.util.regex.Pattern;
|
|||
* A data structure that allows data to be collected over the course of a walker's computation, then have that data
|
||||
* written to a PrintStream such that it's human-readable, AWK-able, and R-friendly (given that you load it using the
|
||||
* GATKReport loader module).
|
||||
*
|
||||
* <p/>
|
||||
* The goal of this object is to use the same data structure for both accumulating data during a walker's computation
|
||||
* and emitting that data to a file for easy analysis in R (or any other program/language that can take in a table of
|
||||
* results). Thus, all of the infrastructure below is designed simply to make printing the following as easy as
|
||||
* possible:
|
||||
*
|
||||
* <p/>
|
||||
* ##:GATKReport.v0.1 ErrorRatePerCycle : The error rate per sequenced position in the reads
|
||||
* cycle errorrate.61PA8.7 qualavg.61PA8.7
|
||||
* 0 0.007451835696110506 25.474613284804366
|
||||
|
|
@ -29,60 +32,60 @@ import java.util.regex.Pattern;
|
|||
* 6 5.452562704471102E-4 36.1217248908297
|
||||
* 7 5.452562704471102E-4 36.1910480349345
|
||||
* 8 5.452562704471102E-4 36.00345705967977
|
||||
*
|
||||
* <p/>
|
||||
* Here, we have a GATKReport table - a well-formatted, easy to read representation of some tabular data. Every single
|
||||
* table has this same GATKReport.v0.1 header, which permits multiple files from different sources to be cat-ed
|
||||
* together, which makes it very easy to pull tables from different programs into R via a single file.
|
||||
*
|
||||
* <p/>
|
||||
* ------------
|
||||
* Definitions:
|
||||
*
|
||||
* <p/>
|
||||
* Table info:
|
||||
* The first line, structured as
|
||||
* ##:<report version> <table name> : <table description>
|
||||
*
|
||||
* The first line, structured as
|
||||
* ##:<report version> <table name> : <table description>
|
||||
* <p/>
|
||||
* Table header:
|
||||
* The second line, specifying a unique name for each column in the table.
|
||||
*
|
||||
* The first column mentioned in the table header is the "primary key" column - a column that provides the unique
|
||||
* identifier for each row in the table. Once this column is created, any element in the table can be referenced by
|
||||
* the row-column coordinate, i.e. "primary key"-"column name" coordinate.
|
||||
*
|
||||
* When a column is added to a table, a default value must be specified (usually 0). This is the initial value for
|
||||
* an element in a column. This permits operations like increment() and decrement() to work properly on columns that
|
||||
* are effectively counters for a particular event.
|
||||
*
|
||||
* Finally, the display property for each column can be set during column creation. This is useful when a given
|
||||
* column stores an intermediate result that will be used later on, perhaps to calculate the value of another column.
|
||||
* In these cases, it's obviously necessary to store the value required for further computation, but it's not
|
||||
* necessary to actually print the intermediate column.
|
||||
*
|
||||
* The second line, specifying a unique name for each column in the table.
|
||||
* <p/>
|
||||
* The first column mentioned in the table header is the "primary key" column - a column that provides the unique
|
||||
* identifier for each row in the table. Once this column is created, any element in the table can be referenced by
|
||||
* the row-column coordinate, i.e. "primary key"-"column name" coordinate.
|
||||
* <p/>
|
||||
* When a column is added to a table, a default value must be specified (usually 0). This is the initial value for
|
||||
* an element in a column. This permits operations like increment() and decrement() to work properly on columns that
|
||||
* are effectively counters for a particular event.
|
||||
* <p/>
|
||||
* Finally, the display property for each column can be set during column creation. This is useful when a given
|
||||
* column stores an intermediate result that will be used later on, perhaps to calculate the value of another column.
|
||||
* In these cases, it's obviously necessary to store the value required for further computation, but it's not
|
||||
* necessary to actually print the intermediate column.
|
||||
* <p/>
|
||||
* Table body:
|
||||
* The values of the table itself.
|
||||
*
|
||||
* The values of the table itself.
|
||||
* <p/>
|
||||
* ---------------
|
||||
* Implementation:
|
||||
*
|
||||
* <p/>
|
||||
* The implementation of this table has two components:
|
||||
* 1. A TreeSet<Object> that stores all the values ever specified for the primary key. Any get() operation that
|
||||
* refers to an element where the primary key object does not exist will result in its implicit creation. I
|
||||
* haven't yet decided if this is a good idea...
|
||||
*
|
||||
* 2. A HashMap<String, GATKReportColumn> that stores a mapping from column name to column contents. Each
|
||||
* GATKReportColumn is effectively a map (in fact, GATKReportColumn extends TreeMap<Object, Object>) between
|
||||
* primary key and the column value. This means that, given N columns, the primary key information is stored
|
||||
* N+1 times. This is obviously wasteful and can likely be handled much more elegantly in future implementations.
|
||||
*
|
||||
* 1. A TreeSet<Object> that stores all the values ever specified for the primary key. Any get() operation that
|
||||
* refers to an element where the primary key object does not exist will result in its implicit creation. I
|
||||
* haven't yet decided if this is a good idea...
|
||||
* <p/>
|
||||
* 2. A HashMap<String, GATKReportColumn> that stores a mapping from column name to column contents. Each
|
||||
* GATKReportColumn is effectively a map (in fact, GATKReportColumn extends TreeMap<Object, Object>) between
|
||||
* primary key and the column value. This means that, given N columns, the primary key information is stored
|
||||
* N+1 times. This is obviously wasteful and can likely be handled much more elegantly in future implementations.
|
||||
* <p/>
|
||||
* ------------------------------
|
||||
* Element and column operations:
|
||||
*
|
||||
* <p/>
|
||||
* In addition to simply getting and setting values, this object also permits some simple operations to be applied to
|
||||
* individual elements or to whole columns. For instance, an element can be easily incremented without the hassle of
|
||||
* calling get(), incrementing the obtained value by 1, and then calling set() with the new value. Also, some vector
|
||||
* operations are supported. For instance, two whole columns can be divided and have the result be set to a third
|
||||
* column. This is especially useful when aggregating counts in two intermediate columns that will eventually need to
|
||||
* be manipulated row-by-row to compute the final column.
|
||||
*
|
||||
* <p/>
|
||||
* Note: I've made no attempt whatsoever to make these operations efficient. Right now, some of the methods check the
|
||||
* type of the stored object using an instanceof call and attempt to do the right thing. Others cast the contents of
|
||||
* the cell to a Number, call the Number.toDouble() method and compute a result. This is clearly not the ideal design,
|
||||
|
|
@ -92,7 +95,9 @@ import java.util.regex.Pattern;
|
|||
* @author Khalid Shakir
|
||||
*/
|
||||
public class GATKReportTable {
|
||||
/** REGEX that matches any table with an invalid name */
|
||||
/**
|
||||
* REGEX that matches any table with an invalid name
|
||||
*/
|
||||
public final static String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]";
|
||||
private static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V0_2;
|
||||
private String tableName;
|
||||
|
|
@ -109,8 +114,8 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed
|
||||
*
|
||||
* @param name the name of the table or column
|
||||
* @return true if the name is valid, false if otherwise
|
||||
* @param name the name of the table or column
|
||||
* @return true if the name is valid, false if otherwise
|
||||
*/
|
||||
private boolean isValidName(String name) {
|
||||
Pattern p = Pattern.compile(INVALID_TABLE_NAME_REGEX);
|
||||
|
|
@ -122,8 +127,8 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Verifies that a table or column name has only alphanumeric characters - no spaces or special characters allowed
|
||||
*
|
||||
* @param description the name of the table or column
|
||||
* @return true if the name is valid, false if otherwise
|
||||
* @param description the name of the table or column
|
||||
* @return true if the name is valid, false if otherwise
|
||||
*/
|
||||
private boolean isValidDescription(String description) {
|
||||
Pattern p = Pattern.compile("\\r|\\n");
|
||||
|
|
@ -135,15 +140,15 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Construct a new GATK report table with the specified name and description
|
||||
*
|
||||
* @param tableName the name of the table
|
||||
* @param tableDescription the description of the table
|
||||
* @param tableName the name of the table
|
||||
* @param tableDescription the description of the table
|
||||
*/
|
||||
public GATKReportTable(String tableName, String tableDescription) {
|
||||
this(tableName, tableDescription, true);
|
||||
}
|
||||
|
||||
public GATKReportTable(String tableName, String tableDescription, boolean sortByPrimaryKey) {
|
||||
if (!isValidName(tableName)) {
|
||||
if (!isValidName(tableName)) {
|
||||
throw new ReviewedStingException("Attempted to set a GATKReportTable name of '" + tableName + "'. GATKReportTable names must be purely alphanumeric - no spaces or special characters are allowed.");
|
||||
}
|
||||
|
||||
|
|
@ -169,7 +174,7 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Add a primary key column. This becomes the unique identifier for every column in the table.
|
||||
*
|
||||
* @param primaryKeyName the name of the primary key column
|
||||
* @param primaryKeyName the name of the primary key column
|
||||
*/
|
||||
public void addPrimaryKey(String primaryKeyName) {
|
||||
addPrimaryKey(primaryKeyName, true);
|
||||
|
|
@ -178,8 +183,8 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Add an optionally visible primary key column. This becomes the unique identifier for every column in the table, and will always be printed as the first column.
|
||||
*
|
||||
* @param primaryKeyName the name of the primary key column
|
||||
* @param display should this primary key be displayed?
|
||||
* @param primaryKeyName the name of the primary key column
|
||||
* @param display should this primary key be displayed?
|
||||
*/
|
||||
public void addPrimaryKey(String primaryKeyName, boolean display) {
|
||||
if (!isValidName(primaryKeyName)) {
|
||||
|
|
@ -195,6 +200,7 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Returns the first primary key matching the dotted column values.
|
||||
* Ex: dbsnp.eval.called.all.novel.all
|
||||
*
|
||||
* @param dottedColumnValues Period concatenated values.
|
||||
* @return The first primary key matching the column values or throws an exception.
|
||||
*/
|
||||
|
|
@ -208,6 +214,7 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Returns true if there is at least on row with the dotted column values.
|
||||
* Ex: dbsnp.eval.called.all.novel.all
|
||||
*
|
||||
* @param dottedColumnValues Period concatenated values.
|
||||
* @return true if there is at least one row matching the columns.
|
||||
*/
|
||||
|
|
@ -218,6 +225,7 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Returns the first primary key matching the dotted column values.
|
||||
* Ex: dbsnp.eval.called.all.novel.all
|
||||
*
|
||||
* @param dottedColumnValues Period concatenated values.
|
||||
* @return The first primary key matching the column values or null.
|
||||
*/
|
||||
|
|
@ -228,6 +236,7 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Returns the first primary key matching the column values.
|
||||
* Ex: new String[] { "dbsnp", "eval", "called", "all", "novel", "all" }
|
||||
*
|
||||
* @param columnValues column values.
|
||||
* @return The first primary key matching the column values.
|
||||
*/
|
||||
|
|
@ -235,7 +244,7 @@ public class GATKReportTable {
|
|||
for (Object primaryKey : primaryKeyColumn) {
|
||||
boolean matching = true;
|
||||
for (int i = 0; matching && i < columnValues.length; i++) {
|
||||
matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i+1));
|
||||
matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i + 1));
|
||||
}
|
||||
if (matching)
|
||||
return primaryKey;
|
||||
|
|
@ -246,8 +255,8 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Add a column to the report and specify the default value that should be supplied if a given position in the table is never explicitly set.
|
||||
*
|
||||
* @param columnName the name of the column
|
||||
* @param defaultValue the default value for the column
|
||||
* @param columnName the name of the column
|
||||
* @param defaultValue the default value for the column
|
||||
*/
|
||||
public void addColumn(String columnName, Object defaultValue) {
|
||||
addColumn(columnName, defaultValue, null);
|
||||
|
|
@ -256,12 +265,13 @@ public class GATKReportTable {
|
|||
public void addColumn(String columnName, Object defaultValue, String format) {
|
||||
addColumn(columnName, defaultValue, true, format);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a column to the report, specify the default column value, and specify whether the column should be displayed in the final output (useful when intermediate columns are necessary for later calculations, but are not required to be in the output file.
|
||||
*
|
||||
* @param columnName the name of the column
|
||||
* @param defaultValue the default value of the column
|
||||
* @param display if true - the column will be displayed; if false - the column will be hidden
|
||||
* @param columnName the name of the column
|
||||
* @param defaultValue the default value of the column
|
||||
* @param display if true - the column will be displayed; if false - the column will be hidden
|
||||
*/
|
||||
public void addColumn(String columnName, Object defaultValue, boolean display) {
|
||||
addColumn(columnName, defaultValue, display, null);
|
||||
|
|
@ -277,8 +287,8 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Check if the requested element exists, and if not, create it.
|
||||
*
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
*/
|
||||
private void verifyEntry(Object primaryKey, String columnName) {
|
||||
if (!columns.containsKey(columnName)) {
|
||||
|
|
@ -299,9 +309,9 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Set the value for a given position in the table
|
||||
*
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @param value the value to set
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @param value the value to set
|
||||
*/
|
||||
public void set(Object primaryKey, String columnName, Object value) {
|
||||
verifyEntry(primaryKey, columnName);
|
||||
|
|
@ -312,13 +322,13 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Get a value from the given position in the table
|
||||
*
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @return the value stored at the specified position in the table
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @return the value stored at the specified position in the table
|
||||
*/
|
||||
public Object get(Object primaryKey, String columnName) {
|
||||
verifyEntry(primaryKey, columnName);
|
||||
|
||||
|
||||
return columns.get(columnName).get(primaryKey);
|
||||
}
|
||||
|
||||
|
|
@ -327,7 +337,7 @@ public class GATKReportTable {
|
|||
*
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnIndex the index of the column
|
||||
* @return the value stored at the specified position in the table
|
||||
* @return the value stored at the specified position in the table
|
||||
*/
|
||||
private Object get(Object primaryKey, int columnIndex) {
|
||||
return columns.getByIndex(columnIndex).get(primaryKey);
|
||||
|
|
@ -336,8 +346,8 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Increment an element in the table. This implementation is awful - a functor would probably be better.
|
||||
*
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
*/
|
||||
public void increment(Object primaryKey, String columnName) {
|
||||
Object oldValue = get(primaryKey, columnName);
|
||||
|
|
@ -365,8 +375,8 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Decrement an element in the table. This implementation is awful - a functor would probably be better.
|
||||
*
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
*/
|
||||
public void decrement(Object primaryKey, String columnName) {
|
||||
Object oldValue = get(primaryKey, columnName);
|
||||
|
|
@ -394,9 +404,9 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Add the specified value to an element in the table
|
||||
*
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @param valueToAdd the value to add
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @param valueToAdd the value to add
|
||||
*/
|
||||
public void add(Object primaryKey, String columnName, Object valueToAdd) {
|
||||
Object oldValue = get(primaryKey, columnName);
|
||||
|
|
@ -424,8 +434,8 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Subtract the specified value from an element in the table
|
||||
*
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @param valueToSubtract the value to subtract
|
||||
*/
|
||||
public void subtract(Object primaryKey, String columnName, Object valueToSubtract) {
|
||||
|
|
@ -454,9 +464,9 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Multiply the specified value to an element in the table
|
||||
*
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @param valueToMultiply the value to multiply by
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @param valueToMultiply the value to multiply by
|
||||
*/
|
||||
public void multiply(Object primaryKey, String columnName, Object valueToMultiply) {
|
||||
Object oldValue = get(primaryKey, columnName);
|
||||
|
|
@ -484,9 +494,9 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Divide the specified value from an element in the table
|
||||
*
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @param valueToDivide the value to divide by
|
||||
* @param primaryKey the primary key value
|
||||
* @param columnName the name of the column
|
||||
* @param valueToDivide the value to divide by
|
||||
*/
|
||||
public void divide(Object primaryKey, String columnName, Object valueToDivide) {
|
||||
Object oldValue = get(primaryKey, columnName);
|
||||
|
|
@ -514,9 +524,9 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Add two columns to each other and set the results to a third column
|
||||
*
|
||||
* @param columnToSet the column that should hold the results
|
||||
* @param augend the column that shall be the augend
|
||||
* @param addend the column that shall be the addend
|
||||
* @param columnToSet the column that should hold the results
|
||||
* @param augend the column that shall be the augend
|
||||
* @param addend the column that shall be the addend
|
||||
*/
|
||||
public void addColumns(String columnToSet, String augend, String addend) {
|
||||
for (Object primaryKey : primaryKeyColumn) {
|
||||
|
|
@ -532,8 +542,8 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Subtract one column from another and set the results to a third column
|
||||
*
|
||||
* @param columnToSet the column that should hold the results
|
||||
* @param minuend the column that shall be the minuend (the a in a - b)
|
||||
* @param columnToSet the column that should hold the results
|
||||
* @param minuend the column that shall be the minuend (the a in a - b)
|
||||
* @param subtrahend the column that shall be the subtrahend (the b in a - b)
|
||||
*/
|
||||
public void subtractColumns(String columnToSet, String minuend, String subtrahend) {
|
||||
|
|
@ -551,8 +561,8 @@ public class GATKReportTable {
|
|||
* Multiply two columns by each other and set the results to a third column
|
||||
*
|
||||
* @param columnToSet the column that should hold the results
|
||||
* @param multiplier the column that shall be the multiplier
|
||||
* @param multiplicand the column that shall be the multiplicand
|
||||
* @param multiplier the column that shall be the multiplier
|
||||
* @param multiplicand the column that shall be the multiplicand
|
||||
*/
|
||||
public void multiplyColumns(String columnToSet, String multiplier, String multiplicand) {
|
||||
for (Object primaryKey : primaryKeyColumn) {
|
||||
|
|
@ -568,9 +578,9 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Divide two columns by each other and set the results to a third column
|
||||
*
|
||||
* @param columnToSet the column that should hold the results
|
||||
* @param numeratorColumn the column that shall be the numerator
|
||||
* @param denominatorColumn the column that shall be the denominator
|
||||
* @param columnToSet the column that should hold the results
|
||||
* @param numeratorColumn the column that shall be the numerator
|
||||
* @param denominatorColumn the column that shall be the denominator
|
||||
*/
|
||||
public void divideColumns(String columnToSet, String numeratorColumn, String denominatorColumn) {
|
||||
for (Object primaryKey : primaryKeyColumn) {
|
||||
|
|
@ -585,10 +595,11 @@ public class GATKReportTable {
|
|||
|
||||
/**
|
||||
* Return the print width of the primary key column
|
||||
* @return the width of the primary key column
|
||||
*
|
||||
* @return the width of the primary key column
|
||||
*/
|
||||
public int getPrimaryKeyColumnWidth() {
|
||||
int maxWidth = primaryKeyName.length();
|
||||
int maxWidth = getPrimaryKeyName().length();
|
||||
|
||||
for (Object primaryKey : primaryKeyColumn) {
|
||||
int width = primaryKey.toString().length();
|
||||
|
|
@ -604,7 +615,7 @@ public class GATKReportTable {
|
|||
/**
|
||||
* Write the table to the PrintStream, formatted nicely to be human-readable, AWK-able, and R-friendly.
|
||||
*
|
||||
* @param out the PrintStream to which the table should be written
|
||||
* @param out the PrintStream to which the table should be written
|
||||
*/
|
||||
public void write(PrintStream out) {
|
||||
// Get the column widths for everything
|
||||
|
|
@ -620,13 +631,15 @@ public class GATKReportTable {
|
|||
// Emit the table header, taking into account the padding requirement if the primary key is a hidden column
|
||||
boolean needsPadding = false;
|
||||
if (primaryKeyDisplay) {
|
||||
out.printf(primaryKeyFormat, primaryKeyName);
|
||||
out.printf(primaryKeyFormat, getPrimaryKeyName());
|
||||
needsPadding = true;
|
||||
}
|
||||
|
||||
for (String columnName : columns.keySet()) {
|
||||
if (columns.get(columnName).isDisplayable()) {
|
||||
if (needsPadding) { out.printf(" "); }
|
||||
if (needsPadding) {
|
||||
out.printf(" ");
|
||||
}
|
||||
out.printf(columnFormats.get(columnName).getNameFormat(), columnName);
|
||||
|
||||
needsPadding = true;
|
||||
|
|
@ -645,7 +658,9 @@ public class GATKReportTable {
|
|||
|
||||
for (String columnName : columns.keySet()) {
|
||||
if (columns.get(columnName).isDisplayable()) {
|
||||
if (needsPadding) { out.printf(" "); }
|
||||
if (needsPadding) {
|
||||
out.printf(" ");
|
||||
}
|
||||
String value = columns.get(columnName).getStringValue(primaryKey);
|
||||
out.printf(columnFormats.get(columnName).getValueFormat(), value);
|
||||
|
||||
|
|
@ -675,4 +690,49 @@ public class GATKReportTable {
|
|||
public GATKReportColumns getColumns() {
|
||||
return columns;
|
||||
}
|
||||
|
||||
public void mergeRows(GATKReportTable input) {
|
||||
/*
|
||||
* This function is different from addRowsFrom because we will add the ability to sum,average, etc rows
|
||||
* TODO: Add other combining algorithms
|
||||
*/
|
||||
|
||||
// Make sure the columns match AND the Primary Key
|
||||
if (input.getColumns().keySet().equals(this.getColumns().keySet()) &&
|
||||
input.getPrimaryKeyName().equals(this.getPrimaryKeyName())) {
|
||||
this.addRowsFrom(input);
|
||||
} else
|
||||
throw new ReviewedStingException("Failed to combine GATKReportTable, columns don't match!");
|
||||
}
|
||||
|
||||
public void addRowsFrom(GATKReportTable input) {
|
||||
// add column by column
|
||||
|
||||
// For every column
|
||||
for (String columnKey : input.getColumns().keySet()) {
|
||||
GATKReportColumn current = this.getColumns().get(columnKey);
|
||||
GATKReportColumn toAdd = input.getColumns().get(columnKey);
|
||||
// We want to take the current column and add all the values from input
|
||||
|
||||
// The column is a map of values <Key, Value>
|
||||
for (Object rowKey : toAdd.keySet()) {
|
||||
// We add every value from toAdd to the current
|
||||
if (!current.containsKey(rowKey)) {
|
||||
this.set(rowKey, columnKey, toAdd.get(rowKey));
|
||||
System.out.printf("Putting row with PK: %s \n", rowKey);
|
||||
} else {
|
||||
|
||||
// TODO we should be able to handle combining data by adding, averaging, etc.
|
||||
this.set(rowKey, columnKey, toAdd.get(rowKey));
|
||||
|
||||
System.out.printf("OVERWRITING Row with PK: %s \n", rowKey);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public String getPrimaryKeyName() {
|
||||
return primaryKeyName;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
package org.broadinstitute.sting.gatk.traversals;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.WalkerManager;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
|
|
@ -29,7 +28,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
protected static Logger logger = Logger.getLogger(TraversalEngine.class);
|
||||
|
||||
private final Queue<ActiveRegion> workQueue = new LinkedList<ActiveRegion>();
|
||||
private final LinkedHashSet<SAMRecord> myReads = new LinkedHashSet<SAMRecord>();
|
||||
private final LinkedHashSet<GATKSAMRecord> myReads = new LinkedHashSet<GATKSAMRecord>();
|
||||
|
||||
@Override
|
||||
protected String getTraversalType() {
|
||||
|
|
@ -101,7 +100,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
|
||||
// Grab all the previously unseen reads from this pileup and add them to the massive read list
|
||||
for( final PileupElement p : locus.getBasePileup() ) {
|
||||
final SAMRecord read = p.getRead();
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
if( !myReads.contains(read) ) {
|
||||
myReads.add(read);
|
||||
}
|
||||
|
|
@ -111,7 +110,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
// which active regions in the work queue are now safe to process
|
||||
if( !locusView.hasNext() ) {
|
||||
for( final PileupElement p : locus.getBasePileup() ) {
|
||||
final SAMRecord read = p.getRead();
|
||||
final GATKSAMRecord read = p.getRead();
|
||||
if( !myReads.contains(read) ) {
|
||||
myReads.add(read);
|
||||
}
|
||||
|
|
@ -124,7 +123,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
|
||||
// Take the individual isActive calls and integrate them into contiguous active regions and
|
||||
// add these blocks of work to the work queue
|
||||
final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList, firstIsActiveStart, activeRegionExtension );
|
||||
final ArrayList<ActiveRegion> activeRegions = integrateActiveList( isActiveList, firstIsActiveStart, activeRegionExtension, walker.presetActiveRegions != null );
|
||||
logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." );
|
||||
if( walker.activeRegionOutStream == null ) {
|
||||
workQueue.addAll( activeRegions );
|
||||
|
|
@ -156,9 +155,9 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
return sum;
|
||||
}
|
||||
|
||||
private T processActiveRegion( final ActiveRegion activeRegion, final LinkedHashSet<SAMRecord> reads, final Queue<ActiveRegion> workQueue, final T sum, final ActiveRegionWalker<M,T> walker ) {
|
||||
final ArrayList<SAMRecord> placedReads = new ArrayList<SAMRecord>();
|
||||
for( final SAMRecord read : reads ) {
|
||||
private T processActiveRegion( final ActiveRegion activeRegion, final LinkedHashSet<GATKSAMRecord> reads, final Queue<ActiveRegion> workQueue, final T sum, final ActiveRegionWalker<M,T> walker ) {
|
||||
final ArrayList<GATKSAMRecord> placedReads = new ArrayList<GATKSAMRecord>();
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read );
|
||||
if( activeRegion.getLocation().overlapsP( readLoc ) ) {
|
||||
// The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region)
|
||||
|
|
@ -170,22 +169,22 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
bestRegion = otherRegionToTest;
|
||||
}
|
||||
}
|
||||
bestRegion.add( (GATKSAMRecord) read );
|
||||
bestRegion.add( read );
|
||||
|
||||
// The read is also added to all other regions in which it overlaps but marked as non-primary
|
||||
if( walker.wantsNonPrimaryReads() ) {
|
||||
if( !bestRegion.equals(activeRegion) ) {
|
||||
activeRegion.add( (GATKSAMRecord) read );
|
||||
activeRegion.add( read );
|
||||
}
|
||||
for( final ActiveRegion otherRegionToTest : workQueue ) {
|
||||
if( !bestRegion.equals(otherRegionToTest) && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) {
|
||||
otherRegionToTest.add( (GATKSAMRecord) read );
|
||||
otherRegionToTest.add( read );
|
||||
}
|
||||
}
|
||||
}
|
||||
placedReads.add( read );
|
||||
} else if( activeRegion.getExtendedLoc().overlapsP( readLoc ) && walker.wantsNonPrimaryReads() ) {
|
||||
activeRegion.add( (GATKSAMRecord) read );
|
||||
activeRegion.add( read );
|
||||
}
|
||||
}
|
||||
reads.removeAll( placedReads ); // remove all the reads which have been placed into their active region
|
||||
|
|
@ -214,7 +213,7 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
}
|
||||
|
||||
// band-pass filter the list of isActive probabilities and turn into active regions
|
||||
private ArrayList<ActiveRegion> integrateActiveList( final ArrayList<Double> activeList, final GenomeLoc firstIsActiveStart, final int activeRegionExtension ) {
|
||||
private ArrayList<ActiveRegion> integrateActiveList( final ArrayList<Double> activeList, final GenomeLoc firstIsActiveStart, final int activeRegionExtension, final boolean presetRegions ) {
|
||||
|
||||
final double ACTIVE_PROB_THRESHOLD = 0.2; // BUGBUG: needs to be set-able by the walker author
|
||||
final ArrayList<ActiveRegion> returnList = new ArrayList<ActiveRegion>();
|
||||
|
|
@ -227,11 +226,11 @@ public class TraverseActiveRegions <M,T> extends TraversalEngine<M,T,ActiveRegio
|
|||
} else {
|
||||
final Double[] activeProbArray = activeList.toArray(new Double[activeList.size()]);
|
||||
final double[] filteredProbArray = new double[activeProbArray.length];
|
||||
final int FILTER_SIZE = 50; // BUGBUG: needs to be set-able by the walker author
|
||||
final int MAX_ACTIVE_REGION = 425; // BUGBUG: needs to be set-able by the walker author
|
||||
final int FILTER_SIZE = ( presetRegions ? 0 : 50 ); // BUGBUG: needs to be set-able by the walker author
|
||||
final int MAX_ACTIVE_REGION = ( presetRegions ? 16001 : 425 ); // BUGBUG: needs to be set-able by the walker author
|
||||
for( int iii = 0; iii < activeProbArray.length; iii++ ) {
|
||||
double maxVal = 0;
|
||||
for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(activeList.size(), iii+FILTER_SIZE); jjj++ ) {
|
||||
for( int jjj = Math.max(0, iii-FILTER_SIZE); jjj < Math.min(activeList.size(), iii+FILTER_SIZE+1); jjj++ ) {
|
||||
if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; }
|
||||
}
|
||||
filteredProbArray[iii] = maxVal;
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.samples.Sample;
|
|||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
|
|
@ -17,16 +18,13 @@ import java.util.*;
|
|||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin, lfran
|
||||
* User: rpoplin, lfran, ebanks
|
||||
* Date: 11/14/11
|
||||
*/
|
||||
|
||||
public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation {
|
||||
public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation {
|
||||
|
||||
private Set<Sample> trios = null;
|
||||
private final static int REF = 0;
|
||||
private final static int HET = 1;
|
||||
private final static int HOM = 2;
|
||||
private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information
|
||||
|
||||
public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
|
||||
|
|
@ -38,10 +36,10 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen
|
|||
}
|
||||
}
|
||||
|
||||
final Map<String,Object> toRet = new HashMap<String,Object>(1);
|
||||
final Map<String, Object> toRet = new HashMap<String, Object>(1);
|
||||
final HashSet<Sample> triosToTest = new HashSet<Sample>();
|
||||
|
||||
for( final Sample child : trios) {
|
||||
for( final Sample child : trios ) {
|
||||
final boolean hasAppropriateGenotypes = vc.hasGenotype(child.getID()) && vc.getGenotype(child.getID()).hasLikelihoods() &&
|
||||
vc.hasGenotype(child.getPaternalID()) && vc.getGenotype(child.getPaternalID()).hasLikelihoods() &&
|
||||
vc.hasGenotype(child.getMaternalID()) && vc.getGenotype(child.getMaternalID()).hasLikelihoods();
|
||||
|
|
@ -65,28 +63,55 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen
|
|||
// Following derivation in http://en.wikipedia.org/wiki/Transmission_disequilibrium_test#A_modified_version_of_the_TDT
|
||||
private double calculateTDT( final VariantContext vc, final Set<Sample> triosToTest ) {
|
||||
|
||||
final double nABGivenABandBB = calculateNChildren(vc, triosToTest, HET, HET, HOM) + calculateNChildren(vc, triosToTest, HET, HOM, HET);
|
||||
final double nBBGivenABandBB = calculateNChildren(vc, triosToTest, HOM, HET, HOM) + calculateNChildren(vc, triosToTest, HOM, HOM, HET);
|
||||
final double nAAGivenABandAB = calculateNChildren(vc, triosToTest, REF, HET, HET);
|
||||
final double nBBGivenABandAB = calculateNChildren(vc, triosToTest, HOM, HET, HET);
|
||||
final double nAAGivenAAandAB = calculateNChildren(vc, triosToTest, REF, REF, HET) + calculateNChildren(vc, triosToTest, REF, HET, REF);
|
||||
final double nABGivenAAandAB = calculateNChildren(vc, triosToTest, HET, REF, HET) + calculateNChildren(vc, triosToTest, HET, HET, REF);
|
||||
double nABGivenABandBB = 0.0;
|
||||
double nBBGivenABandBB = 0.0;
|
||||
double nAAGivenABandAB = 0.0;
|
||||
double nBBGivenABandAB = 0.0;
|
||||
double nAAGivenAAandAB = 0.0;
|
||||
double nABGivenAAandAB = 0.0;
|
||||
|
||||
// for each pair of alleles, add the likelihoods
|
||||
int numAlleles = vc.getNAlleles();
|
||||
for ( int allele1 = 0; allele1 < numAlleles; allele1++ ) {
|
||||
final int HOM1index = determineHomIndex(allele1, numAlleles);
|
||||
|
||||
for ( int allele2 = allele1 + 1; allele2 < numAlleles; allele2++ ) {
|
||||
|
||||
// TODO -- cache these for better performance
|
||||
final int HETindex = HOM1index + (allele2 - allele1);
|
||||
final int HOM2index = determineHomIndex(allele2, numAlleles);
|
||||
|
||||
nABGivenABandBB += calculateNChildren(vc, triosToTest, HETindex, HETindex, HOM2index) + calculateNChildren(vc, triosToTest, HETindex, HOM2index, HETindex);
|
||||
nBBGivenABandBB += calculateNChildren(vc, triosToTest, HOM2index, HETindex, HOM2index) + calculateNChildren(vc, triosToTest, HOM2index, HOM2index, HETindex);
|
||||
nAAGivenABandAB += calculateNChildren(vc, triosToTest, HOM1index, HETindex, HETindex);
|
||||
nBBGivenABandAB += calculateNChildren(vc, triosToTest, HOM2index, HETindex, HETindex);
|
||||
nAAGivenAAandAB += calculateNChildren(vc, triosToTest, HOM1index, HOM1index, HETindex) + calculateNChildren(vc, triosToTest, HOM1index, HETindex, HOM1index);
|
||||
nABGivenAAandAB += calculateNChildren(vc, triosToTest, HETindex, HOM1index, HETindex) + calculateNChildren(vc, triosToTest, HETindex, HETindex, HOM1index);
|
||||
}
|
||||
}
|
||||
|
||||
final double numer = (nABGivenABandBB - nBBGivenABandBB) + 2.0 * (nAAGivenABandAB - nBBGivenABandAB) + (nAAGivenAAandAB - nABGivenAAandAB);
|
||||
final double denom = (nABGivenABandBB + nBBGivenABandBB) + 4.0 * (nAAGivenABandAB + nBBGivenABandAB) + (nAAGivenAAandAB + nABGivenAAandAB);
|
||||
return (numer * numer) / denom;
|
||||
}
|
||||
|
||||
private double calculateNChildren( final VariantContext vc, final Set<Sample> triosToTest, final int childIdx, final int parent1Idx, final int parent2Idx ) {
|
||||
private double calculateNChildren( final VariantContext vc, final Set<Sample> triosToTest, final int childIdx, final int momIdx, final int dadIdx ) {
|
||||
final double likelihoodVector[] = new double[triosToTest.size()];
|
||||
int iii = 0;
|
||||
for( final Sample child : triosToTest ) {
|
||||
final double[] momGL = vc.getGenotype(child.getMaternalID()).getLikelihoods().getAsVector();
|
||||
final double[] dadGL = vc.getGenotype(child.getPaternalID()).getLikelihoods().getAsVector();
|
||||
final double[] childGL = vc.getGenotype(child.getID()).getLikelihoods().getAsVector();
|
||||
likelihoodVector[iii++] = momGL[parent1Idx] + dadGL[parent2Idx] + childGL[childIdx];
|
||||
likelihoodVector[iii++] = momGL[momIdx] + dadGL[dadIdx] + childGL[childIdx];
|
||||
}
|
||||
|
||||
return MathUtils.sumLog10(likelihoodVector);
|
||||
}
|
||||
|
||||
private static int determineHomIndex(final int alleleIndex, int numAlleles) {
|
||||
int result = 0;
|
||||
for ( int i = 0; i < alleleIndex; i++ )
|
||||
result += numAlleles--;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,22 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
|
||||
/**
|
||||
* Short one line description of the walker.
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 2/1/12
|
||||
*/
|
||||
public enum CallableStatus {
|
||||
/** the reference base was an N, which is not considered callable the GATK */
|
||||
REF_N,
|
||||
/** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */
|
||||
CALLABLE,
|
||||
/** absolutely no reads were seen at this locus, regardless of the filtering parameters */
|
||||
NO_COVERAGE,
|
||||
/** there were less than min. depth bases at the locus, after applying filters */
|
||||
LOW_COVERAGE,
|
||||
/** more than -maxDepth read at the locus, indicating some sort of mapping problem */
|
||||
EXCESSIVE_COVERAGE,
|
||||
/** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */
|
||||
POOR_QUALITY
|
||||
}
|
||||
|
|
@ -0,0 +1,172 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.IntervalBinding;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.By;
|
||||
import org.broadinstitute.sting.gatk.walkers.DataSource;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocComparator;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* Short one line description of the walker.
|
||||
*
|
||||
* <p>
|
||||
* [Long description of the walker]
|
||||
* </p>
|
||||
*
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* [Description of the Input]
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* [Description of the Output]
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T [walker name]
|
||||
* </pre>
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 2/1/12
|
||||
*/
|
||||
@By(value = DataSource.READS)
|
||||
public class DiagnoseTargets extends LocusWalker<Long, Long> {
|
||||
@Input(fullName = "interval_track", shortName = "int", doc = "", required = true)
|
||||
private IntervalBinding<Feature> intervalTrack = null;
|
||||
|
||||
@Output
|
||||
private PrintStream out = System.out;
|
||||
|
||||
@Argument(fullName = "expand_interval", shortName = "exp", doc = "", required = false)
|
||||
private int expandInterval = 50;
|
||||
|
||||
@Argument(fullName = "minimum_base_quality", shortName = "mbq", doc = "", required = false)
|
||||
private int minimumBaseQuality = 20;
|
||||
|
||||
@Argument(fullName = "minimum_mapping_quality", shortName = "mmq", doc = "", required = false)
|
||||
private int minimumMappingQuality = 20;
|
||||
|
||||
@Argument(fullName = "minimum_coverage", shortName = "mincov", doc = "", required = false)
|
||||
private int minimumCoverage = 5;
|
||||
|
||||
@Argument(fullName = "maximum_coverage", shortName = "maxcov", doc = "", required = false)
|
||||
private int maximumCoverage = 700;
|
||||
|
||||
private TreeSet<GenomeLoc> intervalList = null; // The list of intervals of interest (plus expanded intervals if user wants them)
|
||||
private HashMap<GenomeLoc, IntervalStatistics> intervalMap = null; // interval => statistics
|
||||
private Iterator<GenomeLoc> intervalListIterator; // An iterator to go over all the intervals provided as we traverse the genome
|
||||
private GenomeLoc currentInterval = null; // The "current" interval loaded and being filled with statistics
|
||||
private IntervalStatistics currentIntervalStatistics = null; // The "current" interval loaded and being filled with statistics
|
||||
|
||||
private GenomeLocParser parser; // just an object to allow us to create genome locs (for the expanded intervals)
|
||||
|
||||
@Override
|
||||
public void initialize() {
|
||||
super.initialize();
|
||||
|
||||
if (intervalTrack == null)
|
||||
throw new UserException("This tool currently only works if you provide an interval track");
|
||||
|
||||
parser = new GenomeLocParser(getToolkit().getMasterSequenceDictionary()); // Important to initialize the parser before creating the intervals below
|
||||
|
||||
List<GenomeLoc> originalList = intervalTrack.getIntervals(getToolkit()); // The original list of targets provided by the user that will be expanded or not depending on the options provided
|
||||
intervalList = new TreeSet<GenomeLoc>(new GenomeLocComparator());
|
||||
intervalMap = new HashMap<GenomeLoc, IntervalStatistics>(originalList.size() * 2);
|
||||
for (GenomeLoc interval : originalList)
|
||||
addAndExpandIntervalToLists(interval);
|
||||
|
||||
intervalListIterator = intervalList.iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
GenomeLoc refLocus = ref.getLocus();
|
||||
while (currentInterval == null || currentInterval.isBefore(refLocus)) {
|
||||
if (!intervalListIterator.hasNext())
|
||||
return 0L;
|
||||
|
||||
currentInterval = intervalListIterator.next();
|
||||
currentIntervalStatistics = intervalMap.get(currentInterval);
|
||||
}
|
||||
|
||||
if (currentInterval.isPast(refLocus))
|
||||
return 0L;
|
||||
|
||||
byte[] mappingQualities = context.getBasePileup().getMappingQuals();
|
||||
byte[] baseQualities = context.getBasePileup().getQuals();
|
||||
int coverage = context.getBasePileup().getBaseAndMappingFilteredPileup(minimumBaseQuality, minimumMappingQuality).depthOfCoverage();
|
||||
int rawCoverage = context.size();
|
||||
|
||||
IntervalStatisticLocus locusData = new IntervalStatisticLocus(mappingQualities, baseQualities, coverage, rawCoverage);
|
||||
currentIntervalStatistics.addLocus(refLocus, locusData);
|
||||
|
||||
return 1L;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long reduceInit() {
|
||||
return 0L;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long reduce(Long value, Long sum) {
|
||||
return sum + value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Long result) {
|
||||
super.onTraversalDone(result);
|
||||
out.println("Interval\tCallStatus\tCOV\tAVG");
|
||||
for (GenomeLoc interval : intervalList) {
|
||||
IntervalStatistics stats = intervalMap.get(interval);
|
||||
out.println(String.format("%s\t%s\t%d\t%f", interval, stats.callableStatus(), stats.totalCoverage(), stats.averageCoverage()));
|
||||
}
|
||||
}
|
||||
|
||||
private GenomeLoc createIntervalBefore(GenomeLoc interval) {
|
||||
int start = Math.max(interval.getStart() - expandInterval, 0);
|
||||
int stop = Math.max(interval.getStart() - 1, 0);
|
||||
return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop);
|
||||
}
|
||||
|
||||
private GenomeLoc createIntervalAfter(GenomeLoc interval) {
|
||||
int contigLimit = getToolkit().getSAMFileHeader().getSequenceDictionary().getSequence(interval.getContigIndex()).getSequenceLength();
|
||||
int start = Math.min(interval.getStop() + 1, contigLimit);
|
||||
int stop = Math.min(interval.getStop() + expandInterval, contigLimit);
|
||||
return parser.createGenomeLoc(interval.getContig(), interval.getContigIndex(), start, stop);
|
||||
}
|
||||
|
||||
private void addAndExpandIntervalToLists(GenomeLoc interval) {
|
||||
if (expandInterval > 0) {
|
||||
GenomeLoc before = createIntervalBefore(interval);
|
||||
GenomeLoc after = createIntervalAfter(interval);
|
||||
intervalList.add(before);
|
||||
intervalList.add(after);
|
||||
intervalMap.put(before, new IntervalStatistics(before, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality));
|
||||
intervalMap.put(after, new IntervalStatistics(after, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality));
|
||||
}
|
||||
intervalList.add(interval);
|
||||
intervalMap.put(interval, new IntervalStatistics(interval, minimumCoverage, maximumCoverage, minimumMappingQuality, minimumBaseQuality));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
|
||||
/**
|
||||
* The definition of a locus for the DiagnoseTargets walker statistics calculation
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 2/3/12
|
||||
*/
|
||||
class IntervalStatisticLocus {
|
||||
private final byte[] mappingQuality;
|
||||
private final byte[] baseQuality;
|
||||
private final int coverage;
|
||||
private final int rawCoverage;
|
||||
|
||||
public IntervalStatisticLocus(byte[] mappingQuality, byte[] baseQuality, int coverage, int rawCoverage) {
|
||||
this.mappingQuality = mappingQuality;
|
||||
this.baseQuality = baseQuality;
|
||||
this.coverage = coverage;
|
||||
this.rawCoverage = rawCoverage;
|
||||
}
|
||||
|
||||
public IntervalStatisticLocus() {
|
||||
this(new byte[1], new byte[1], 0, 0);
|
||||
}
|
||||
|
||||
public int getCoverage() {
|
||||
return coverage;
|
||||
}
|
||||
|
||||
public int getRawCoverage() {
|
||||
return rawCoverage;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,122 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.diagnostics.targets;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Short one line description of the walker.
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 2/1/12
|
||||
*/
|
||||
class IntervalStatistics {
|
||||
private final GenomeLoc interval;
|
||||
private final ArrayList<IntervalStatisticLocus> loci;
|
||||
|
||||
private final int minimumCoverageThreshold;
|
||||
private final int maximumCoverageThreshold;
|
||||
private final int minimumMappingQuality;
|
||||
private final int minimumBaseQuality;
|
||||
|
||||
private int preComputedTotalCoverage = -1; // avoids re-calculating the total sum (-1 means we haven't pre-computed it yet)
|
||||
|
||||
private IntervalStatistics(GenomeLoc interval, ArrayList<IntervalStatisticLocus> loci, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
|
||||
this.interval = interval;
|
||||
this.loci = loci;
|
||||
this.minimumCoverageThreshold = minimumCoverageThreshold;
|
||||
this.maximumCoverageThreshold = maximumCoverageThreshold;
|
||||
this.minimumMappingQuality = minimumMappingQuality;
|
||||
this.minimumBaseQuality = minimumBaseQuality;
|
||||
}
|
||||
|
||||
public IntervalStatistics(GenomeLoc interval, int minimumCoverageThreshold, int maximumCoverageThreshold, int minimumMappingQuality, int minimumBaseQuality) {
|
||||
this(interval, new ArrayList<IntervalStatisticLocus>(interval.size()), minimumCoverageThreshold, maximumCoverageThreshold, minimumMappingQuality, minimumBaseQuality);
|
||||
|
||||
// Initialize every loci (this way we don't have to worry about non-existent loci in the object
|
||||
for (int i = 0; i < interval.size(); i++)
|
||||
this.loci.add(i, new IntervalStatisticLocus());
|
||||
|
||||
}
|
||||
|
||||
public long totalCoverage() {
|
||||
if (preComputedTotalCoverage < 0)
|
||||
calculateTotalCoverage();
|
||||
return preComputedTotalCoverage;
|
||||
}
|
||||
|
||||
public double averageCoverage() {
|
||||
if (preComputedTotalCoverage < 0)
|
||||
calculateTotalCoverage();
|
||||
return (double) preComputedTotalCoverage / loci.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the callable status of the entire interval
|
||||
*
|
||||
* @return the callable status of the entire interval
|
||||
*/
|
||||
public CallableStatus callableStatus() {
|
||||
long max = -1;
|
||||
CallableStatus maxCallableStatus = null;
|
||||
HashMap<CallableStatus, Integer> statusCounts = new HashMap<CallableStatus, Integer>(CallableStatus.values().length);
|
||||
|
||||
// initialize the statusCounts with all callable states
|
||||
for (CallableStatus key : CallableStatus.values())
|
||||
statusCounts.put(key, 0);
|
||||
|
||||
// calculate the callable status for each locus
|
||||
for (int i = 0; i < loci.size(); i++) {
|
||||
CallableStatus status = callableStatus(i);
|
||||
int count = statusCounts.get(status) + 1;
|
||||
statusCounts.put(status, count);
|
||||
|
||||
if (count > max) {
|
||||
max = count;
|
||||
maxCallableStatus = status;
|
||||
}
|
||||
}
|
||||
|
||||
return maxCallableStatus;
|
||||
}
|
||||
|
||||
public void addLocus(GenomeLoc locus, IntervalStatisticLocus locusData) {
|
||||
if (!interval.containsP(locus))
|
||||
throw new ReviewedStingException(String.format("Locus %s is not part of the Interval", locus));
|
||||
|
||||
int locusIndex = locus.getStart() - interval.getStart();
|
||||
|
||||
loci.add(locusIndex, locusData);
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the callable status of this locus without taking the reference base into account.
|
||||
*
|
||||
* @param locusIndex location in the genome to inquire (only one locus)
|
||||
* @return the callable status of a locus
|
||||
*/
|
||||
private CallableStatus callableStatus(int locusIndex) {
|
||||
if (loci.get(locusIndex).getCoverage() > maximumCoverageThreshold)
|
||||
return CallableStatus.EXCESSIVE_COVERAGE;
|
||||
|
||||
if (loci.get(locusIndex).getCoverage() >= minimumCoverageThreshold)
|
||||
return CallableStatus.CALLABLE;
|
||||
|
||||
if (loci.get(locusIndex).getRawCoverage() >= minimumCoverageThreshold)
|
||||
return CallableStatus.POOR_QUALITY;
|
||||
|
||||
if (loci.get(locusIndex).getRawCoverage() > 0)
|
||||
return CallableStatus.LOW_COVERAGE;
|
||||
|
||||
return CallableStatus.NO_COVERAGE;
|
||||
}
|
||||
|
||||
private void calculateTotalCoverage() {
|
||||
preComputedTotalCoverage = 0;
|
||||
for (IntervalStatisticLocus locus : loci)
|
||||
preComputedTotalCoverage += locus.getCoverage();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -253,7 +253,7 @@ public class UnifiedGenotyperEngine {
|
|||
VariantContext vcInput = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, rawContext.getLocation(), false, logger, UAC.alleles);
|
||||
if ( vcInput == null )
|
||||
return null;
|
||||
vc = new VariantContextBuilder(vcInput).source("UG_call").noID().referenceBaseForIndel(ref.getBase()).make();
|
||||
vc = new VariantContextBuilder(vcInput).source("UG_call").noID().referenceBaseForIndel(ref.getBase()).attributes(new HashMap<String, Object>()).filters(new HashSet<String>()).make();
|
||||
} else {
|
||||
// deal with bad/non-standard reference bases
|
||||
if ( !Allele.acceptableAlleleBases(new byte[]{ref.getBase()}) )
|
||||
|
|
|
|||
|
|
@ -0,0 +1,70 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 9/26/11
|
||||
*/
|
||||
|
||||
public class ContextCovariate implements ExperimentalCovariate {
|
||||
|
||||
private int CONTEXT_SIZE;
|
||||
private String allN = "";
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
CONTEXT_SIZE = RAC.CONTEXT_SIZE;
|
||||
|
||||
if (CONTEXT_SIZE <= 0)
|
||||
throw new UserException("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead");
|
||||
|
||||
// initialize allN given the size of the context
|
||||
for (int i = 0; i < CONTEXT_SIZE; i++)
|
||||
allN += "N";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
|
||||
byte[] bases = read.getReadBases();
|
||||
for (int i = 0; i < read.getReadLength(); i++)
|
||||
comparable[i] = (i < CONTEXT_SIZE) ? allN : new String(Arrays.copyOfRange(bases, i - CONTEXT_SIZE, i));
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
return str;
|
||||
}
|
||||
}
|
||||
|
|
@ -41,6 +41,7 @@ import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
|||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.PrintStream;
|
||||
|
|
@ -76,20 +77,20 @@ import java.util.Map;
|
|||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
|
||||
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
|
||||
* It is a comma-separated text file relating the desired covariates to the number of such bases and their rate of mismatch in the genome, and its implied empirical quality score.
|
||||
*
|
||||
* The first 20 lines of such a file is shown below.
|
||||
* The first 20 lines of such a file is shown below.
|
||||
* * The file begins with a series of comment lines describing:
|
||||
* ** The number of counted loci
|
||||
* ** The number of counted bases
|
||||
* ** The number of skipped loci and the fraction skipped, due to presence in dbSNP or bad reference bases
|
||||
*
|
||||
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
|
||||
*
|
||||
* * After the comments appears a header line indicating which covariates were used as well as the ordering of elements in the subsequent records.
|
||||
*
|
||||
* * After the header, data records occur one per line until the end of the file. The first several items on a line are the values of the individual covariates and will change
|
||||
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
|
||||
* depending on which covariates were specified at runtime. The last three items are the data- that is, number of observations for this combination of covariates, number of
|
||||
* reference mismatches, and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*
|
||||
*
|
||||
* <pre>
|
||||
* # Counted Sites 19451059
|
||||
* # Counted Bases 56582018
|
||||
|
|
@ -128,13 +129,14 @@ import java.util.Map;
|
|||
* -cov DinucCovariate \
|
||||
* -recalFile my_reads.recal_data.csv
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
||||
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
|
||||
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
|
||||
@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality
|
||||
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
|
||||
@By(DataSource.READS) // Only look at covered loci, not every loci of the reference file
|
||||
@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class})
|
||||
// Filter out all reads with zero or unavailable mapping quality
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
|
||||
// This walker requires both -I input.bam and -R reference.fasta
|
||||
@PartitionBy(PartitionType.LOCUS)
|
||||
public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> {
|
||||
|
||||
|
|
@ -148,7 +150,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
/////////////////////////////
|
||||
// Shared Arguments
|
||||
/////////////////////////////
|
||||
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
@ArgumentCollection
|
||||
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
|
|
@ -159,7 +162,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
|
||||
* Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
|
||||
*/
|
||||
@Input(fullName="knownSites", shortName = "knownSites", doc="A database of known polymorphic sites to skip over in the recalibration algorithm", required=false)
|
||||
@Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
|
||||
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
|
||||
|
||||
/**
|
||||
|
|
@ -168,31 +171,31 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*/
|
||||
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file")
|
||||
@Output(fullName = "recal_file", shortName = "recalFile", required = true, doc = "Filename for the output covariates table recalibration file")
|
||||
@Gather(CountCovariatesGatherer.class)
|
||||
public PrintStream RECAL_FILE;
|
||||
|
||||
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
|
||||
@Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
|
||||
private boolean LIST_ONLY = false;
|
||||
|
||||
/**
|
||||
* See the -list argument to view available covariates.
|
||||
*/
|
||||
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false)
|
||||
@Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false)
|
||||
private String[] COVARIATES = null;
|
||||
@Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false)
|
||||
@Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false)
|
||||
private boolean USE_STANDARD_COVARIATES = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
@Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
|
||||
@Argument(fullName = "dont_sort_output", shortName = "unsorted", required = false, doc = "If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
|
||||
private boolean DONT_SORT_OUTPUT = false;
|
||||
|
||||
/**
|
||||
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
|
||||
*/
|
||||
@Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
@Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
|
||||
private boolean RUN_WITHOUT_DBSNP = false;
|
||||
|
||||
/////////////////////////////
|
||||
|
|
@ -216,6 +219,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
/**
|
||||
* Adds the values of other to this, returning this
|
||||
*
|
||||
* @param other
|
||||
* @return this object
|
||||
*/
|
||||
|
|
@ -246,53 +250,55 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
*/
|
||||
public void initialize() {
|
||||
|
||||
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
|
||||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
||||
if (RAC.FORCE_PLATFORM != null) {
|
||||
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
|
||||
}
|
||||
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>( Covariate.class ).getPlugins();
|
||||
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>( RequiredCovariate.class ).getPlugins();
|
||||
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>( StandardCovariate.class ).getPlugins();
|
||||
final List<Class<? extends Covariate>> covariateClasses = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
final List<Class<? extends RequiredCovariate>> requiredClasses = new PluginManager<RequiredCovariate>(RequiredCovariate.class).getPlugins();
|
||||
final List<Class<? extends StandardCovariate>> standardClasses = new PluginManager<StandardCovariate>(StandardCovariate.class).getPlugins();
|
||||
|
||||
// Print and exit if that's what was requested
|
||||
if ( LIST_ONLY ) {
|
||||
logger.info( "Available covariates:" );
|
||||
for( Class<?> covClass : covariateClasses ) {
|
||||
logger.info( covClass.getSimpleName() );
|
||||
if (LIST_ONLY) {
|
||||
logger.info("Available covariates:");
|
||||
for (Class<?> covClass : covariateClasses) {
|
||||
logger.info(covClass.getSimpleName());
|
||||
}
|
||||
logger.info("");
|
||||
|
||||
System.exit( 0 ); // Early exit here because user requested it
|
||||
System.exit(0); // Early exit here because user requested it
|
||||
}
|
||||
|
||||
// Warn the user if no dbSNP file or other variant mask was specified
|
||||
if( knownSites.isEmpty() && !RUN_WITHOUT_DBSNP ) {
|
||||
if (knownSites.isEmpty() && !RUN_WITHOUT_DBSNP) {
|
||||
throw new UserException.CommandLineException("This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation.");
|
||||
}
|
||||
|
||||
// Initialize the requested covariates by parsing the -cov argument
|
||||
// First add the required covariates
|
||||
if( requiredClasses.size() == 2) { // readGroup and reported quality score
|
||||
requestedCovariates.add( new ReadGroupCovariate() ); // Order is important here
|
||||
requestedCovariates.add( new QualityScoreCovariate() );
|
||||
} else {
|
||||
if (requiredClasses.size() == 2) { // readGroup and reported quality score
|
||||
requestedCovariates.add(new ReadGroupCovariate()); // Order is important here
|
||||
requestedCovariates.add(new QualityScoreCovariate());
|
||||
}
|
||||
else {
|
||||
throw new UserException.CommandLineException("There are more required covariates than expected. The instantiation list needs to be updated with the new required covariate and in the correct order.");
|
||||
}
|
||||
// Next add the standard covariates if -standard was specified by the user
|
||||
if( USE_STANDARD_COVARIATES ) {
|
||||
if (USE_STANDARD_COVARIATES) {
|
||||
// We want the standard covariates to appear in a consistent order but the packageUtils method gives a random order
|
||||
// A list of Classes can't be sorted, but a list of Class names can be
|
||||
final List<String> standardClassNames = new ArrayList<String>();
|
||||
for( Class<?> covClass : standardClasses ) {
|
||||
standardClassNames.add( covClass.getName() );
|
||||
for (Class<?> covClass : standardClasses) {
|
||||
standardClassNames.add(covClass.getName());
|
||||
}
|
||||
Collections.sort(standardClassNames); // Sort the list of class names
|
||||
for( String className : standardClassNames ) {
|
||||
for( Class<?> covClass : standardClasses ) { // Find the class that matches this class name
|
||||
if( covClass.getName().equals( className ) ) {
|
||||
for (String className : standardClassNames) {
|
||||
for (Class<?> covClass : standardClasses) { // Find the class that matches this class name
|
||||
if (covClass.getName().equals(className)) {
|
||||
try {
|
||||
final Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
final Covariate covariate = (Covariate) covClass.newInstance();
|
||||
requestedCovariates.add(covariate);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
|
@ -301,17 +307,17 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
}
|
||||
}
|
||||
// Finally parse the -cov arguments that were provided, skipping over the ones already specified
|
||||
if( COVARIATES != null ) {
|
||||
for( String requestedCovariateString : COVARIATES ) {
|
||||
if (COVARIATES != null) {
|
||||
for (String requestedCovariateString : COVARIATES) {
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : covariateClasses ) {
|
||||
if( requestedCovariateString.equalsIgnoreCase( covClass.getSimpleName() ) ) { // -cov argument matches the class name for an implementing class
|
||||
for (Class<?> covClass : covariateClasses) {
|
||||
if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class
|
||||
foundClass = true;
|
||||
if( !requiredClasses.contains( covClass ) && (!USE_STANDARD_COVARIATES || !standardClasses.contains( covClass )) ) {
|
||||
if (!requiredClasses.contains(covClass) && (!USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) {
|
||||
try {
|
||||
// Now that we've found a matching class, try to instantiate it
|
||||
final Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
final Covariate covariate = (Covariate) covClass.newInstance();
|
||||
requestedCovariates.add(covariate);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
|
@ -319,20 +325,19 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new UserException.CommandLineException( "The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates." );
|
||||
if (!foundClass) {
|
||||
throw new UserException.CommandLineException("The requested covariate type (" + requestedCovariateString + ") isn't a valid covariate option. Use --list to see possible covariates.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.info( "The covariates being used here: " );
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
logger.info( "\t" + cov.getClass().getSimpleName() );
|
||||
cov.initialize( RAC ); // Initialize any covariate member variables using the shared argument collection
|
||||
logger.info("The covariates being used here: ");
|
||||
for (Covariate cov : requestedCovariates) {
|
||||
logger.info("\t" + cov.getClass().getSimpleName());
|
||||
cov.initialize(RAC); // Initialize any covariate member variables using the shared argument collection
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
|
|
@ -341,62 +346,63 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
/**
|
||||
* For each read at this locus get the various covariate values and increment that location in the map based on
|
||||
* whether or not the base matches the reference at this particular location
|
||||
* whether or not the base matches the reference at this particular location
|
||||
*
|
||||
* @param tracker The reference metadata tracker
|
||||
* @param ref The reference context
|
||||
* @param ref The reference context
|
||||
* @param context The alignment context
|
||||
* @return Returns 1, but this value isn't used in the reduce step
|
||||
*/
|
||||
public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
|
||||
public CountedData map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
// Only use data from non-dbsnp sites
|
||||
// Assume every mismatch at a non-dbsnp site is indicative of poor quality
|
||||
CountedData counter = new CountedData();
|
||||
if( tracker.getValues(knownSites).size() == 0 ) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
|
||||
if (tracker.getValues(knownSites).size() == 0) { // If something here is in one of the knownSites tracks then skip over it, otherwise proceed
|
||||
// For each read at this locus
|
||||
for( final PileupElement p : context.getBasePileup() ) {
|
||||
final GATKSAMRecord gatkRead = (GATKSAMRecord) p.getRead();
|
||||
for (final PileupElement p : context.getBasePileup()) {
|
||||
final GATKSAMRecord gatkRead = p.getRead();
|
||||
int offset = p.getOffset();
|
||||
|
||||
if( gatkRead.containsTemporaryAttribute( SKIP_RECORD_ATTRIBUTE ) ) {
|
||||
if (gatkRead.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if( !gatkRead.containsTemporaryAttribute( SEEN_ATTRIBUTE ) )
|
||||
{
|
||||
gatkRead.setTemporaryAttribute( SEEN_ATTRIBUTE, true );
|
||||
RecalDataManager.parseSAMRecord( gatkRead, RAC );
|
||||
if (!gatkRead.containsTemporaryAttribute(SEEN_ATTRIBUTE)) {
|
||||
gatkRead.setTemporaryAttribute(SEEN_ATTRIBUTE, true);
|
||||
RecalDataManager.parseSAMRecord(gatkRead, RAC);
|
||||
|
||||
// Skip over reads with no calls in the color space if the user requested it
|
||||
if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace( gatkRead ) ) {
|
||||
gatkRead.setTemporaryAttribute( SKIP_RECORD_ATTRIBUTE, true);
|
||||
if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) && RecalDataManager.checkNoCallColorSpace(gatkRead)) {
|
||||
gatkRead.setTemporaryAttribute(SKIP_RECORD_ATTRIBUTE, true);
|
||||
continue;
|
||||
}
|
||||
|
||||
RecalDataManager.parseColorSpace( gatkRead );
|
||||
gatkRead.setTemporaryAttribute( COVARS_ATTRIBUTE,
|
||||
RecalDataManager.computeCovariates( gatkRead, requestedCovariates ));
|
||||
RecalDataManager.parseColorSpace(gatkRead);
|
||||
gatkRead.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalDataManager.computeCovariates(gatkRead, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION));
|
||||
}
|
||||
|
||||
// Skip this position if base quality is zero
|
||||
if( gatkRead.getBaseQualities()[offset] > 0 ) {
|
||||
if (gatkRead.getBaseQualities()[offset] > 0) {
|
||||
|
||||
byte[] bases = gatkRead.getReadBases();
|
||||
byte refBase = ref.getBase();
|
||||
|
||||
// Skip if this base is an 'N' or etc.
|
||||
if( BaseUtils.isRegularBase( bases[offset] ) ) {
|
||||
if (BaseUtils.isRegularBase(bases[offset])) {
|
||||
|
||||
// SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it
|
||||
if( !gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
|
||||
!RecalDataManager.isInconsistentColorSpace( gatkRead, offset ) ) {
|
||||
if (!gatkRead.getReadGroup().getPlatform().toUpperCase().contains("SOLID") || RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING ||
|
||||
!RecalDataManager.isInconsistentColorSpace(gatkRead, offset)) {
|
||||
|
||||
// This base finally passed all the checks for a good base, so add it to the big data hashmap
|
||||
updateDataFromRead( counter, gatkRead, offset, refBase );
|
||||
updateDataFromRead(counter, gatkRead, offset, refBase);
|
||||
|
||||
} else { // calculate SOLID reference insertion rate
|
||||
if( refBase == bases[offset] ) {
|
||||
}
|
||||
else { // calculate SOLID reference insertion rate
|
||||
if (refBase == bases[offset]) {
|
||||
counter.solidInsertedReferenceBases++;
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
counter.otherColorSpaceInconsistency++;
|
||||
}
|
||||
}
|
||||
|
|
@ -404,7 +410,8 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
}
|
||||
}
|
||||
counter.countedSites++;
|
||||
} else { // We skipped over the dbSNP site, and we are only processing every Nth locus
|
||||
}
|
||||
else { // We skipped over the dbSNP site, and we are only processing every Nth locus
|
||||
counter.skippedSites++;
|
||||
updateMismatchCounts(counter, context, ref.getBase()); // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
}
|
||||
|
|
@ -412,7 +419,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
return counter;
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Update the mismatch / total_base counts for a given class of loci.
|
||||
*
|
||||
* @param counter The CountedData to be updated
|
||||
|
|
@ -420,13 +427,13 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* @param refBase The reference base
|
||||
*/
|
||||
private static void updateMismatchCounts(CountedData counter, final AlignmentContext context, final byte refBase) {
|
||||
for( PileupElement p : context.getBasePileup() ) {
|
||||
for (PileupElement p : context.getBasePileup()) {
|
||||
final byte readBase = p.getBase();
|
||||
final int readBaseIndex = BaseUtils.simpleBaseToBaseIndex(readBase);
|
||||
final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase);
|
||||
final int refBaseIndex = BaseUtils.simpleBaseToBaseIndex(refBase);
|
||||
|
||||
if( readBaseIndex != -1 && refBaseIndex != -1 ) {
|
||||
if( readBaseIndex != refBaseIndex ) {
|
||||
if (readBaseIndex != -1 && refBaseIndex != -1) {
|
||||
if (readBaseIndex != refBaseIndex) {
|
||||
counter.novelCountsMM++;
|
||||
}
|
||||
counter.novelCountsBases++;
|
||||
|
|
@ -438,13 +445,14 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* Major workhorse routine for this walker.
|
||||
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
|
||||
* adding one to the number of observations and potentially one to the number of mismatches
|
||||
* adding one to the number of observations and potentially one to the number of mismatches
|
||||
* Lots of things are passed as parameters to this method as a strategy for optimizing the covariate.getValue calls
|
||||
* because pulling things out of the SAMRecord is an expensive operation.
|
||||
* @param counter Data structure which holds the counted bases
|
||||
* because pulling things out of the SAMRecord is an expensive operation.
|
||||
*
|
||||
* @param counter Data structure which holds the counted bases
|
||||
* @param gatkRead The SAMRecord holding all the data for this read
|
||||
* @param offset The offset in the read for this locus
|
||||
* @param refBase The reference base at this locus
|
||||
* @param offset The offset in the read for this locus
|
||||
* @param refBase The reference base at this locus
|
||||
*/
|
||||
private void updateDataFromRead(CountedData counter, final GATKSAMRecord gatkRead, final int offset, final byte refBase) {
|
||||
final Object[][] covars = (Comparable[][]) gatkRead.getTemporaryAttribute(COVARS_ATTRIBUTE);
|
||||
|
|
@ -452,10 +460,10 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
// Using the list of covariate values as a key, pick out the RecalDatum from the data HashMap
|
||||
final NestedHashMap data = dataManager.data; //optimization - create local reference
|
||||
RecalDatumOptimized datum = (RecalDatumOptimized) data.get( key );
|
||||
if( datum == null ) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
RecalDatumOptimized datum = (RecalDatumOptimized) data.get(key);
|
||||
if (datum == null) { // key doesn't exist yet in the map so make a new bucket and add it
|
||||
// initialized with zeros, will be incremented at end of method
|
||||
datum = (RecalDatumOptimized)data.put( new RecalDatumOptimized(), true, (Object[])key );
|
||||
datum = (RecalDatumOptimized) data.put(new RecalDatumOptimized(), true, (Object[]) key);
|
||||
}
|
||||
|
||||
// Need the bases to determine whether or not we have a mismatch
|
||||
|
|
@ -463,13 +471,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
final long curMismatches = datum.getNumMismatches();
|
||||
|
||||
// Add one to the number of observations and potentially one to the number of mismatches
|
||||
datum.incrementBaseCounts( base, refBase );
|
||||
datum.incrementBaseCounts(base, refBase);
|
||||
counter.countedBases++;
|
||||
counter.novelCountsBases++;
|
||||
counter.novelCountsMM += datum.getNumMismatches() - curMismatches; // For sanity check to ensure novel mismatch rate vs dnsnp mismatch rate is reasonable
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
|
|
@ -478,6 +485,7 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
/**
|
||||
* Initialize the reduce step by creating a PrintStream from the filename specified as an argument to the walker.
|
||||
*
|
||||
* @return returns A PrintStream created from the -recalFile filename argument specified to the walker
|
||||
*/
|
||||
public CountedData reduceInit() {
|
||||
|
|
@ -486,11 +494,12 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
|
||||
/**
|
||||
* The Reduce method doesn't do anything for this walker.
|
||||
*
|
||||
* @param mapped Result of the map. This value is immediately ignored.
|
||||
* @param sum The summing CountedData used to output the CSV data
|
||||
* @param sum The summing CountedData used to output the CSV data
|
||||
* @return returns The sum used to output the CSV data
|
||||
*/
|
||||
public CountedData reduce( CountedData mapped, CountedData sum ) {
|
||||
public CountedData reduce(CountedData mapped, CountedData sum) {
|
||||
// Do a dbSNP sanity check every so often
|
||||
return validatingDbsnpMismatchRate(sum.add(mapped));
|
||||
}
|
||||
|
|
@ -499,16 +508,15 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
* Validate the dbSNP reference mismatch rates.
|
||||
*/
|
||||
private CountedData validatingDbsnpMismatchRate(CountedData counter) {
|
||||
if( ++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY ) {
|
||||
if (++counter.lociSinceLastDbsnpCheck >= DBSNP_VALIDATION_CHECK_FREQUENCY) {
|
||||
counter.lociSinceLastDbsnpCheck = 0;
|
||||
|
||||
if( counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L ) {
|
||||
final double fractionMM_novel = (double)counter.novelCountsMM / (double)counter.novelCountsBases;
|
||||
final double fractionMM_dbsnp = (double)counter.dbSNPCountsMM / (double)counter.dbSNPCountsBases;
|
||||
if (counter.novelCountsBases != 0L && counter.dbSNPCountsBases != 0L) {
|
||||
final double fractionMM_novel = (double) counter.novelCountsMM / (double) counter.novelCountsBases;
|
||||
final double fractionMM_dbsnp = (double) counter.dbSNPCountsMM / (double) counter.dbSNPCountsBases;
|
||||
|
||||
if( fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel ) {
|
||||
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " +
|
||||
String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel) );
|
||||
if (fractionMM_dbsnp < DBSNP_VS_NOVEL_MISMATCH_RATE * fractionMM_novel) {
|
||||
Utils.warnUser("The variation rate at the supplied list of known variant sites seems suspiciously low. Please double-check that the correct ROD is being used. " + String.format("[dbSNP variation rate = %.4f, novel variation rate = %.4f]", fractionMM_dbsnp, fractionMM_novel));
|
||||
DBSNP_VALIDATION_CHECK_FREQUENCY *= 2; // Don't annoyingly output the warning message every megabase of a large file
|
||||
}
|
||||
}
|
||||
|
|
@ -517,47 +525,50 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
return counter;
|
||||
}
|
||||
|
||||
public CountedData treeReduce( CountedData sum1, CountedData sum2 ) {
|
||||
public CountedData treeReduce(CountedData sum1, CountedData sum2) {
|
||||
return validatingDbsnpMismatchRate(sum1.add(sum2));
|
||||
}
|
||||
|
||||
/**
|
||||
* Write out the full data hashmap to disk in CSV format
|
||||
*
|
||||
* @param sum The CountedData to write out to RECAL_FILE
|
||||
*/
|
||||
public void onTraversalDone( CountedData sum ) {
|
||||
logger.info( "Writing raw recalibration data..." );
|
||||
if( sum.countedBases == 0L ) {
|
||||
public void onTraversalDone(CountedData sum) {
|
||||
logger.info("Writing raw recalibration data...");
|
||||
if (sum.countedBases == 0L) {
|
||||
throw new UserException.BadInput("Could not find any usable data in the input BAM file(s).");
|
||||
}
|
||||
outputToCSV( sum, RECAL_FILE );
|
||||
logger.info( "...done!" );
|
||||
outputToCSV(sum, RECAL_FILE);
|
||||
logger.info("...done!");
|
||||
}
|
||||
|
||||
/**
|
||||
* For each entry (key-value pair) in the data hashmap output the Covariate's values as well as the RecalDatum's data in CSV format
|
||||
*
|
||||
* @param recalTableStream The PrintStream to write out to
|
||||
*/
|
||||
private void outputToCSV( CountedData sum, final PrintStream recalTableStream ) {
|
||||
private void outputToCSV(CountedData sum, final PrintStream recalTableStream) {
|
||||
recalTableStream.printf("# Counted Sites %d%n", sum.countedSites);
|
||||
recalTableStream.printf("# Counted Bases %d%n", sum.countedBases);
|
||||
recalTableStream.printf("# Skipped Sites %d%n", sum.skippedSites);
|
||||
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double)sum.countedSites / sum.skippedSites);
|
||||
recalTableStream.printf("# Fraction Skipped 1 / %.0f bp%n", (double) sum.countedSites / sum.skippedSites);
|
||||
|
||||
if( sum.solidInsertedReferenceBases != 0 ) {
|
||||
if (sum.solidInsertedReferenceBases != 0) {
|
||||
recalTableStream.printf("# Fraction SOLiD inserted reference 1 / %.0f bases%n", (double) sum.countedBases / sum.solidInsertedReferenceBases);
|
||||
recalTableStream.printf("# Fraction other color space inconsistencies 1 / %.0f bases%n", (double) sum.countedBases / sum.otherColorSpaceInconsistency);
|
||||
}
|
||||
|
||||
// Output header saying which covariates were used and in what order
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
recalTableStream.print( cov.getClass().getSimpleName().split("Covariate")[0] + "," );
|
||||
for (Covariate cov : requestedCovariates) {
|
||||
recalTableStream.print(cov.getClass().getSimpleName().split("Covariate")[0] + ",");
|
||||
}
|
||||
recalTableStream.println("nObservations,nMismatches,Qempirical");
|
||||
|
||||
if( DONT_SORT_OUTPUT ) {
|
||||
if (DONT_SORT_OUTPUT) {
|
||||
printMappings(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
printMappingsSorted(recalTableStream, 0, new Object[requestedCovariates.size()], dataManager.data.data);
|
||||
}
|
||||
|
||||
|
|
@ -565,45 +576,47 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
|
|||
recalTableStream.println(TableRecalibrationWalker.EOF_MARKER);
|
||||
}
|
||||
|
||||
private void printMappingsSorted( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
private void printMappingsSorted(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
final ArrayList<Comparable> keyList = new ArrayList<Comparable>();
|
||||
for( Object comp : data.keySet() ) {
|
||||
for (Object comp : data.keySet()) {
|
||||
keyList.add((Comparable) comp);
|
||||
}
|
||||
|
||||
Collections.sort(keyList);
|
||||
|
||||
for( Comparable comp : keyList ) {
|
||||
for (Comparable comp : keyList) {
|
||||
key[curPos] = comp;
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
||||
if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
|
||||
// For each Covariate in the key
|
||||
for( Object compToPrint : key ) {
|
||||
for (Object compToPrint : key) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( compToPrint + "," );
|
||||
recalTableStream.print(compToPrint + ",");
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
||||
} else { // Another layer in the nested hash map
|
||||
printMappingsSorted( recalTableStream, curPos + 1, key, (Map) val );
|
||||
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
|
||||
}
|
||||
else { // Another layer in the nested hash map
|
||||
printMappingsSorted(recalTableStream, curPos + 1, key, (Map) val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void printMappings( final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
for( Object comp : data.keySet() ) {
|
||||
private void printMappings(final PrintStream recalTableStream, final int curPos, final Object[] key, final Map data) {
|
||||
for (Object comp : data.keySet()) {
|
||||
key[curPos] = comp;
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatumOptimized ) { // We are at the end of the nested hash maps
|
||||
if (val instanceof RecalDatumOptimized) { // We are at the end of the nested hash maps
|
||||
// For each Covariate in the key
|
||||
for( Object compToPrint : key ) {
|
||||
for (Object compToPrint : key) {
|
||||
// Output the Covariate's value
|
||||
recalTableStream.print( compToPrint + "," );
|
||||
recalTableStream.print(compToPrint + ",");
|
||||
}
|
||||
// Output the RecalDatum entry
|
||||
recalTableStream.println( ((RecalDatumOptimized)val).outputToCSV() );
|
||||
} else { // Another layer in the nested hash map
|
||||
printMappings( recalTableStream, curPos + 1, key, (Map) val );
|
||||
recalTableStream.println(((RecalDatumOptimized) val).outputToCSV());
|
||||
}
|
||||
else { // Another layer in the nested hash map
|
||||
printMappings(recalTableStream, curPos + 1, key, (Map) val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -32,24 +33,24 @@ import net.sf.samtools.SAMRecord;
|
|||
* User: rpoplin
|
||||
* Date: Oct 30, 2009
|
||||
*
|
||||
* The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read, offset, and corresponding reference bases
|
||||
* The Covariate interface. A Covariate is a feature used in the recalibration that can be picked out of the read.
|
||||
* In general most error checking and adjustments to the data are done before the call to the covariates getValue methods in order to speed up the code.
|
||||
* This unfortunately muddies the code, but most of these corrections can be done per read while the covariates get called per base, resulting in a big speed up.
|
||||
*/
|
||||
|
||||
public interface Covariate {
|
||||
public void initialize( RecalibrationArgumentCollection RAC ); // Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public Comparable getValue( String str ); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public void getValues( SAMRecord read, Comparable[] comparable ); //Takes an array of size (at least) read.getReadLength() and fills it with covariate
|
||||
//values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows
|
||||
//read-specific calculations to be done just once rather than for each offset.
|
||||
public void initialize(RecalibrationArgumentCollection RAC); // Initialize any member variables using the command-line arguments passed to the walkers
|
||||
|
||||
public Comparable getValue(String str); // Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
|
||||
public void getValues(GATKSAMRecord read, Comparable[] comparable, BaseRecalibration.BaseRecalibrationType modelType);
|
||||
//Takes an array of size (at least) read.getReadLength() and fills it with covariate
|
||||
//values for each position in the read. This method was created as an optimization over calling getValue( read, offset ) for each offset and allows
|
||||
//read-specific calculations to be done just once rather than for each offset.
|
||||
}
|
||||
|
||||
interface RequiredCovariate extends Covariate {
|
||||
}
|
||||
interface RequiredCovariate extends Covariate {}
|
||||
|
||||
interface StandardCovariate extends Covariate {
|
||||
}
|
||||
interface StandardCovariate extends Covariate {}
|
||||
|
||||
interface ExperimentalCovariate extends Covariate {
|
||||
}
|
||||
interface ExperimentalCovariate extends Covariate {}
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
|
@ -39,67 +39,69 @@ import java.util.EnumSet;
|
|||
* Date: Oct 30, 2009
|
||||
*
|
||||
* The Cycle covariate.
|
||||
* For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
|
||||
* For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle
|
||||
* For example, for the read: AAACCCCGAAATTTTTACTG
|
||||
* the cycle would be 11111111222333333344
|
||||
* For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
|
||||
* For Solexa the cycle is simply the position in the read (counting backwards if it is a negative strand read)
|
||||
* For 454 the cycle is the TACG flow cycle, that is, each flow grabs all the TACG's in order in a single cycle
|
||||
* For example, for the read: AAACCCCGAAATTTTTACTG
|
||||
* the cycle would be 11111111222333333344
|
||||
* For SOLiD the cycle is a more complicated mixture of ligation cycle and primer round
|
||||
*/
|
||||
|
||||
public class CycleCovariate implements StandardCovariate {
|
||||
private final static EnumSet<NGSPlatform> DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS);
|
||||
private final static EnumSet<NGSPlatform> FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT);
|
||||
private final static EnumSet<NGSPlatform> FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT);
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
if( RAC.DEFAULT_PLATFORM != null ) {
|
||||
if( RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SLX" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ILLUMINA" ) ||
|
||||
RAC.DEFAULT_PLATFORM.contains( "454" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "SOLID" ) || RAC.DEFAULT_PLATFORM.equalsIgnoreCase( "ABI_SOLID" ) ) {
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
if (RAC.DEFAULT_PLATFORM != null) {
|
||||
if (RAC.DEFAULT_PLATFORM.equalsIgnoreCase("SLX") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("ILLUMINA") ||
|
||||
RAC.DEFAULT_PLATFORM.contains("454") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("SOLID") || RAC.DEFAULT_PLATFORM.equalsIgnoreCase("ABI_SOLID")) {
|
||||
// nothing to do
|
||||
} else {
|
||||
throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM +") is not a recognized platform. Implemented options are illumina, 454, and solid");
|
||||
}
|
||||
else {
|
||||
throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform. Implemented options are illumina, 454, and solid");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
|
||||
|
||||
//-----------------------------
|
||||
// Illumina, Solid, PacBio, and Complete Genomics
|
||||
//-----------------------------
|
||||
|
||||
final NGSPlatform ngsPlatform = ((GATKSAMRecord)read).getNGSPlatform();
|
||||
if( DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform) ) {
|
||||
final NGSPlatform ngsPlatform = read.getNGSPlatform();
|
||||
if (DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform)) {
|
||||
final int init;
|
||||
final int increment;
|
||||
if( !read.getReadNegativeStrandFlag() ) {
|
||||
if (!read.getReadNegativeStrandFlag()) {
|
||||
// Differentiate between first and second of pair.
|
||||
// The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group
|
||||
// to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair.
|
||||
// Therefore the cycle covariate must differentiate between first and second of pair reads.
|
||||
// This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because
|
||||
// the current sequential model would consider the effects independently instead of jointly.
|
||||
if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) {
|
||||
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
|
||||
//second of pair, positive strand
|
||||
init = -1;
|
||||
increment = -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
else {
|
||||
//first of pair, positive strand
|
||||
init = 1;
|
||||
increment = 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) {
|
||||
}
|
||||
else {
|
||||
if (read.getReadPairedFlag() && read.getSecondOfPairFlag()) {
|
||||
//second of pair, negative strand
|
||||
init = -read.getReadLength();
|
||||
increment = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
else {
|
||||
//first of pair, negative strand
|
||||
init = read.getReadLength();
|
||||
increment = -1;
|
||||
|
|
@ -107,7 +109,7 @@ public class CycleCovariate implements StandardCovariate {
|
|||
}
|
||||
|
||||
int cycle = init;
|
||||
for(int i = 0; i < read.getReadLength(); i++) {
|
||||
for (int i = 0; i < read.getReadLength(); i++) {
|
||||
comparable[i] = cycle;
|
||||
cycle += increment;
|
||||
}
|
||||
|
|
@ -116,7 +118,7 @@ public class CycleCovariate implements StandardCovariate {
|
|||
//-----------------------------
|
||||
// 454 and Ion Torrent
|
||||
//-----------------------------
|
||||
else if( FLOW_CYCLE_PLATFORMS.contains(ngsPlatform) ) {
|
||||
else if (FLOW_CYCLE_PLATFORMS.contains(ngsPlatform)) {
|
||||
|
||||
final int readLength = read.getReadLength();
|
||||
final byte[] bases = read.getReadBases();
|
||||
|
|
@ -133,38 +135,78 @@ public class CycleCovariate implements StandardCovariate {
|
|||
|
||||
// BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change
|
||||
// For example, AAAAAAA was probably read in two flow cycles but here we count it as one
|
||||
if( !read.getReadNegativeStrandFlag() ) { // Forward direction
|
||||
if (!read.getReadNegativeStrandFlag()) { // Forward direction
|
||||
int iii = 0;
|
||||
while( iii < readLength )
|
||||
{
|
||||
while( iii < readLength && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii++; }
|
||||
while( iii < readLength && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii++; }
|
||||
while( iii < readLength && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii++; }
|
||||
while( iii < readLength && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii++; }
|
||||
if( iii < readLength ) { if (multiplyByNegative1) cycle--; else cycle++; }
|
||||
if( iii < readLength && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii++; }
|
||||
while (iii < readLength) {
|
||||
while (iii < readLength && bases[iii] == (byte) 'T') {
|
||||
comparable[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
while (iii < readLength && bases[iii] == (byte) 'A') {
|
||||
comparable[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
while (iii < readLength && bases[iii] == (byte) 'C') {
|
||||
comparable[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
while (iii < readLength && bases[iii] == (byte) 'G') {
|
||||
comparable[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
if (iii < readLength) {
|
||||
if (multiplyByNegative1)
|
||||
cycle--;
|
||||
else
|
||||
cycle++;
|
||||
}
|
||||
if (iii < readLength && !BaseUtils.isRegularBase(bases[iii])) {
|
||||
comparable[iii] = cycle;
|
||||
iii++;
|
||||
}
|
||||
|
||||
}
|
||||
} else { // Negative direction
|
||||
int iii = readLength-1;
|
||||
while( iii >= 0 )
|
||||
{
|
||||
while( iii >= 0 && bases[iii] == (byte)'T' ) { comparable[iii] = cycle; iii--; }
|
||||
while( iii >= 0 && bases[iii] == (byte)'A' ) { comparable[iii] = cycle; iii--; }
|
||||
while( iii >= 0 && bases[iii] == (byte)'C' ) { comparable[iii] = cycle; iii--; }
|
||||
while( iii >= 0 && bases[iii] == (byte)'G' ) { comparable[iii] = cycle; iii--; }
|
||||
if( iii >= 0 ) { if (multiplyByNegative1) cycle--; else cycle++; }
|
||||
if( iii >= 0 && !BaseUtils.isRegularBase(bases[iii]) ) { comparable[iii] = cycle; iii--; }
|
||||
}
|
||||
else { // Negative direction
|
||||
int iii = readLength - 1;
|
||||
while (iii >= 0) {
|
||||
while (iii >= 0 && bases[iii] == (byte) 'T') {
|
||||
comparable[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
while (iii >= 0 && bases[iii] == (byte) 'A') {
|
||||
comparable[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
while (iii >= 0 && bases[iii] == (byte) 'C') {
|
||||
comparable[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
while (iii >= 0 && bases[iii] == (byte) 'G') {
|
||||
comparable[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
if (iii >= 0) {
|
||||
if (multiplyByNegative1)
|
||||
cycle--;
|
||||
else
|
||||
cycle++;
|
||||
}
|
||||
if (iii >= 0 && !BaseUtils.isRegularBase(bases[iii])) {
|
||||
comparable[iii] = cycle;
|
||||
iii--;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
else {
|
||||
throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid");
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
return Integer.parseInt(str);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.HashMap;
|
||||
|
||||
|
|
@ -42,63 +43,30 @@ import java.util.HashMap;
|
|||
|
||||
public class DinucCovariate implements StandardCovariate {
|
||||
|
||||
private static final byte NO_CALL = (byte)'N';
|
||||
private static final byte NO_CALL = (byte) 'N';
|
||||
private static final Dinuc NO_DINUC = new Dinuc(NO_CALL, NO_CALL);
|
||||
|
||||
private HashMap<Integer, Dinuc> dinucHashMap;
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
final byte[] BASES = { (byte)'A', (byte)'C', (byte)'G', (byte)'T' };
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
final byte[] BASES = {(byte) 'A', (byte) 'C', (byte) 'G', (byte) 'T'};
|
||||
dinucHashMap = new HashMap<Integer, Dinuc>();
|
||||
for( byte byte1 : BASES ) {
|
||||
for( byte byte2: BASES ) {
|
||||
dinucHashMap.put( Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2) ); // This might seem silly, but Strings are too slow
|
||||
for (byte byte1 : BASES) {
|
||||
for (byte byte2 : BASES) {
|
||||
dinucHashMap.put(Dinuc.hashBytes(byte1, byte2), new Dinuc(byte1, byte2)); // This might seem silly, but Strings are too slow
|
||||
}
|
||||
}
|
||||
// Add the "no dinuc" entry too
|
||||
dinucHashMap.put( Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC );
|
||||
dinucHashMap.put(Dinuc.hashBytes(NO_CALL, NO_CALL), NO_DINUC);
|
||||
}
|
||||
|
||||
/*
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
|
||||
byte base;
|
||||
byte prevBase;
|
||||
final byte[] bases = read.getReadBases();
|
||||
// If this is a negative strand read then we need to reverse the direction for our previous base
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
// No dinuc at the beginning of the read
|
||||
if( offset == bases.length-1 ) {
|
||||
return NO_DINUC;
|
||||
}
|
||||
base = (byte)BaseUtils.simpleComplement( (char)(bases[offset]) );
|
||||
// Note: We are using the previous base in the read, not the previous base in the reference. This is done in part to be consistent with unmapped reads.
|
||||
prevBase = (byte)BaseUtils.simpleComplement( (char)(bases[offset + 1]) );
|
||||
} else {
|
||||
// No dinuc at the beginning of the read
|
||||
if( offset == 0 ) {
|
||||
return NO_DINUC;
|
||||
}
|
||||
base = bases[offset];
|
||||
// Note: We are using the previous base in the read, not the previous base in the reference. This is done in part to be consistent with unmapped reads.
|
||||
prevBase = bases[offset - 1];
|
||||
}
|
||||
|
||||
// Make sure the previous base is good
|
||||
if( !BaseUtils.isRegularBase( prevBase ) ) {
|
||||
return NO_DINUC;
|
||||
}
|
||||
|
||||
return dinucHashMap.get( Dinuc.hashBytes( prevBase, base ) );
|
||||
}
|
||||
*/
|
||||
|
||||
/**
|
||||
* Takes an array of size (at least) read.getReadLength() and fills it with the covariate values for each position in the read.
|
||||
*/
|
||||
public void getValues( SAMRecord read, Comparable[] result ) {
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
|
||||
final HashMap<Integer, Dinuc> dinucHashMapRef = this.dinucHashMap; //optimize access to dinucHashMap
|
||||
final int readLength = read.getReadLength();
|
||||
final boolean negativeStrand = read.getReadNegativeStrandFlag();
|
||||
|
|
@ -108,50 +76,51 @@ public class DinucCovariate implements StandardCovariate {
|
|||
int offset = 0;
|
||||
// If this is a negative strand read then we need to reverse the direction for our previous base
|
||||
|
||||
if(negativeStrand) {
|
||||
if (negativeStrand) {
|
||||
bases = BaseUtils.simpleReverseComplement(bases); //this is NOT in-place
|
||||
}
|
||||
result[0] = NO_DINUC; // No dinuc at the beginning of the read
|
||||
comparable[0] = NO_DINUC; // No dinuc at the beginning of the read
|
||||
|
||||
prevBase = bases[0];
|
||||
offset++;
|
||||
while(offset < readLength) {
|
||||
// Note: We are using the previous base in the read, not the
|
||||
// previous base in the reference. This is done in part to be consistent with unmapped reads.
|
||||
base = bases[offset];
|
||||
if( BaseUtils.isRegularBase( prevBase ) ) {
|
||||
result[offset] = dinucHashMapRef.get( Dinuc.hashBytes( prevBase, base ) );
|
||||
} else {
|
||||
result[offset] = NO_DINUC;
|
||||
}
|
||||
while (offset < readLength) {
|
||||
// Note: We are using the previous base in the read, not the
|
||||
// previous base in the reference. This is done in part to be consistent with unmapped reads.
|
||||
base = bases[offset];
|
||||
if (BaseUtils.isRegularBase(prevBase)) {
|
||||
comparable[offset] = dinucHashMapRef.get(Dinuc.hashBytes(prevBase, base));
|
||||
}
|
||||
else {
|
||||
comparable[offset] = NO_DINUC;
|
||||
}
|
||||
|
||||
offset++;
|
||||
prevBase = base;
|
||||
offset++;
|
||||
prevBase = base;
|
||||
}
|
||||
if(negativeStrand) {
|
||||
reverse( result );
|
||||
if (negativeStrand) {
|
||||
reverse(comparable);
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
byte[] bytes = str.getBytes();
|
||||
final Dinuc returnDinuc = dinucHashMap.get( Dinuc.hashBytes( bytes[0], bytes[1] ) );
|
||||
if( returnDinuc.compareTo(NO_DINUC) == 0 ) {
|
||||
final Dinuc returnDinuc = dinucHashMap.get(Dinuc.hashBytes(bytes[0], bytes[1]));
|
||||
if (returnDinuc.compareTo(NO_DINUC) == 0) {
|
||||
return null;
|
||||
}
|
||||
return returnDinuc;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reverses the given array in place.
|
||||
*
|
||||
* @param array
|
||||
* @param array any array
|
||||
*/
|
||||
private static void reverse(final Comparable[] array) {
|
||||
final int arrayLength = array.length;
|
||||
for(int l = 0, r = arrayLength - 1; l < r; l++, r--) {
|
||||
for (int l = 0, r = arrayLength - 1; l < r; l++, r--) {
|
||||
final Comparable temp = array[l];
|
||||
array[l] = array[r];
|
||||
array[r] = temp;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
|
|
@ -38,55 +40,57 @@ import net.sf.samtools.SAMRecord;
|
|||
|
||||
public class GCContentCovariate implements ExperimentalCovariate {
|
||||
|
||||
int numBack = 7;
|
||||
private int numBack = 7;
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
numBack = RAC.HOMOPOLYMER_NBACK;
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
private Comparable getValue(final SAMRecord read, final int offset) {
|
||||
|
||||
// ATTGCCCCGTAAAAAAAGAGAA
|
||||
// 0000123456654321001122
|
||||
|
||||
if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) ) {
|
||||
if (read.getReadGroup().getPlatform().equalsIgnoreCase("ILLUMINA") || read.getReadGroup().getPlatform().equalsIgnoreCase("SLX")) {
|
||||
int numGC = 0;
|
||||
int startPos = 0;
|
||||
int stopPos = 0;
|
||||
int startPos;
|
||||
int stopPos;
|
||||
final byte[] bases = read.getReadBases();
|
||||
if( !read.getReadNegativeStrandFlag() ) { // Forward direction
|
||||
if (!read.getReadNegativeStrandFlag()) { // Forward direction
|
||||
startPos = Math.max(offset - numBack, 0);
|
||||
stopPos = Math.max(offset - 1, 0);
|
||||
} else { // Negative direction
|
||||
}
|
||||
else { // Negative direction
|
||||
startPos = Math.min(offset + 2, bases.length);
|
||||
stopPos = Math.min(offset + numBack + 1, bases.length);
|
||||
}
|
||||
|
||||
for( int iii = startPos; iii < stopPos; iii++ ) {
|
||||
if( bases[iii] == (byte)'G' || bases[iii] == (byte)'C' ) {
|
||||
for (int iii = startPos; iii < stopPos; iii++) {
|
||||
if (bases[iii] == (byte) 'G' || bases[iii] == (byte) 'C') {
|
||||
numGC++;
|
||||
}
|
||||
}
|
||||
|
||||
return numGC;
|
||||
} else { // This effect is specific to the Illumina platform
|
||||
}
|
||||
else { // This effect is specific to the Illumina platform
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
for(int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
|
||||
for (int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
return Integer.parseInt(str);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -1,6 +1,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -40,15 +42,16 @@ import net.sf.samtools.SAMRecord;
|
|||
|
||||
public class HomopolymerCovariate implements ExperimentalCovariate {
|
||||
|
||||
int numBack = 7;
|
||||
private int numBack;
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
numBack = RAC.HOMOPOLYMER_NBACK;
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
private Comparable getValue(final SAMRecord read, final int offset) {
|
||||
|
||||
// This block of code is for if you don't want to only count consecutive bases
|
||||
// ATTGCCCCGTAAAAAAAAATA
|
||||
|
|
@ -75,13 +78,14 @@ public class HomopolymerCovariate implements ExperimentalCovariate {
|
|||
int numAgree = 0; // The number of consecutive bases that agree with you in the previous numBack bases of the read
|
||||
final byte[] bases = read.getReadBases();
|
||||
int iii = offset;
|
||||
if( !read.getReadNegativeStrandFlag() ) { // Forward direction
|
||||
while( iii <= bases.length-2 && bases[iii] == bases[iii+1] && numAgree < numBack ) {
|
||||
if (!read.getReadNegativeStrandFlag()) { // Forward direction
|
||||
while (iii <= bases.length - 2 && bases[iii] == bases[iii + 1] && numAgree < numBack) {
|
||||
numAgree++;
|
||||
iii++;
|
||||
}
|
||||
} else { // Negative direction
|
||||
while( iii >= 1 && bases[iii] == bases[iii-1] && numAgree < numBack ) {
|
||||
}
|
||||
else { // Negative direction
|
||||
while (iii >= 1 && bases[iii] == bases[iii - 1] && numAgree < numBack) {
|
||||
numAgree++;
|
||||
iii--;
|
||||
}
|
||||
|
|
@ -90,15 +94,16 @@ public class HomopolymerCovariate implements ExperimentalCovariate {
|
|||
return numAgree;
|
||||
}
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
for(int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
|
||||
for (int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
return Integer.parseInt(str);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -38,23 +39,25 @@ import net.sf.samtools.SAMRecord;
|
|||
public class MappingQualityCovariate implements ExperimentalCovariate {
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
private Comparable getValue(final GATKSAMRecord read) {
|
||||
return read.getMappingQuality();
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
return Integer.parseInt(str);
|
||||
}
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
for(int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
|
||||
for (int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
comparable[iii] = getValue(read); // BUGBUG: this can be optimized
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -41,34 +43,37 @@ public class MinimumNQSCovariate implements ExperimentalCovariate {
|
|||
private int windowReach; // How far in each direction from the current base to look
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
windowReach = RAC.WINDOW_SIZE / 2; // integer division
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
private Comparable getValue(final SAMRecord read, final int offset) {
|
||||
|
||||
// Loop over the list of base quality scores in the window and find the minimum
|
||||
final byte[] quals = read.getBaseQualities();
|
||||
int minQual = quals[offset];
|
||||
final int minIndex = Math.max(offset - windowReach, 0);
|
||||
final int maxIndex = Math.min(offset + windowReach, quals.length - 1);
|
||||
for ( int iii = minIndex; iii < maxIndex; iii++ ) {
|
||||
if( quals[iii] < minQual ) {
|
||||
for (int iii = minIndex; iii < maxIndex; iii++) {
|
||||
if (quals[iii] < minQual) {
|
||||
minQual = quals[iii];
|
||||
}
|
||||
}
|
||||
return minQual;
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
}
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
for(int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
|
||||
for (int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
return Integer.parseInt(str);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -39,27 +41,29 @@ import net.sf.samtools.SAMRecord;
|
|||
public class PositionCovariate implements ExperimentalCovariate {
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
private Comparable getValue(final SAMRecord read, final int offset) {
|
||||
int cycle = offset;
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
cycle = read.getReadLength() - (offset + 1);
|
||||
}
|
||||
return cycle;
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
}
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
for(int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
|
||||
for (int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
return Integer.parseInt(str);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -33,38 +35,42 @@ import net.sf.samtools.SAMRecord;
|
|||
* Date: Nov 13, 2009
|
||||
*
|
||||
* The Primer Round covariate.
|
||||
* For Solexa and 454 this is the same value of the length of the read.
|
||||
* For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
|
||||
* For Solexa and 454 this is the same value of the length of the read.
|
||||
* For SOLiD this is different for each position according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
|
||||
*/
|
||||
|
||||
public class PrimerRoundCovariate implements ExperimentalCovariate {
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
}
|
||||
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "ABI_SOLID" ) ) {
|
||||
private Comparable getValue(final SAMRecord read, final int offset) {
|
||||
if (read.getReadGroup().getPlatform().equalsIgnoreCase("SOLID") || read.getReadGroup().getPlatform().equalsIgnoreCase("ABI_SOLID")) {
|
||||
int pos = offset;
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
pos = read.getReadLength() - (offset + 1);
|
||||
}
|
||||
return pos % 5; // the primer round according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
return 1; // nothing to do here because it is always the same
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
}
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
for(int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
|
||||
for (int iii = 0; iii < read.getReadLength(); iii++) {
|
||||
comparable[iii] = getValue(read, iii); // BUGBUG: this can be optimized
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
return Integer.parseInt(str);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,9 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -38,26 +41,27 @@ import net.sf.samtools.SAMRecord;
|
|||
public class QualityScoreCovariate implements RequiredCovariate {
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
}
|
||||
|
||||
/*
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
return (int)(read.getBaseQualities()[offset]);
|
||||
}
|
||||
*/
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
byte[] baseQualities = read.getBaseQualities();
|
||||
for(int i = 0; i < read.getReadLength(); i++) {
|
||||
comparable[i] = (int) baseQualities[i];
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
|
||||
if (modelType == BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION) {
|
||||
byte[] baseQualities = read.getBaseQualities();
|
||||
for (int i = 0; i < read.getReadLength(); i++) {
|
||||
comparable[i] = (int) baseQualities[i];
|
||||
}
|
||||
}
|
||||
else { // model == BASE_INSERTION || model == BASE_DELETION
|
||||
Arrays.fill(comparable, 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will
|
||||
// be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
return Integer.parseInt( str );
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
return Integer.parseInt(str);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -35,33 +36,26 @@ import net.sf.samtools.SAMRecord;
|
|||
* The Read Group covariate.
|
||||
*/
|
||||
|
||||
public class ReadGroupCovariate implements RequiredCovariate{
|
||||
|
||||
public static final String defaultReadGroup = "DefaultReadGroup";
|
||||
public class ReadGroupCovariate implements RequiredCovariate {
|
||||
|
||||
// Initialize any member variables using the command-line arguments passed to the walkers
|
||||
public void initialize( final RecalibrationArgumentCollection RAC ) {
|
||||
@Override
|
||||
public void initialize(final RecalibrationArgumentCollection RAC) {
|
||||
}
|
||||
|
||||
/*
|
||||
// Used to pick out the covariate's value from attributes of the read
|
||||
public final Comparable getValue( final SAMRecord read, final int offset ) {
|
||||
return read.getReadGroup().getReadGroupId();
|
||||
}
|
||||
*/
|
||||
|
||||
public void getValues(SAMRecord read, Comparable[] comparable) {
|
||||
@Override
|
||||
public void getValues(final GATKSAMRecord read, final Comparable[] comparable, final BaseRecalibration.BaseRecalibrationType modelType) {
|
||||
final String readGroupId = read.getReadGroup().getReadGroupId();
|
||||
for(int i = 0; i < read.getReadLength(); i++) {
|
||||
for (int i = 0; i < read.getReadLength(); i++) {
|
||||
comparable[i] = readGroupId;
|
||||
}
|
||||
}
|
||||
|
||||
// Used to get the covariate's value from input csv file in TableRecalibrationWalker
|
||||
public final Comparable getValue( final String str ) {
|
||||
@Override
|
||||
public final Comparable getValue(final String str) {
|
||||
return str;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -25,8 +25,6 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
|
||||
import net.sf.samtools.SAMReadGroupRecord;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
|
@ -34,9 +32,11 @@ import org.broadinstitute.sting.utils.Utils;
|
|||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
|
@ -67,42 +67,57 @@ public class RecalDataManager {
|
|||
private static boolean warnUserNullPlatform = false;
|
||||
|
||||
public enum SOLID_RECAL_MODE {
|
||||
/** Treat reference inserted bases as reference matching bases. Very unsafe! */
|
||||
/**
|
||||
* Treat reference inserted bases as reference matching bases. Very unsafe!
|
||||
*/
|
||||
DO_NOTHING,
|
||||
/** Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. */
|
||||
/**
|
||||
* Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option.
|
||||
*/
|
||||
SET_Q_ZERO,
|
||||
/** In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. */
|
||||
/**
|
||||
* In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV.
|
||||
*/
|
||||
SET_Q_ZERO_BASE_N,
|
||||
/** Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. */
|
||||
/**
|
||||
* Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference.
|
||||
*/
|
||||
REMOVE_REF_BIAS
|
||||
}
|
||||
|
||||
public enum SOLID_NOCALL_STRATEGY {
|
||||
/** When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. */
|
||||
/**
|
||||
* When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option.
|
||||
*/
|
||||
THROW_EXCEPTION,
|
||||
/** Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. */
|
||||
/**
|
||||
* Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare.
|
||||
*/
|
||||
LEAVE_READ_UNRECALIBRATED,
|
||||
/** Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. */
|
||||
/**
|
||||
* Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses.
|
||||
*/
|
||||
PURGE_READ
|
||||
}
|
||||
|
||||
RecalDataManager() {
|
||||
public RecalDataManager() {
|
||||
data = new NestedHashMap();
|
||||
dataCollapsedReadGroup = null;
|
||||
dataCollapsedQualityScore = null;
|
||||
dataCollapsedByCovariate = null;
|
||||
}
|
||||
|
||||
RecalDataManager( final boolean createCollapsedTables, final int numCovariates ) {
|
||||
if( createCollapsedTables ) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker
|
||||
public RecalDataManager(final boolean createCollapsedTables, final int numCovariates) {
|
||||
if (createCollapsedTables) { // Initialize all the collapsed tables, only used by TableRecalibrationWalker
|
||||
data = null;
|
||||
dataCollapsedReadGroup = new NestedHashMap();
|
||||
dataCollapsedQualityScore = new NestedHashMap();
|
||||
dataCollapsedByCovariate = new ArrayList<NestedHashMap>();
|
||||
for( int iii = 0; iii < numCovariates - 2; iii++ ) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||
dataCollapsedByCovariate.add( new NestedHashMap() );
|
||||
for (int iii = 0; iii < numCovariates - 2; iii++) { // readGroup and QualityScore aren't counted here, their tables are separate
|
||||
dataCollapsedByCovariate.add(new NestedHashMap());
|
||||
}
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
data = new NestedHashMap();
|
||||
dataCollapsedReadGroup = null;
|
||||
dataCollapsedQualityScore = null;
|
||||
|
|
@ -112,54 +127,58 @@ public class RecalDataManager {
|
|||
|
||||
/**
|
||||
* Add the given mapping to all of the collapsed hash tables
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
*
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
* @param PRESERVE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
|
||||
*/
|
||||
public final void addToAllTables( final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN ) {
|
||||
public final void addToAllTables(final Object[] key, final RecalDatum fullDatum, final int PRESERVE_QSCORES_LESS_THAN) {
|
||||
|
||||
// The full dataset isn't actually ever used for anything because of the sequential calculation so no need to keep the full data HashMap around
|
||||
//data.put(key, thisDatum); // add the mapping to the main table
|
||||
|
||||
final int qualityScore = Integer.parseInt( key[1].toString() );
|
||||
final int qualityScore = Integer.parseInt(key[1].toString());
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||
final Object[] covariateCollapsedKey = new Object[3];
|
||||
RecalDatum collapsedDatum;
|
||||
|
||||
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||
if( qualityScore >= PRESERVE_QSCORES_LESS_THAN ) {
|
||||
if (qualityScore >= PRESERVE_QSCORES_LESS_THAN) {
|
||||
readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group
|
||||
collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get( readGroupCollapsedKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedReadGroup.put( new RecalDatum(fullDatum), readGroupCollapsedKey );
|
||||
} else {
|
||||
collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported
|
||||
collapsedDatum = (RecalDatum) dataCollapsedReadGroup.get(readGroupCollapsedKey);
|
||||
if (collapsedDatum == null) {
|
||||
dataCollapsedReadGroup.put(new RecalDatum(fullDatum), readGroupCollapsedKey);
|
||||
}
|
||||
else {
|
||||
collapsedDatum.combine(fullDatum); // using combine instead of increment in order to calculate overall aggregateQReported
|
||||
}
|
||||
}
|
||||
|
||||
// Create dataCollapsedQuality, the table where everything except read group and quality score has been collapsed
|
||||
qualityScoreCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
||||
qualityScoreCollapsedKey[1] = key[1]; // and quality score
|
||||
collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get( qualityScoreCollapsedKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedQualityScore.put( new RecalDatum(fullDatum), qualityScoreCollapsedKey );
|
||||
} else {
|
||||
collapsedDatum.increment( fullDatum );
|
||||
collapsedDatum = (RecalDatum) dataCollapsedQualityScore.get(qualityScoreCollapsedKey);
|
||||
if (collapsedDatum == null) {
|
||||
dataCollapsedQualityScore.put(new RecalDatum(fullDatum), qualityScoreCollapsedKey);
|
||||
}
|
||||
else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
|
||||
// Create dataCollapsedByCovariate's, the tables where everything except read group, quality score, and given covariate has been collapsed
|
||||
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) {
|
||||
for (int iii = 0; iii < dataCollapsedByCovariate.size(); iii++) {
|
||||
covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
||||
covariateCollapsedKey[1] = key[1]; // and quality score ...
|
||||
final Object theCovariateElement = key[iii + 2]; // and the given covariate
|
||||
if( theCovariateElement != null ) {
|
||||
if (theCovariateElement != null) {
|
||||
covariateCollapsedKey[2] = theCovariateElement;
|
||||
collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get( covariateCollapsedKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedByCovariate.get(iii).put( new RecalDatum(fullDatum), covariateCollapsedKey );
|
||||
} else {
|
||||
collapsedDatum.increment( fullDatum );
|
||||
collapsedDatum = (RecalDatum) dataCollapsedByCovariate.get(iii).get(covariateCollapsedKey);
|
||||
if (collapsedDatum == null) {
|
||||
dataCollapsedByCovariate.get(iii).put(new RecalDatum(fullDatum), covariateCollapsedKey);
|
||||
}
|
||||
else {
|
||||
collapsedDatum.increment(fullDatum);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -167,150 +186,136 @@ public class RecalDataManager {
|
|||
|
||||
/**
|
||||
* Loop over all the collapsed tables and turn the recalDatums found there into an empirical quality score
|
||||
* that will be used in the sequential calculation in TableRecalibrationWalker
|
||||
* that will be used in the sequential calculation in TableRecalibrationWalker
|
||||
*
|
||||
* @param smoothing The smoothing parameter that goes into empirical quality score calculation
|
||||
* @param maxQual At which value to cap the quality scores
|
||||
* @param maxQual At which value to cap the quality scores
|
||||
*/
|
||||
public final void generateEmpiricalQualities( final int smoothing, final int maxQual ) {
|
||||
public final void generateEmpiricalQualities(final int smoothing, final int maxQual) {
|
||||
|
||||
recursivelyGenerateEmpiricalQualities(dataCollapsedReadGroup.data, smoothing, maxQual);
|
||||
recursivelyGenerateEmpiricalQualities(dataCollapsedQualityScore.data, smoothing, maxQual);
|
||||
for( NestedHashMap map : dataCollapsedByCovariate ) {
|
||||
for (NestedHashMap map : dataCollapsedByCovariate) {
|
||||
recursivelyGenerateEmpiricalQualities(map.data, smoothing, maxQual);
|
||||
checkForSingletons(map.data);
|
||||
}
|
||||
}
|
||||
|
||||
private void recursivelyGenerateEmpiricalQualities( final Map data, final int smoothing, final int maxQual ) {
|
||||
private void recursivelyGenerateEmpiricalQualities(final Map data, final int smoothing, final int maxQual) {
|
||||
|
||||
for( Object comp : data.keySet() ) {
|
||||
for (Object comp : data.keySet()) {
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps
|
||||
((RecalDatum)val).calcCombinedEmpiricalQuality(smoothing, maxQual);
|
||||
} else { // Another layer in the nested hash map
|
||||
recursivelyGenerateEmpiricalQualities( (Map) val, smoothing, maxQual);
|
||||
if (val instanceof RecalDatum) { // We are at the end of the nested hash maps
|
||||
((RecalDatum) val).calcCombinedEmpiricalQuality(smoothing, maxQual);
|
||||
}
|
||||
else { // Another layer in the nested hash map
|
||||
recursivelyGenerateEmpiricalQualities((Map) val, smoothing, maxQual);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void checkForSingletons( final Map data ) {
|
||||
private void checkForSingletons(final Map data) {
|
||||
// todo -- this looks like it's better just as a data.valueSet() call?
|
||||
for( Object comp : data.keySet() ) {
|
||||
for (Object comp : data.keySet()) {
|
||||
final Object val = data.get(comp);
|
||||
if( val instanceof RecalDatum ) { // We are at the end of the nested hash maps
|
||||
if( data.keySet().size() == 1) {
|
||||
if (val instanceof RecalDatum) { // We are at the end of the nested hash maps
|
||||
if (data.keySet().size() == 1) {
|
||||
data.clear(); // don't TableRecalibrate a non-required covariate if it only has one element because that correction has already been done ...
|
||||
// in a previous step of the sequential calculation model
|
||||
// in a previous step of the sequential calculation model
|
||||
}
|
||||
} else { // Another layer in the nested hash map
|
||||
checkForSingletons( (Map) val );
|
||||
}
|
||||
else { // Another layer in the nested hash map
|
||||
checkForSingletons((Map) val);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the appropriate collapsed table out of the set of all the tables held by this Object
|
||||
*
|
||||
* @param covariate Which covariate indexes the desired collapsed HashMap
|
||||
* @return The desired collapsed HashMap
|
||||
*/
|
||||
public final NestedHashMap getCollapsedTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
public final NestedHashMap getCollapsedTable(final int covariate) {
|
||||
if (covariate == 0) {
|
||||
return dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
} else if( covariate == 1 ) {
|
||||
}
|
||||
else if (covariate == 1) {
|
||||
return dataCollapsedQualityScore; // Table where everything except read group and quality score has been collapsed
|
||||
} else {
|
||||
return dataCollapsedByCovariate.get( covariate - 2 ); // Table where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
else {
|
||||
return dataCollapsedByCovariate.get(covariate - 2); // Table where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Section of code shared between the two recalibration walkers which uses the command line arguments to adjust attributes of the read such as quals or platform string
|
||||
*
|
||||
* @param read The read to adjust
|
||||
* @param RAC The list of shared command line arguments
|
||||
* @param RAC The list of shared command line arguments
|
||||
*/
|
||||
public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) {
|
||||
GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord)read).getReadGroup();
|
||||
public static void parseSAMRecord(final GATKSAMRecord read, final RecalibrationArgumentCollection RAC) {
|
||||
GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord) read).getReadGroup();
|
||||
|
||||
// If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments
|
||||
if( readGroup == null ) {
|
||||
if( RAC.DEFAULT_READ_GROUP != null && RAC.DEFAULT_PLATFORM != null) {
|
||||
if( !warnUserNullReadGroup && RAC.FORCE_READ_GROUP == null ) {
|
||||
Utils.warnUser("The input .bam file contains reads with no read group. " +
|
||||
"Defaulting to read group ID = " + RAC.DEFAULT_READ_GROUP + " and platform = " + RAC.DEFAULT_PLATFORM + ". " +
|
||||
"First observed at read with name = " + read.getReadName() );
|
||||
warnUserNullReadGroup = true;
|
||||
}
|
||||
// There is no readGroup so defaulting to these values
|
||||
readGroup = new GATKSAMReadGroupRecord( RAC.DEFAULT_READ_GROUP );
|
||||
readGroup.setPlatform( RAC.DEFAULT_PLATFORM );
|
||||
((GATKSAMRecord)read).setReadGroup( readGroup );
|
||||
} else {
|
||||
throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() );
|
||||
}
|
||||
if (RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
|
||||
readGroup.setPlatform(RAC.FORCE_PLATFORM);
|
||||
}
|
||||
|
||||
if( RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP) ) { // Collapse all the read groups into a single common String provided by the user
|
||||
final String oldPlatform = readGroup.getPlatform();
|
||||
readGroup = new GATKSAMReadGroupRecord( RAC.FORCE_READ_GROUP );
|
||||
readGroup.setPlatform( oldPlatform );
|
||||
((GATKSAMRecord)read).setReadGroup( readGroup );
|
||||
}
|
||||
|
||||
if( RAC.FORCE_PLATFORM != null && (readGroup.getPlatform() == null || !readGroup.getPlatform().equals(RAC.FORCE_PLATFORM))) {
|
||||
readGroup.setPlatform( RAC.FORCE_PLATFORM );
|
||||
}
|
||||
|
||||
if ( readGroup.getPlatform() == null ) {
|
||||
if( RAC.DEFAULT_PLATFORM != null ) {
|
||||
if( !warnUserNullPlatform ) {
|
||||
if (readGroup.getPlatform() == null) {
|
||||
if (RAC.DEFAULT_PLATFORM != null) {
|
||||
if (!warnUserNullPlatform) {
|
||||
Utils.warnUser("The input .bam file contains reads with no platform information. " +
|
||||
"Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " +
|
||||
"First observed at read with name = " + read.getReadName() );
|
||||
"Defaulting to platform = " + RAC.DEFAULT_PLATFORM + ". " +
|
||||
"First observed at read with name = " + read.getReadName());
|
||||
warnUserNullPlatform = true;
|
||||
}
|
||||
readGroup.setPlatform( RAC.DEFAULT_PLATFORM );
|
||||
} else {
|
||||
throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() );
|
||||
readGroup.setPlatform(RAC.DEFAULT_PLATFORM);
|
||||
}
|
||||
else {
|
||||
throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse through the color space of the read and add a new tag to the SAMRecord that says which bases are inconsistent with the color space
|
||||
*
|
||||
* @param read The SAMRecord to parse
|
||||
*/
|
||||
public static void parseColorSpace( final SAMRecord read ) {
|
||||
public static void parseColorSpace(final GATKSAMRecord read) {
|
||||
|
||||
// If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base
|
||||
if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) {
|
||||
if( read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null ) { // Haven't calculated the inconsistency array yet for this read
|
||||
if (ReadUtils.isSOLiDRead(read)) {
|
||||
if (read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
if (attr != null) {
|
||||
byte[] colorSpace;
|
||||
if( attr instanceof String ) {
|
||||
colorSpace = ((String)attr).getBytes();
|
||||
} else {
|
||||
if (attr instanceof String) {
|
||||
colorSpace = ((String) attr).getBytes();
|
||||
}
|
||||
else {
|
||||
throw new UserException.MalformedBAM(read, String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
// Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read
|
||||
byte[] readBases = read.getReadBases();
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
readBases = BaseUtils.simpleReverseComplement( read.getReadBases() );
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
readBases = BaseUtils.simpleReverseComplement(read.getReadBases());
|
||||
}
|
||||
final byte[] inconsistency = new byte[readBases.length];
|
||||
int iii;
|
||||
byte prevBase = colorSpace[0]; // The sentinel
|
||||
for( iii = 0; iii < readBases.length; iii++ ) {
|
||||
final byte thisBase = getNextBaseFromColor( read, prevBase, colorSpace[iii + 1] );
|
||||
inconsistency[iii] = (byte)( thisBase == readBases[iii] ? 0 : 1 );
|
||||
for (iii = 0; iii < readBases.length; iii++) {
|
||||
final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]);
|
||||
inconsistency[iii] = (byte) (thisBase == readBases[iii] ? 0 : 1);
|
||||
prevBase = readBases[iii];
|
||||
}
|
||||
read.setAttribute( RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency );
|
||||
read.setAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency);
|
||||
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
|
||||
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
|
||||
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -319,52 +324,57 @@ public class RecalDataManager {
|
|||
/**
|
||||
* Parse through the color space of the read and apply the desired --solid_recal_mode correction to the bases
|
||||
* This method doesn't add the inconsistent tag to the read like parseColorSpace does
|
||||
* @param read The SAMRecord to parse
|
||||
*
|
||||
* @param read The SAMRecord to parse
|
||||
* @param originalQualScores The array of original quality scores to modify during the correction
|
||||
* @param solidRecalMode Which mode of solid recalibration to apply
|
||||
* @param refBases The reference for this read
|
||||
* @param solidRecalMode Which mode of solid recalibration to apply
|
||||
* @param refBases The reference for this read
|
||||
* @return A new array of quality scores that have been ref bias corrected
|
||||
*/
|
||||
public static byte[] calcColorSpace( final SAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases ) {
|
||||
public static byte[] calcColorSpace(final GATKSAMRecord read, byte[] originalQualScores, final SOLID_RECAL_MODE solidRecalMode, final byte[] refBases) {
|
||||
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
if (attr != null) {
|
||||
byte[] colorSpace;
|
||||
if( attr instanceof String ) {
|
||||
colorSpace = ((String)attr).getBytes();
|
||||
} else {
|
||||
if (attr instanceof String) {
|
||||
colorSpace = ((String) attr).getBytes();
|
||||
}
|
||||
else {
|
||||
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
// Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read
|
||||
byte[] readBases = read.getReadBases();
|
||||
final byte[] colorImpliedBases = readBases.clone();
|
||||
byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray( read.getCigar(), read.getReadBases(), refBases ); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
readBases = BaseUtils.simpleReverseComplement( read.getReadBases() );
|
||||
refBasesDirRead = BaseUtils.simpleReverseComplement( refBasesDirRead.clone() );
|
||||
byte[] refBasesDirRead = AlignmentUtils.alignmentToByteArray(read.getCigar(), read.getReadBases(), refBases); //BUGBUG: This needs to change when read walkers are changed to give the aligned refBases
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
readBases = BaseUtils.simpleReverseComplement(read.getReadBases());
|
||||
refBasesDirRead = BaseUtils.simpleReverseComplement(refBasesDirRead.clone());
|
||||
}
|
||||
final int[] inconsistency = new int[readBases.length];
|
||||
byte prevBase = colorSpace[0]; // The sentinel
|
||||
for( int iii = 0; iii < readBases.length; iii++ ) {
|
||||
final byte thisBase = getNextBaseFromColor( read, prevBase, colorSpace[iii + 1] );
|
||||
for (int iii = 0; iii < readBases.length; iii++) {
|
||||
final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[iii + 1]);
|
||||
colorImpliedBases[iii] = thisBase;
|
||||
inconsistency[iii] = ( thisBase == readBases[iii] ? 0 : 1 );
|
||||
inconsistency[iii] = (thisBase == readBases[iii] ? 0 : 1);
|
||||
prevBase = readBases[iii];
|
||||
}
|
||||
|
||||
// Now that we have the inconsistency array apply the desired correction to the inconsistent bases
|
||||
if( solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO ) { // Set inconsistent bases and the one before it to Q0
|
||||
if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO) { // Set inconsistent bases and the one before it to Q0
|
||||
final boolean setBaseN = false;
|
||||
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
|
||||
} else if( solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N ) {
|
||||
}
|
||||
else if (solidRecalMode == SOLID_RECAL_MODE.SET_Q_ZERO_BASE_N) {
|
||||
final boolean setBaseN = true;
|
||||
originalQualScores = solidRecalSetToQZero(read, readBases, inconsistency, originalQualScores, refBasesDirRead, setBaseN);
|
||||
} else if( solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS ) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases
|
||||
}
|
||||
else if (solidRecalMode == SOLID_RECAL_MODE.REMOVE_REF_BIAS) { // Use the color space quality to probabilistically remove ref bases at inconsistent color space bases
|
||||
solidRecalRemoveRefBias(read, readBases, inconsistency, colorImpliedBases, refBasesDirRead);
|
||||
}
|
||||
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
|
||||
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
|
||||
}
|
||||
|
|
@ -372,26 +382,28 @@ public class RecalDataManager {
|
|||
return originalQualScores;
|
||||
}
|
||||
|
||||
public static boolean checkNoCallColorSpace( final SAMRecord read ) {
|
||||
if( read.getReadGroup().getPlatform().toUpperCase().contains("SOLID") ) {
|
||||
public static boolean checkNoCallColorSpace(final GATKSAMRecord read) {
|
||||
if (ReadUtils.isSOLiDRead(read)) {
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
if (attr != null) {
|
||||
byte[] colorSpace;
|
||||
if( attr instanceof String ) {
|
||||
colorSpace = ((String)attr).substring(1).getBytes(); // trim off the Sentinel
|
||||
} else {
|
||||
if (attr instanceof String) {
|
||||
colorSpace = ((String) attr).substring(1).getBytes(); // trim off the Sentinel
|
||||
}
|
||||
else {
|
||||
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
for( byte color : colorSpace ) {
|
||||
if( color != (byte)'0' && color != (byte)'1' && color != (byte)'2' && color != (byte)'3' ) {
|
||||
for (byte color : colorSpace) {
|
||||
if (color != (byte) '0' && color != (byte) '1' && color != (byte) '2' && color != (byte) '3') {
|
||||
return true; // There is a bad color in this SOLiD read and the user wants to skip over it
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() +
|
||||
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
|
||||
" Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias.");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -400,90 +412,105 @@ public class RecalDataManager {
|
|||
|
||||
/**
|
||||
* Perform the SET_Q_ZERO solid recalibration. Inconsistent color space bases and their previous base are set to quality zero
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read which have been RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
*
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read which have been RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
* @param originalQualScores The array of original quality scores to set to zero if needed
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
* @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
* @param setBaseN Should we also set the base to N as well as quality zero in order to visualize in IGV or something similar
|
||||
* @return The byte array of original quality scores some of which might have been set to zero
|
||||
*/
|
||||
private static byte[] solidRecalSetToQZero( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores,
|
||||
final byte[] refBases, final boolean setBaseN ) {
|
||||
private static byte[] solidRecalSetToQZero(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] originalQualScores, final byte[] refBases, final boolean setBaseN) {
|
||||
|
||||
final boolean negStrand = read.getReadNegativeStrandFlag();
|
||||
for( int iii = 1; iii < originalQualScores.length; iii++ ) {
|
||||
if( inconsistency[iii] == 1 ) {
|
||||
if( readBases[iii] == refBases[iii] ) {
|
||||
if( negStrand ) { originalQualScores[originalQualScores.length-(iii+1)] = (byte)0; }
|
||||
else { originalQualScores[iii] = (byte)0; }
|
||||
if( setBaseN ) { readBases[iii] = (byte)'N'; }
|
||||
for (int iii = 1; iii < originalQualScores.length; iii++) {
|
||||
if (inconsistency[iii] == 1) {
|
||||
if (readBases[iii] == refBases[iii]) {
|
||||
if (negStrand) {
|
||||
originalQualScores[originalQualScores.length - (iii + 1)] = (byte) 0;
|
||||
}
|
||||
else {
|
||||
originalQualScores[iii] = (byte) 0;
|
||||
}
|
||||
if (setBaseN) {
|
||||
readBases[iii] = (byte) 'N';
|
||||
}
|
||||
}
|
||||
// Set the prev base to Q0 as well
|
||||
if( readBases[iii-1] == refBases[iii-1] ) {
|
||||
if( negStrand ) { originalQualScores[originalQualScores.length-iii] = (byte)0; }
|
||||
else { originalQualScores[iii-1] = (byte)0; }
|
||||
if( setBaseN ) { readBases[iii-1] = (byte)'N'; }
|
||||
if (readBases[iii - 1] == refBases[iii - 1]) {
|
||||
if (negStrand) {
|
||||
originalQualScores[originalQualScores.length - iii] = (byte) 0;
|
||||
}
|
||||
else {
|
||||
originalQualScores[iii - 1] = (byte) 0;
|
||||
}
|
||||
if (setBaseN) {
|
||||
readBases[iii - 1] = (byte) 'N';
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if( negStrand ) {
|
||||
readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read
|
||||
if (negStrand) {
|
||||
readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
|
||||
}
|
||||
read.setReadBases( readBases );
|
||||
read.setReadBases(readBases);
|
||||
|
||||
return originalQualScores;
|
||||
}
|
||||
|
||||
/**
|
||||
* Peform the REMOVE_REF_BIAS solid recalibration. Look at the color space qualities and probabilistically decide if the base should be change to match the color or left as reference
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read which have been RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
*
|
||||
* @param read The SAMRecord to recalibrate
|
||||
* @param readBases The bases in the read which have been RC'd if necessary
|
||||
* @param inconsistency The array of 1/0 that says if this base is inconsistent with its color
|
||||
* @param colorImpliedBases The bases implied by the color space, RC'd if necessary
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
* @param refBases The reference which has been RC'd if necessary
|
||||
*/
|
||||
private static void solidRecalRemoveRefBias( final SAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases,
|
||||
final byte[] refBases) {
|
||||
private static void solidRecalRemoveRefBias(final GATKSAMRecord read, byte[] readBases, final int[] inconsistency, final byte[] colorImpliedBases, final byte[] refBases) {
|
||||
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG);
|
||||
if( attr != null ) {
|
||||
if (attr != null) {
|
||||
byte[] colorSpaceQuals;
|
||||
if( attr instanceof String ) {
|
||||
String x = (String)attr;
|
||||
if (attr instanceof String) {
|
||||
String x = (String) attr;
|
||||
colorSpaceQuals = x.getBytes();
|
||||
SAMUtils.fastqToPhred(colorSpaceQuals);
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
throw new ReviewedStingException(String.format("Value encoded by %s in %s isn't a string!", RecalDataManager.COLOR_SPACE_QUAL_ATTRIBUTE_TAG, read.getReadName()));
|
||||
}
|
||||
|
||||
for( int iii = 1; iii < inconsistency.length - 1; iii++ ) {
|
||||
if( inconsistency[iii] == 1 ) {
|
||||
for( int jjj = iii - 1; jjj <= iii; jjj++ ) { // Correct this base and the one before it along the direction of the read
|
||||
if( jjj == iii || inconsistency[jjj] == 0 ) { // Don't want to correct the previous base a second time if it was already corrected in the previous step
|
||||
if( readBases[jjj] == refBases[jjj] ) {
|
||||
if( colorSpaceQuals[jjj] == colorSpaceQuals[jjj+1] ) { // Equal evidence for the color implied base and the reference base, so flip a coin
|
||||
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt( 2 );
|
||||
if( rand == 0 ) { // The color implied base won the coin flip
|
||||
for (int iii = 1; iii < inconsistency.length - 1; iii++) {
|
||||
if (inconsistency[iii] == 1) {
|
||||
for (int jjj = iii - 1; jjj <= iii; jjj++) { // Correct this base and the one before it along the direction of the read
|
||||
if (jjj == iii || inconsistency[jjj] == 0) { // Don't want to correct the previous base a second time if it was already corrected in the previous step
|
||||
if (readBases[jjj] == refBases[jjj]) {
|
||||
if (colorSpaceQuals[jjj] == colorSpaceQuals[jjj + 1]) { // Equal evidence for the color implied base and the reference base, so flip a coin
|
||||
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(2);
|
||||
if (rand == 0) { // The color implied base won the coin flip
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
}
|
||||
} else {
|
||||
final int maxQuality = Math.max((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
|
||||
final int minQuality = Math.min((int)colorSpaceQuals[jjj], (int)colorSpaceQuals[jjj+1]);
|
||||
}
|
||||
else {
|
||||
final int maxQuality = Math.max((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
|
||||
final int minQuality = Math.min((int) colorSpaceQuals[jjj], (int) colorSpaceQuals[jjj + 1]);
|
||||
int diffInQuality = maxQuality - minQuality;
|
||||
int numLow = minQuality;
|
||||
if( numLow == 0 ) {
|
||||
if (numLow == 0) {
|
||||
numLow++;
|
||||
diffInQuality++;
|
||||
}
|
||||
final int numHigh = Math.round( numLow * (float)Math.pow(10.0f, (float) diffInQuality / 10.0f) ); // The color with higher quality is exponentially more likely
|
||||
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt( numLow + numHigh );
|
||||
if( rand >= numLow ) { // higher q score won
|
||||
if( maxQuality == (int)colorSpaceQuals[jjj] ) {
|
||||
final int numHigh = Math.round(numLow * (float) Math.pow(10.0f, (float) diffInQuality / 10.0f)); // The color with higher quality is exponentially more likely
|
||||
final int rand = GenomeAnalysisEngine.getRandomGenerator().nextInt(numLow + numHigh);
|
||||
if (rand >= numLow) { // higher q score won
|
||||
if (maxQuality == (int) colorSpaceQuals[jjj]) {
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
} // else ref color had higher q score, and won out, so nothing to do here
|
||||
} else { // lower q score won
|
||||
if( minQuality == (int)colorSpaceQuals[jjj] ) {
|
||||
}
|
||||
else { // lower q score won
|
||||
if (minQuality == (int) colorSpaceQuals[jjj]) {
|
||||
readBases[jjj] = colorImpliedBases[jjj];
|
||||
} // else ref color had lower q score, and won out, so nothing to do here
|
||||
}
|
||||
|
|
@ -494,52 +521,56 @@ public class RecalDataManager {
|
|||
}
|
||||
}
|
||||
|
||||
if( read.getReadNegativeStrandFlag() ) {
|
||||
readBases = BaseUtils.simpleReverseComplement( readBases.clone() ); // Put the bases back in reverse order to stuff them back in the read
|
||||
if (read.getReadNegativeStrandFlag()) {
|
||||
readBases = BaseUtils.simpleReverseComplement(readBases.clone()); // Put the bases back in reverse order to stuff them back in the read
|
||||
}
|
||||
read.setReadBases( readBases );
|
||||
} else { // No color space quality tag in file
|
||||
read.setReadBases(readBases);
|
||||
}
|
||||
else { // No color space quality tag in file
|
||||
throw new UserException.MalformedBAM(read, "REMOVE_REF_BIAS recal mode requires color space qualities but they can't be found for read: " + read.getReadName());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the base and the color calculate the next base in the sequence
|
||||
*
|
||||
* @param prevBase The base
|
||||
* @param color The color
|
||||
* @param color The color
|
||||
* @return The next base in the sequence
|
||||
*/
|
||||
private static byte getNextBaseFromColor( SAMRecord read, final byte prevBase, final byte color ) {
|
||||
switch(color) {
|
||||
private static byte getNextBaseFromColor(GATKSAMRecord read, final byte prevBase, final byte color) {
|
||||
switch (color) {
|
||||
case '0':
|
||||
return prevBase;
|
||||
case '1':
|
||||
return performColorOne( prevBase );
|
||||
return performColorOne(prevBase);
|
||||
case '2':
|
||||
return performColorTwo( prevBase );
|
||||
return performColorTwo(prevBase);
|
||||
case '3':
|
||||
return performColorThree( prevBase );
|
||||
return performColorThree(prevBase);
|
||||
default:
|
||||
throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char)color +
|
||||
" Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias.");
|
||||
throw new UserException.MalformedBAM(read, "Unrecognized color space in SOLID read, color = " + (char) color +
|
||||
" Unfortunately this bam file can not be recalibrated without full color space information because of potential reference bias.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if this base is inconsistent with its color space. If it is then SOLID inserted the reference here and we should reduce the quality
|
||||
* @param read The read which contains the color space to check against
|
||||
*
|
||||
* @param read The read which contains the color space to check against
|
||||
* @param offset The offset in the read at which to check
|
||||
* @return Returns true if the base was inconsistent with the color space
|
||||
*/
|
||||
public static boolean isInconsistentColorSpace( final SAMRecord read, final int offset ) {
|
||||
public static boolean isInconsistentColorSpace(final GATKSAMRecord read, final int offset) {
|
||||
final Object attr = read.getAttribute(RecalDataManager.COLOR_SPACE_INCONSISTENCY_TAG);
|
||||
if( attr != null ) {
|
||||
final byte[] inconsistency = (byte[])attr;
|
||||
if (attr != null) {
|
||||
final byte[] inconsistency = (byte[]) attr;
|
||||
// NOTE: The inconsistency array is in the direction of the read, not aligned to the reference!
|
||||
if( read.getReadNegativeStrandFlag() ) { // Negative direction
|
||||
return inconsistency[inconsistency.length - offset - 1] != (byte)0;
|
||||
} else { // Forward direction
|
||||
return inconsistency[offset] != (byte)0;
|
||||
if (read.getReadNegativeStrandFlag()) { // Negative direction
|
||||
return inconsistency[inconsistency.length - offset - 1] != (byte) 0;
|
||||
}
|
||||
else { // Forward direction
|
||||
return inconsistency[offset] != (byte) 0;
|
||||
}
|
||||
|
||||
// This block of code is for if you want to check both the offset and the next base for color space inconsistency
|
||||
|
|
@ -557,7 +588,8 @@ public class RecalDataManager {
|
|||
// }
|
||||
//}
|
||||
|
||||
} else { // No inconsistency array, so nothing is inconsistent
|
||||
}
|
||||
else { // No inconsistency array, so nothing is inconsistent
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -566,36 +598,31 @@ public class RecalDataManager {
|
|||
* Computes all requested covariates for every offset in the given read
|
||||
* by calling covariate.getValues(..).
|
||||
*
|
||||
* @param gatkRead The read for which to compute covariate values.
|
||||
* @param gatkRead The read for which to compute covariate values.
|
||||
* @param requestedCovariates The list of requested covariates.
|
||||
* @return An array of covariate values where result[i][j] is the covariate
|
||||
* value for the ith position in the read and the jth covariate in
|
||||
* reqeustedCovariates list.
|
||||
* value for the ith position in the read and the jth covariate in
|
||||
* reqeustedCovariates list.
|
||||
*/
|
||||
public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List<Covariate> requestedCovariates) {
|
||||
//compute all covariates for this read
|
||||
final List<Covariate> requestedCovariatesRef = requestedCovariates;
|
||||
final int numRequestedCovariates = requestedCovariatesRef.size();
|
||||
final int readLength = gatkRead.getReadLength();
|
||||
public static Comparable[][] computeCovariates(final GATKSAMRecord gatkRead, final List<Covariate> requestedCovariates, final BaseRecalibration.BaseRecalibrationType modelType) {
|
||||
//compute all covariates for this read
|
||||
final int numRequestedCovariates = requestedCovariates.size();
|
||||
final int readLength = gatkRead.getReadLength();
|
||||
|
||||
final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates];
|
||||
final Comparable[] tempCovariateValuesHolder = new Comparable[readLength];
|
||||
final Comparable[][] covariateValues_offset_x_covar = new Comparable[readLength][numRequestedCovariates];
|
||||
final Comparable[] tempCovariateValuesHolder = new Comparable[readLength];
|
||||
|
||||
// Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
|
||||
for( int i = 0; i < numRequestedCovariates; i++ ) {
|
||||
requestedCovariatesRef.get(i).getValues( gatkRead, tempCovariateValuesHolder );
|
||||
for(int j = 0; j < readLength; j++) {
|
||||
//copy values into a 2D array that allows all covar types to be extracted at once for
|
||||
//an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.
|
||||
covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j];
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < numRequestedCovariates; i++) { // Loop through the list of requested covariates and compute the values of each covariate for all positions in this read
|
||||
requestedCovariates.get(i).getValues(gatkRead, tempCovariateValuesHolder, modelType);
|
||||
for (int j = 0; j < readLength; j++)
|
||||
covariateValues_offset_x_covar[j][i] = tempCovariateValuesHolder[j]; // copy values into a 2D array that allows all covar types to be extracted at once for an offset j by doing covariateValues_offset_x_covar[j]. This avoids the need to later iterate over covar types.
|
||||
}
|
||||
|
||||
return covariateValues_offset_x_covar;
|
||||
}
|
||||
return covariateValues_offset_x_covar;
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform a ceratin transversion (A <-> C or G <-> T) on the base.
|
||||
* Perform a certain transversion (A <-> C or G <-> T) on the base.
|
||||
*
|
||||
* @param base the base [AaCcGgTt]
|
||||
* @return the transversion of the base, or the input base if it's not one of the understood ones
|
||||
|
|
@ -603,14 +630,19 @@ public class RecalDataManager {
|
|||
private static byte performColorOne(byte base) {
|
||||
switch (base) {
|
||||
case 'A':
|
||||
case 'a': return 'C';
|
||||
case 'a':
|
||||
return 'C';
|
||||
case 'C':
|
||||
case 'c': return 'A';
|
||||
case 'c':
|
||||
return 'A';
|
||||
case 'G':
|
||||
case 'g': return 'T';
|
||||
case 'g':
|
||||
return 'T';
|
||||
case 'T':
|
||||
case 't': return 'G';
|
||||
default: return base;
|
||||
case 't':
|
||||
return 'G';
|
||||
default:
|
||||
return base;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -623,14 +655,19 @@ public class RecalDataManager {
|
|||
private static byte performColorTwo(byte base) {
|
||||
switch (base) {
|
||||
case 'A':
|
||||
case 'a': return 'G';
|
||||
case 'a':
|
||||
return 'G';
|
||||
case 'C':
|
||||
case 'c': return 'T';
|
||||
case 'c':
|
||||
return 'T';
|
||||
case 'G':
|
||||
case 'g': return 'A';
|
||||
case 'g':
|
||||
return 'A';
|
||||
case 'T':
|
||||
case 't': return 'C';
|
||||
default: return base;
|
||||
case 't':
|
||||
return 'C';
|
||||
default:
|
||||
return base;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -643,14 +680,19 @@ public class RecalDataManager {
|
|||
private static byte performColorThree(byte base) {
|
||||
switch (base) {
|
||||
case 'A':
|
||||
case 'a': return 'T';
|
||||
case 'a':
|
||||
return 'T';
|
||||
case 'C':
|
||||
case 'c': return 'G';
|
||||
case 'c':
|
||||
return 'G';
|
||||
case 'G':
|
||||
case 'g': return 'C';
|
||||
case 'g':
|
||||
return 'C';
|
||||
case 'T':
|
||||
case 't': return 'A';
|
||||
default: return base;
|
||||
case 't':
|
||||
return 'A';
|
||||
default:
|
||||
return base;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,36 +43,20 @@ public class RecalibrationArgumentCollection {
|
|||
// Shared Command Line Arguments
|
||||
//////////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.")
|
||||
public String DEFAULT_READ_GROUP = null;
|
||||
@Hidden
|
||||
@Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||
@Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String DEFAULT_PLATFORM = null;
|
||||
@Hidden
|
||||
@Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.")
|
||||
public String FORCE_READ_GROUP = null;
|
||||
@Hidden
|
||||
@Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
@Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.")
|
||||
public String FORCE_PLATFORM = null;
|
||||
@Hidden
|
||||
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
|
||||
@Argument(fullName = "window_size_nqs", shortName = "nqs", doc = "The window size used by MinimumNQSCovariate for its calculation", required = false)
|
||||
public int WINDOW_SIZE = 5;
|
||||
|
||||
/**
|
||||
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
|
||||
*/
|
||||
@Hidden
|
||||
@Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false)
|
||||
public int HOMOPOLYMER_NBACK = 7;
|
||||
@Hidden
|
||||
@Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false)
|
||||
public boolean EXCEPTION_IF_NO_TILE = false;
|
||||
|
||||
/**
|
||||
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
|
||||
* reads which have had the reference inserted because of color space inconsistencies.
|
||||
*/
|
||||
@Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
|
||||
@Argument(fullName = "solid_recal_mode", shortName = "sMode", required = false, doc = "How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
|
||||
public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO;
|
||||
|
||||
/**
|
||||
|
|
@ -80,6 +64,19 @@ public class RecalibrationArgumentCollection {
|
|||
* no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
|
||||
* their color space tag can not be recalibrated.
|
||||
*/
|
||||
@Argument(fullName = "solid_nocall_strategy", shortName="solid_nocall_strategy", doc="Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required=false)
|
||||
@Argument(fullName = "solid_nocall_strategy", shortName = "solid_nocall_strategy", doc = "Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required = false)
|
||||
public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
|
||||
|
||||
/**
|
||||
* The context covariate will use a context of this size to calculate it's covariate value
|
||||
*/
|
||||
@Argument(fullName = "context_size", shortName = "cs", doc = "size of the k-mer context to be used", required = false)
|
||||
public int CONTEXT_SIZE = 8;
|
||||
|
||||
/**
|
||||
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
|
||||
*/
|
||||
@Argument(fullName = "homopolymer_nback", shortName = "nback", doc = "The number of previous bases to look at in HomopolymerCovariate", required = false)
|
||||
public int HOMOPOLYMER_NBACK = 7;
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.classloader.PluginManager;
|
|||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
|
@ -85,12 +86,12 @@ import java.util.regex.Pattern;
|
|||
* -o my_reads.recal.bam \
|
||||
* -recalFile my_reads.recal_data.csv
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
||||
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
|
||||
@WalkerName("TableRecalibration")
|
||||
@Requires({ DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES }) // This walker requires -I input.bam, it also requires -R reference.fasta
|
||||
@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES})
|
||||
// This walker requires -I input.bam, it also requires -R reference.fasta
|
||||
public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
|
||||
|
||||
public static final String PROGRAM_RECORD_NAME = "GATK TableRecalibration";
|
||||
|
|
@ -98,7 +99,8 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
/////////////////////////////
|
||||
// Shared Arguments
|
||||
/////////////////////////////
|
||||
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
@ArgumentCollection
|
||||
private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
|
|
@ -109,12 +111,12 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*/
|
||||
@Input(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the input covariates table recalibration .csv file")
|
||||
@Input(fullName = "recal_file", shortName = "recalFile", required = true, doc = "Filename for the input covariates table recalibration .csv file")
|
||||
public File RECAL_FILE = null;
|
||||
/**
|
||||
* A new bam file in which the quality scores in each read have been recalibrated. The alignment of the reads is left untouched.
|
||||
*/
|
||||
@Output(doc="The output recalibrated BAM file", required=true)
|
||||
@Output(doc = "The output recalibrated BAM file", required = true)
|
||||
private StingSAMFileWriter OUTPUT_BAM = null;
|
||||
|
||||
/**
|
||||
|
|
@ -125,7 +127,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
* your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases
|
||||
* are unmodified during recalibration, so they don't get inappropriately evaluated.
|
||||
*/
|
||||
@Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
|
||||
@Argument(fullName = "preserve_qscores_less_than", shortName = "pQ", doc = "Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required = false)
|
||||
private int PRESERVE_QSCORES_LESS_THAN = 5;
|
||||
|
||||
/**
|
||||
|
|
@ -134,37 +136,36 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
* argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example,
|
||||
* --smoothing 15 for a large amount of smoothing.
|
||||
*/
|
||||
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
|
||||
@Argument(fullName = "smoothing", shortName = "sm", required = false, doc = "Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
|
||||
private int SMOOTHING = 1;
|
||||
|
||||
/**
|
||||
* Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
|
||||
* by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
|
||||
*/
|
||||
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores")
|
||||
@Argument(fullName = "max_quality_score", shortName = "maxQ", required = false, doc = "The integer value at which to cap the quality scores")
|
||||
private int MAX_QUALITY_SCORE = 50;
|
||||
|
||||
/**
|
||||
* By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun
|
||||
* the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag.
|
||||
*/
|
||||
@Argument(fullName="doNotWriteOriginalQuals", shortName="noOQs", required=false, doc="If true, we will not write the original quality (OQ) tag for each read")
|
||||
@Argument(fullName = "doNotWriteOriginalQuals", shortName = "noOQs", required = false, doc = "If true, we will not write the original quality (OQ) tag for each read")
|
||||
private boolean DO_NOT_WRITE_OQ = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Debugging-only Arguments
|
||||
/////////////////////////////
|
||||
@Hidden
|
||||
@Argument(fullName="no_pg_tag", shortName="noPG", required=false, doc="Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
@Argument(fullName = "no_pg_tag", shortName = "noPG", required = false, doc = "Don't output the usual PG tag in the recalibrated bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
|
||||
private boolean NO_PG_TAG = false;
|
||||
@Hidden
|
||||
@Argument(fullName="fail_with_no_eof_marker", shortName="requireEOF", required=false, doc="If no EOF marker is present in the covariates file, exit the program with an exception.")
|
||||
@Argument(fullName = "fail_with_no_eof_marker", shortName = "requireEOF", required = false, doc = "If no EOF marker is present in the covariates file, exit the program with an exception.")
|
||||
private boolean REQUIRE_EOF = false;
|
||||
@Hidden
|
||||
@Argument(fullName="skipUQUpdate", shortName="skipUQUpdate", required=false, doc="If true, we will skip the UQ updating step for each read, speeding up the calculations")
|
||||
@Argument(fullName = "skipUQUpdate", shortName = "skipUQUpdate", required = false, doc = "If true, we will skip the UQ updating step for each read, speeding up the calculations")
|
||||
private boolean skipUQUpdate = false;
|
||||
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
|
|
@ -181,7 +182,6 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
/////////////////////////////
|
||||
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
|
|
@ -195,8 +195,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
*/
|
||||
public void initialize() {
|
||||
|
||||
if( RAC.FORCE_READ_GROUP != null ) { RAC.DEFAULT_READ_GROUP = RAC.FORCE_READ_GROUP; }
|
||||
if( RAC.FORCE_PLATFORM != null ) { RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; }
|
||||
if (RAC.FORCE_PLATFORM != null) {
|
||||
RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM;
|
||||
}
|
||||
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
|
|
@ -205,31 +206,33 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
boolean foundAllCovariates = false;
|
||||
|
||||
// Read in the data from the csv file and populate the data map and covariates list
|
||||
logger.info( "Reading in the data from input csv file..." );
|
||||
logger.info("Reading in the data from input csv file...");
|
||||
|
||||
boolean sawEOF = false;
|
||||
try {
|
||||
for ( String line : new XReadLines(RECAL_FILE) ) {
|
||||
for (String line : new XReadLines(RECAL_FILE)) {
|
||||
lineNumber++;
|
||||
if ( EOF_MARKER.equals(line) ) {
|
||||
if (EOF_MARKER.equals(line)) {
|
||||
sawEOF = true;
|
||||
} else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) {
|
||||
}
|
||||
else if (COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches()) {
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
// Read in the covariates that were used from the input file
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
|
||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
else if (COVARIATE_PATTERN.matcher(line).matches()) { // The line string is either specifying a covariate or is giving csv data
|
||||
if (foundAllCovariates) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE);
|
||||
}
|
||||
else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
String[] vals = line.split(",");
|
||||
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||
for (int iii = 0; iii < vals.length - 3; iii++) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
||||
for (Class<?> covClass : classes) {
|
||||
if ((vals[iii] + "Covariate").equalsIgnoreCase(covClass.getSimpleName())) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
Covariate covariate = (Covariate) covClass.newInstance();
|
||||
requestedCovariates.add(covariate);
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
|
@ -237,107 +240,110 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
||||
if (!foundClass) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else { // Found a line of data
|
||||
if( !foundAllCovariates ) {
|
||||
}
|
||||
else { // Found a line of data
|
||||
if (!foundAllCovariates) {
|
||||
foundAllCovariates = true;
|
||||
|
||||
// At this point all the covariates should have been found and initialized
|
||||
if( requestedCovariates.size() < 2 ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE );
|
||||
if (requestedCovariates.size() < 2) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE);
|
||||
}
|
||||
|
||||
final boolean createCollapsedTables = true;
|
||||
|
||||
// Initialize any covariate member variables using the shared argument collection
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
cov.initialize( RAC );
|
||||
for (Covariate cov : requestedCovariates) {
|
||||
cov.initialize(RAC);
|
||||
}
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() );
|
||||
dataManager = new RecalDataManager(createCollapsedTables, requestedCovariates.size());
|
||||
|
||||
}
|
||||
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||
} catch ( NumberFormatException e ) {
|
||||
} catch (NumberFormatException e) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
logger.info( "...done!" );
|
||||
logger.info("...done!");
|
||||
|
||||
if ( !sawEOF ) {
|
||||
if (!sawEOF) {
|
||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
|
||||
if ( REQUIRE_EOF )
|
||||
if (REQUIRE_EOF)
|
||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||
logger.warn(errorMessage);
|
||||
}
|
||||
|
||||
logger.info( "The covariates being used here: " );
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
logger.info( "\t" + cov.getClass().getSimpleName() );
|
||||
logger.info("The covariates being used here: ");
|
||||
for (Covariate cov : requestedCovariates) {
|
||||
logger.info("\t" + cov.getClass().getSimpleName());
|
||||
}
|
||||
|
||||
if( dataManager == null ) {
|
||||
if (dataManager == null) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
|
||||
}
|
||||
|
||||
// Create the tables of empirical quality scores that will be used in the sequential calculation
|
||||
logger.info( "Generating tables of empirical qualities for use in sequential calculation..." );
|
||||
dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE );
|
||||
logger.info( "...done!" );
|
||||
logger.info("Generating tables of empirical qualities for use in sequential calculation...");
|
||||
dataManager.generateEmpiricalQualities(SMOOTHING, MAX_QUALITY_SCORE);
|
||||
logger.info("...done!");
|
||||
|
||||
// Take the header of the input SAM file and tweak it by adding in a new programRecord with the version number and list of covariates that were used
|
||||
final SAMFileHeader header = getToolkit().getSAMFileHeader().clone();
|
||||
if( !NO_PG_TAG ) {
|
||||
if (!NO_PG_TAG) {
|
||||
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME);
|
||||
final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText");
|
||||
try {
|
||||
final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version");
|
||||
programRecord.setProgramVersion(version);
|
||||
} catch (MissingResourceException e) {}
|
||||
} catch (MissingResourceException e) {
|
||||
}
|
||||
|
||||
StringBuffer sb = new StringBuffer();
|
||||
sb.append(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this));
|
||||
sb.append(" Covariates=[");
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
for (Covariate cov : requestedCovariates) {
|
||||
sb.append(cov.getClass().getSimpleName());
|
||||
sb.append(", ");
|
||||
}
|
||||
sb.setCharAt(sb.length()-2, ']');
|
||||
sb.setCharAt(sb.length()-1, ' ');
|
||||
sb.setCharAt(sb.length() - 2, ']');
|
||||
sb.setCharAt(sb.length() - 1, ' ');
|
||||
programRecord.setCommandLine(sb.toString());
|
||||
|
||||
List<SAMProgramRecord> oldRecords = header.getProgramRecords();
|
||||
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size()+1);
|
||||
for ( SAMProgramRecord record : oldRecords ) {
|
||||
if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) )
|
||||
List<SAMProgramRecord> newRecords = new ArrayList<SAMProgramRecord>(oldRecords.size() + 1);
|
||||
for (SAMProgramRecord record : oldRecords) {
|
||||
if (!record.getId().startsWith(PROGRAM_RECORD_NAME))
|
||||
newRecords.add(record);
|
||||
}
|
||||
newRecords.add(programRecord);
|
||||
header.setProgramRecords(newRecords);
|
||||
|
||||
// Write out the new header
|
||||
OUTPUT_BAM.writeHeader( header );
|
||||
OUTPUT_BAM.writeHeader(header);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||
*
|
||||
* @param line A line of CSV data read from the recalibration table data file
|
||||
*/
|
||||
private void addCSVData(final File file, final String line) {
|
||||
final String[] vals = line.split(",");
|
||||
|
||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||
if (vals.length != requestedCovariates.size() + 3) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
|
||||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
||||
}
|
||||
|
|
@ -345,15 +351,15 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
final Object[] key = new Object[requestedCovariates.size()];
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
key[iii] = cov.getValue( vals[iii] );
|
||||
for (iii = 0; iii < requestedCovariates.size(); iii++) {
|
||||
cov = requestedCovariates.get(iii);
|
||||
key[iii] = cov.getValue(vals[iii]);
|
||||
}
|
||||
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
final RecalDatum datum = new RecalDatum(Long.parseLong(vals[iii]), Long.parseLong(vals[iii + 1]), Double.parseDouble(vals[1]), 0.0);
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN );
|
||||
dataManager.addToAllTables(key, datum, PRESERVE_QSCORES_LESS_THAN);
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -366,64 +372,63 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
* For each base in the read calculate a new recalibrated quality score and replace the quality scores in the read
|
||||
*
|
||||
* @param refBases References bases over the length of the read
|
||||
* @param read The read to be recalibrated
|
||||
* @param read The read to be recalibrated
|
||||
* @return The read with quality scores replaced
|
||||
*/
|
||||
public SAMRecord map( ReferenceContext refBases, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) {
|
||||
public SAMRecord map(ReferenceContext refBases, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
|
||||
if( read.getReadLength() == 0 ) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads.
|
||||
if (read.getReadLength() == 0) { // Some reads have '*' as the SEQ field and samtools returns length zero. We don't touch these reads.
|
||||
return read;
|
||||
}
|
||||
|
||||
RecalDataManager.parseSAMRecord( read, RAC );
|
||||
RecalDataManager.parseSAMRecord(read, RAC);
|
||||
|
||||
byte[] originalQuals = read.getBaseQualities();
|
||||
final byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
final String platform = read.getReadGroup().getPlatform();
|
||||
if( platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING) ) {
|
||||
if( !(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) ) {
|
||||
final boolean badColor = RecalDataManager.checkNoCallColorSpace( read );
|
||||
if( badColor ) {
|
||||
if (platform.toUpperCase().contains("SOLID") && !(RAC.SOLID_RECAL_MODE == RecalDataManager.SOLID_RECAL_MODE.DO_NOTHING)) {
|
||||
if (!(RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION)) {
|
||||
final boolean badColor = RecalDataManager.checkNoCallColorSpace(read);
|
||||
if (badColor) {
|
||||
numReadsWithMalformedColorSpace++;
|
||||
if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) {
|
||||
if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
|
||||
return read; // can't recalibrate a SOLiD read with no calls in the color space, and the user wants to skip over them
|
||||
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) {
|
||||
}
|
||||
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
|
||||
read.setReadFailsVendorQualityCheckFlag(true);
|
||||
return read;
|
||||
}
|
||||
}
|
||||
}
|
||||
originalQuals = RecalDataManager.calcColorSpace( read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases() );
|
||||
originalQuals = RecalDataManager.calcColorSpace(read, originalQuals, RAC.SOLID_RECAL_MODE, refBases == null ? null : refBases.getBases());
|
||||
}
|
||||
|
||||
//compute all covariate values for this read
|
||||
final Comparable[][] covariateValues_offset_x_covar =
|
||||
RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates);
|
||||
final Comparable[][] covariateValues_offset_x_covar = RecalDataManager.computeCovariates(read, requestedCovariates, BaseRecalibration.BaseRecalibrationType.BASE_SUBSTITUTION);
|
||||
|
||||
// For each base in the read
|
||||
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
||||
for (int offset = 0; offset < read.getReadLength(); offset++) {
|
||||
|
||||
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
|
||||
|
||||
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
|
||||
if(qualityScore == null)
|
||||
{
|
||||
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
|
||||
if (qualityScore == null) {
|
||||
qualityScore = performSequentialQualityCalculation(fullCovariateKey);
|
||||
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
|
||||
}
|
||||
|
||||
recalQuals[offset] = qualityScore;
|
||||
}
|
||||
|
||||
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
|
||||
preserveQScores(originalQuals, recalQuals); // Overwrite the work done if original quality score is too low
|
||||
|
||||
read.setBaseQualities( recalQuals ); // Overwrite old qualities with new recalibrated qualities
|
||||
if ( !DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null ) { // Save the old qualities if the tag isn't already taken in the read
|
||||
read.setBaseQualities(recalQuals); // Overwrite old qualities with new recalibrated qualities
|
||||
if (!DO_NOT_WRITE_OQ && read.getAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG) == null) { // Save the old qualities if the tag isn't already taken in the read
|
||||
read.setAttribute(RecalDataManager.ORIGINAL_QUAL_ATTRIBUTE_TAG, SAMUtils.phredToFastq(originalQuals));
|
||||
}
|
||||
|
||||
if (! skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) {
|
||||
if (!skipUQUpdate && refBases != null && read.getAttribute(SAMTag.UQ.name()) != null) {
|
||||
read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, refBases.getBases(), read.getAlignmentStart() - 1, false));
|
||||
}
|
||||
|
||||
|
|
@ -440,27 +445,28 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
*
|
||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||
*
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
* @param key The list of Comparables that were calculated from the covariates
|
||||
* @return A recalibrated quality score as a byte
|
||||
*/
|
||||
private byte performSequentialQualityCalculation( final Object... key ) {
|
||||
private byte performSequentialQualityCalculation(final Object... key) {
|
||||
|
||||
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
|
||||
final byte qualFromRead = (byte) Integer.parseInt(key[1].toString());
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||
final Object[] covariateCollapsedKey = new Object[3];
|
||||
|
||||
// The global quality shift (over the read group only)
|
||||
readGroupCollapsedKey[0] = key[0];
|
||||
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey ));
|
||||
final RecalDatum globalRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(0).get(readGroupCollapsedKey));
|
||||
double globalDeltaQ = 0.0;
|
||||
if( globalRecalDatum != null ) {
|
||||
if (globalRecalDatum != null) {
|
||||
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
||||
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||
|
|
@ -469,9 +475,9 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
// The shift in quality between reported and empirical
|
||||
qualityScoreCollapsedKey[0] = key[0];
|
||||
qualityScoreCollapsedKey[1] = key[1];
|
||||
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey ));
|
||||
final RecalDatum qReportedRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(1).get(qualityScoreCollapsedKey));
|
||||
double deltaQReported = 0.0;
|
||||
if( qReportedRecalDatum != null ) {
|
||||
if (qReportedRecalDatum != null) {
|
||||
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
}
|
||||
|
|
@ -481,17 +487,17 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
double deltaQCovariateEmpirical;
|
||||
covariateCollapsedKey[0] = key[0];
|
||||
covariateCollapsedKey[1] = key[1];
|
||||
for( int iii = 2; iii < key.length; iii++ ) {
|
||||
covariateCollapsedKey[2] = key[iii]; // The given covariate
|
||||
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey ));
|
||||
if( covariateRecalDatum != null ) {
|
||||
for (int iii = 2; iii < key.length; iii++) {
|
||||
covariateCollapsedKey[2] = key[iii]; // The given covariate
|
||||
final RecalDatum covariateRecalDatum = ((RecalDatum) dataManager.getCollapsedTable(iii).get(covariateCollapsedKey));
|
||||
if (covariateRecalDatum != null) {
|
||||
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
deltaQCovariates += (deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported));
|
||||
}
|
||||
}
|
||||
|
||||
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
|
||||
return QualityUtils.boundQual((int) Math.round(newQuality), (byte) MAX_QUALITY_SCORE);
|
||||
|
||||
// Verbose printouts used to validate with old recalibrator
|
||||
//if(key.contains(null)) {
|
||||
|
|
@ -508,12 +514,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
/**
|
||||
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
||||
*
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
*/
|
||||
private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) {
|
||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
||||
if( originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN ) {
|
||||
private void preserveQScores(final byte[] originalQuals, final byte[] recalQuals) {
|
||||
for (int iii = 0; iii < recalQuals.length; iii++) {
|
||||
if (originalQuals[iii] < PRESERVE_QSCORES_LESS_THAN) {
|
||||
recalQuals[iii] = originalQuals[iii];
|
||||
}
|
||||
}
|
||||
|
|
@ -527,6 +534,7 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
/**
|
||||
* Start the reduce with a handle to the output bam file
|
||||
*
|
||||
* @return A FileWriter pointing to a new bam file
|
||||
*/
|
||||
public SAMFileWriter reduceInit() {
|
||||
|
|
@ -535,12 +543,13 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
/**
|
||||
* Output each read to disk
|
||||
* @param read The read to output
|
||||
*
|
||||
* @param read The read to output
|
||||
* @param output The FileWriter to write the read to
|
||||
* @return The FileWriter
|
||||
*/
|
||||
public SAMFileWriter reduce( SAMRecord read, SAMFileWriter output ) {
|
||||
if( output != null ) {
|
||||
public SAMFileWriter reduce(SAMRecord read, SAMFileWriter output) {
|
||||
if (output != null) {
|
||||
output.addAlignment(read);
|
||||
}
|
||||
return output;
|
||||
|
|
@ -548,20 +557,22 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
|
|||
|
||||
/**
|
||||
* Do nothing
|
||||
*
|
||||
* @param output The SAMFileWriter that outputs the bam file
|
||||
*/
|
||||
public void onTraversalDone(SAMFileWriter output) {
|
||||
if( numReadsWithMalformedColorSpace != 0 ) {
|
||||
if( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED ) {
|
||||
if (numReadsWithMalformedColorSpace != 0) {
|
||||
if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.LEAVE_READ_UNRECALIBRATED) {
|
||||
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
|
||||
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
||||
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
||||
"These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!");
|
||||
} else if ( RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ ) {
|
||||
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
||||
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
||||
"These reads remain in the output bam file but haven't been corrected for reference bias. !!! USE AT YOUR OWN RISK !!!");
|
||||
}
|
||||
else if (RAC.SOLID_NOCALL_STRATEGY == RecalDataManager.SOLID_NOCALL_STRATEGY.PURGE_READ) {
|
||||
Utils.warnUser("Discovered " + numReadsWithMalformedColorSpace + " SOLiD reads with no calls in the color space. Unfortunately these reads cannot be recalibrated with this recalibration algorithm " +
|
||||
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
||||
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
||||
"These reads were completely removed from the output bam file.");
|
||||
"because we use reference mismatch rate as the only indication of a base's true quality. These reads have had reference bases inserted as a way of correcting " +
|
||||
"for color space misalignments and there is now no way of knowing how often it mismatches the reference and therefore no way to recalibrate the quality score. " +
|
||||
"These reads were completely removed from the output bam file.");
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -71,25 +71,24 @@ public class KeepAFSpectrumFrequencySelector extends FrequencyModeSelector {
|
|||
// recompute AF,AC,AN based on genotypes:
|
||||
// todo - - maybe too inefficient??
|
||||
VariantContextUtils.calculateChromosomeCounts(vc, attributes, false);
|
||||
afArray = new double[] {Double.valueOf((String)attributes.get(VCFConstants.ALLELE_FREQUENCY_KEY))};
|
||||
} else {
|
||||
// sites-only vc or we explicitly tell to ignore genotypes; we trust the AF field if present
|
||||
if ( vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
|
||||
String afo = vc.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null);
|
||||
}
|
||||
|
||||
if (afo.contains(",")) {
|
||||
String[] afs = afo.split(",");
|
||||
afs[0] = afs[0].substring(1,afs[0].length());
|
||||
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
|
||||
// sites-only vc or we explicitly tell to ignore genotypes; we trust the AF field if present
|
||||
if ( vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
|
||||
String afo = vc.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null);
|
||||
|
||||
afArray = new double[afs.length];
|
||||
if (afo.contains(",")) {
|
||||
String[] afs = afo.split(",");
|
||||
afs[0] = afs[0].substring(1,afs[0].length());
|
||||
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
|
||||
|
||||
for (int k=0; k < afArray.length; k++)
|
||||
afArray[k] = Double.valueOf(afs[k]);
|
||||
}
|
||||
else
|
||||
afArray = new double[] {Double.valueOf(afo)};
|
||||
afArray = new double[afs.length];
|
||||
|
||||
for (int k=0; k < afArray.length; k++)
|
||||
afArray[k] = Double.valueOf(afs[k]);
|
||||
}
|
||||
else
|
||||
afArray = new double[] {Double.valueOf(afo)};
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -83,29 +83,39 @@ public class MultiallelicSummary extends VariantEvaluator { // implements Standa
|
|||
@DataPoint(description = "Multi-allelic SNP Novelty Rate")
|
||||
public String SNPNoveltyRate = "NA";
|
||||
|
||||
@DataPoint(description = "Multi-allelic Indels partially known")
|
||||
//TODO -- implement me
|
||||
//@DataPoint(description = "Multi-allelic Indels partially known")
|
||||
public int knownIndelsPartial = 0;
|
||||
@DataPoint(description = "Multi-allelic Indels completely known")
|
||||
//@DataPoint(description = "Multi-allelic Indels completely known")
|
||||
public int knownIndelsComplete = 0;
|
||||
@DataPoint(description = "Multi-allelic Indel Novelty Rate")
|
||||
//@DataPoint(description = "Multi-allelic Indel Novelty Rate")
|
||||
public String indelNoveltyRate = "NA";
|
||||
|
||||
@DataPoint(description="Histogram of allele frequencies")
|
||||
AFHistogram AFhistogram = new AFHistogram();
|
||||
@DataPoint(description="Histogram of allele frequencies for most common SNP alternate allele")
|
||||
AFHistogram AFhistogramMaxSnp = new AFHistogram();
|
||||
|
||||
@DataPoint(description="Histogram of allele frequencies for less common SNP alternate alleles")
|
||||
AFHistogram AFhistogramMinSnp = new AFHistogram();
|
||||
|
||||
@DataPoint(description="Histogram of allele frequencies for most common Indel alternate allele")
|
||||
AFHistogram AFhistogramMaxIndel = new AFHistogram();
|
||||
|
||||
@DataPoint(description="Histogram of allele frequencies for less common Indel alternate alleles")
|
||||
AFHistogram AFhistogramMinIndel = new AFHistogram();
|
||||
|
||||
/*
|
||||
* AF histogram table object
|
||||
*/
|
||||
static class AFHistogram implements TableType {
|
||||
private Object[] colKeys, rowKeys = {"pairwise_AF"};
|
||||
private Object[] rowKeys, colKeys = {"count"};
|
||||
private int[] AFhistogram;
|
||||
|
||||
private static final double AFincrement = 0.01;
|
||||
private static final int numBins = (int)(1.00 / AFincrement);
|
||||
|
||||
public AFHistogram() {
|
||||
colKeys = initColKeys();
|
||||
AFhistogram = new int[colKeys.length];
|
||||
rowKeys = initRowKeys();
|
||||
AFhistogram = new int[rowKeys.length];
|
||||
}
|
||||
|
||||
public Object[] getColumnKeys() {
|
||||
|
|
@ -117,10 +127,10 @@ public class MultiallelicSummary extends VariantEvaluator { // implements Standa
|
|||
}
|
||||
|
||||
public Object getCell(int row, int col) {
|
||||
return AFhistogram[col];
|
||||
return AFhistogram[row];
|
||||
}
|
||||
|
||||
private static Object[] initColKeys() {
|
||||
private static Object[] initRowKeys() {
|
||||
ArrayList<String> keyList = new ArrayList<String>(numBins + 1);
|
||||
for ( double a = 0.00; a <= 1.01; a += AFincrement ) {
|
||||
keyList.add(String.format("%.2f", a));
|
||||
|
|
@ -130,18 +140,10 @@ public class MultiallelicSummary extends VariantEvaluator { // implements Standa
|
|||
|
||||
public String getName() { return "AFHistTable"; }
|
||||
|
||||
public void update(VariantContext vc) {
|
||||
final Object obj = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY, null);
|
||||
if ( obj == null || !(obj instanceof List) )
|
||||
return;
|
||||
|
||||
List<String> list = (List<String>)obj;
|
||||
for ( String str : list ) {
|
||||
final double AF = Double.valueOf(str);
|
||||
final int bin = (int)(numBins * MathUtils.round(AF, 2));
|
||||
AFhistogram[bin]++;
|
||||
}
|
||||
}
|
||||
public void update(final double AF) {
|
||||
final int bin = (int)(numBins * MathUtils.round(AF, 2));
|
||||
AFhistogram[bin]++;
|
||||
}
|
||||
}
|
||||
|
||||
public void initialize(VariantEvalWalker walker) {}
|
||||
|
|
@ -168,6 +170,7 @@ public class MultiallelicSummary extends VariantEvaluator { // implements Standa
|
|||
nMultiSNPs++;
|
||||
calculatePairwiseTiTv(eval);
|
||||
calculateSNPPairwiseNovelty(eval, comp);
|
||||
updateAFhistogram(eval, AFhistogramMaxSnp, AFhistogramMinSnp);
|
||||
}
|
||||
break;
|
||||
case INDEL:
|
||||
|
|
@ -175,13 +178,13 @@ public class MultiallelicSummary extends VariantEvaluator { // implements Standa
|
|||
if ( !eval.isBiallelic() ) {
|
||||
nMultiIndels++;
|
||||
calculateIndelPairwiseNovelty(eval, comp);
|
||||
updateAFhistogram(eval, AFhistogramMaxIndel, AFhistogramMinIndel);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new UserException.BadInput("Unexpected variant context type: " + eval);
|
||||
}
|
||||
AFhistogram.update(eval);
|
||||
|
||||
|
||||
return null; // we don't capture any interesting sites
|
||||
}
|
||||
|
||||
|
|
@ -213,6 +216,24 @@ public class MultiallelicSummary extends VariantEvaluator { // implements Standa
|
|||
private void calculateIndelPairwiseNovelty(VariantContext eval, VariantContext comp) {
|
||||
}
|
||||
|
||||
private void updateAFhistogram(VariantContext vc, AFHistogram max, AFHistogram min) {
|
||||
|
||||
final Object obj = vc.getAttribute(VCFConstants.ALLELE_FREQUENCY_KEY, null);
|
||||
if ( obj == null || !(obj instanceof List) )
|
||||
return;
|
||||
|
||||
List<String> list = (List<String>)obj;
|
||||
ArrayList<Double> AFs = new ArrayList<Double>(list.size());
|
||||
for ( String str : list ) {
|
||||
AFs.add(Double.valueOf(str));
|
||||
}
|
||||
|
||||
Collections.sort(AFs);
|
||||
max.update(AFs.get(AFs.size()-1));
|
||||
for ( int i = 0; i < AFs.size() - 1; i++ )
|
||||
min.update(AFs.get(i));
|
||||
}
|
||||
|
||||
private final String noveltyRate(final int all, final int known) {
|
||||
final int novel = all - known;
|
||||
final double rate = (novel / (1.0 * all));
|
||||
|
|
|
|||
|
|
@ -128,13 +128,13 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
// get the true reference allele
|
||||
Allele reportedRefAllele = vc.getReference();
|
||||
Allele observedRefAllele;
|
||||
Allele observedRefAllele = null;
|
||||
// insertions
|
||||
if ( vc.isSimpleInsertion() ) {
|
||||
observedRefAllele = Allele.create(Allele.NULL_ALLELE_STRING);
|
||||
}
|
||||
// deletions
|
||||
else if ( vc.isSimpleDeletion() || vc.isMixed() || vc.isMNP() ) {
|
||||
else if ( vc.isSimpleDeletion() || vc.isMNP() ) {
|
||||
// we can't validate arbitrarily long deletions
|
||||
if ( reportedRefAllele.length() > 100 ) {
|
||||
logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", reportedRefAllele.length(), vc.getChr(), vc.getStart()));
|
||||
|
|
@ -143,16 +143,15 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
|
|||
|
||||
// deletions are associated with the (position of) the last (preceding) non-deleted base;
|
||||
// hence to get actually deleted bases we need offset = 1
|
||||
int offset = 1 ;
|
||||
if ( vc.isMNP() ) offset = 0; // if it's an MNP, the reported position IS the first modified base
|
||||
int offset = vc.isMNP() ? 0 : 1;
|
||||
byte[] refBytes = ref.getBases();
|
||||
byte[] trueRef = new byte[reportedRefAllele.length()];
|
||||
for (int i = 0; i < reportedRefAllele.length(); i++)
|
||||
trueRef[i] = refBytes[i+offset];
|
||||
observedRefAllele = Allele.create(trueRef, true);
|
||||
}
|
||||
// SNPs, etc.
|
||||
else {
|
||||
// SNPs, etc. but not mixed types because they are too difficult
|
||||
else if ( !vc.isMixed() ) {
|
||||
byte[] refByte = new byte[1];
|
||||
refByte[0] = ref.getBase();
|
||||
observedRefAllele = Allele.create(refByte, true);
|
||||
|
|
|
|||
|
|
@ -114,7 +114,7 @@ public class VariantsToPed extends RodWalker<Integer,Integer> {
|
|||
String mid = mVals.containsKey("mom") ? mVals.get("mom") : String.format("dummy_%d",++dummyID);
|
||||
String sex = mVals.containsKey("sex") ? mVals.get("sex") : "3";
|
||||
String pheno = mVals.get("phenotype");
|
||||
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,pid,sample,mid,sex,pheno);
|
||||
outFam.printf("%s\t%s\t%s\t%s\t%s\t%s%n",fid,sample,pid,mid,sex,pheno);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -272,12 +272,11 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
|
||||
getters.put("REF", new Getter() {
|
||||
public String get(VariantContext vc) {
|
||||
String x = "";
|
||||
if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) {
|
||||
Byte refByte = vc.getReferenceBaseForIndel();
|
||||
x=x+new String(new byte[]{refByte});
|
||||
}
|
||||
return x+vc.getReference().getDisplayString();
|
||||
StringBuilder x = new StringBuilder();
|
||||
if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() )
|
||||
x.append((char)vc.getReferenceBaseForIndel().byteValue());
|
||||
x.append(vc.getReference().getDisplayString());
|
||||
return x.toString();
|
||||
}
|
||||
});
|
||||
getters.put("ALT", new Getter() {
|
||||
|
|
@ -285,13 +284,11 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
StringBuilder x = new StringBuilder();
|
||||
int n = vc.getAlternateAlleles().size();
|
||||
if ( n == 0 ) return ".";
|
||||
if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) {
|
||||
Byte refByte = vc.getReferenceBaseForIndel();
|
||||
x.append(new String(new byte[]{refByte}));
|
||||
}
|
||||
|
||||
for ( int i = 0; i < n; i++ ) {
|
||||
if ( i != 0 ) x.append(",");
|
||||
if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() )
|
||||
x.append((char)vc.getReferenceBaseForIndel().byteValue());
|
||||
x.append(vc.getAlternateAllele(i).getDisplayString());
|
||||
}
|
||||
return x.toString();
|
||||
|
|
|
|||
|
|
@ -2,57 +2,59 @@ package org.broadinstitute.sting.utils;
|
|||
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
|
||||
|
||||
/**
|
||||
* BaseUtils contains some basic utilities for manipulating nucleotides.
|
||||
*/
|
||||
public class BaseUtils {
|
||||
public final static byte A = (byte)'A';
|
||||
public final static byte C = (byte)'C';
|
||||
public final static byte G = (byte)'G';
|
||||
public final static byte T = (byte)'T';
|
||||
public final static byte A = (byte) 'A';
|
||||
public final static byte C = (byte) 'C';
|
||||
public final static byte G = (byte) 'G';
|
||||
public final static byte T = (byte) 'T';
|
||||
|
||||
public final static byte N = (byte)'N';
|
||||
public final static byte D = (byte)'D';
|
||||
public final static byte N = (byte) 'N';
|
||||
public final static byte D = (byte) 'D';
|
||||
|
||||
//
|
||||
// todo -- we need a generalized base abstraction using the Base enum.
|
||||
//
|
||||
public final static byte[] BASES = { 'A', 'C', 'G', 'T' };
|
||||
public final static byte[] EXTENDED_BASES = { 'A', 'C', 'G', 'T', 'N', 'D' };
|
||||
public final static byte[] BASES = {'A', 'C', 'G', 'T'};
|
||||
public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'};
|
||||
|
||||
public enum Base {
|
||||
A ( 'A', 0 ),
|
||||
C ( 'C', 1 ),
|
||||
G ( 'G', 2 ),
|
||||
T ( 'T', 3 );
|
||||
A('A', 0),
|
||||
C('C', 1),
|
||||
G('G', 2),
|
||||
T('T', 3);
|
||||
|
||||
byte b;
|
||||
int index;
|
||||
|
||||
private Base(char base, int index) {
|
||||
this.b = (byte)base;
|
||||
this.b = (byte) base;
|
||||
this.index = index;
|
||||
}
|
||||
|
||||
public byte getBase() { return b; }
|
||||
public char getBaseAsChar() { return (char)b; }
|
||||
|
||||
public char getBaseAsChar() { return (char) b; }
|
||||
|
||||
public int getIndex() { return index; }
|
||||
|
||||
public boolean sameBase(byte o) { return b == o; }
|
||||
public boolean sameBase(char o) { return b == (byte)o; }
|
||||
public boolean sameBase(int i) { return index == i; }
|
||||
}
|
||||
|
||||
public boolean sameBase(char o) { return b == (byte) o; }
|
||||
|
||||
public boolean sameBase(int i) { return index == i; }
|
||||
}
|
||||
|
||||
// todo -- fix me (enums?)
|
||||
public static final byte DELETION_INDEX = 4;
|
||||
public static final byte NO_CALL_INDEX = 5; // (this is 'N')
|
||||
|
||||
public static int gIndex = BaseUtils.simpleBaseToBaseIndex((byte)'G');
|
||||
public static int cIndex = BaseUtils.simpleBaseToBaseIndex((byte)'C');
|
||||
public static int aIndex = BaseUtils.simpleBaseToBaseIndex((byte)'A');
|
||||
public static int tIndex = BaseUtils.simpleBaseToBaseIndex((byte)'T');
|
||||
|
||||
public static int gIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'G');
|
||||
public static int cIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'C');
|
||||
public static int aIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'A');
|
||||
public static int tIndex = BaseUtils.simpleBaseToBaseIndex((byte) 'T');
|
||||
|
||||
/// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or
|
||||
// a pyrimidine to another pyrimidine nucleotide (C <-> T).
|
||||
|
|
@ -64,28 +66,31 @@ public class BaseUtils {
|
|||
|
||||
/**
|
||||
* Returns the base substitution type of the 2 state SNP
|
||||
*
|
||||
* @param base1
|
||||
* @param base2
|
||||
* @return
|
||||
*/
|
||||
public static BaseSubstitutionType SNPSubstitutionType( byte base1, byte base2 ) {
|
||||
public static BaseSubstitutionType SNPSubstitutionType(byte base1, byte base2) {
|
||||
BaseSubstitutionType t = isTransition(base1, base2) ? BaseSubstitutionType.TRANSITION : BaseSubstitutionType.TRANSVERSION;
|
||||
//System.out.printf("SNPSubstitutionType( char %c, char %c ) => %s%n", base1, base2, t);
|
||||
return t;
|
||||
}
|
||||
|
||||
public static boolean isTransition( byte base1, byte base2 ) {
|
||||
public static boolean isTransition(byte base1, byte base2) {
|
||||
int b1 = simpleBaseToBaseIndex(base1);
|
||||
int b2 = simpleBaseToBaseIndex(base2);
|
||||
return b1 == 0 && b2 == 2 || b1 == 2 && b2 == 0 ||
|
||||
b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1;
|
||||
b1 == 1 && b2 == 3 || b1 == 3 && b2 == 1;
|
||||
}
|
||||
|
||||
public static boolean isTransversion( byte base1, byte base2 ) {
|
||||
return ! isTransition(base1, base2);
|
||||
public static boolean isTransversion(byte base1, byte base2) {
|
||||
return !isTransition(base1, base2);
|
||||
}
|
||||
|
||||
/** Private constructor. No instantiating this class! */
|
||||
/**
|
||||
* Private constructor. No instantiating this class!
|
||||
*/
|
||||
private BaseUtils() {}
|
||||
|
||||
static public boolean basesAreEqual(byte base1, byte base2) {
|
||||
|
|
@ -96,7 +101,6 @@ public class BaseUtils {
|
|||
return extendedBaseToBaseIndex(base1) == extendedBaseToBaseIndex(base2);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Converts a IUPAC nucleotide code to a pair of bases
|
||||
*
|
||||
|
|
@ -163,33 +167,37 @@ public class BaseUtils {
|
|||
/**
|
||||
* Converts a simple base to a base index
|
||||
*
|
||||
* @param base [AaCcGgTt]
|
||||
* @param base [AaCcGgTt]
|
||||
* @return 0, 1, 2, 3, or -1 if the base can't be understood
|
||||
*/
|
||||
static public int simpleBaseToBaseIndex(byte base) {
|
||||
switch (base) {
|
||||
case '*': // the wildcard character counts as an A
|
||||
case 'A':
|
||||
case 'a': return 0;
|
||||
case 'a':
|
||||
return 0;
|
||||
|
||||
case 'C':
|
||||
case 'c': return 1;
|
||||
case 'c':
|
||||
return 1;
|
||||
|
||||
case 'G':
|
||||
case 'g': return 2;
|
||||
case 'g':
|
||||
return 2;
|
||||
|
||||
case 'T':
|
||||
case 't': return 3;
|
||||
case 't':
|
||||
return 3;
|
||||
|
||||
default: return -1;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Converts a simple base to a base index
|
||||
*
|
||||
* @param base [AaCcGgTt]
|
||||
* @param base [AaCcGgTt]
|
||||
* @return 0, 1, 2, 3, or -1 if the base can't be understood
|
||||
*/
|
||||
@Deprecated
|
||||
|
|
@ -197,29 +205,37 @@ public class BaseUtils {
|
|||
switch (base) {
|
||||
case '*': // the wildcard character counts as an A
|
||||
case 'A':
|
||||
case 'a': return 0;
|
||||
case 'a':
|
||||
return 0;
|
||||
|
||||
case 'C':
|
||||
case 'c': return 1;
|
||||
case 'c':
|
||||
return 1;
|
||||
|
||||
case 'G':
|
||||
case 'g': return 2;
|
||||
case 'g':
|
||||
return 2;
|
||||
|
||||
case 'T':
|
||||
case 't': return 3;
|
||||
case 't':
|
||||
return 3;
|
||||
|
||||
default: return -1;
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static public int extendedBaseToBaseIndex(byte base) {
|
||||
switch (base) {
|
||||
case 'd':
|
||||
case 'D': return DELETION_INDEX;
|
||||
case 'D':
|
||||
return DELETION_INDEX;
|
||||
case 'n':
|
||||
case 'N': return NO_CALL_INDEX;
|
||||
case 'N':
|
||||
return NO_CALL_INDEX;
|
||||
|
||||
default: return simpleBaseToBaseIndex(base);
|
||||
default:
|
||||
return simpleBaseToBaseIndex(base);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -232,11 +248,6 @@ public class BaseUtils {
|
|||
return simpleBaseToBaseIndex(base) != -1;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
static public boolean isNBase(char base) {
|
||||
return isNBase((byte)base);
|
||||
}
|
||||
|
||||
static public boolean isNBase(byte base) {
|
||||
return base == 'N' || base == 'n';
|
||||
}
|
||||
|
|
@ -244,68 +255,83 @@ public class BaseUtils {
|
|||
/**
|
||||
* Converts a base index to a simple base
|
||||
*
|
||||
* @param baseIndex 0, 1, 2, 3
|
||||
* @param baseIndex 0, 1, 2, 3
|
||||
* @return A, C, G, T, or '.' if the index can't be understood
|
||||
*/
|
||||
static public byte baseIndexToSimpleBase(int baseIndex) {
|
||||
switch (baseIndex) {
|
||||
case 0: return 'A';
|
||||
case 1: return 'C';
|
||||
case 2: return 'G';
|
||||
case 3: return 'T';
|
||||
default: return '.';
|
||||
case 0:
|
||||
return 'A';
|
||||
case 1:
|
||||
return 'C';
|
||||
case 2:
|
||||
return 'G';
|
||||
case 3:
|
||||
return 'T';
|
||||
default:
|
||||
return '.';
|
||||
}
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
static public char baseIndexToSimpleBaseAsChar(int baseIndex) {
|
||||
return (char)baseIndexToSimpleBase(baseIndex);
|
||||
return (char) baseIndexToSimpleBase(baseIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a base index to a base index representing its cross-talk partner
|
||||
*
|
||||
* @param baseIndex 0, 1, 2, 3
|
||||
* @param baseIndex 0, 1, 2, 3
|
||||
* @return 1, 0, 3, 2, or -1 if the index can't be understood
|
||||
*/
|
||||
static public int crossTalkPartnerIndex(int baseIndex) {
|
||||
switch (baseIndex) {
|
||||
case 0: return 1; // A -> C
|
||||
case 1: return 0; // C -> A
|
||||
case 2: return 3; // G -> T
|
||||
case 3: return 2; // T -> G
|
||||
default: return -1;
|
||||
case 0:
|
||||
return 1; // A -> C
|
||||
case 1:
|
||||
return 0; // C -> A
|
||||
case 2:
|
||||
return 3; // G -> T
|
||||
case 3:
|
||||
return 2; // T -> G
|
||||
default:
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a base to the base representing its cross-talk partner
|
||||
*
|
||||
* @param base [AaCcGgTt]
|
||||
* @param base [AaCcGgTt]
|
||||
* @return C, A, T, G, or '.' if the base can't be understood
|
||||
*/
|
||||
@Deprecated
|
||||
static public char crossTalkPartnerBase(char base) {
|
||||
return (char)baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base)));
|
||||
return (char) baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the complement of a base index.
|
||||
*
|
||||
* @param baseIndex the base index (0:A, 1:C, 2:G, 3:T)
|
||||
* @param baseIndex the base index (0:A, 1:C, 2:G, 3:T)
|
||||
* @return the complementary base index
|
||||
*/
|
||||
static public byte complementIndex(int baseIndex) {
|
||||
switch (baseIndex) {
|
||||
case 0: return 3; // a -> t
|
||||
case 1: return 2; // c -> g
|
||||
case 2: return 1; // g -> c
|
||||
case 3: return 0; // t -> a
|
||||
default: return -1; // wtf?
|
||||
case 0:
|
||||
return 3; // a -> t
|
||||
case 1:
|
||||
return 2; // c -> g
|
||||
case 2:
|
||||
return 1; // g -> c
|
||||
case 3:
|
||||
return 0; // t -> a
|
||||
default:
|
||||
return -1; // wtf?
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base).
|
||||
*
|
||||
* @param base the base [AaCcGgTt]
|
||||
|
|
@ -314,20 +340,25 @@ public class BaseUtils {
|
|||
static public byte simpleComplement(byte base) {
|
||||
switch (base) {
|
||||
case 'A':
|
||||
case 'a': return 'T';
|
||||
case 'a':
|
||||
return 'T';
|
||||
case 'C':
|
||||
case 'c': return 'G';
|
||||
case 'c':
|
||||
return 'G';
|
||||
case 'G':
|
||||
case 'g': return 'C';
|
||||
case 'g':
|
||||
return 'C';
|
||||
case 'T':
|
||||
case 't': return 'A';
|
||||
default: return base;
|
||||
case 't':
|
||||
return 'A';
|
||||
default:
|
||||
return base;
|
||||
}
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
static public char simpleComplement(char base) {
|
||||
return (char)simpleComplement((byte)base);
|
||||
return (char) simpleComplement((byte) base);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -349,7 +380,7 @@ public class BaseUtils {
|
|||
/**
|
||||
* Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form)
|
||||
*
|
||||
* @param bases the byte array of bases
|
||||
* @param bases the byte array of bases
|
||||
* @return the complement of the base byte array
|
||||
*/
|
||||
static public byte[] simpleComplement(byte[] bases) {
|
||||
|
|
@ -382,7 +413,7 @@ public class BaseUtils {
|
|||
/**
|
||||
* Complement a char array of bases
|
||||
*
|
||||
* @param bases the char array of bases
|
||||
* @param bases the char array of bases
|
||||
* @return the complement of the base char array
|
||||
*/
|
||||
@Deprecated
|
||||
|
|
@ -399,7 +430,7 @@ public class BaseUtils {
|
|||
/**
|
||||
* Reverse complement a String of bases. Preserves ambiguous bases.
|
||||
*
|
||||
* @param bases the String of bases
|
||||
* @param bases the String of bases
|
||||
* @return the reverse complement of the String
|
||||
*/
|
||||
@Deprecated
|
||||
|
|
@ -407,11 +438,10 @@ public class BaseUtils {
|
|||
return new String(simpleReverseComplement(bases.getBytes()));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Complement a String of bases. Preserves ambiguous bases.
|
||||
*
|
||||
* @param bases the String of bases
|
||||
* @param bases the String of bases
|
||||
* @return the complement of the String
|
||||
*/
|
||||
@Deprecated
|
||||
|
|
@ -451,7 +481,7 @@ public class BaseUtils {
|
|||
/**
|
||||
* Returns the most common base in the basecounts array. To be used with pileup.getBaseCounts.
|
||||
*
|
||||
* @param baseCounts counts of a,c,g,t in order.
|
||||
* @param baseCounts counts of a,c,g,t in order.
|
||||
* @return the most common base
|
||||
*/
|
||||
static public byte mostFrequentSimpleBase(int[] baseCounts) {
|
||||
|
|
@ -461,13 +491,13 @@ public class BaseUtils {
|
|||
/**
|
||||
* For the most frequent base in the sequence, return the percentage of the read it constitutes.
|
||||
*
|
||||
* @param sequence the read sequence
|
||||
* @return the percentage of the read that's made up of the most frequent base
|
||||
* @param sequence the read sequence
|
||||
* @return the percentage of the read that's made up of the most frequent base
|
||||
*/
|
||||
static public double mostFrequentBaseFraction(byte[] sequence) {
|
||||
int[] baseCounts = new int[4];
|
||||
|
||||
for ( byte base : sequence ) {
|
||||
for (byte base : sequence) {
|
||||
int baseIndex = simpleBaseToBaseIndex(base);
|
||||
|
||||
if (baseIndex >= 0) {
|
||||
|
|
@ -477,7 +507,7 @@ public class BaseUtils {
|
|||
|
||||
int mostFrequentBaseIndex = mostFrequentBaseIndex(baseCounts);
|
||||
|
||||
return ((double) baseCounts[mostFrequentBaseIndex])/((double) sequence.length);
|
||||
return ((double) baseCounts[mostFrequentBaseIndex]) / ((double) sequence.length);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
|
|
@ -531,50 +561,50 @@ public class BaseUtils {
|
|||
static public byte getRandomBase(char excludeBase) {
|
||||
return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase)));
|
||||
}
|
||||
|
||||
|
||||
/** Computes the smallest period >= minPeriod for the specified string. The period is defined as such p,
|
||||
|
||||
/**
|
||||
* Computes the smallest period >= minPeriod for the specified string. The period is defined as such p,
|
||||
* that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p).
|
||||
* The sequence does <i>not</i> have to contain whole number of periods. For instance, "ACACACAC" has a period
|
||||
* of 2 (it has a period of 4 as well), and so does
|
||||
* "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is
|
||||
* The sequence does <i>not</i> have to contain whole number of periods. For instance, "ACACACAC" has a period
|
||||
* of 2 (it has a period of 4 as well), and so does
|
||||
* "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is
|
||||
* the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range
|
||||
* or if specified minPeriod is greater than the sequence length.
|
||||
*
|
||||
*
|
||||
* @param seq
|
||||
* @return
|
||||
*/
|
||||
public static int sequencePeriod(byte[] seq, int minPeriod) {
|
||||
int period = ( minPeriod > seq.length ? seq.length : minPeriod );
|
||||
// we assume that bases [0,period-1] repeat themselves and check this assumption
|
||||
// until we find correct period
|
||||
|
||||
for ( int pos = period ; pos < seq.length ; pos++ ) {
|
||||
|
||||
int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period'
|
||||
// if our current hypothesis holds, base[pos] must be the same as base[offset]
|
||||
|
||||
if ( Character.toUpperCase( seq[pos] ) !=
|
||||
Character.toUpperCase( seq[offset] )
|
||||
) {
|
||||
|
||||
// period we have been trying so far does not work.
|
||||
// two possibilities:
|
||||
// A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not;
|
||||
// in this case only bases from start up to the current one, inclusive, may form a repeat, if at all;
|
||||
// so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance
|
||||
// pos will be autoincremented and we will be checking next base
|
||||
// B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one?
|
||||
// hence we should first check if it matches the first base of the sequence, and to do that
|
||||
// we set period to pos (thus trying the hypothesis that bases from start up to the current one,
|
||||
// non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base
|
||||
// on the next loop re-entrance after pos is autoincremented)
|
||||
if ( offset == 0 ) period = pos+1;
|
||||
else period = pos-- ;
|
||||
|
||||
}
|
||||
}
|
||||
return period;
|
||||
int period = (minPeriod > seq.length ? seq.length : minPeriod);
|
||||
// we assume that bases [0,period-1] repeat themselves and check this assumption
|
||||
// until we find correct period
|
||||
|
||||
for (int pos = period; pos < seq.length; pos++) {
|
||||
|
||||
int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period'
|
||||
// if our current hypothesis holds, base[pos] must be the same as base[offset]
|
||||
|
||||
if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) {
|
||||
|
||||
// period we have been trying so far does not work.
|
||||
// two possibilities:
|
||||
// A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not;
|
||||
// in this case only bases from start up to the current one, inclusive, may form a repeat, if at all;
|
||||
// so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance
|
||||
// pos will be autoincremented and we will be checking next base
|
||||
// B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one?
|
||||
// hence we should first check if it matches the first base of the sequence, and to do that
|
||||
// we set period to pos (thus trying the hypothesis that bases from start up to the current one,
|
||||
// non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base
|
||||
// on the next loop re-entrance after pos is autoincremented)
|
||||
if (offset == 0)
|
||||
period = pos + 1;
|
||||
else
|
||||
period = pos--;
|
||||
|
||||
}
|
||||
}
|
||||
return period;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -436,7 +436,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
|
|||
* never be < 1.
|
||||
*/
|
||||
@Ensures("result > 0")
|
||||
public long size() {
|
||||
public int size() {
|
||||
return stop - start + 1;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -49,7 +49,6 @@ public class MathUtils {
|
|||
* high precision
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* Private constructor. No instantiating this class!
|
||||
*/
|
||||
|
|
@ -60,48 +59,48 @@ public class MathUtils {
|
|||
// under/overflow checking, so this shouldn't be used in the general case (but is fine
|
||||
// if one is already make those checks before calling in to the rounding).
|
||||
public static int fastRound(double d) {
|
||||
return (d > 0) ? (int)(d + 0.5d) : (int)(d - 0.5d);
|
||||
return (d > 0) ? (int) (d + 0.5d) : (int) (d - 0.5d);
|
||||
}
|
||||
|
||||
public static double approximateLog10SumLog10(final double[] vals) {
|
||||
return approximateLog10SumLog10(vals, vals.length);
|
||||
return approximateLog10SumLog10(vals, vals.length);
|
||||
}
|
||||
|
||||
public static double approximateLog10SumLog10(final double[] vals, final int endIndex) {
|
||||
|
||||
final int maxElementIndex = MathUtils.maxElementIndex(vals, endIndex);
|
||||
double approxSum = vals[maxElementIndex];
|
||||
if ( approxSum == Double.NEGATIVE_INFINITY )
|
||||
final int maxElementIndex = MathUtils.maxElementIndex(vals, endIndex);
|
||||
double approxSum = vals[maxElementIndex];
|
||||
if (approxSum == Double.NEGATIVE_INFINITY)
|
||||
return approxSum;
|
||||
|
||||
for ( int i = 0; i < endIndex; i++ ) {
|
||||
if ( i == maxElementIndex || vals[i] == Double.NEGATIVE_INFINITY )
|
||||
continue;
|
||||
for (int i = 0; i < endIndex; i++) {
|
||||
if (i == maxElementIndex || vals[i] == Double.NEGATIVE_INFINITY)
|
||||
continue;
|
||||
|
||||
final double diff = approxSum - vals[i];
|
||||
if ( diff < MathUtils.MAX_JACOBIAN_TOLERANCE ) {
|
||||
// See notes from the 2-inout implementation below
|
||||
final int ind = fastRound(diff / MathUtils.JACOBIAN_LOG_TABLE_STEP); // hard rounding
|
||||
approxSum += MathUtils.jacobianLogTable[ind];
|
||||
}
|
||||
}
|
||||
final double diff = approxSum - vals[i];
|
||||
if (diff < MathUtils.MAX_JACOBIAN_TOLERANCE) {
|
||||
// See notes from the 2-inout implementation below
|
||||
final int ind = fastRound(diff / MathUtils.JACOBIAN_LOG_TABLE_STEP); // hard rounding
|
||||
approxSum += MathUtils.jacobianLogTable[ind];
|
||||
}
|
||||
}
|
||||
|
||||
return approxSum;
|
||||
}
|
||||
|
||||
public static double approximateLog10SumLog10(double small, double big) {
|
||||
// make sure small is really the smaller value
|
||||
if ( small > big ) {
|
||||
if (small > big) {
|
||||
final double t = big;
|
||||
big = small;
|
||||
small = t;
|
||||
}
|
||||
|
||||
if ( small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY )
|
||||
if (small == Double.NEGATIVE_INFINITY || big == Double.NEGATIVE_INFINITY)
|
||||
return big;
|
||||
|
||||
final double diff = big - small;
|
||||
if ( diff >= MathUtils.MAX_JACOBIAN_TOLERANCE )
|
||||
final double diff = big - small;
|
||||
if (diff >= MathUtils.MAX_JACOBIAN_TOLERANCE)
|
||||
return big;
|
||||
|
||||
// OK, so |y-x| < tol: we use the following identity then:
|
||||
|
|
@ -138,10 +137,15 @@ public class MathUtils {
|
|||
return size;
|
||||
}
|
||||
|
||||
public static double average(Collection<Integer> x) {
|
||||
return (double) sum(x) / x.size();
|
||||
}
|
||||
|
||||
public static double average(Collection<Number> numbers, boolean ignoreNan) {
|
||||
if (ignoreNan) {
|
||||
return sum(numbers, true) / nonNanSize(numbers);
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
return sum(numbers, false) / nonNanSize(numbers);
|
||||
}
|
||||
}
|
||||
|
|
@ -172,10 +176,17 @@ public class MathUtils {
|
|||
|
||||
public static double sum(double[] values) {
|
||||
double s = 0.0;
|
||||
for (double v : values) s += v;
|
||||
for (double v : values)
|
||||
s += v;
|
||||
return s;
|
||||
}
|
||||
|
||||
public static long sum(int[] x) {
|
||||
long total = 0;
|
||||
for (int v : x)
|
||||
total += v;
|
||||
return total;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the log10 cumulative sum of an array with log10 probabilities
|
||||
|
|
@ -218,21 +229,23 @@ public class MathUtils {
|
|||
|
||||
public static double sumDoubles(List<Double> values) {
|
||||
double s = 0.0;
|
||||
for (double v : values) s += v;
|
||||
for (double v : values)
|
||||
s += v;
|
||||
return s;
|
||||
}
|
||||
|
||||
public static int sumIntegers(List<Integer> values) {
|
||||
int s = 0;
|
||||
for (int v : values) s += v;
|
||||
for (int v : values)
|
||||
s += v;
|
||||
return s;
|
||||
}
|
||||
|
||||
public static double sumLog10(double[] log10values) {
|
||||
return Math.pow(10.0, log10sumLog10(log10values));
|
||||
// double s = 0.0;
|
||||
// for ( double v : log10values) s += Math.pow(10.0, v);
|
||||
// return s;
|
||||
// double s = 0.0;
|
||||
// for ( double v : log10values) s += Math.pow(10.0, v);
|
||||
// return s;
|
||||
}
|
||||
|
||||
public static double log10sumLog10(double[] log10values) {
|
||||
|
|
@ -445,7 +458,6 @@ public class MathUtils {
|
|||
return Math.sqrt(rms);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* calculate the Root Mean Square of an array of integers
|
||||
*
|
||||
|
|
@ -506,7 +518,6 @@ public class MathUtils {
|
|||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
|
||||
*
|
||||
|
|
@ -543,7 +554,8 @@ public class MathUtils {
|
|||
sum += normalized[i];
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
double x = normalized[i] / sum;
|
||||
if (takeLog10OfOutput) x = Math.log10(x);
|
||||
if (takeLog10OfOutput)
|
||||
x = Math.log10(x);
|
||||
normalized[i] = x;
|
||||
}
|
||||
|
||||
|
|
@ -565,7 +577,8 @@ public class MathUtils {
|
|||
sum += normalized[i];
|
||||
for (int i = 0; i < array.size(); i++) {
|
||||
double x = normalized[i] / sum;
|
||||
if (takeLog10OfOutput) x = Math.log10(x);
|
||||
if (takeLog10OfOutput)
|
||||
x = Math.log10(x);
|
||||
normalized[i] = x;
|
||||
}
|
||||
|
||||
|
|
@ -587,11 +600,12 @@ public class MathUtils {
|
|||
}
|
||||
|
||||
public static int maxElementIndex(final double[] array) {
|
||||
return maxElementIndex(array, array.length);
|
||||
return maxElementIndex(array, array.length);
|
||||
}
|
||||
|
||||
public static int maxElementIndex(final double[] array, final int endIndex) {
|
||||
if (array == null) throw new IllegalArgumentException("Array cannot be null!");
|
||||
if (array == null)
|
||||
throw new IllegalArgumentException("Array cannot be null!");
|
||||
|
||||
int maxI = -1;
|
||||
for (int i = 0; i < endIndex; i++) {
|
||||
|
|
@ -603,11 +617,12 @@ public class MathUtils {
|
|||
}
|
||||
|
||||
public static int maxElementIndex(final int[] array) {
|
||||
return maxElementIndex(array, array.length);
|
||||
return maxElementIndex(array, array.length);
|
||||
}
|
||||
|
||||
public static int maxElementIndex(final int[] array, int endIndex) {
|
||||
if (array == null) throw new IllegalArgumentException("Array cannot be null!");
|
||||
if (array == null)
|
||||
throw new IllegalArgumentException("Array cannot be null!");
|
||||
|
||||
int maxI = -1;
|
||||
for (int i = 0; i < endIndex; i++) {
|
||||
|
|
@ -635,7 +650,8 @@ public class MathUtils {
|
|||
}
|
||||
|
||||
public static int minElementIndex(double[] array) {
|
||||
if (array == null) throw new IllegalArgumentException("Array cannot be null!");
|
||||
if (array == null)
|
||||
throw new IllegalArgumentException("Array cannot be null!");
|
||||
|
||||
int minI = -1;
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
|
|
@ -647,7 +663,8 @@ public class MathUtils {
|
|||
}
|
||||
|
||||
public static int minElementIndex(byte[] array) {
|
||||
if (array == null) throw new IllegalArgumentException("Array cannot be null!");
|
||||
if (array == null)
|
||||
throw new IllegalArgumentException("Array cannot be null!");
|
||||
|
||||
int minI = -1;
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
|
|
@ -659,7 +676,8 @@ public class MathUtils {
|
|||
}
|
||||
|
||||
public static int minElementIndex(int[] array) {
|
||||
if (array == null) throw new IllegalArgumentException("Array cannot be null!");
|
||||
if (array == null)
|
||||
throw new IllegalArgumentException("Array cannot be null!");
|
||||
|
||||
int minI = -1;
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
|
|
@ -671,20 +689,26 @@ public class MathUtils {
|
|||
}
|
||||
|
||||
public static int arrayMaxInt(List<Integer> array) {
|
||||
if (array == null) throw new IllegalArgumentException("Array cannot be null!");
|
||||
if (array.size() == 0) throw new IllegalArgumentException("Array size cannot be 0!");
|
||||
if (array == null)
|
||||
throw new IllegalArgumentException("Array cannot be null!");
|
||||
if (array.size() == 0)
|
||||
throw new IllegalArgumentException("Array size cannot be 0!");
|
||||
|
||||
int m = array.get(0);
|
||||
for (int e : array) m = Math.max(m, e);
|
||||
for (int e : array)
|
||||
m = Math.max(m, e);
|
||||
return m;
|
||||
}
|
||||
|
||||
public static double arrayMaxDouble(List<Double> array) {
|
||||
if (array == null) throw new IllegalArgumentException("Array cannot be null!");
|
||||
if (array.size() == 0) throw new IllegalArgumentException("Array size cannot be 0!");
|
||||
if (array == null)
|
||||
throw new IllegalArgumentException("Array cannot be null!");
|
||||
if (array.size() == 0)
|
||||
throw new IllegalArgumentException("Array size cannot be 0!");
|
||||
|
||||
double m = array.get(0);
|
||||
for (double e : array) m = Math.max(m, e);
|
||||
for (double e : array)
|
||||
m = Math.max(m, e);
|
||||
return m;
|
||||
}
|
||||
|
||||
|
|
@ -722,6 +746,13 @@ public class MathUtils {
|
|||
return average(vals, vals.size());
|
||||
}
|
||||
|
||||
public static double average(int[] x) {
|
||||
int sum = 0;
|
||||
for (int v : x)
|
||||
sum += v;
|
||||
return (double) sum / x.length;
|
||||
}
|
||||
|
||||
public static byte average(byte[] vals) {
|
||||
int sum = 0;
|
||||
for (byte v : vals) {
|
||||
|
|
@ -798,7 +829,6 @@ public class MathUtils {
|
|||
return permutation;
|
||||
}
|
||||
|
||||
|
||||
public static int[] permuteArray(int[] array, Integer[] permutation) {
|
||||
int[] output = new int[array.length];
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
|
|
@ -839,7 +869,6 @@ public class MathUtils {
|
|||
return output;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Draw N random elements from list.
|
||||
*/
|
||||
|
|
@ -905,7 +934,8 @@ public class MathUtils {
|
|||
public static <T> int countOccurrences(T x, List<T> l) {
|
||||
int count = 0;
|
||||
for (T y : l) {
|
||||
if (x.equals(y)) count++;
|
||||
if (x.equals(y))
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
|
|
@ -1013,9 +1043,11 @@ public class MathUtils {
|
|||
for (Comparable y : list) {
|
||||
if (x.compareTo(y) > 0) {
|
||||
lessThanX.add(y);
|
||||
} else if (x.compareTo(y) < 0) {
|
||||
}
|
||||
else if (x.compareTo(y) < 0) {
|
||||
greaterThanX.add(y);
|
||||
} else
|
||||
}
|
||||
else
|
||||
equalToX.add(y);
|
||||
}
|
||||
|
||||
|
|
@ -1028,7 +1060,6 @@ public class MathUtils {
|
|||
|
||||
}
|
||||
|
||||
|
||||
public static Object getMedian(List<Comparable> list) {
|
||||
return orderStatisticSearch((int) Math.ceil(list.size() / 2), list);
|
||||
}
|
||||
|
|
@ -1058,10 +1089,12 @@ public class MathUtils {
|
|||
if (quality < qk) {
|
||||
lessThanQReads.add(read);
|
||||
lessThanQOffsets.add(offset);
|
||||
} else if (quality > qk) {
|
||||
}
|
||||
else if (quality > qk) {
|
||||
greaterThanQReads.add(read);
|
||||
greaterThanQOffsets.add(offset);
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
equalToQReads.add(reads.get(iter));
|
||||
}
|
||||
}
|
||||
|
|
@ -1079,6 +1112,13 @@ public class MathUtils {
|
|||
return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.));
|
||||
}
|
||||
|
||||
public static long sum(Collection<Integer> x) {
|
||||
long sum = 0;
|
||||
for (int v : x)
|
||||
sum += v;
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* A utility class that computes on the fly average and standard deviation for a stream of numbers.
|
||||
* The number of observations does not have to be known in advance, and can be also very big (so that
|
||||
|
|
@ -1184,8 +1224,7 @@ public class MathUtils {
|
|||
log10Cache[k] = Math.log10(k);
|
||||
|
||||
for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
|
||||
jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k)
|
||||
* JACOBIAN_LOG_TABLE_STEP));
|
||||
jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP));
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -1232,7 +1271,8 @@ public class MathUtils {
|
|||
else if (diff >= 0) {
|
||||
int ind = (int) (diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5);
|
||||
return x + jacobianLogTable[ind];
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
int ind = (int) (-diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5);
|
||||
return y + jacobianLogTable[ind];
|
||||
}
|
||||
|
|
@ -1273,71 +1313,7 @@ public class MathUtils {
|
|||
/**
|
||||
* Constants to simplify the log gamma function calculation.
|
||||
*/
|
||||
private static final double
|
||||
zero = 0.0,
|
||||
one = 1.0,
|
||||
half = .5,
|
||||
a0 = 7.72156649015328655494e-02,
|
||||
a1 = 3.22467033424113591611e-01,
|
||||
a2 = 6.73523010531292681824e-02,
|
||||
a3 = 2.05808084325167332806e-02,
|
||||
a4 = 7.38555086081402883957e-03,
|
||||
a5 = 2.89051383673415629091e-03,
|
||||
a6 = 1.19270763183362067845e-03,
|
||||
a7 = 5.10069792153511336608e-04,
|
||||
a8 = 2.20862790713908385557e-04,
|
||||
a9 = 1.08011567247583939954e-04,
|
||||
a10 = 2.52144565451257326939e-05,
|
||||
a11 = 4.48640949618915160150e-05,
|
||||
tc = 1.46163214496836224576e+00,
|
||||
tf = -1.21486290535849611461e-01,
|
||||
tt = -3.63867699703950536541e-18,
|
||||
t0 = 4.83836122723810047042e-01,
|
||||
t1 = -1.47587722994593911752e-01,
|
||||
t2 = 6.46249402391333854778e-02,
|
||||
t3 = -3.27885410759859649565e-02,
|
||||
t4 = 1.79706750811820387126e-02,
|
||||
t5 = -1.03142241298341437450e-02,
|
||||
t6 = 6.10053870246291332635e-03,
|
||||
t7 = -3.68452016781138256760e-03,
|
||||
t8 = 2.25964780900612472250e-03,
|
||||
t9 = -1.40346469989232843813e-03,
|
||||
t10 = 8.81081882437654011382e-04,
|
||||
t11 = -5.38595305356740546715e-04,
|
||||
t12 = 3.15632070903625950361e-04,
|
||||
t13 = -3.12754168375120860518e-04,
|
||||
t14 = 3.35529192635519073543e-04,
|
||||
u0 = -7.72156649015328655494e-02,
|
||||
u1 = 6.32827064025093366517e-01,
|
||||
u2 = 1.45492250137234768737e+00,
|
||||
u3 = 9.77717527963372745603e-01,
|
||||
u4 = 2.28963728064692451092e-01,
|
||||
u5 = 1.33810918536787660377e-02,
|
||||
v1 = 2.45597793713041134822e+00,
|
||||
v2 = 2.12848976379893395361e+00,
|
||||
v3 = 7.69285150456672783825e-01,
|
||||
v4 = 1.04222645593369134254e-01,
|
||||
v5 = 3.21709242282423911810e-03,
|
||||
s0 = -7.72156649015328655494e-02,
|
||||
s1 = 2.14982415960608852501e-01,
|
||||
s2 = 3.25778796408930981787e-01,
|
||||
s3 = 1.46350472652464452805e-01,
|
||||
s4 = 2.66422703033638609560e-02,
|
||||
s5 = 1.84028451407337715652e-03,
|
||||
s6 = 3.19475326584100867617e-05,
|
||||
r1 = 1.39200533467621045958e+00,
|
||||
r2 = 7.21935547567138069525e-01,
|
||||
r3 = 1.71933865632803078993e-01,
|
||||
r4 = 1.86459191715652901344e-02,
|
||||
r5 = 7.77942496381893596434e-04,
|
||||
r6 = 7.32668430744625636189e-06,
|
||||
w0 = 4.18938533204672725052e-01,
|
||||
w1 = 8.33333333333329678849e-02,
|
||||
w2 = -2.77777777728775536470e-03,
|
||||
w3 = 7.93650558643019558500e-04,
|
||||
w4 = -5.95187557450339963135e-04,
|
||||
w5 = 8.36339918996282139126e-04,
|
||||
w6 = -1.63092934096575273989e-03;
|
||||
private static final double zero = 0.0, one = 1.0, half = .5, a0 = 7.72156649015328655494e-02, a1 = 3.22467033424113591611e-01, a2 = 6.73523010531292681824e-02, a3 = 2.05808084325167332806e-02, a4 = 7.38555086081402883957e-03, a5 = 2.89051383673415629091e-03, a6 = 1.19270763183362067845e-03, a7 = 5.10069792153511336608e-04, a8 = 2.20862790713908385557e-04, a9 = 1.08011567247583939954e-04, a10 = 2.52144565451257326939e-05, a11 = 4.48640949618915160150e-05, tc = 1.46163214496836224576e+00, tf = -1.21486290535849611461e-01, tt = -3.63867699703950536541e-18, t0 = 4.83836122723810047042e-01, t1 = -1.47587722994593911752e-01, t2 = 6.46249402391333854778e-02, t3 = -3.27885410759859649565e-02, t4 = 1.79706750811820387126e-02, t5 = -1.03142241298341437450e-02, t6 = 6.10053870246291332635e-03, t7 = -3.68452016781138256760e-03, t8 = 2.25964780900612472250e-03, t9 = -1.40346469989232843813e-03, t10 = 8.81081882437654011382e-04, t11 = -5.38595305356740546715e-04, t12 = 3.15632070903625950361e-04, t13 = -3.12754168375120860518e-04, t14 = 3.35529192635519073543e-04, u0 = -7.72156649015328655494e-02, u1 = 6.32827064025093366517e-01, u2 = 1.45492250137234768737e+00, u3 = 9.77717527963372745603e-01, u4 = 2.28963728064692451092e-01, u5 = 1.33810918536787660377e-02, v1 = 2.45597793713041134822e+00, v2 = 2.12848976379893395361e+00, v3 = 7.69285150456672783825e-01, v4 = 1.04222645593369134254e-01, v5 = 3.21709242282423911810e-03, s0 = -7.72156649015328655494e-02, s1 = 2.14982415960608852501e-01, s2 = 3.25778796408930981787e-01, s3 = 1.46350472652464452805e-01, s4 = 2.66422703033638609560e-02, s5 = 1.84028451407337715652e-03, s6 = 3.19475326584100867617e-05, r1 = 1.39200533467621045958e+00, r2 = 7.21935547567138069525e-01, r3 = 1.71933865632803078993e-01, r4 = 1.86459191715652901344e-02, r5 = 7.77942496381893596434e-04, r6 = 7.32668430744625636189e-06, w0 = 4.18938533204672725052e-01, w1 = 8.33333333333329678849e-02, w2 = -2.77777777728775536470e-03, w3 = 7.93650558643019558500e-04, w4 = -5.95187557450339963135e-04, w5 = 8.36339918996282139126e-04, w6 = -1.63092934096575273989e-03;
|
||||
|
||||
/**
|
||||
* Efficient rounding functions to simplify the log gamma function calculation
|
||||
|
|
@ -1368,14 +1344,17 @@ public class MathUtils {
|
|||
|
||||
/* purge off +-inf, NaN, +-0, and negative arguments */
|
||||
int ix = hx & 0x7fffffff;
|
||||
if (ix >= 0x7ff00000) return Double.POSITIVE_INFINITY;
|
||||
if ((ix | lx) == 0 || hx < 0) return Double.NaN;
|
||||
if (ix >= 0x7ff00000)
|
||||
return Double.POSITIVE_INFINITY;
|
||||
if ((ix | lx) == 0 || hx < 0)
|
||||
return Double.NaN;
|
||||
if (ix < 0x3b900000) { /* |x|<2**-70, return -log(|x|) */
|
||||
return -Math.log(x);
|
||||
}
|
||||
|
||||
/* purge off 1 and 2 */
|
||||
if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0)) r = 0;
|
||||
if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0))
|
||||
r = 0;
|
||||
/* for x < 2.0 */
|
||||
else if (ix < 0x40000000) {
|
||||
if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */
|
||||
|
|
@ -1383,22 +1362,27 @@ public class MathUtils {
|
|||
if (ix >= 0x3FE76944) {
|
||||
y = one - x;
|
||||
i = 0;
|
||||
} else if (ix >= 0x3FCDA661) {
|
||||
}
|
||||
else if (ix >= 0x3FCDA661) {
|
||||
y = x - (tc - one);
|
||||
i = 1;
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
y = x;
|
||||
i = 2;
|
||||
}
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
r = zero;
|
||||
if (ix >= 0x3FFBB4C3) {
|
||||
y = 2.0 - x;
|
||||
i = 0;
|
||||
} /* [1.7316,2] */ else if (ix >= 0x3FF3B4C4) {
|
||||
} /* [1.7316,2] */
|
||||
else if (ix >= 0x3FF3B4C4) {
|
||||
y = x - tc;
|
||||
i = 1;
|
||||
} /* [1.23,1.73] */ else {
|
||||
} /* [1.23,1.73] */
|
||||
else {
|
||||
y = x - one;
|
||||
i = 2;
|
||||
}
|
||||
|
|
@ -1426,7 +1410,8 @@ public class MathUtils {
|
|||
p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
|
||||
r += (-0.5 * y + p1 / p2);
|
||||
}
|
||||
} else if (ix < 0x40200000) { /* x < 8.0 */
|
||||
}
|
||||
else if (ix < 0x40200000) { /* x < 8.0 */
|
||||
i = (int) x;
|
||||
t = zero;
|
||||
y = x - (double) i;
|
||||
|
|
@ -1449,13 +1434,15 @@ public class MathUtils {
|
|||
break;
|
||||
}
|
||||
/* 8.0 <= x < 2**58 */
|
||||
} else if (ix < 0x43900000) {
|
||||
}
|
||||
else if (ix < 0x43900000) {
|
||||
t = Math.log(x);
|
||||
z = one / x;
|
||||
y = z * z;
|
||||
w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
|
||||
r = (x - half) * (t - one) + w;
|
||||
} else
|
||||
}
|
||||
else
|
||||
/* 2**58 <= x <= inf */
|
||||
r = x * (Math.log(x) - one);
|
||||
return r;
|
||||
|
|
@ -1490,7 +1477,6 @@ public class MathUtils {
|
|||
return log10BinomialCoefficient(n, k) + log10p * k + log10OneMinusP * (n - k);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the log10 of the multinomial coefficient. Designed to prevent
|
||||
* overflows even with very large numbers.
|
||||
|
|
@ -1534,7 +1520,6 @@ public class MathUtils {
|
|||
return log10Gamma(x + 1);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Adds two arrays together and returns a new array with the sum.
|
||||
*
|
||||
|
|
@ -1572,17 +1557,18 @@ public class MathUtils {
|
|||
|
||||
/**
|
||||
* Vector operations
|
||||
*
|
||||
* @param v1 first numerical array
|
||||
* @param v2 second numerical array
|
||||
* @return a new array with the elements added
|
||||
* @return a new array with the elements added
|
||||
*/
|
||||
public static <E extends Number> Double[] vectorSum(E v1[], E v2[]) {
|
||||
if (v1.length != v2.length)
|
||||
throw new UserException("BUG: vectors v1, v2 of different size in vectorSum()");
|
||||
|
||||
Double[] result = new Double[v1.length];
|
||||
for (int k=0; k < v1.length; k++)
|
||||
result[k] = v1[k].doubleValue()+v2[k].doubleValue();
|
||||
for (int k = 0; k < v1.length; k++)
|
||||
result[k] = v1[k].doubleValue() + v2[k].doubleValue();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
|
@ -1590,19 +1576,19 @@ public class MathUtils {
|
|||
public static <E extends Number> Double[] scalarTimesVector(E a, E[] v1) {
|
||||
|
||||
Double result[] = new Double[v1.length];
|
||||
for (int k=0; k < v1.length; k++)
|
||||
result[k] = a.doubleValue()*v1[k].doubleValue();
|
||||
for (int k = 0; k < v1.length; k++)
|
||||
result[k] = a.doubleValue() * v1[k].doubleValue();
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public static <E extends Number> Double dotProduct(E[] v1, E[] v2) {
|
||||
public static <E extends Number> Double dotProduct(E[] v1, E[] v2) {
|
||||
if (v1.length != v2.length)
|
||||
throw new UserException("BUG: vectors v1, v2 of different size in vectorSum()");
|
||||
|
||||
Double result = 0.0;
|
||||
for (int k=0; k < v1.length; k++)
|
||||
result += v1[k].doubleValue() *v2[k].doubleValue();
|
||||
for (int k = 0; k < v1.length; k++)
|
||||
result += v1[k].doubleValue() * v2[k].doubleValue();
|
||||
|
||||
return result;
|
||||
|
||||
|
|
@ -1610,7 +1596,7 @@ public class MathUtils {
|
|||
|
||||
public static double[] vectorLog10(double v1[]) {
|
||||
double result[] = new double[v1.length];
|
||||
for (int k=0; k < v1.length; k++)
|
||||
for (int k = 0; k < v1.length; k++)
|
||||
result[k] = Math.log10(v1[k]);
|
||||
|
||||
return result;
|
||||
|
|
@ -1620,7 +1606,7 @@ public class MathUtils {
|
|||
// todo - silly overloading, just because Java can't unbox/box arrays of primitive types, and we can't do generics with primitive types!
|
||||
public static Double[] vectorLog10(Double v1[]) {
|
||||
Double result[] = new Double[v1.length];
|
||||
for (int k=0; k < v1.length; k++)
|
||||
for (int k = 0; k < v1.length; k++)
|
||||
result[k] = Math.log10(v1[k]);
|
||||
|
||||
return result;
|
||||
|
|
|
|||
|
|
@ -55,6 +55,14 @@ public class QualityUtils {
|
|||
return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc.
|
||||
}
|
||||
|
||||
static public double[] qualArrayToLog10ErrorProb(byte[] quals) {
|
||||
double[] returnArray = new double[quals.length];
|
||||
for( int iii = 0; iii < quals.length; iii++ ) {
|
||||
returnArray[iii] = ((double) quals[iii])/-10.0;
|
||||
}
|
||||
return returnArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a probability to a quality score. Note, this is capped at Q40.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -673,7 +673,7 @@ public class BAQ {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns true if we don't think this read is eligable for the BAQ calculation. Examples include non-PF reads,
|
||||
* Returns true if we don't think this read is eligible for the BAQ calculation. Examples include non-PF reads,
|
||||
* duplicates, or unmapped reads. Used by baqRead to determine if a read should fall through the calculation.
|
||||
*
|
||||
* @param read
|
||||
|
|
|
|||
|
|
@ -314,10 +314,10 @@ public class IntervalUtils {
|
|||
* @param reference The reference for the intervals.
|
||||
* @return A map of contig names with their sizes.
|
||||
*/
|
||||
public static Map<String, Long> getContigSizes(File reference) {
|
||||
public static Map<String, Integer> getContigSizes(File reference) {
|
||||
ReferenceDataSource referenceSource = new ReferenceDataSource(reference);
|
||||
List<GenomeLoc> locs = GenomeLocSortedSet.createSetFromSequenceDictionary(referenceSource.getReference().getSequenceDictionary()).toList();
|
||||
Map<String, Long> lengths = new LinkedHashMap<String, Long>();
|
||||
Map<String, Integer> lengths = new LinkedHashMap<String, Integer>();
|
||||
for (GenomeLoc loc: locs)
|
||||
lengths.put(loc.getContig(), loc.size());
|
||||
return lengths;
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
protected final boolean isBeforeInsertion;
|
||||
protected final boolean isNextToSoftClip;
|
||||
|
||||
|
||||
/**
|
||||
* Creates a new pileup element.
|
||||
*
|
||||
|
|
@ -89,6 +88,14 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
public byte getQual() {
|
||||
return getQual(offset);
|
||||
}
|
||||
|
||||
public byte getBaseInsertionQual() {
|
||||
return getBaseInsertionQual(offset);
|
||||
}
|
||||
|
||||
public byte getBaseDeletionQual() {
|
||||
return getBaseDeletionQual(offset);
|
||||
}
|
||||
|
||||
public int getMappingQual() {
|
||||
return read.getMappingQuality();
|
||||
|
|
@ -111,6 +118,14 @@ public class PileupElement implements Comparable<PileupElement> {
|
|||
return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_QUAL : read.getBaseQualities()[offset];
|
||||
}
|
||||
|
||||
protected byte getBaseInsertionQual(final int offset) {
|
||||
return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_QUAL : read.getBaseInsertionQualities()[offset];
|
||||
}
|
||||
|
||||
protected byte getBaseDeletionQual(final int offset) {
|
||||
return (isDeletion() || isInsertionAtBeginningOfRead()) ? DELETION_QUAL : read.getBaseDeletionQualities()[offset];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(final PileupElement pileupElement) {
|
||||
if (offset < pileupElement.offset)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,295 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.utils.recalibration;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDataManager;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Utility methods to facilitate on-the-fly base quality score recalibration.
|
||||
*
|
||||
* User: rpoplin
|
||||
* Date: 2/4/12
|
||||
*/
|
||||
|
||||
public class BaseRecalibration {
|
||||
|
||||
public enum BaseRecalibrationType {
|
||||
BASE_SUBSTITUTION,
|
||||
BASE_INSERTION,
|
||||
BASE_DELETION
|
||||
}
|
||||
|
||||
private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private final ArrayList<Covariate> requestedCovariates = new ArrayList<Covariate>(); // List of covariates to be used in this calculation
|
||||
public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
public static final String EOF_MARKER = "EOF";
|
||||
private static final int MAX_QUALITY_SCORE = 65; //BUGBUG: what value to use here?
|
||||
private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values.
|
||||
|
||||
public BaseRecalibration( final File RECAL_FILE ) {
|
||||
// Get a list of all available covariates
|
||||
final List<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
|
||||
int lineNumber = 0;
|
||||
boolean foundAllCovariates = false;
|
||||
|
||||
// Read in the data from the csv file and populate the data map and covariates list
|
||||
boolean sawEOF = false;
|
||||
try {
|
||||
for ( String line : new XReadLines(RECAL_FILE) ) {
|
||||
lineNumber++;
|
||||
if ( EOF_MARKER.equals(line) ) {
|
||||
sawEOF = true;
|
||||
} else if( COMMENT_PATTERN.matcher(line).matches() ) {
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
// Read in the covariates that were used from the input file
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
|
||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
String[] vals = line.split(",");
|
||||
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} else { // Found a line of data
|
||||
if( !foundAllCovariates ) {
|
||||
foundAllCovariates = true;
|
||||
|
||||
// At this point all the covariates should have been found and initialized
|
||||
if( requestedCovariates.size() < 2 ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE );
|
||||
}
|
||||
|
||||
final boolean createCollapsedTables = true;
|
||||
|
||||
// Initialize any covariate member variables using the shared argument collection
|
||||
RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
cov.initialize( RAC );
|
||||
}
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() );
|
||||
|
||||
}
|
||||
addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
|
||||
if ( !sawEOF ) {
|
||||
final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool.";
|
||||
throw new UserException.MalformedFile(RECAL_FILE, errorMessage);
|
||||
}
|
||||
|
||||
if( dataManager == null ) {
|
||||
throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?");
|
||||
}
|
||||
|
||||
dataManager.generateEmpiricalQualities( 1, MAX_QUALITY_SCORE );
|
||||
}
|
||||
|
||||
/**
|
||||
* For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches)
|
||||
* @param line A line of CSV data read from the recalibration table data file
|
||||
*/
|
||||
private void addCSVData(final File file, final String line) {
|
||||
final String[] vals = line.split(",");
|
||||
|
||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||
throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line +
|
||||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
||||
}
|
||||
|
||||
final Object[] key = new Object[requestedCovariates.size()];
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
key[iii] = cov.getValue( vals[iii] );
|
||||
}
|
||||
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, QualityUtils.MIN_USABLE_Q_SCORE ); //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
|
||||
}
|
||||
|
||||
public byte[] recalibrateRead( final GATKSAMRecord read, final byte[] originalQuals, final BaseRecalibrationType modelType ) {
|
||||
|
||||
final byte[] recalQuals = originalQuals.clone();
|
||||
|
||||
//compute all covariate values for this read
|
||||
final Comparable[][] covariateValues_offset_x_covar =
|
||||
RecalDataManager.computeCovariates(read, requestedCovariates, modelType);
|
||||
|
||||
// For each base in the read
|
||||
for( int offset = 0; offset < read.getReadLength(); offset++ ) {
|
||||
|
||||
final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset];
|
||||
|
||||
Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey);
|
||||
if(qualityScore == null)
|
||||
{
|
||||
qualityScore = performSequentialQualityCalculation( fullCovariateKey );
|
||||
qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey);
|
||||
}
|
||||
|
||||
recalQuals[offset] = qualityScore;
|
||||
}
|
||||
|
||||
preserveQScores( originalQuals, recalQuals ); // Overwrite the work done if original quality score is too low
|
||||
|
||||
return recalQuals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Implements a serial recalibration of the reads using the combinational table.
|
||||
* First, we perform a positional recalibration, and then a subsequent dinuc correction.
|
||||
*
|
||||
* Given the full recalibration table, we perform the following preprocessing steps:
|
||||
*
|
||||
* - calculate the global quality score shift across all data [DeltaQ]
|
||||
* - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
|
||||
* -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
|
||||
* - The final shift equation is:
|
||||
*
|
||||
* Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
|
||||
* @param key The list of Comparables that were calculated from the covariates
|
||||
* @return A recalibrated quality score as a byte
|
||||
*/
|
||||
private byte performSequentialQualityCalculation( final Object... key ) {
|
||||
|
||||
final byte qualFromRead = (byte)Integer.parseInt(key[1].toString());
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] qualityScoreCollapsedKey = new Object[2];
|
||||
final Object[] covariateCollapsedKey = new Object[3];
|
||||
|
||||
// The global quality shift (over the read group only)
|
||||
readGroupCollapsedKey[0] = key[0];
|
||||
final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey ));
|
||||
double globalDeltaQ = 0.0;
|
||||
if( globalRecalDatum != null ) {
|
||||
final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality();
|
||||
final double aggregrateQReported = globalRecalDatum.getEstimatedQReported();
|
||||
globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported;
|
||||
}
|
||||
|
||||
// The shift in quality between reported and empirical
|
||||
qualityScoreCollapsedKey[0] = key[0];
|
||||
qualityScoreCollapsedKey[1] = key[1];
|
||||
final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey ));
|
||||
double deltaQReported = 0.0;
|
||||
if( qReportedRecalDatum != null ) {
|
||||
final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality();
|
||||
deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ;
|
||||
}
|
||||
|
||||
// The shift in quality due to each covariate by itself in turn
|
||||
double deltaQCovariates = 0.0;
|
||||
double deltaQCovariateEmpirical;
|
||||
covariateCollapsedKey[0] = key[0];
|
||||
covariateCollapsedKey[1] = key[1];
|
||||
for( int iii = 2; iii < key.length; iii++ ) {
|
||||
covariateCollapsedKey[2] = key[iii]; // The given covariate
|
||||
final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey ));
|
||||
if( covariateRecalDatum != null ) {
|
||||
deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality();
|
||||
deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) );
|
||||
}
|
||||
}
|
||||
|
||||
final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates;
|
||||
return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE );
|
||||
|
||||
// Verbose printouts used to validate with old recalibrator
|
||||
//if(key.contains(null)) {
|
||||
// System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte));
|
||||
//}
|
||||
//else {
|
||||
// System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d",
|
||||
// key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) );
|
||||
//}
|
||||
|
||||
//return newQualityByte;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop over the list of qualities and overwrite the newly recalibrated score to be the original score if it was less than some threshold
|
||||
* @param originalQuals The list of original base quality scores
|
||||
* @param recalQuals A list of the new recalibrated quality scores
|
||||
*/
|
||||
private void preserveQScores( final byte[] originalQuals, final byte[] recalQuals ) {
|
||||
for( int iii = 0; iii < recalQuals.length; iii++ ) {
|
||||
if( originalQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ) { //BUGBUG: used to be Q5 now is Q6, probably doesn't matter
|
||||
recalQuals[iii] = originalQuals[iii];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -25,8 +25,11 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.NGSPlatform;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
|
|
@ -48,6 +51,11 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OP"; // reads that are clipped may use this attribute to keep track of their original alignment start
|
||||
public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end
|
||||
|
||||
// Base Quality Score Recalibrator specific attribute tags
|
||||
public static final String BQSR_BASE_INSERTION_QUALITIES = "BI";
|
||||
public static final String BQSR_BASE_DELETION_QUALITIES = "BD";
|
||||
public static final String BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG = "BR";
|
||||
|
||||
// the SAMRecord data we're caching
|
||||
private String mReadString = null;
|
||||
private GATKSAMReadGroupRecord mReadGroup = null;
|
||||
|
|
@ -155,6 +163,62 @@ public class GATKSAMRecord extends BAMRecord {
|
|||
return super.equals(o);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public byte[] getBaseQualities() {
|
||||
return super.getBaseQualities();
|
||||
/*
|
||||
if( getAttribute( BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG ) != null ) {
|
||||
return super.getBaseQualities();
|
||||
} else {
|
||||
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
|
||||
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
|
||||
final byte[] quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, super.getBaseQualities() );
|
||||
setBaseQualities(quals);
|
||||
setAttribute( BQSR_BASES_HAVE_BEEN_RECALIBRATED_TAG, true );
|
||||
return quals;
|
||||
} else { // just use the qualities that are in the read since we don't have the sufficient information to recalibrate on the fly
|
||||
return super.getBaseQualities();
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
/**
|
||||
* Accessors for base insertion and base deletion quality scores
|
||||
*/
|
||||
public byte[] getBaseInsertionQualities() {
|
||||
byte[] quals = getByteArrayAttribute( BQSR_BASE_INSERTION_QUALITIES );
|
||||
if( quals == null ) {
|
||||
quals = new byte[getBaseQualities().length];
|
||||
Arrays.fill(quals, (byte) 45); // allow for differing default values between BaseInsertions and BaseDeletions
|
||||
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
|
||||
// else give default values which are flat Q45
|
||||
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
|
||||
quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals, BaseRecalibration.BaseRecalibrationType.BASE_INSERTION ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities
|
||||
}
|
||||
// add the qual array to the read so that we don't have to do the recalibration work again
|
||||
setAttribute( BQSR_BASE_INSERTION_QUALITIES, quals );
|
||||
}
|
||||
return quals;
|
||||
}
|
||||
|
||||
public byte[] getBaseDeletionQualities() {
|
||||
byte[] quals = getByteArrayAttribute( BQSR_BASE_DELETION_QUALITIES );
|
||||
if( quals == null ) {
|
||||
quals = new byte[getBaseQualities().length];
|
||||
Arrays.fill(quals, (byte) 45);
|
||||
// if the recal data was populated in the engine then recalibrate the quality scores on the fly
|
||||
// else give default values which are flat Q45
|
||||
if( GenomeAnalysisEngine.hasBaseRecalibration() ) {
|
||||
quals = GenomeAnalysisEngine.getBaseRecalibration().recalibrateRead( this, quals, BaseRecalibration.BaseRecalibrationType.BASE_DELETION ); // the original quals here are the flat base insertion/deletion quals, NOT the original base qualities
|
||||
}
|
||||
// add the qual array to the read so that we don't have to do the recalibration work again
|
||||
setAttribute( BQSR_BASE_DELETION_QUALITIES, quals );
|
||||
}
|
||||
return quals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Efficient caching accessor that returns the GATK NGSPlatform of this read
|
||||
* @return
|
||||
|
|
|
|||
|
|
@ -920,6 +920,9 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
}
|
||||
|
||||
public void validateReferenceBases(Allele reference, Byte paddedRefBase) {
|
||||
if ( reference == null )
|
||||
return;
|
||||
|
||||
// don't validate if we're a complex event
|
||||
if ( !isComplexIndel() && !reference.isNull() && !reference.basesMatch(getReference()) ) {
|
||||
throw new TribbleException.InternalCodecException(String.format("the REF allele is incorrect for the record at position %s:%d, fasta says %s vs. VCF says %s", getChr(), getStart(), reference.getBaseString(), getReference().getBaseString()));
|
||||
|
|
@ -963,6 +966,9 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
}
|
||||
|
||||
public void validateChromosomeCounts() {
|
||||
if ( !hasGenotypes() )
|
||||
return;
|
||||
|
||||
// AN
|
||||
if ( hasAttribute(VCFConstants.ALLELE_NUMBER_KEY) ) {
|
||||
int reportedAN = Integer.valueOf(getAttribute(VCFConstants.ALLELE_NUMBER_KEY).toString());
|
||||
|
|
@ -993,7 +999,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
|
|||
throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag doesn't have the correct number of values for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedACs.size(), observedACs.size()));
|
||||
for (int i = 0; i < observedACs.size(); i++) {
|
||||
if ( Integer.valueOf(reportedACs.get(i).toString()) != observedACs.get(i) )
|
||||
throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %d vs. %d", getChr(), getStart(), reportedACs.get(i), observedACs.get(i)));
|
||||
throw new TribbleException.InternalCodecException(String.format("the Allele Count (AC) tag is incorrect for the record at position %s:%d, %s vs. %d", getChr(), getStart(), reportedACs.get(i), observedACs.get(i)));
|
||||
}
|
||||
} else {
|
||||
if ( observedACs.size() != 1 )
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ import org.testng.annotations.DataProvider;
|
|||
import org.testng.annotations.Test;
|
||||
|
||||
public class GATKReportUnitTest extends BaseTest {
|
||||
@Test
|
||||
@Test(enabled = false)
|
||||
public void testParse() throws Exception {
|
||||
String reportPath = validationDataLocation + "exampleGATKReport.eval";
|
||||
GATKReport report = new GATKReport(reportPath);
|
||||
|
|
@ -49,23 +49,23 @@ public class GATKReportUnitTest extends BaseTest {
|
|||
|
||||
@DataProvider(name = "rightAlignValues")
|
||||
public Object[][] getRightAlignValues() {
|
||||
return new Object[][] {
|
||||
new Object[] {null, true},
|
||||
new Object[] {"null", true},
|
||||
new Object[] {"NA", true},
|
||||
new Object[] {"0", true},
|
||||
new Object[] {"0.0", true},
|
||||
new Object[] {"-0", true},
|
||||
new Object[] {"-0.0", true},
|
||||
new Object[] {String.valueOf(Long.MAX_VALUE), true},
|
||||
new Object[] {String.valueOf(Long.MIN_VALUE), true},
|
||||
new Object[] {String.valueOf(Float.MIN_NORMAL), true},
|
||||
new Object[] {String.valueOf(Double.MAX_VALUE), true},
|
||||
new Object[] {String.valueOf(Double.MIN_VALUE), true},
|
||||
new Object[] {String.valueOf(Double.POSITIVE_INFINITY), true},
|
||||
new Object[] {String.valueOf(Double.NEGATIVE_INFINITY), true},
|
||||
new Object[] {String.valueOf(Double.NaN), true},
|
||||
new Object[] {"hello", false}
|
||||
return new Object[][]{
|
||||
new Object[]{null, true},
|
||||
new Object[]{"null", true},
|
||||
new Object[]{"NA", true},
|
||||
new Object[]{"0", true},
|
||||
new Object[]{"0.0", true},
|
||||
new Object[]{"-0", true},
|
||||
new Object[]{"-0.0", true},
|
||||
new Object[]{String.valueOf(Long.MAX_VALUE), true},
|
||||
new Object[]{String.valueOf(Long.MIN_VALUE), true},
|
||||
new Object[]{String.valueOf(Float.MIN_NORMAL), true},
|
||||
new Object[]{String.valueOf(Double.MAX_VALUE), true},
|
||||
new Object[]{String.valueOf(Double.MIN_VALUE), true},
|
||||
new Object[]{String.valueOf(Double.POSITIVE_INFINITY), true},
|
||||
new Object[]{String.valueOf(Double.NEGATIVE_INFINITY), true},
|
||||
new Object[]{String.valueOf(Double.NaN), true},
|
||||
new Object[]{"hello", false}
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -73,4 +73,96 @@ public class GATKReportUnitTest extends BaseTest {
|
|||
public void testIsRightAlign(String value, boolean expected) {
|
||||
Assert.assertEquals(GATKReportColumn.isRightAlign(value), expected, "right align of '" + value + "'");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGATKReportGatherer() {
|
||||
|
||||
/*
|
||||
GATKReportTable actual1 = new GATKReportTable("TableName", "Description");
|
||||
actual1.addPrimaryKey("key");
|
||||
actual1.addColumn("colA", 0);
|
||||
actual1.addColumn("colB", 0);
|
||||
actual1.set("row1", "colA", 1);
|
||||
actual1.set("row1", "colB", 2);
|
||||
|
||||
GATKReportTable actual2 = new GATKReportTable("TableName", "Description");
|
||||
actual2.addPrimaryKey("key");
|
||||
actual2.addColumn("colA", 0);
|
||||
actual2.addColumn("colB", 0);
|
||||
actual2.set("row2", "colA", 3);
|
||||
actual2.set("row2", "colB", 4);
|
||||
|
||||
GATKReportTable actual3 = new GATKReportTable("TableName", "Description");
|
||||
actual3.addPrimaryKey("key");
|
||||
actual3.addColumn("colA", 0);
|
||||
actual3.addColumn("colB", 0);
|
||||
actual3.set("row3", "colA", 5);
|
||||
actual3.set("row3", "colB", 6);
|
||||
|
||||
actual1.mergeRows(actual2);
|
||||
actual1.mergeRows(actual3);
|
||||
actual1.write(System.out);
|
||||
*/
|
||||
|
||||
GATKReportTable expected = new GATKReportTable("TableName", "Description");
|
||||
expected.addPrimaryKey("key");
|
||||
expected.addColumn("colA", 0);
|
||||
expected.addColumn("colB", 0);
|
||||
expected.set("row1", "colA", 1);
|
||||
expected.set("row1", "colB", 2);
|
||||
expected.set("row2", "colA", 3);
|
||||
expected.set("row2", "colB", 4);
|
||||
expected.set("row3", "colA", 5);
|
||||
expected.set("row3", "colB", 6);
|
||||
expected.write(System.out);
|
||||
|
||||
GATKReport report1, report2, report3;
|
||||
report1 = new GATKReport();
|
||||
report1.addTable("TableName", "Description");
|
||||
report1.getTable("TableName").addPrimaryKey("key");
|
||||
report1.getTable("TableName").addColumn("colA", 0);
|
||||
report1.getTable("TableName").addColumn("colB", 0);
|
||||
report1.getTable("TableName").set("row1", "colA", 1);
|
||||
report1.getTable("TableName").set("row1", "colB", 2);
|
||||
|
||||
report2 = new GATKReport();
|
||||
report2.addTable("TableName", "Description");
|
||||
report2.getTable("TableName").addPrimaryKey("key");
|
||||
report2.getTable("TableName").addColumn("colA", 0);
|
||||
report2.getTable("TableName").addColumn("colB", 0);
|
||||
report2.getTable("TableName").set("row2", "colA", 3);
|
||||
report2.getTable("TableName").set("row2", "colB", 4);
|
||||
|
||||
report3 = new GATKReport();
|
||||
report3.addTable("TableName", "Description");
|
||||
report3.getTable("TableName").addPrimaryKey("key");
|
||||
report3.getTable("TableName").addColumn("colA", 0);
|
||||
report3.getTable("TableName").addColumn("colB", 0);
|
||||
report3.getTable("TableName").set("row3", "colA", 5);
|
||||
report3.getTable("TableName").set("row3", "colB", 6);
|
||||
|
||||
report1.combineWith(report2);
|
||||
report1.combineWith(report3);
|
||||
|
||||
report1.print(System.out);
|
||||
/*
|
||||
File a = new File("/home/roger/tbls/a.tbl");
|
||||
File b = new File("/home/roger/tbls/b.tbl");
|
||||
File c = new File("/home/roger/tbls/c.tbl");
|
||||
File out = new File("/home/roger/tbls/out.tbl");
|
||||
|
||||
|
||||
List<File> FileList = new ArrayList<File>();
|
||||
FileList.add(a);
|
||||
FileList.add(b);
|
||||
FileList.add(c);
|
||||
|
||||
GATKReportGatherer gatherer = new GATKReportGatherer();
|
||||
gatherer.gather(FileList, out);
|
||||
System.out.print(out);
|
||||
*/
|
||||
|
||||
//Assert.assertEquals(1,1);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -8,13 +8,12 @@ import org.broadinstitute.sting.BaseTest;
|
|||
import org.broadinstitute.sting.commandline.IntervalBinding;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.testng.Assert;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
|
@ -341,7 +340,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
|
|||
|
||||
@Test
|
||||
public void testGetContigLengths() {
|
||||
Map<String, Long> lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference));
|
||||
Map<String, Integer> lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference));
|
||||
Assert.assertEquals((long)lengths.get("chr1"), 247249719);
|
||||
Assert.assertEquals((long)lengths.get("chr2"), 242951149);
|
||||
Assert.assertEquals((long)lengths.get("chr3"), 199501827);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,88 @@
|
|||
package org.broadinstitute.sting.queue.qscripts.lib
|
||||
|
||||
import org.broadinstitute.sting.queue.QScript
|
||||
import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals
|
||||
import scala.collection.JavaConversions._
|
||||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import java.io.PrintStream
|
||||
import org.broadinstitute.sting.queue.extensions.gatk.SelectVariants
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: chartl
|
||||
* Date: 2/2/12
|
||||
* Time: 12:13 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
|
||||
class ChunkVCF extends QScript {
|
||||
|
||||
@Input(shortName="V",fullName="VCF",doc="The VCF you want to chunk",required=true)
|
||||
var inVCF : File = _
|
||||
|
||||
@Input(shortName="N",fullName="numEntriesInChunk",doc="The number of variants per chunk",required=true)
|
||||
var numEntries : Int = _
|
||||
|
||||
@Input(shortName="I",fullName="Intervals",doc="The SNP interval list to chunk. If not provided, one will be created for you to provide in a second run.")
|
||||
var intervals : File = _
|
||||
|
||||
@Input(fullName="preserveChromosomes",doc="Restrict chunks to one chromosome (smaller chunk at end of chromosome)",required=false)
|
||||
var preserve : Boolean = false
|
||||
|
||||
@Input(fullName="reference",doc="The reference file",required=false)
|
||||
var ref : File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
||||
|
||||
@Input(fullName="samples",doc="A file of sample IDs to condense VCF file to",required=false)
|
||||
var extractSamples : File = _
|
||||
|
||||
val tmpdir : File = System.getProperty("java.io.tmpdir")
|
||||
|
||||
def script = {
|
||||
if ( intervals == null ) {
|
||||
// create an interval list from the VCF
|
||||
val ivals : File = swapExt(variants,".vcf",".intervals.list")
|
||||
val extract : VCFExtractIntervals = new VCFExtractIntervals(variants,ivals,false)
|
||||
add(extract)
|
||||
} else {
|
||||
var chunkNum = 1
|
||||
var numLinesInChunk = 0
|
||||
var chromosome : String = asScalaIterator(new XReadLines(intervals)).next().split(":")(0)
|
||||
var chunkFile : File = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum))
|
||||
var chunkWriter = new PrintStream(chunkFile)
|
||||
asScalaIterator(new XReadLines(intervals)).foreach( int => {
|
||||
// check new chromosome or full chunk
|
||||
if ( ( preserve && ! int.split(":")(0).equals(chromosome) ) || numLinesInChunk > numEntries ) {
|
||||
chunkWriter.close()
|
||||
val chunkSelect : SelectVariants = new SelectVariants
|
||||
chunkSelect.reference_sequence = ref
|
||||
chunkSelect.memoryLimit = 2
|
||||
chunkSelect.intervals :+= chunkFile
|
||||
if ( extractSamples != null )
|
||||
chunkSelect.sample_file = extractSamples
|
||||
chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum))
|
||||
add(chunkSelect)
|
||||
chunkNum += 1
|
||||
numLinesInChunk = 0
|
||||
chromosome = int.split(":")(0)
|
||||
chunkFile = new File(tmpdir,"ChunkVCF.chunk%d.intervals.list".format(chunkNum))
|
||||
chunkWriter = new PrintStream(chunkFile)
|
||||
}
|
||||
chunkWriter.printf("%s%n",int)
|
||||
numLinesInChunk += 1
|
||||
})
|
||||
// last chunk
|
||||
if ( numLinesInChunk > 0 ) {
|
||||
// some work to do
|
||||
val chunkSelect : SelectVariants = new SelectVariants
|
||||
chunkSelect.reference_sequence = ref
|
||||
chunkSelect.memoryLimit = 2
|
||||
chunkSelect.intervals :+= chunkFile
|
||||
chunkWriter.close()
|
||||
if ( extractSamples != null )
|
||||
chunkSelect.sample_file = extractSamples
|
||||
chunkSelect.out = swapExt(inVCF,".vcf",".chunk%d.vcf".format(chunkNum))
|
||||
add(chunkSelect)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -6,7 +6,7 @@ import org.broadinstitute.sting.queue.library.ipf.vcf.VCFExtractIntervals
|
|||
import org.broadinstitute.sting.utils.text.XReadLines
|
||||
import collection.JavaConversions._
|
||||
import java.io._
|
||||
import org.broadinstitute.sting.queue.extensions.gatk.VariantsToPed
|
||||
import org.broadinstitute.sting.queue.extensions.gatk.{SelectVariants, VariantsToPed}
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -31,11 +31,14 @@ class VcfToPed extends QScript {
|
|||
var intervals : File = _
|
||||
|
||||
@Argument(shortName="R",fullName="Ref",required=false,doc="Reference file")
|
||||
var ref : File = new File("/humgen/1kg/references/human_g1k_v37.fasta")
|
||||
var ref : File = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
|
||||
|
||||
@Argument(shortName="D",fullName="dbsnp",required=false,doc="dbsnp file")
|
||||
var dbsnp : File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.vcf")
|
||||
|
||||
@Argument(shortName="sf",fullName="sampleFile",required=false,doc="sample file")
|
||||
var samFile : File = _
|
||||
|
||||
val tmpdir : File = System.getProperty("java.io.tmpdir")
|
||||
|
||||
def script = {
|
||||
|
|
@ -59,14 +62,27 @@ class VcfToPed extends QScript {
|
|||
val toPed : VariantsToPed = new VariantsToPed
|
||||
toPed.memoryLimit = 2
|
||||
toPed.reference_sequence = ref
|
||||
toPed.intervals :+= new File(subListFile)
|
||||
toPed.intervals :+= subListFile
|
||||
toPed.dbsnp = dbsnp
|
||||
toPed.variant = variants
|
||||
if ( samFile != null ) {
|
||||
val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk)
|
||||
val extract : SelectVariants = new SelectVariants
|
||||
extract.reference_sequence = ref
|
||||
extract.memoryLimit = 2
|
||||
extract.intervals :+= subListFile
|
||||
extract.variant = variants
|
||||
extract.out = new File(tmpdir,base+"_extract%d.vcf".format(chunk))
|
||||
extract.sample_file :+= samFile
|
||||
add(extract)
|
||||
toPed.variant = extract.out
|
||||
} else {
|
||||
toPed.variant = variants
|
||||
}
|
||||
toPed.metaData = meta
|
||||
lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk)
|
||||
lazy val tBed = new File(tmpdir,base+".bed")
|
||||
lazy val bim = new File(tmpdir,base+".bim")
|
||||
lazy val fam = new File(tmpdir,base+".fam")
|
||||
val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk)
|
||||
val tBed = new File(tmpdir,base+".bed")
|
||||
val bim = new File(tmpdir,base+".bim")
|
||||
val fam = new File(tmpdir,base+".fam")
|
||||
toPed.bed = tBed
|
||||
toPed.bim = bim
|
||||
toPed.fam = fam
|
||||
|
|
@ -87,12 +103,26 @@ class VcfToPed extends QScript {
|
|||
toPed.reference_sequence = ref
|
||||
toPed.intervals :+= new File(subListFile)
|
||||
toPed.dbsnp = dbsnp
|
||||
toPed.variant = variants
|
||||
if ( samFile != null ) {
|
||||
val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk)
|
||||
val extract : SelectVariants = new SelectVariants
|
||||
extract.reference_sequence = ref
|
||||
extract.memoryLimit = 2
|
||||
extract.intervals :+= subListFile
|
||||
extract.variant = variants
|
||||
extract.out = new File(tmpdir,base+"_extract%d.vcf".format(chunk))
|
||||
extract.sample_file :+= samFile
|
||||
add(extract)
|
||||
toPed.variant = extract.out
|
||||
} else {
|
||||
toPed.variant = variants
|
||||
}
|
||||
toPed.metaData = meta
|
||||
lazy val base : String = bed.getName.stripSuffix(".bed")+"_%".format(chunk)
|
||||
lazy val tBed = new File(tmpdir,base+".bed")
|
||||
lazy val bim = new File(tmpdir,base+".bim")
|
||||
lazy val fam = new File(tmpdir,base+".fam")
|
||||
toPed.memoryLimit = 2
|
||||
val base : String = bed.getName.stripSuffix(".bed")+"_%d".format(chunk)
|
||||
val tBed = new File(tmpdir,base+".bed")
|
||||
val bim = new File(tmpdir,base+".bim")
|
||||
val fam = new File(tmpdir,base+".fam")
|
||||
toPed.bed = tBed
|
||||
toPed.bim = bim
|
||||
toPed.fam = fam
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ package org.broadinstitute.sting.queue.extensions.gatk
|
|||
|
||||
import java.io.File
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
||||
import org.broadinstitute.sting.utils.interval.{IntervalMergingRule, IntervalUtils}
|
||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
|
||||
import net.sf.samtools.SAMFileHeader
|
||||
import java.util.Collections
|
||||
|
|
@ -50,6 +50,8 @@ case class GATKIntervals(reference: File, intervals: Seq[String]) {
|
|||
IntervalUtils.parseIntervalArguments(parser, intervals)
|
||||
Collections.sort(parsedLocs)
|
||||
Collections.unmodifiableList(parsedLocs)
|
||||
val mergedLocs = IntervalUtils.mergeIntervalLocations(parsedLocs, IntervalMergingRule.OVERLAPPING_ONLY)
|
||||
Collections.unmodifiableList(mergedLocs)
|
||||
}
|
||||
|
||||
lazy val contigs = locs.map(_.getContig).distinct.toSeq
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
|
|||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile
|
||||
import org.broadinstitute.sting.utils.{GenomeLocSortedSet, GenomeLocParser}
|
||||
import collection.JavaConversions._
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils
|
||||
|
||||
class GATKIntervalsUnitTest {
|
||||
private final lazy val hg18Reference = new File(BaseTest.hg18Reference)
|
||||
|
|
@ -60,7 +61,7 @@ class GATKIntervalsUnitTest {
|
|||
// for(Item item: javaConvertedScalaList)
|
||||
// This for loop is actually an O(N^2) operation as the iterator calls the
|
||||
// O(N) javaConvertedScalaList.size() for each iteration of the loop.
|
||||
//Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894)
|
||||
Assert.assertEquals(IntervalUtils.splitFixedIntervals(gi.locs, 189894).size(), 189894)
|
||||
Assert.assertEquals(gi.contigs.size, 24)
|
||||
}
|
||||
|
||||
|
|
@ -77,4 +78,17 @@ class GATKIntervalsUnitTest {
|
|||
Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1", "chr2", "chr3")).contigs, Seq("chr1", "chr2", "chr3"))
|
||||
Assert.assertEquals(new GATKIntervals(hg18Reference, Seq("chr1:1-2", "chr1:4-5", "chr2:1-1", "chr3:2-2")).contigs, Seq("chr1", "chr2", "chr3"))
|
||||
}
|
||||
|
||||
@Test
|
||||
def testSortAndMergeIntervals() {
|
||||
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-10", "chr1:1-10"), Seq("chr1:1-10"))
|
||||
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:1-11", "chr1:1-12"), Seq("chr1:1-12"))
|
||||
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:11-20", "chr1:21-30"), Seq("chr1:1-10", "chr1:11-20", "chr1:21-30"))
|
||||
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:10-20", "chr1:21-30"), Seq("chr1:1-20", "chr1:21-30"))
|
||||
testSortAndMergeIntervals(Seq("chr1:1-10", "chr1:21-30", "chr1:10-20"), Seq("chr1:1-20", "chr1:21-30"))
|
||||
}
|
||||
|
||||
private def testSortAndMergeIntervals(actual: Seq[String], expected: Seq[String]) {
|
||||
Assert.assertEquals(new GATKIntervals(hg18Reference, actual).locs.toSeq, expected.map(hg18GenomeLocParser.parseGenomeLoc(_)))
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue