diff --git a/build.xml b/build.xml
index 1df75cd1d..d3e25d424 100644
--- a/build.xml
+++ b/build.xml
@@ -47,6 +47,7 @@
+
@@ -567,6 +568,7 @@
+
@@ -615,6 +617,9 @@
+
+
+
@@ -879,6 +884,7 @@
+
diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
index 46bbf7eda..876cf5cbc 100644
--- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
+++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
@@ -4,11 +4,9 @@
colnames(d) = tableHeader;
for (i in 1:ncol(d)) {
- v = suppressWarnings(as.numeric(d[,i]));
-
- if (length(na.omit(as.numeric(v))) == length(d[,i])) {
- d[,i] = v;
- }
+ # use the general type.convert infrastructure of read.table to convert column data to R types
+ v = type.convert(d[,i])
+ d[,i] = v;
}
usedNames = ls(envir=tableEnv, pattern=tableName);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
index 32002e093..e5aaf2338 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
@@ -35,9 +35,12 @@ import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor;
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.walkers.Walker;
-import org.broadinstitute.sting.utils.classloader.JVMUtils;
+import org.broadinstitute.sting.utils.crypt.CryptUtils;
+import org.broadinstitute.sting.utils.crypt.GATKKey;
+import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.text.ListFileUtils;
+import java.security.PublicKey;
import java.util.*;
/**
@@ -78,6 +81,9 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
Walker,?> walker = engine.getWalkerByName(getAnalysisName());
try {
+ // Make sure a valid GATK user key is present, if required.
+ authorizeGATKRun();
+
engine.setArguments(getArgumentCollection());
// File lists can require a bit of additional expansion. Set these explicitly by the engine.
@@ -130,6 +136,28 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
return 0;
}
+ /**
+ * Authorizes this run of the GATK by checking for a valid GATK user key, if required.
+ * Currently, a key is required only if running with the -et NO_ET or -et STDOUT options.
+ */
+ private void authorizeGATKRun() {
+ if ( getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.NO_ET ||
+ getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) {
+ if ( getArgumentCollection().gatkKeyFile == null ) {
+ throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " +
+ "Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home " +
+ "for more information and instructions on how to obtain a key.");
+ }
+ else {
+ PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey();
+ GATKKey gatkUserKey = new GATKKey(gatkPublicKey, getArgumentCollection().gatkKeyFile);
+
+ if ( ! gatkUserKey.isValid() ) {
+ throw new UserException.KeySignatureVerificationException(getArgumentCollection().gatkKeyFile);
+ }
+ }
+ }
+ }
/**
* Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index 8ec707801..02d211a0c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -65,9 +65,12 @@ public class GATKArgumentCollection {
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
public Integer readBufferSize = null;
- @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? Standard is the default, can be verbose or NO_ET so nothing is posted to the run repository", required = false)
+ @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false)
public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD;
+ @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false)
+ public File gatkKeyFile = null;
+
@Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
public List readFilters = new ArrayList();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
index e8627ef4c..f1f74069f 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
@@ -154,9 +154,7 @@ public class GATKRunReport {
/** Standard option. Writes to local repository if it can be found, or S3 otherwise */
STANDARD,
/** Force output to STDOUT. For debugging only */
- STDOUT,
- /** Force output to S3. For debugging only */
- AWS_S3 // todo -- remove me -- really just for testing purposes
+ STDOUT
}
private static final DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH.mm.ss");
@@ -239,11 +237,8 @@ public class GATKRunReport {
case STDOUT:
postReportToStream(System.out);
break;
- case AWS_S3:
- postReportToAWSS3();
- break;
default:
- exceptDuringRunReport("BUG: unexcepted PhoneHomeOption ");
+ exceptDuringRunReport("BUG: unexpected PhoneHomeOption ");
break;
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
index 3f555f780..00968943d 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java
@@ -43,15 +43,15 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar
final ArrayList altQuals = new ArrayList();
if ( vc.isSNP() ) {
+ final List altAlleles = new ArrayList();
+ for ( final Allele a : vc.getAlternateAlleles() )
+ altAlleles.add(a.getBases()[0]);
+
for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) {
final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName());
if ( context == null )
continue;
- final List altAlleles = new ArrayList();
- for ( final Allele a : vc.getAlternateAlleles() )
- altAlleles.add(a.getBases()[0]);
-
fillQualsFromPileup(ref.getBase(), altAlleles, context.getBasePileup(), refQuals, altQuals);
}
} else if ( vc.isIndel() || vc.isMixed() ) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java
index 89a30e4f5..a1ab73341 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java
@@ -26,10 +26,12 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.util.Arrays;
+import java.util.BitSet;
/**
* Created by IntelliJ IDEA.
@@ -43,10 +45,6 @@ public class ContextCovariate implements StandardCovariate {
private int insertionsContextSize;
private int deletionsContextSize;
- private String mismatchesNoContext = "";
- private String insertionsNoContext = "";
- private String deletionsNoContext = "";
-
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
public void initialize(final RecalibrationArgumentCollection RAC) {
@@ -57,29 +55,26 @@ public class ContextCovariate implements StandardCovariate {
if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0)
throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize));
- // initialize no context strings given the size of the context for each covariate type
- mismatchesNoContext = makeAllNStringWithLength(mismatchesContextSize);
- insertionsNoContext = makeAllNStringWithLength(insertionsContextSize);
- deletionsNoContext = makeAllNStringWithLength( deletionsContextSize);
}
@Override
public CovariateValues getValues(final GATKSAMRecord read) {
int l = read.getReadLength();
- String[] mismatches = new String [l];
- String[] insertions = new String [l];
- String[] deletions = new String [l];
+ BitSet[] mismatches = new BitSet[l];
+ BitSet[] insertions = new BitSet[l];
+ BitSet[] deletions = new BitSet[l];
final boolean negativeStrand = read.getReadNegativeStrandFlag();
byte[] bases = read.getReadBases();
- if (negativeStrand) {
- bases = BaseUtils.simpleReverseComplement(bases); //this is NOT in-place
- }
+ if (negativeStrand)
+ bases = BaseUtils.simpleReverseComplement(bases);
+
for (int i = 0; i < read.getReadLength(); i++) {
- mismatches[i] = contextWith(bases, i, mismatchesContextSize, mismatchesNoContext);
- insertions[i] = contextWith(bases, i, insertionsContextSize, insertionsNoContext);
- deletions[i] = contextWith(bases, i, deletionsContextSize, deletionsNoContext);
+ mismatches[i] = contextWith(bases, i, mismatchesContextSize);
+ insertions[i] = contextWith(bases, i, insertionsContextSize);
+ deletions[i] = contextWith(bases, i, deletionsContextSize);
}
+
if (negativeStrand) {
reverse(mismatches);
reverse(insertions);
@@ -90,7 +85,7 @@ public class ContextCovariate implements StandardCovariate {
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
- public final Comparable getValue(final String str) {
+ public final Object getValue(final String str) {
return str;
}
@@ -100,29 +95,28 @@ public class ContextCovariate implements StandardCovariate {
* @param bases the bases in the read to build the context from
* @param offset the position in the read to calculate the context for
* @param contextSize context size to use building the context
- * @param noContextString string to return if the position is not far enough in the read to have a full context before.
* @return
*/
- private String contextWith(byte [] bases, int offset, int contextSize, String noContextString) {
- return (offset < contextSize) ? noContextString : new String(Arrays.copyOfRange(bases, offset - contextSize, offset));
+ private BitSet contextWith(byte [] bases, int offset, int contextSize) {
+ if (offset < contextSize)
+ return null;
+
+ String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset));
+ if (context.contains("N"))
+ return null;
+
+ return MathUtils.bitSetFrom(context);
}
-
- private String makeAllNStringWithLength(int length) {
- String s = "";
- for (int i=0; i readGroupLookupTable = new HashMap();
+ private final HashMap readGroupReverseLookupTable = new HashMap();
+ private short nextId = 0;
// Initialize any member variables using the command-line arguments passed to the walkers
@Override
@@ -48,16 +53,29 @@ public class ReadGroupCovariate implements RequiredCovariate {
public CovariateValues getValues(final GATKSAMRecord read) {
final int l = read.getReadLength();
final String readGroupId = read.getReadGroup().getReadGroupId();
- String [] readGroups = new String[l];
- Arrays.fill(readGroups, readGroupId);
+ short shortId;
+ if (readGroupLookupTable.containsKey(readGroupId))
+ shortId = readGroupLookupTable.get(readGroupId);
+ else {
+ shortId = nextId;
+ readGroupLookupTable.put(readGroupId, nextId);
+ readGroupReverseLookupTable.put(nextId, readGroupId);
+ nextId++;
+ }
+ Short [] readGroups = new Short[l];
+ Arrays.fill(readGroups, shortId);
return new CovariateValues(readGroups, readGroups, readGroups);
}
// Used to get the covariate's value from input csv file during on-the-fly recalibration
@Override
- public final Comparable getValue(final String str) {
+ public final Object getValue(final String str) {
return str;
}
+
+ public final String decodeReadGroup(final short id) {
+ return readGroupReverseLookupTable.get(id);
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
index 38e7051e4..cc6f67cc9 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java
@@ -25,8 +25,14 @@
package org.broadinstitute.sting.gatk.walkers.bqsr;
-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Hidden;
+import org.broad.tribble.Feature;
+import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer;
+
+import java.io.PrintStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
/**
* Created by IntelliJ IDEA.
@@ -39,6 +45,63 @@ import org.broadinstitute.sting.commandline.Hidden;
public class RecalibrationArgumentCollection {
+ /**
+ * This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference,
+ * so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.)
+ * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites.
+ * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument.
+ */
+ @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false)
+ protected List> knownSites = Collections.emptyList();
+
+ /**
+ * After the header, data records occur one per line until the end of the file. The first several items on a line are the
+ * values of the individual covariates and will change depending on which covariates were specified at runtime. The last
+ * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
+ * and the raw empirical quality score calculated by phred-scaling the mismatch rate.
+ */
+ @Gather(CountCovariatesGatherer.class)
+ @Output
+ protected PrintStream RECAL_FILE;
+
+ /**
+ * List all implemented covariates.
+ */
+ @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false)
+ protected boolean LIST_ONLY = false;
+
+ /**
+ * Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you. See the list of covariates with -list.
+ */
+ @Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false)
+ protected String[] COVARIATES = null;
+
+ /*
+ * Use the standard set of covariates in addition to the ones listed using the -cov argument
+ */
+ @Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false)
+ protected boolean USE_STANDARD_COVARIATES = true;
+
+ /////////////////////////////
+ // Debugging-only Arguments
+ /////////////////////////////
+ /**
+ * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
+ */
+ @Hidden
+ @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
+ protected boolean RUN_WITHOUT_DBSNP = false;
+
+ /////////////////////////////
+ // protected Member Variables
+ /////////////////////////////
+ protected final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap used to create collapsed data hashmaps (delta delta tables)
+ protected final ArrayList requestedCovariates = new ArrayList();// A list to hold the covariate objects that were requested
+
+ protected final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped.
+ protected final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed.
+
+
/**
* CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
new file mode 100644
index 000000000..d7a48d321
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.diagnostics;
+
+import net.sf.samtools.SAMReadGroupRecord;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+import org.broadinstitute.sting.gatk.report.GATKReport;
+import org.broadinstitute.sting.gatk.report.GATKReportTable;
+import org.broadinstitute.sting.gatk.walkers.ReadWalker;
+import org.broadinstitute.sting.utils.Median;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+import java.io.PrintStream;
+import java.text.DateFormat;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Emits a GATKReport containing read group, sample, library, platform, center, sequencing data,
+ * paired end status, simple read type name (e.g. 2x76) median insert size and median read length
+ * for each read group in every provided BAM file
+ *
+ * Note that this walker stops when all read groups have been observed at least a few thousand times so that
+ * the median statistics are well determined. It is safe to run it WG and it'll finish in an appropriate
+ * timeframe.
+ *
+ *
Input
+ *
+ * Any number of BAM files
+ *
+ *
+ *
Output
+ *
+ * GATKReport containing read group, sample, library, platform, center, median insert size and median read length.
+ *
+ * For example, running this tool on the NA12878 data sets:
+ *
+ *
+ *
+ * @author Mark DePristo
+ */
+
+
+
+public class ReadGroupProperties extends ReadWalker {
+ @Output
+ public PrintStream out;
+
+ @Argument(shortName="maxElementsForMedian", doc="Calculate median from the first maxElementsForMedian values observed", required=false)
+ public int MAX_VALUES_FOR_MEDIAN = 10000;
+
+ private final static String TABLE_NAME = "ReadGroupProperties";
+ private final Map readGroupInfo = new HashMap();
+
+ private class PerReadGroupInfo {
+ public final Median readLength = new Median(MAX_VALUES_FOR_MEDIAN);
+ public final Median insertSize = new Median(MAX_VALUES_FOR_MEDIAN);
+ public int nReadsSeen = 0, nReadsPaired = 0;
+
+ public boolean needsMoreData() {
+ return ! readLength.isFull() || ! insertSize.isFull();
+ }
+ }
+
+ @Override
+ public void initialize() {
+ for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
+ readGroupInfo.put(rg.getId(), new PerReadGroupInfo());
+ }
+ }
+
+ @Override
+ public boolean filter(ReferenceContext ref, GATKSAMRecord read) {
+ return ! (read.getReadFailsVendorQualityCheckFlag() || read.getReadUnmappedFlag());
+ }
+
+ @Override
+ public boolean isDone() {
+ for ( PerReadGroupInfo info : readGroupInfo.values() ) {
+ if ( info.needsMoreData() )
+ return false;
+ }
+
+ return true;
+ }
+
+ @Override
+ public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, ReadMetaDataTracker readMetaDataTracker) {
+ final String rgID = read.getReadGroup().getId();
+ final PerReadGroupInfo info = readGroupInfo.get(rgID);
+
+ if ( info.needsMoreData() ) {
+ info.readLength.add(read.getReadLength());
+ info.nReadsSeen++;
+ if ( read.getReadPairedFlag() ) {
+ info.nReadsPaired++;
+ if ( read.getInferredInsertSize() != 0) {
+ info.insertSize.add(Math.abs(read.getInferredInsertSize()));
+ }
+ }
+ }
+
+ return null;
+ }
+
+ @Override
+ public Integer reduceInit() {
+ return null;
+ }
+
+ @Override
+ public Integer reduce(Integer integer, Integer integer1) {
+ return null;
+ }
+
+ @Override
+ public void onTraversalDone(Integer sum) {
+ final GATKReport report = new GATKReport();
+ report.addTable(TABLE_NAME, "Table of read group properties");
+ GATKReportTable table = report.getTable(TABLE_NAME);
+ DateFormat dateFormatter = DateFormat.getDateInstance(DateFormat.SHORT);
+
+ table.addPrimaryKey("readgroup");
+ //* Emits a GATKReport containing read group, sample, library, platform, center, median insert size and
+ //* median read length for each read group in every BAM file.
+ table.addColumn("sample", "NA");
+ table.addColumn("library", "NA");
+ table.addColumn("platform", "NA");
+ table.addColumn("center", "NA");
+ table.addColumn("date", "NA");
+ table.addColumn("has.any.reads", "false");
+ table.addColumn("is.paired.end", "false");
+ table.addColumn("n.reads.analyzed", "NA");
+ table.addColumn("simple.read.type", "NA");
+ table.addColumn("median.read.length", Integer.valueOf(0));
+ table.addColumn("median.insert.size", Integer.valueOf(0));
+
+ for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) {
+ final String rgID = rg.getId();
+ PerReadGroupInfo info = readGroupInfo.get(rgID);
+
+ // we are paired if > 25% of reads are paired
+ final boolean isPaired = info.nReadsPaired / (1.0 * (info.nReadsSeen+1)) > 0.25;
+ final boolean hasAnyReads = info.nReadsSeen > 0;
+ final int readLength = info.readLength.getMedian(0);
+
+ setTableValue(table, rgID, "sample", rg.getSample());
+ setTableValue(table, rgID, "library", rg.getLibrary());
+ setTableValue(table, rgID, "platform", rg.getPlatform());
+ setTableValue(table, rgID, "center", rg.getSequencingCenter());
+ try {
+ setTableValue(table, rgID, "date", rg.getRunDate() != null ? dateFormatter.format(rg.getRunDate()) : "NA");
+ } catch ( NullPointerException e ) {
+ // TODO: remove me when bug in Picard is fixed that causes NPE when date isn't present
+ setTableValue(table, rgID, "date", "NA");
+ }
+ setTableValue(table, rgID, "has.any.reads", hasAnyReads);
+ setTableValue(table, rgID, "is.paired.end", isPaired);
+ setTableValue(table, rgID, "n.reads.analyzed", info.nReadsSeen);
+ setTableValue(table, rgID, "simple.read.type", hasAnyReads ? String.format("%dx%d", isPaired ? 2 : 1, readLength) : "NA");
+ setTableValue(table, rgID, "median.read.length", hasAnyReads ? readLength : "NA" );
+ setTableValue(table, rgID, "median.insert.size", hasAnyReads && isPaired ? info.insertSize.getMedian(0) : "NA" );
+ }
+
+ report.print(out);
+ }
+
+ private final void setTableValue(GATKReportTable table, final String rgID, final String key, final Object value) {
+ table.set(rgID, key, value == null ? "NA" : value);
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
index 200a250f2..26023bd2f 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
@@ -454,8 +454,7 @@ public class HaplotypeIndelErrorModel {
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2)
// First term is approximated by Jacobian log with table lookup.
// Second term is a constant added to both likelihoods so will be ignored
- haplotypeLikehoodMatrix[i][j] += MathUtils.softMax(readLikelihood[0],
- readLikelihood[1]);
+ haplotypeLikehoodMatrix[i][j] += MathUtils.approximateLog10SumLog10(readLikelihood[0], readLikelihood[1]);
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
index 6410d619d..64993b43a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@@ -166,18 +166,17 @@ public class PairHMMIndelErrorModel {
final double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual];
- matchMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, XMetricArray[im1][jm1] + pBaseRead,
- YMetricArray[im1][jm1] + pBaseRead);
+ matchMetricArray[indI][indJ] = pBaseRead + MathUtils.approximateLog10SumLog10(new double[]{matchMetricArray[im1][jm1], XMetricArray[im1][jm1], YMetricArray[im1][jm1]});
final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1];
final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1];
- XMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1);
+ XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1);
// update Y array
final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1];
final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1];
- YMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2);
+ YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2);
}
}
@@ -316,9 +315,7 @@ public class PairHMMIndelErrorModel {
final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1;
- final double bestMetric = MathUtils.softMax(matchMetricArray[bestI][bestJ],
- XMetricArray[bestI][bestJ],
- YMetricArray[bestI][bestJ]);
+ final double bestMetric = MathUtils.approximateLog10SumLog10(new double[]{ matchMetricArray[bestI][bestJ], XMetricArray[bestI][bestJ], YMetricArray[bestI][bestJ] });
/*
if (DEBUG) {
@@ -651,7 +648,7 @@ public class PairHMMIndelErrorModel {
private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) {
final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes];
- // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplied to just a single loop without the intermediate NxN matrix
+ // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix
for (int i=0; i < numHaplotypes; i++) {
for (int j=i; j < numHaplotypes; j++){
// combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j]
@@ -665,7 +662,7 @@ public class PairHMMIndelErrorModel {
final double li = readLikelihoods[readIdx][i];
final double lj = readLikelihoods[readIdx][j];
final int readCount = readCounts[readIdx];
- haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.softMax(li, lj) + LOG_ONE_HALF);
+ haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + LOG_ONE_HALF);
}
}
}
@@ -678,7 +675,7 @@ public class PairHMMIndelErrorModel {
}
}
- // renormalize so that max element is zero.
+ // renormalize so that max element is zero.
return MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true);
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
index b27bef265..e812fb53a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
@@ -110,6 +110,13 @@ public class ValidationAmplicons extends RodWalker {
@Argument(doc="Lower case SNPs rather than replacing with 'N'",fullName="lowerCaseSNPs",required=false)
boolean lowerCaseSNPs = false;
+ /**
+ * If onlyOutputValidAmplicons is true, the output fasta file will contain only valid sequences.
+ * Useful for producing delivery-ready files.
+ */
+ @Argument(doc="Only output valid sequences.",fullName="onlyOutputValidAmplicons",required=false)
+ boolean onlyOutputValidAmplicons = false;
+
/**
* BWA single-end alignment is used as a primer specificity proxy. Low-complexity regions (that don't align back to themselves as a best hit) are lowercased.
* This changes the size of the k-mer used for alignment.
@@ -486,14 +493,16 @@ public class ValidationAmplicons extends RodWalker {
valid = "Valid";
}
- String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D');
- if (!sequenomOutput)
- out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
- else {
- seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record
- probeName = probeName.replace("amplicon_","a");
- out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity);
+ if (!onlyOutputValidAmplicons || !sequenceInvalid) {
+ String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D');
+ if (!sequenomOutput)
+ out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity);
+ else {
+ seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record
+ probeName = probeName.replace("amplicon_","a");
+ out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity);
+ }
}
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java
index 82776ca2e..6f0ae7c25 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java
@@ -140,15 +140,16 @@ public class GaussianMixtureModel {
}
for( final VariantDatum datum : data ) {
- final ArrayList pVarInGaussianLog10 = new ArrayList( gaussians.size() );
+ final double[] pVarInGaussianLog10 = new double[gaussians.size()];
+ int gaussianIndex = 0;
for( final MultivariateGaussian gaussian : gaussians ) {
final double pVarLog10 = gaussian.evaluateDatumLog10( datum );
- pVarInGaussianLog10.add( pVarLog10 );
+ pVarInGaussianLog10[gaussianIndex++] = pVarLog10;
}
- final double[] pVarInGaussianNormalized = MathUtils.normalizeFromLog10( pVarInGaussianLog10 );
- int iii = 0;
+ final double[] pVarInGaussianNormalized = MathUtils.normalizeFromLog10( pVarInGaussianLog10, false );
+ gaussianIndex = 0;
for( final MultivariateGaussian gaussian : gaussians ) {
- gaussian.assignPVarInGaussian( pVarInGaussianNormalized[iii++] ); //BUGBUG: to clean up
+ gaussian.assignPVarInGaussian( pVarInGaussianNormalized[gaussianIndex++] );
}
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
index 7cc5b1625..3cdcf4982 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java
@@ -372,6 +372,8 @@ public class VariantRecalibrator extends RodWalker= 0", "hapStartInRefCoords >= 0"})
+ public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final int hapStartInRefCoords, final Cigar haplotypeCigar ) {
if( refAllele.length() != altAllele.length() ) { refInsertLocation++; }
- int haplotypeInsertLocation = getHaplotypeCoordinateForReferenceCoordinate(refStart + numBasesAddedToStartOfHaplotype, haplotypeCigar, refInsertLocation);
+ int haplotypeInsertLocation = getHaplotypeCoordinateForReferenceCoordinate(hapStartInRefCoords, haplotypeCigar, refInsertLocation);
if( haplotypeInsertLocation == -1 ) { // desired change falls inside deletion so don't bother creating a new haplotype
- return getBases().clone();
+ return bases.clone();
}
- haplotypeInsertLocation += numBasesAddedToStartOfHaplotype;
- final byte[] newHaplotype = getBases().clone();
+ byte[] newHaplotype;
try {
if( refAllele.length() == altAllele.length() ) { // SNP or MNP
+ newHaplotype = bases.clone();
for( int iii = 0; iii < altAllele.length(); iii++ ) {
newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii];
}
- } else if( refAllele.length() < altAllele.length() ) { // insertion
+ } else if( refAllele.length() < altAllele.length() ) { // insertion
final int altAlleleLength = altAllele.length();
+ newHaplotype = new byte[bases.length + altAlleleLength];
+ for( int iii = 0; iii < bases.length; iii++ ) {
+ newHaplotype[iii] = bases[iii];
+ }
for( int iii = newHaplotype.length - 1; iii > haplotypeInsertLocation + altAlleleLength - 1; iii-- ) {
newHaplotype[iii] = newHaplotype[iii-altAlleleLength];
}
@@ -138,24 +143,17 @@ public class Haplotype {
newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii];
}
} else { // deletion
- int refHaplotypeOffset = 0;
- for( final CigarElement ce : haplotypeCigar.getCigarElements()) {
- if(ce.getOperator() == CigarOperator.D) { refHaplotypeOffset += ce.getLength(); }
- else if(ce.getOperator() == CigarOperator.I) { refHaplotypeOffset -= ce.getLength(); }
- }
- for( int iii = 0; iii < altAllele.length(); iii++ ) {
- newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii];
- }
final int shift = refAllele.length() - altAllele.length();
- for( int iii = haplotypeInsertLocation + altAllele.length(); iii < newHaplotype.length - shift; iii++ ) {
- newHaplotype[iii] = newHaplotype[iii+shift];
+ newHaplotype = new byte[bases.length - shift];
+ for( int iii = 0; iii < haplotypeInsertLocation + altAllele.length(); iii++ ) {
+ newHaplotype[iii] = bases[iii];
}
- for( int iii = 0; iii < shift; iii++ ) {
- newHaplotype[iii+newHaplotype.length-shift] = paddedRef[refStart+refHaplotypeLength+refHaplotypeOffset+iii];
+ for( int iii = haplotypeInsertLocation + altAllele.length(); iii < newHaplotype.length; iii++ ) {
+ newHaplotype[iii] = bases[iii+shift];
}
}
} catch (Exception e) { // event already on haplotype is too large/complex to insert another allele, most likely because of not enough reference padding
- return getBases().clone();
+ return bases.clone();
}
return newHaplotype;
diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
index c9ab3b58e..a96cbffc5 100644
--- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
@@ -29,6 +29,7 @@ import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.math.BigDecimal;
@@ -40,14 +41,6 @@ import java.util.*;
* @author Kiran Garimella
*/
public class MathUtils {
- /** Public constants - used for the Lanczos approximation to the factorial function
- * (for the calculation of the binomial/multinomial probability in logspace)
- * @param LANC_SEQ[] - an array holding the constants which correspond to the product
- * of Chebyshev Polynomial coefficients, and points on the Gamma function (for interpolation)
- * [see A Precision Approximation of the Gamma Function J. SIAM Numer. Anal. Ser. B, Vol. 1 1964. pp. 86-96]
- * @param LANC_G - a value for the Lanczos approximation to the gamma function that works to
- * high precision
- */
/**
* Private constructor. No instantiating this class!
@@ -55,6 +48,28 @@ public class MathUtils {
private MathUtils() {
}
+ public static final double[] log10Cache;
+ private static final double[] jacobianLogTable;
+ private static final double JACOBIAN_LOG_TABLE_STEP = 0.001;
+ private static final double MAX_JACOBIAN_TOLERANCE = 10.0;
+ private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1;
+ private static final int MAXN = 11000;
+ private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients
+
+ static {
+ log10Cache = new double[LOG10_CACHE_SIZE];
+ jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE];
+
+ log10Cache[0] = Double.NEGATIVE_INFINITY;
+ for (int k = 1; k < LOG10_CACHE_SIZE; k++)
+ log10Cache[k] = Math.log10(k);
+
+ for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
+ jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP));
+
+ }
+ }
+
// A fast implementation of the Math.round() method. This method does not perform
// under/overflow checking, so this shouldn't be used in the general case (but is fine
// if one is already make those checks before calling in to the rounding).
@@ -535,7 +550,7 @@ public class MathUtils {
// all negative) the largest value; also, we need to convert to normal-space.
double maxValue = Utils.findMaxEntry(array);
- // we may decide to just normalize in log space with converting to linear space
+ // we may decide to just normalize in log space without converting to linear space
if (keepInLogSpace) {
for (int i = 0; i < array.length; i++)
array[i] -= maxValue;
@@ -562,29 +577,6 @@ public class MathUtils {
return normalized;
}
- public static double[] normalizeFromLog10(List array, boolean takeLog10OfOutput) {
- double[] normalized = new double[array.size()];
-
- // for precision purposes, we need to add (or really subtract, since they're
- // all negative) the largest value; also, we need to convert to normal-space.
- double maxValue = MathUtils.arrayMaxDouble(array);
- for (int i = 0; i < array.size(); i++)
- normalized[i] = Math.pow(10, array.get(i) - maxValue);
-
- // normalize
- double sum = 0.0;
- for (int i = 0; i < array.size(); i++)
- sum += normalized[i];
- for (int i = 0; i < array.size(); i++) {
- double x = normalized[i] / sum;
- if (takeLog10OfOutput)
- x = Math.log10(x);
- normalized[i] = x;
- }
-
- return normalized;
- }
-
/**
* normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE).
*
@@ -595,10 +587,6 @@ public class MathUtils {
return normalizeFromLog10(array, false);
}
- public static double[] normalizeFromLog10(List array) {
- return normalizeFromLog10(array, false);
- }
-
public static int maxElementIndex(final double[] array) {
return maxElementIndex(array, array.length);
}
@@ -1206,78 +1194,11 @@ public class MathUtils {
return ((double) num) / (Math.max(denom, 1));
}
- public static final double[] log10Cache;
- public static final double[] jacobianLogTable;
- public static final int JACOBIAN_LOG_TABLE_SIZE = 101;
- public static final double JACOBIAN_LOG_TABLE_STEP = 0.1;
- public static final double INV_JACOBIAN_LOG_TABLE_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP;
- public static final double MAX_JACOBIAN_TOLERANCE = 10.0;
- private static final int MAXN = 11000;
- private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients
-
- static {
- log10Cache = new double[LOG10_CACHE_SIZE];
- jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE];
-
- log10Cache[0] = Double.NEGATIVE_INFINITY;
- for (int k = 1; k < LOG10_CACHE_SIZE; k++)
- log10Cache[k] = Math.log10(k);
-
- for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) {
- jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP));
-
- }
- }
-
- static public double softMax(final double[] vec) {
- double acc = vec[0];
- for (int k = 1; k < vec.length; k++)
- acc = softMax(acc, vec[k]);
-
- return acc;
-
- }
-
static public double max(double x0, double x1, double x2) {
double a = Math.max(x0, x1);
return Math.max(a, x2);
}
- static public double softMax(final double x0, final double x1, final double x2) {
- // compute naively log10(10^x[0] + 10^x[1]+...)
- // return Math.log10(MathUtils.sumLog10(vec));
-
- // better approximation: do Jacobian logarithm function on data pairs
- double a = softMax(x0, x1);
- return softMax(a, x2);
- }
-
- static public double softMax(final double x, final double y) {
- // we need to compute log10(10^x + 10^y)
- // By Jacobian logarithm identity, this is equal to
- // max(x,y) + log10(1+10^-abs(x-y))
- // we compute the second term as a table lookup
- // with integer quantization
-
- // slow exact version:
- // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y));
-
- double diff = x - y;
-
- if (diff > MAX_JACOBIAN_TOLERANCE)
- return x;
- else if (diff < -MAX_JACOBIAN_TOLERANCE)
- return y;
- else if (diff >= 0) {
- int ind = (int) (diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5);
- return x + jacobianLogTable[ind];
- }
- else {
- int ind = (int) (-diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5);
- return y + jacobianLogTable[ind];
- }
- }
-
public static double phredScaleToProbability(byte q) {
return Math.pow(10, (-q) / 10.0);
}
@@ -1619,30 +1540,118 @@ public class MathUtils {
* @param bitSet the bitset
* @return an integer with the bitset representation
*/
- public static int intFrom(final BitSet bitSet) {
- int integer = 0;
+ public static long intFrom(final BitSet bitSet) {
+ long number = 0;
for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0; bitIndex = bitSet.nextSetBit(bitIndex+1))
- integer |= 1 << bitIndex;
+ number |= 1L << bitIndex;
- return integer;
+ return number;
}
/**
* Creates a BitSet representation of a given integer
*
- * @param integer the number to turn into a bitset
+ * @param number the number to turn into a bitset
* @return a bitset representation of the integer
*/
- public static BitSet bitSetFrom(int integer) {
- BitSet bitSet = new BitSet((int) Math.ceil(Math.sqrt(integer)));
+ public static BitSet bitSetFrom(long number) {
+ BitSet bitSet = new BitSet();
int bitIndex = 0;
- while (integer > 0) {
- if (integer%2 > 0)
+ while (number > 0) {
+ if (number%2 > 0)
bitSet.set(bitIndex);
bitIndex++;
- integer /= 2;
+ number /= 2;
}
return bitSet;
}
+ /**
+ * Converts a BitSet into the dna string representation.
+ *
+ * Warning: This conversion is limited to long precision, therefore the dna sequence cannot
+ * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
+ * a bitSetFrom(BigNumber) method.
+ *
+ * We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the
+ * base_10 representation of the sequence. This is important for us to know how to bring the number
+ * to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented
+ * as 0's and leading 0's are omitted).
+ *
+ * quasi-canonical because A is represented by a 0, therefore,
+ * instead of : 0, 1, 2, 3, 10, 11, 12, ...
+ * we have : 0, 1, 2, 3, 00, 01, 02, ...
+ *
+ * but we can correctly decode it because we know the final length.
+ *
+ * @param bitSet the bitset representation of the dna sequence
+ * @return the dna sequence represented by the bitset
+ */
+ public static String dnaFrom(final BitSet bitSet) {
+ long number = intFrom(bitSet); // the base_10 representation of the bit set
+ long preContext = 0; // the number of combinations skipped to get to the quasi-canonical representation (we keep it to subtract later)
+ long nextContext = 4; // the next context (we advance it so we know which one was preceding it).
+ int i = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet.
+ while (nextContext <= number) { // find the length of the dna string (i)
+ preContext = nextContext; // keep track of the number of combinations in the preceding context
+ nextContext += Math.pow(4, ++i);// calculate the next context
+ }
+ number -= preContext; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation
+
+ String dna = "";
+ while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical)
+ byte base = (byte) (number % 4);
+ switch (base) {
+ case 0 : dna = "A" + dna; break;
+ case 1 : dna = "C" + dna; break;
+ case 2 : dna = "G" + dna; break;
+ case 3 : dna = "T" + dna; break;
+ }
+ number /= 4;
+ }
+ for (int j = dna.length(); j < i; j++)
+ dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above)
+
+ return dna;
+ }
+
+ /**
+ * Creates a BitSet representation of a given dna string.
+ *
+ * Warning: This conversion is limited to long precision, therefore the dna sequence cannot
+ * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create
+ * a bitSetFrom(BigNumber) method.
+ *
+ * The bit representation of a dna string is the simple:
+ * 0 A 4 AA 8 CA
+ * 1 C 5 AC ...
+ * 2 G 6 AG 1343 TTGGT
+ * 3 T 7 AT 1364 TTTTT
+ *
+ * To convert from dna to number, we convert the dna string to base10 and add all combinations that
+ * preceded the string (with smaller lengths).
+ *
+ * @param dna the dna sequence
+ * @return the bitset representing the dna sequence
+ */
+ public static BitSet bitSetFrom(String dna) {
+ if (dna.length() > 31)
+ throw new ReviewedStingException(String.format("DNA Length cannot be bigger than 31. dna: %s (%d)", dna, dna.length()));
+
+ long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set
+ long preContext = 0; // the sum of all combinations that preceded the length of the dna string
+ for (int i=0; i0)
+ preContext += Math.pow(4, i); // each length will have 4^i combinations (e.g 1 = 4, 2 = 16, 3 = 64, ...)
+ }
+
+ return bitSetFrom(baseTen+preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length.
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/Median.java b/public/java/src/org/broadinstitute/sting/utils/Median.java
new file mode 100644
index 000000000..7ebe8d2d7
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/Median.java
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils;
+
+import java.util.*;
+
+/**
+ * Utility class for calculating median from a data set, potentially limiting the size of data to a
+ * fixed amount
+ *
+ * @author Your Name
+ * @since Date created
+ */
+public class Median {
+ final List values;
+ final int maxValuesToKeep;
+ boolean sorted = false;
+
+ public Median() {
+ this(Integer.MAX_VALUE);
+ }
+
+ public Median(final int maxValuesToKeep) {
+ this.maxValuesToKeep = maxValuesToKeep;
+ this.values = new ArrayList();
+ }
+
+ public boolean isFull() {
+ return values.size() >= maxValuesToKeep;
+ }
+
+ public int size() {
+ return values.size();
+ }
+
+ public boolean isEmpty() {
+ return values.isEmpty();
+ }
+
+ public T getMedian() {
+ if ( isEmpty() )
+ throw new IllegalStateException("Cannot get median value from empty array");
+ return getMedian(null); // note that value null will never be used
+ }
+
+ /**
+ * Returns the floor(n + 1 / 2) item from the list of values if the list
+ * has values, or defaultValue is the list is empty.
+ */
+ public T getMedian(final T defaultValue) {
+ if ( isEmpty() )
+ return defaultValue;
+
+ if ( ! sorted ) {
+ sorted = true;
+ Collections.sort(values);
+ }
+
+ final int offset = (int)Math.floor((values.size() + 1) * 0.5) - 1;
+ return values.get(offset);
+ }
+
+ public boolean add(final T value) {
+ if ( ! isFull() ) {
+ sorted = false;
+ return values.add(value);
+ }
+ else
+ return false;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java
index fb133d902..62a67a1f2 100644
--- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java
+++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java
@@ -4,6 +4,7 @@ import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
+import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@@ -314,6 +315,15 @@ public class ClippingOp {
if (start == 0)
hardClippedRead.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), cigarShift.cigar));
+ if (read.hasBaseIndelQualities()) {
+ byte[] newBaseInsertionQuals = new byte[newLength];
+ byte[] newBaseDeletionQuals = new byte[newLength];
+ System.arraycopy(read.getBaseInsertionQualities(), copyStart, newBaseInsertionQuals, 0, newLength);
+ System.arraycopy(read.getBaseDeletionQualities(), copyStart, newBaseDeletionQuals, 0, newLength);
+ hardClippedRead.setBaseQualities(newBaseInsertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION);
+ hardClippedRead.setBaseQualities(newBaseDeletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION);
+ }
+
return hardClippedRead;
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java b/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java
index d280ac804..8652d3c28 100755
--- a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java
+++ b/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java
@@ -34,7 +34,7 @@ import java.util.Map;
* Date: Dec 29, 2009
*/
-public class NestedHashMap{
+public class NestedHashMap {
public final Map data = new HashMap