diff --git a/build.xml b/build.xml
index e5ad9daf0..1f26e7b7a 100644
--- a/build.xml
+++ b/build.xml
@@ -163,6 +163,14 @@
* One or more bam files (with proper headers) to be analyzed for coverage statistics
- * (Optional) A REFSEQ Rod to aggregate coverage to the gene level
*
+ *(Optional) A REFSEQ Rod to aggregate coverage to the gene level
+ *
+ * (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation)
+ *Input
*
* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: @@ -93,7 +96,7 @@ import java.util.*; *
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
- * -T VariantEval \
+ * -T DepthOfCoverage \
* -o file_name_base \
* -I input_bams.list
* [-geneList refSeq.sorted.txt] \
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
index e68aa31e0..232e468f9 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java
@@ -73,7 +73,7 @@ public class HaplotypeIndelErrorModel {
baseMatchArray = new double[MAX_CACHED_QUAL+1];
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
- double baseProb = QualityUtils.qualToProb(k);
+ double baseProb = QualityUtils.qualToProb((byte)k);
baseMatchArray[k] = probToQual(baseProb);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java
index a56c9e21e..63fb33295 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java
@@ -37,7 +37,7 @@ public class PhasingRead extends BaseArray {
public PhasingRead(int length, int mappingQual) {
super(length);
- this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual));
+ this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb((byte)mappingQual));
this.baseProbs = new PreciseNonNegativeDouble[length];
Arrays.fill(this.baseProbs, null);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java
index 55da1c152..f94140814 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/RefSeqDataParser.java
@@ -44,12 +44,12 @@ public class RefSeqDataParser {
String nameKeyToUseMultiplePrefix = nameKeyToUse + "_";
Map entriesToNames = new HashMap();
- Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY);
- if (numRecords != null) {
+ int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1);
+ if (numRecords != -1) {
boolean done = false;
if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1":
- String name = vc.getAttributeAsStringNoException(nameKeyToUse);
+ String name = vc.getAttributeAsString(nameKeyToUse, null);
if (name != null) {
entriesToNames.put(nameKeyToUse, name);
done = true;
@@ -59,14 +59,14 @@ public class RefSeqDataParser {
if (!done) {
for (int i = 1; i <= numRecords; i++) {
String key = nameKeyToUseMultiplePrefix + i;
- String name = vc.getAttributeAsStringNoException(key);
+ String name = vc.getAttributeAsString(key, null);
if (name != null)
entriesToNames.put(key, name);
}
}
}
else { // no entry with the # of records:
- String name = vc.getAttributeAsStringNoException(nameKeyToUse);
+ String name = vc.getAttributeAsString(nameKeyToUse, null);
if (name != null) {
entriesToNames.put(nameKeyToUse, name);
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java
index 2913c97a6..72058ba7b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java
@@ -110,12 +110,12 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
case SNP:
nVariantLoci++;
nSNPs++;
- if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
+ if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
break;
case MNP:
nVariantLoci++;
nMNPs++;
- if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
+ if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
break;
case INDEL:
nVariantLoci++;
@@ -137,7 +137,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
String refStr = vc1.getReference().getBaseString().toUpperCase();
- String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE").toUpperCase() : null;
+ String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase() : null;
// if (aaStr.equals(".")) {
// aaStr = refStr;
// }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java
index a476a2680..e69dbfb28 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java
@@ -219,7 +219,8 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
}
public static Double getPQ(Genotype gt) {
- return gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY);
+ Double d = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1);
+ return d == -1 ? null : d;
}
public static boolean topMatchesTop(AllelePair b1, AllelePair b2) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java
index 203c15a85..2d0163206 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java
@@ -120,7 +120,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval
if ( eval.hasGenotypes() )
ac = eval.getChromosomeCount(eval.getAlternateAllele(0));
else if ( eval.hasAttribute("AC") ) {
- ac = Integer.valueOf(eval.getAttributeAsString("AC"));
+ ac = eval.getAttributeAsInt("AC", -1);
}
if ( ac != -1 ) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java
index 1feb37e01..9b6e145e6 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java
@@ -49,18 +49,14 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv
else nTv++;
}
- String refStr = vc.getReference().getBaseString().toUpperCase();
- String aaStr = vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase();
-
- if (aaStr != null && !aaStr.equalsIgnoreCase("null") && !aaStr.equals(".")) {
- BaseUtils.BaseSubstitutionType aaSubType = BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0]);
-
- //System.out.println(refStr + " " + vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase() + " " + aaSubType);
-
- if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSITION) {
- nTiDerived++;
- } else if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSVERSION) {
- nTvDerived++;
+ if (vc.hasAttribute("ANCESTRALALLELE")) {
+ final String aaStr = vc.getAttributeAsString("ANCESTRALALLELE", "null").toUpperCase();
+ if ( ! aaStr.equals(".") ) {
+ switch ( BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0] ) ) {
+ case TRANSITION: nTiDerived++; break;
+ case TRANSVERSION: nTvDerived++; break;
+ default: break;
+ }
}
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java
index 307b4f684..c60586017 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java
@@ -131,7 +131,7 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
//// System.out.printf(" ac = %d%n", ac);
}
else
- ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY);
+ ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0);
return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO;
} else {
return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED ? SiteStatus.POLY : SiteStatus.NO_CALL; // we can't figure out what to do
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java
index 3cc22cc52..c7bea93b2 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java
@@ -44,7 +44,7 @@ public class AlleleCount extends VariantStratifier {
if (eval != null) {
int AC = -1;
if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) {
- AC = eval.getAttributeAsInt("AC");
+ AC = eval.getAttributeAsInt("AC", 0);
} else if ( eval.isVariant() ) {
for (Allele allele : eval.getAlternateAlleles())
AC = Math.max(AC, eval.getChromosomeCount(allele));
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java
index 3d2dda651..cd2b8e475 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java
@@ -28,7 +28,7 @@ public class AlleleFrequency extends VariantStratifier {
if (eval != null) {
try {
- relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF") / 5.0, 3))));
+ relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF", 0.0) / 5.0, 3))));
} catch (Exception e) {
return relevantStates;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java
index 3223626c0..91c96e490 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java
@@ -90,8 +90,8 @@ public class Degeneracy extends VariantStratifier {
Integer frame = null;
if (eval.hasAttribute("refseq.functionalClass")) {
- aa = eval.getAttributeAsString("refseq.variantAA");
- frame = eval.getAttributeAsInt("refseq.frame");
+ aa = eval.getAttributeAsString("refseq.variantAA", null);
+ frame = eval.getAttributeAsInt("refseq.frame", 0);
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
int annotationId = 1;
String key;
@@ -99,7 +99,7 @@ public class Degeneracy extends VariantStratifier {
do {
key = String.format("refseq.functionalClass_%d", annotationId);
- String newtype = eval.getAttributeAsString(key);
+ String newtype = eval.getAttributeAsString(key, null);
if ( newtype != null &&
( type == null ||
@@ -109,13 +109,13 @@ public class Degeneracy extends VariantStratifier {
type = newtype;
String aakey = String.format("refseq.variantAA_%d", annotationId);
- aa = eval.getAttributeAsString(aakey);
+ aa = eval.getAttributeAsString(aakey, null);
if (aa != null) {
String framekey = String.format("refseq.frame_%d", annotationId);
if (eval.hasAttribute(framekey)) {
- frame = eval.getAttributeAsInt(framekey);
+ frame = eval.getAttributeAsInt(framekey, 0);
}
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
index 88ffcaaeb..dbe8295ee 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java
@@ -28,7 +28,7 @@ public class FunctionalClass extends VariantStratifier {
}
- public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
+public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
ArrayList relevantStates = new ArrayList();
relevantStates.add("all");
@@ -38,7 +38,7 @@ public class FunctionalClass extends VariantStratifier {
if (eval.hasAttribute("refseq.functionalClass")) {
try {
- type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass"));
+ type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass", null));
} catch ( Exception e ) {} // don't error out if the type isn't supported
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
int annotationId = 1;
@@ -47,7 +47,7 @@ public class FunctionalClass extends VariantStratifier {
do {
key = String.format("refseq.functionalClass_%d", annotationId);
- String newtypeStr = eval.getAttributeAsString(key);
+ String newtypeStr = eval.getAttributeAsString(key, null);
if ( newtypeStr != null && !newtypeStr.equalsIgnoreCase("null") ) {
try {
FunctionalType newType = FunctionalType.valueOf(newtypeStr);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java
index bc7252ec2..04ba3ff14 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java
@@ -115,7 +115,7 @@ public class VQSRCalibrationCurve {
if ( vc.isFiltered() )
return 0.0;
else if ( vc.hasAttribute(VQSRQualKey) ) {
- double qual = vc.getAttributeAsDouble(VQSRQualKey);
+ double qual = vc.getAttributeAsDouble(VQSRQualKey, 0.0);
return probTrueVariant(qual);
} else {
throw new UserException.VariantContextMissingRequiredField(VQSRQualKey, vc);
@@ -143,7 +143,7 @@ public class VQSRCalibrationCurve {
for ( int i = 0; i < log10Likelihoods.length; i++) {
double p = Math.pow(10, log10Likelihoods[i]);
double q = alpha * p + (1-alpha) * noInfoPr;
- if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey), p, alpha, q);
+ if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey, 0.0), p, alpha, q);
updated[i] = Math.log10(q);
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
index 018c4dcc2..89d103f52 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@@ -574,7 +574,7 @@ public class SelectVariants extends RodWalker {
// ok we have a comp VC and we need to match the AF spectrum of inputAFRodName.
// We then pick a variant with probablity AF*desiredFraction
if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
- String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY);
+ String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null);
double af;
double afBoost = 1.0;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
index 2a877fb09..c44d84136 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
@@ -192,7 +192,7 @@ public class VariantsToTable extends RodWalker {
if ( getters.containsKey(field) ) {
val = getters.get(field).get(vc);
} else if ( vc.hasAttribute(field) ) {
- val = vc.getAttributeAsString(field);
+ val = vc.getAttributeAsString(field, null);
} else if ( isWildCard(field) ) {
Set wildVals = new HashSet();
for ( Map.Entry elt : vc.getAttributes().entrySet()) {
@@ -309,6 +309,7 @@ public class VariantsToTable extends RodWalker {
getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
+ getters.put("TYPE", new Getter() { public String get(VariantContext vc) { return vc.getType().toString(); } });
getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
index b96923589..b66198713 100644
--- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
+++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
@@ -306,7 +306,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome
@Override
public int hashCode() {
- return (int)( start << 16 + stop << 4 + contigIndex );
+ return start << 16 | stop << 4 | contigIndex;
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java
index fad2320fc..19e03a19d 100755
--- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java
@@ -9,14 +9,17 @@ import net.sf.samtools.SAMUtils;
* @author Kiran Garimella
*/
public class QualityUtils {
-
public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE;
public final static double MIN_REASONABLE_ERROR = 0.0001;
public final static byte MAX_REASONABLE_Q_SCORE = 40;
public final static byte MIN_USABLE_Q_SCORE = 6;
-
public final static int MAPPING_QUALITY_UNAVAILABLE = 255;
+ private static double qualToErrorProbCache[] = new double[256];
+ static {
+ for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw((byte)i);
+ }
+
/**
* Private constructor. No instantiating this class!
*/
@@ -33,10 +36,6 @@ public class QualityUtils {
return 1.0 - qualToErrorProb(qual);
}
- static public double qualToProb(int qual) {
- return qualToProb( (double)qual );
- }
-
static public double qualToProb(double qual) {
return 1.0 - Math.pow(10.0, qual/(-10.0));
}
@@ -48,10 +47,14 @@ public class QualityUtils {
* @param qual a quality score (0-40)
* @return a probability (0.0-1.0)
*/
- static public double qualToErrorProb(byte qual) {
+ static public double qualToErrorProbRaw(byte qual) {
return Math.pow(10.0, ((double) qual)/-10.0);
}
+ static public double qualToErrorProb(byte qual) {
+ return qualToErrorProbCache[qual];
+ }
+
/**
* Convert a probability to a quality score. Note, this is capped at Q40.
*
@@ -110,88 +113,4 @@ public class QualityUtils {
//return (byte) Math.min(qual, maxQual);
return (byte) Math.max(Math.min(qual, maxQual), 1);
}
-
- /**
- * Compress a base and a probability into a single byte so that it can be output in a SAMRecord's SQ field.
- * Note: the highest probability this function can encode is 64%, so this function should only never be used on the best base hypothesis.
- * Another note: the probability encoded here gets rounded to the nearest 1%.
- *
- * @param baseIndex the base index
- * @param prob the base probability
- * @return a byte containing the index and the probability
- */
- static public byte baseAndProbToCompressedQuality(int baseIndex, double prob) {
- byte compressedQual = 0;
-
- compressedQual = (byte) baseIndex;
-
- byte cprob = (byte) (100.0*prob);
- byte qualmask = (byte) 252;
- compressedQual += ((cprob << 2) & qualmask);
-
- return compressedQual;
- }
-
- /**
- * From a compressed base, extract the base index (0:A, 1:C, 2:G, 3:T)
- *
- * @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality
- * @return base index
- */
- static public int compressedQualityToBaseIndex(byte compressedQual) {
- return (int) (compressedQual & 0x3);
- }
-
- /**
- * From a compressed base, extract the base probability
- *
- * @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality
- * @return the probability
- */
- static public double compressedQualityToProb(byte compressedQual) {
- // Because java natives are signed, extra care must be taken to avoid
- // shifting a 1 into the sign bit in the implicit promotion of 2 to an int.
- int x2 = ((int) compressedQual) & 0xff;
- x2 = (x2 >>> 2);
-
- return ((double) x2)/100.0;
- }
-
- /**
- * Return the complement of a compressed quality
- *
- * @param compressedQual the compressed quality score (as returned by baseAndProbToCompressedQuality)
- * @return the complementary compressed quality
- */
- static public byte complementCompressedQuality(byte compressedQual) {
- int baseIndex = compressedQualityToBaseIndex(compressedQual);
- double prob = compressedQualityToProb(compressedQual);
-
- return baseAndProbToCompressedQuality(BaseUtils.complementIndex(baseIndex), prob);
- }
-
- /**
- * Return the reverse complement of a byte array of compressed qualities
- *
- * @param compressedQuals a byte array of compressed quality scores
- * @return the reverse complement of the byte array
- */
- static public byte[] reverseComplementCompressedQualityArray(byte[] compressedQuals) {
- byte[] rcCompressedQuals = new byte[compressedQuals.length];
-
- for (int pos = 0; pos < compressedQuals.length; pos++) {
- rcCompressedQuals[compressedQuals.length - pos - 1] = complementCompressedQuality(compressedQuals[pos]);
- }
-
- return rcCompressedQuals;
- }
-
- /**
- * Return the reverse of a byte array of qualities (compressed or otherwise)
- * @param quals the array of bytes to be reversed
- * @return the reverse of the quality array
- */
- static public byte[] reverseQualityArray( byte[] quals ) {
- return Utils.reverse(quals); // no sense in duplicating functionality
- }
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java
index f9997bfd8..1b4703e4a 100755
--- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java
@@ -190,11 +190,21 @@ public class SampleUtils {
}
- public static List getSamplesFromCommandLineInput(Collection sampleArgs) {
+ /**
+ * Returns a new set of samples, containing a final list of samples expanded from sampleArgs
+ *
+ * Each element E of sampleArgs can either be a literal sample name or a file. For each E,
+ * we try to read a file named E from disk, and if possible all lines from that file are expanded
+ * into unique sample names.
+ *
+ * @param sampleArgs
+ * @return
+ */
+ public static Set getSamplesFromCommandLineInput(Collection sampleArgs) {
if (sampleArgs != null) {
// Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
// sample list set, and treat the entries as if they had been specified on the command line.
- List samplesFromFiles = new ArrayList();
+ Set samplesFromFiles = new HashSet();
for (String SAMPLE_EXPRESSION : sampleArgs) {
File sampleFile = new File(SAMPLE_EXPRESSION);
@@ -203,7 +213,7 @@ public class SampleUtils {
List lines = reader.readLines();
for (String line : lines) {
- samplesFromFiles.add(line);
+ samplesFromFiles.add(line.trim());
}
} catch (FileNotFoundException e) {
samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample
@@ -212,7 +222,8 @@ public class SampleUtils {
return samplesFromFiles;
}
- return new ArrayList();
+
+ return new HashSet();
}
public static Set getSamplesFromCommandLineInput(Collection vcfSamples, Collection sampleExpressions) {
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
index d94d9ff84..f142fa5aa 100644
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/refseq/RefSeqCodec.java
@@ -12,19 +12,35 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.ArrayList;
/**
- * TODO FOR CHRIS HARTL
+ * Allows for reading in RefSeq information
*
*
- * Codec Description
+ * Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop,
+ * strandedness of transcription.
*
*
*
- * See also: link to file specification
+ * Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here
+ * http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq
*
+ * Usage
+ * The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example
+ *
+ * -refSeqBinding:REFSEQ /path/to/refSeq.txt
+ *
+ *
+ * You will need to consult individual walkers for the binding name ("refSeqBinding", above)
*
* File format example
+ * If you want to define your own file for use, the format is (tab delimited):
+ * bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete)
+ * and exon frames, for example:
+ *
+ * 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
+ *
+ * for more information see here
*
- * A BAM file containing exactly one sample.
+ *
*
*
* @author Mark DePristo
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
index bb212e128..624d06a71 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@@ -227,7 +227,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
throw new UserException.MalformedVCF(message, lineNo);
}
- private static void generateException(String message, int lineNo) {
+ protected static void generateException(String message, int lineNo) {
throw new UserException.MalformedVCF(message, lineNo);
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java
new file mode 100644
index 000000000..4ae87ddcb
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.codecs.vcf;
+
+import org.broad.tribble.Tribble;
+import org.broad.tribble.TribbleException;
+import org.broad.tribble.index.DynamicIndexCreator;
+import org.broad.tribble.index.Index;
+import org.broad.tribble.index.IndexFactory;
+import org.broad.tribble.util.LittleEndianOutputStream;
+import org.broad.tribble.util.PositionalStream;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+
+import java.io.*;
+
+/**
+ * this class writes VCF files
+ */
+public abstract class IndexingVCFWriter implements VCFWriter {
+ final private String name;
+
+ private File indexFile = null;
+ private OutputStream outputStream;
+ private PositionalStream positionalStream = null;
+ private DynamicIndexCreator indexer = null;
+ private LittleEndianOutputStream idxStream = null;
+
+ protected IndexingVCFWriter(String name, File location, OutputStream output, boolean enableOnTheFlyIndexing) {
+ outputStream = output;
+ this.name = name;
+
+ if ( enableOnTheFlyIndexing ) {
+ indexFile = Tribble.indexFile(location);
+ try {
+ idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
+ //System.out.println("Creating index on the fly for " + location);
+ indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
+ indexer.initialize(location, indexer.defaultBinSize());
+ positionalStream = new PositionalStream(output);
+ outputStream = positionalStream;
+ } catch ( IOException ex ) {
+ // No matter what we keep going, since we don't care if we can't create the index file
+ idxStream = null;
+ indexer = null;
+ positionalStream = null;
+ indexFile = null;
+ }
+ }
+ }
+
+ public OutputStream getOutputStream() {
+ return outputStream;
+ }
+
+ public String getStreamName() {
+ return name;
+ }
+
+ public abstract void writeHeader(VCFHeader header);
+
+ /**
+ * attempt to close the VCF file
+ */
+ public void close() {
+ // try to close the index stream (keep it separate to help debugging efforts)
+ if ( indexer != null ) {
+ try {
+ Index index = indexer.finalizeIndex(positionalStream.getPosition());
+ index.write(idxStream);
+ idxStream.close();
+ } catch (IOException e) {
+ throw new ReviewedStingException("Unable to close index for " + getStreamName(), e);
+ }
+ }
+ }
+
+ /**
+ * add a record to the file
+ *
+ * @param vc the Variant Context object
+ */
+ public void add(VariantContext vc) {
+ // if we are doing on the fly indexing, add the record ***before*** we write any bytes
+ if ( indexer != null )
+ indexer.addFeature(vc, positionalStream.getPosition());
+ }
+
+ protected static final String writerName(File location, OutputStream stream) {
+ return location == null ? stream.toString() : location.getAbsolutePath();
+ }
+
+ protected static OutputStream openOutputStream(File location) {
+ try {
+ return new FileOutputStream(location);
+ } catch (FileNotFoundException e) {
+ throw new ReviewedStingException("Unable to create VCF file at location: " + location, e);
+ }
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java
index d3705813c..7cba5fc3e 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java
@@ -44,26 +44,19 @@ import java.util.*;
/**
* this class writes VCF files
*/
-public class StandardVCFWriter implements VCFWriter {
+public class StandardVCFWriter extends IndexingVCFWriter {
+ // the print stream we're writing to
+ final protected BufferedWriter mWriter;
+
+ // should we write genotypes or just sites?
+ final protected boolean doNotWriteGenotypes;
// the VCF header we're storing
protected VCFHeader mHeader = null;
- // the print stream we're writing to
- protected BufferedWriter mWriter;
- protected PositionalStream positionalStream = null;
-
// were filters applied?
protected boolean filtersWereAppliedToContext = false;
- // should we write genotypes or just sites?
- protected boolean doNotWriteGenotypes = false;
-
- protected DynamicIndexCreator indexer = null;
- protected File indexFile = null;
- LittleEndianOutputStream idxStream = null;
- File location = null;
-
/**
* create a VCF writer, given a file to write to
*
@@ -93,32 +86,22 @@ public class StandardVCFWriter implements VCFWriter {
* @param doNotWriteGenotypes do not write genotypes
*/
public StandardVCFWriter(OutputStream output, boolean doNotWriteGenotypes) {
- mWriter = new BufferedWriter(new OutputStreamWriter(output));
- this.doNotWriteGenotypes = doNotWriteGenotypes;
+ this(null, output, false, doNotWriteGenotypes);
}
public StandardVCFWriter(File location, OutputStream output, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
- this.location = location;
-
- if ( enableOnTheFlyIndexing ) {
- indexFile = Tribble.indexFile(location);
- try {
- idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
- //System.out.println("Creating index on the fly for " + location);
- indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
- indexer.initialize(location, indexer.defaultBinSize());
- positionalStream = new PositionalStream(output);
- output = positionalStream;
- } catch ( IOException ex ) {
- // No matter what we keep going, since we don't care if we can't create the index file
- }
- }
-
- //mWriter = new BufferedWriter(new OutputStreamWriter(new PositionalStream(output)));
- mWriter = new BufferedWriter(new OutputStreamWriter(output));
+ super(writerName(location, output), location, output, enableOnTheFlyIndexing);
+ mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size
this.doNotWriteGenotypes = doNotWriteGenotypes;
}
+ // --------------------------------------------------------------------------------
+ //
+ // VCFWriter interface functions
+ //
+ // --------------------------------------------------------------------------------
+
+ @Override
public void writeHeader(VCFHeader header) {
mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header;
@@ -158,44 +141,24 @@ public class StandardVCFWriter implements VCFWriter {
mWriter.flush(); // necessary so that writing to an output stream will work
}
catch (IOException e) {
- throw new TribbleException("IOException writing the VCF header to " + locationString(), e);
+ throw new ReviewedStingException("IOException writing the VCF header to " + getStreamName(), e);
}
}
- private String locationString() {
- return location == null ? mWriter.toString() : location.getAbsolutePath();
- }
-
/**
* attempt to close the VCF file
*/
+ @Override
public void close() {
// try to close the vcf stream
try {
mWriter.flush();
mWriter.close();
} catch (IOException e) {
- throw new TribbleException("Unable to close " + locationString() + " because of " + e.getMessage());
+ throw new ReviewedStingException("Unable to close " + getStreamName(), e);
}
- // try to close the index stream (keep it separate to help debugging efforts)
- if ( indexer != null ) {
- try {
- Index index = indexer.finalizeIndex(positionalStream.getPosition());
- index.write(idxStream);
- idxStream.close();
- } catch (IOException e) {
- throw new TribbleException("Unable to close index for " + locationString() + " because of " + e.getMessage());
- }
- }
- }
-
- protected static OutputStream openOutputStream(File location) {
- try {
- return new FileOutputStream(location);
- } catch (FileNotFoundException e) {
- throw new TribbleException("Unable to create VCF file at location: " + location);
- }
+ super.close();
}
/**
@@ -203,28 +166,17 @@ public class StandardVCFWriter implements VCFWriter {
*
* @param vc the Variant Context object
*/
+ @Override
public void add(VariantContext vc) {
- add(vc, false);
- }
-
- /**
- * add a record to the file
- *
- * @param vc the Variant Context object
- * @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER THE EVENT INSTEAD)
- */
- public void add(VariantContext vc, boolean refBaseShouldBeAppliedToEndOfAlleles) {
if ( mHeader == null )
- throw new IllegalStateException("The VCF Header must be written before records can be added: " + locationString());
+ throw new IllegalStateException("The VCF Header must be written before records can be added: " + getStreamName());
if ( doNotWriteGenotypes )
vc = VariantContext.modifyGenotypes(vc, null);
try {
- vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBaseShouldBeAppliedToEndOfAlleles);
-
- // if we are doing on the fly indexing, add the record ***before*** we write any bytes
- if ( indexer != null ) indexer.addFeature(vc, positionalStream.getPosition());
+ vc = VariantContext.createVariantContextWithPaddedAlleles(vc, false);
+ super.add(vc);
Map alleleMap = new HashMap(vc.getAlleles().size());
alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup
@@ -275,7 +227,7 @@ public class StandardVCFWriter implements VCFWriter {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
// FILTER
- String filters = vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (filtersWereAppliedToContext || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
+ String filters = getFilterString(vc, filtersWereAppliedToContext);
mWriter.write(filters);
mWriter.write(VCFConstants.FIELD_SEPARATOR);
@@ -317,9 +269,22 @@ public class StandardVCFWriter implements VCFWriter {
mWriter.write("\n");
mWriter.flush(); // necessary so that writing to an output stream will work
} catch (IOException e) {
- throw new RuntimeException("Unable to write the VCF object to " + locationString());
+ throw new RuntimeException("Unable to write the VCF object to " + getStreamName());
}
+ }
+ // --------------------------------------------------------------------------------
+ //
+ // implementation functions
+ //
+ // --------------------------------------------------------------------------------
+
+ public static final String getFilterString(final VariantContext vc) {
+ return getFilterString(vc, false);
+ }
+
+ public static final String getFilterString(final VariantContext vc, boolean forcePASS) {
+ return vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (forcePASS || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
}
private String getQualValue(double qual) {
@@ -462,7 +427,7 @@ public class StandardVCFWriter implements VCFWriter {
mWriter.write(encoding);
}
- private static String formatVCFField(Object val) {
+ public static String formatVCFField(Object val) {
String result;
if ( val == null )
result = VCFConstants.MISSING_VALUE_v4;
@@ -524,12 +489,11 @@ public class StandardVCFWriter implements VCFWriter {
}
- public static int countOccurrences(char c, String s) {
+ private static int countOccurrences(char c, String s) {
int count = 0;
for (int i = 0; i < s.length(); i++) {
count += s.charAt(i) == c ? 1 : 0;
}
return count;
}
-
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java
index fa030ef5f..42ea05355 100755
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java
@@ -105,34 +105,37 @@ public class VCFCodec extends AbstractVCFCodec {
* @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF)
*/
protected Set parseFilters(String filterString) {
+ return parseFilters(filterHash, lineNo, filterString);
+ }
+ public static Set parseFilters(final Map> cache, final int lineNo, final String filterString) {
// null for unfiltered
if ( filterString.equals(VCFConstants.UNFILTERED) )
return null;
- // empty set for passes filters
- LinkedHashSet fFields = new LinkedHashSet();
-
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) )
- return fFields;
+ return Collections.emptySet();
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
- generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4");
+ generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo);
if ( filterString.length() == 0 )
- generateException("The VCF specification requires a valid filter status");
+ generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo);
// do we have the filter string cached?
- if ( filterHash.containsKey(filterString) )
- return filterHash.get(filterString);
+ if ( cache != null && cache.containsKey(filterString) )
+ return Collections.unmodifiableSet(cache.get(filterString));
+ // empty set for passes filters
+ LinkedHashSet fFields = new LinkedHashSet();
// otherwise we have to parse and cache the value
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
fFields.add(filterString);
else
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
- filterHash.put(filterString, fFields);
+ fFields = fFields;
+ if ( cache != null ) cache.put(filterString, fFields);
- return fFields;
+ return Collections.unmodifiableSet(fFields);
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java
new file mode 100644
index 000000000..ef0d9ca42
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.gcf;
+
+import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.variantcontext.Allele;
+import org.broadinstitute.sting.utils.variantcontext.Genotype;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * GATK binary VCF record
+ *
+ * @author Your Name
+ * @since Date created
+ */
+public class GCF {
+ private final static int RECORD_TERMINATOR = 123456789;
+ private int chromOffset;
+ private int start, stop;
+ private String id;
+ private List alleleMap;
+ private int alleleOffsets[];
+ private float qual;
+ private byte refPad;
+ private String info;
+ private int filterOffset;
+
+ private List genotypes = Collections.emptyList();
+
+ public GCF(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc, boolean skipGenotypes) {
+ chromOffset = GCFHeaderBuilder.encodeString(vc.getChr());
+ start = vc.getStart();
+ stop = vc.getEnd();
+ refPad = vc.hasReferenceBaseForIndel() ? vc.getReferenceBaseForIndel() : 0;
+ id = vc.getID();
+
+ // encode alleles
+ alleleMap = new ArrayList(vc.getNAlleles());
+ alleleOffsets = new int[vc.getNAlleles()];
+ alleleMap.add(vc.getReference());
+ alleleOffsets[0] = GCFHeaderBuilder.encodeAllele(vc.getReference());
+ for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
+ alleleMap.add(vc.getAlternateAllele(i));
+ alleleOffsets[i+1] = GCFHeaderBuilder.encodeAllele(vc.getAlternateAllele(i));
+ }
+
+ qual = (float)vc.getNegLog10PError(); //qualToByte(vc.getPhredScaledQual());
+ info = infoFieldString(vc, GCFHeaderBuilder);
+ filterOffset = GCFHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc));
+
+ if ( ! skipGenotypes ) {
+ genotypes = encodeGenotypes(GCFHeaderBuilder, vc);
+ }
+ }
+
+ public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException, EOFException {
+ chromOffset = inputStream.readInt();
+
+ // have we reached the footer?
+ if ( chromOffset == GCFHeader.FOOTER_START_MARKER )
+ throw new EOFException();
+
+ start = inputStream.readInt();
+ stop = inputStream.readInt();
+ id = inputStream.readUTF();
+ refPad = inputStream.readByte();
+ alleleOffsets = readIntArray(inputStream);
+ qual = inputStream.readFloat();
+ info = inputStream.readUTF();
+ filterOffset = inputStream.readInt();
+
+ int nGenotypes = inputStream.readInt();
+ int sizeOfGenotypes = inputStream.readInt();
+ if ( skipGenotypes ) {
+ genotypes = Collections.emptyList();
+ inputStream.skipBytes(sizeOfGenotypes);
+ } else {
+ genotypes = new ArrayList(nGenotypes);
+ for ( int i = 0; i < nGenotypes; i++ )
+ genotypes.add(new GCFGenotype(this, inputStream));
+ }
+
+ int recordDone = inputStream.readInt();
+ if ( recordDone != RECORD_TERMINATOR )
+ throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key");
+ }
+
+ public int write(DataOutputStream outputStream) throws IOException {
+ int startSize = outputStream.size();
+ outputStream.writeInt(chromOffset);
+ outputStream.writeInt(start);
+ outputStream.writeInt(stop);
+ outputStream.writeUTF(id);
+ outputStream.writeByte(refPad);
+ writeIntArray(alleleOffsets, outputStream, true);
+ outputStream.writeFloat(qual);
+ outputStream.writeUTF(info);
+ outputStream.writeInt(filterOffset);
+
+ int nGenotypes = genotypes.size();
+ int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes;
+ outputStream.writeInt(nGenotypes);
+ outputStream.writeInt(expectedSizeOfGenotypes);
+ int obsSizeOfGenotypes = 0;
+ for ( GCFGenotype g : genotypes )
+ obsSizeOfGenotypes += g.write(outputStream);
+ if ( obsSizeOfGenotypes != expectedSizeOfGenotypes )
+ throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes);
+
+ outputStream.writeInt(RECORD_TERMINATOR);
+ return outputStream.size() - startSize;
+ }
+
+ public VariantContext decode(final String source, final GCFHeader header) {
+ final String contig = header.getString(chromOffset);
+ alleleMap = header.getAlleles(alleleOffsets);
+ double negLog10PError = qual; // QualityUtils.qualToErrorProb(qual);
+ Set filters = header.getFilters(filterOffset);
+ Map attributes = new HashMap();
+ attributes.put("INFO", info);
+ Byte refPadByte = refPad == 0 ? null : refPad;
+ Map genotypes = decodeGenotypes(header);
+
+ return new VariantContext(source, contig, start, stop, alleleMap, genotypes, negLog10PError, filters, attributes, refPadByte);
+ }
+
+ private Map decodeGenotypes(final GCFHeader header) {
+ if ( genotypes.isEmpty() )
+ return VariantContext.NO_GENOTYPES;
+ else {
+ Map map = new TreeMap();
+
+ for ( int i = 0; i < genotypes.size(); i++ ) {
+ final String sampleName = header.getSample(i);
+ final Genotype g = genotypes.get(i).decode(sampleName, header, this, alleleMap);
+ map.put(sampleName, g);
+ }
+
+ return map;
+ }
+ }
+
+ private List encodeGenotypes(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc) {
+ int nGenotypes = vc.getNSamples();
+ if ( nGenotypes > 0 ) {
+ List genotypes = new ArrayList(nGenotypes);
+ for ( int i = 0; i < nGenotypes; i++ ) genotypes.add(null);
+
+ for ( Genotype g : vc.getGenotypes().values() ) {
+ int i = GCFHeaderBuilder.encodeSample(g.getSampleName());
+ genotypes.set(i, new GCFGenotype(GCFHeaderBuilder, alleleMap, g));
+ }
+
+ return genotypes;
+ } else {
+ return Collections.emptyList();
+ }
+ }
+
+ public int getNAlleles() { return alleleOffsets.length; }
+
+
+ private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) {
+ StringBuilder s = new StringBuilder();
+
+ boolean first = true;
+ for ( Map.Entry field : vc.getAttributes().entrySet() ) {
+ String key = field.getKey();
+ if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) )
+ continue;
+ int stringIndex = GCFHeaderBuilder.encodeString(key);
+ String outputValue = StandardVCFWriter.formatVCFField(field.getValue());
+ if ( outputValue != null ) {
+ if ( ! first ) s.append(";");
+ s.append(stringIndex).append("=").append(outputValue);
+ first = false;
+ }
+ }
+
+ return s.toString();
+ }
+
+ protected final static int BUFFER_SIZE = 1048576; // 2**20
+
+ public static DataInputStream createDataInputStream(final InputStream stream) {
+ return new DataInputStream(new BufferedInputStream(stream, BUFFER_SIZE));
+ }
+
+ public static FileInputStream createFileInputStream(final File file) throws FileNotFoundException {
+ return new FileInputStream(file);
+ }
+
+ protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException {
+ return readIntArray(inputStream, inputStream.readInt());
+ }
+
+ protected final static int[] readIntArray(final DataInputStream inputStream, int size) throws IOException {
+ int[] array = new int[size];
+ for ( int i = 0; i < array.length; i++ )
+ array[i] = inputStream.readInt();
+ return array;
+ }
+
+ protected final static void writeIntArray(int[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException {
+ if ( writeSize ) outputStream.writeInt(array.length);
+ for ( int i : array )
+ outputStream.writeInt(i);
+ }
+
+ protected final static byte[] readByteArray(final DataInputStream inputStream) throws IOException {
+ return readByteArray(inputStream, inputStream.readInt());
+ }
+
+ protected final static byte[] readByteArray(final DataInputStream inputStream, int size) throws IOException {
+ byte[] array = new byte[size];
+ for ( int i = 0; i < array.length; i++ )
+ array[i] = inputStream.readByte();
+ return array;
+ }
+
+ protected final static void writeByteArray(byte[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException {
+ if ( writeSize ) outputStream.writeInt(array.length);
+ for ( byte i : array )
+ outputStream.writeByte(i);
+ }
+
+ protected final static byte qualToByte(double phredScaledQual) {
+ return (byte)Math.round(Math.min(phredScaledQual, 255));
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java
new file mode 100644
index 000000000..dd1fb091c
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.gcf;
+
+import org.broadinstitute.sting.utils.variantcontext.Allele;
+import org.broadinstitute.sting.utils.variantcontext.Genotype;
+
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * GATK binary VCF record
+ *
+ * @author Your Name
+ * @since Date created
+ */
+public class GCFGenotype {
+ private byte gq;
+ private int gt;
+ private int dp;
+ private int ad[];
+ private byte[] pl;
+
+ // todo -- what to do about phasing? Perhaps we shouldn't support it
+ // todo -- is the FL field generic or just a flag? Should we even support per sample filtering?
+
+ public GCFGenotype(final GCFHeaderBuilder GCFHeaderBuilder, final List allAlleles, Genotype genotype) {
+ gq = GCF.qualToByte(genotype.getPhredScaledQual());
+ gt = encodeAlleles(genotype.getAlleles(), allAlleles);
+
+ dp = genotype.getAttributeAsInt("DP", 0);
+
+ int nAlleles = allAlleles.size();
+ ad = new int[nAlleles];
+
+ int npls = nAllelesToNPls(nAlleles);
+ pl = new byte[npls];
+ }
+
+ private int nAllelesToNPls( int nAlleles ) {
+ return nAlleles*(nAlleles+1) / 2;
+ }
+
+ public GCFGenotype(GCF GCF, DataInputStream inputStream) throws IOException {
+ int gqInt = inputStream.readUnsignedByte();
+ gq = (byte)gqInt;
+ gt = inputStream.readInt();
+ dp = inputStream.readInt();
+ ad = GCF.readIntArray(inputStream, GCF.getNAlleles());
+ pl = GCF.readByteArray(inputStream, nAllelesToNPls(GCF.getNAlleles()));
+ }
+
+ // 2 alleles => 1 + 8 + 8 + 3 => 20
+ protected int sizeInBytes() {
+ return 1 // gq
+ + 4 * 2 // gt + dp
+ + 4 * ad.length // ad
+ + 1 * pl.length; // pl
+ }
+
+ public Genotype decode(final String sampleName, final GCFHeader header, GCF GCF, List alleleIndex) {
+ final List alleles = decodeAlleles(gt, alleleIndex);
+ final double negLog10PError = gq / 10.0;
+ final Set filters = Collections.emptySet();
+ final Map attributes = new HashMap();
+ attributes.put("DP", dp);
+ attributes.put("AD", ad);
+ attributes.put("PL", pl);
+
+ return new Genotype(sampleName, alleles, negLog10PError, filters, attributes, false);
+ }
+
+ private static int encodeAlleles(List gtList, List allAlleles) {
+ final int nAlleles = gtList.size();
+ if ( nAlleles > 4 )
+ throw new IllegalArgumentException("encodeAlleles doesn't support more than 4 alt alleles, but I saw " + gtList);
+
+ int gtInt = 0;
+ for ( int i = 0; i < nAlleles ; i++ ) {
+ final int bitOffset = i * 8;
+ final int allelei = getAlleleIndex(gtList.get(i), allAlleles);
+ final int gti = (allelei + 1) << bitOffset;
+ gtInt = gtInt | gti;
+ }
+
+ return gtInt;
+ }
+
+ private static int getAlleleIndex(Allele q, List allAlleles) {
+ if ( q.isNoCall() )
+ return 254;
+ for ( int i = 0; i < allAlleles.size(); i++ )
+ if ( q.equals(allAlleles.get(i)) )
+ return i;
+ throw new IllegalStateException("getAlleleIndex passed allele not in map! allele " + q + " allAlleles " + allAlleles);
+ }
+
+ private static List decodeAlleles(int gtInt, List alleleIndex) {
+ List alleles = new ArrayList(4);
+
+ for ( int i = 0; i < 32; i += 8 ) {
+ final int gi = (gtInt & (0x000000FF << i)) >> i;
+ if ( gi != 0 ) {
+ final int allelei = gi - 1;
+ alleles.add( allelei == 254 ? Allele.NO_CALL : alleleIndex.get(allelei) );
+ } else {
+ break;
+ }
+ }
+
+ return alleles;
+ }
+
+ public int write(DataOutputStream outputStream) throws IOException {
+ int startSize = outputStream.size();
+ outputStream.writeByte(gq);
+ outputStream.writeInt(gt);
+ outputStream.writeInt(dp);
+ GCF.writeIntArray(ad, outputStream, false);
+ GCF.writeByteArray(pl, outputStream, false);
+ return outputStream.size() - startSize;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java
new file mode 100644
index 000000000..6d96eda56
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.gcf;
+
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.variantcontext.Allele;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * [Short one sentence description of this walker]
+ *
+ *
+ * [Functionality of this walker]
+ *
+ *
+ * Input
+ *
+ * [Input description]
+ *
+ *
+ * Output
+ *
+ * [Output description]
+ *
+ *
+ * Examples
+ *
+ * java
+ * -jar GenomeAnalysisTK.jar
+ * -T $WalkerName
+ *
+ *
+ * @author Your Name
+ * @since Date created
+ */
+public class GCFHeader {
+ final protected static Logger logger = Logger.getLogger(GCFHeader.class);
+
+ public final static int GCF_VERSION = 1;
+ public final static byte[] GCF_FILE_START_MARKER = "GCF\1".getBytes();
+ public final static int FOOTER_START_MARKER = -1;
+ public final static long HEADER_FORWARD_REFERENCE_OFFSET = GCF_FILE_START_MARKER.length + 4; // for the version
+
+ final int version;
+ long footerPosition;
+ final List alleles;
+ final List strings;
+ final List samples;
+ final List> filters;
+
+ public GCFHeader(final Map allelesIn, final Map stringIn, final Map samplesIn) {
+ version = GCF_VERSION;
+ footerPosition = 0;
+ this.alleles = linearize(allelesIn);
+ this.strings = linearize(stringIn);
+ this.samples = linearize(samplesIn);
+ this.filters = null; // not used with this constructor
+ }
+
+ public GCFHeader(FileInputStream fileInputStream) throws IOException {
+ DataInputStream inputStream = new DataInputStream(fileInputStream);
+ byte[] headerTest = new byte[GCF_FILE_START_MARKER.length];
+ inputStream.read(headerTest);
+ if ( ! Arrays.equals(headerTest, GCF_FILE_START_MARKER) ) {
+ throw new UserException("Could not read GVCF file. GCF_FILE_START_MARKER missing. Saw " + new String(headerTest));
+ } else {
+ version = inputStream.readInt();
+ logger.info("Read GCF version " + version);
+ footerPosition = inputStream.readLong();
+ logger.info("Read footer position of " + footerPosition);
+ long lastPos = fileInputStream.getChannel().position();
+ logger.info(" Last position is " + lastPos);
+
+ // seek to the footer
+ fileInputStream.getChannel().position(footerPosition);
+ if ( inputStream.readInt() != FOOTER_START_MARKER )
+ throw new UserException.MalformedFile("Malformed GCF file: couldn't find the footer marker");
+ alleles = stringsToAlleles(readStrings(inputStream));
+ strings = readStrings(inputStream);
+ samples = readStrings(inputStream);
+ logger.info(String.format("Allele map of %d elements", alleles.size()));
+ logger.info(String.format("String map of %d elements", strings.size()));
+ logger.info(String.format("Sample map of %d elements", samples.size()));
+ filters = initializeFilterCache();
+ fileInputStream.getChannel().position(lastPos);
+ }
+ }
+
+ public static int writeHeader(final DataOutputStream outputStream) throws IOException {
+ int startBytes = outputStream.size();
+ outputStream.write(GCF_FILE_START_MARKER);
+ outputStream.writeInt(GCF_VERSION);
+ outputStream.writeLong(0);
+ return outputStream.size() - startBytes;
+ }
+
+ public int writeFooter(final DataOutputStream outputStream) throws IOException {
+ int startBytes = outputStream.size();
+ outputStream.writeInt(FOOTER_START_MARKER); // has to be the same as chrom encoding
+ write(outputStream, allelesToStrings(alleles));
+ write(outputStream, strings);
+ write(outputStream, samples);
+ return outputStream.size() - startBytes;
+ }
+
+ private void write(DataOutputStream outputStream, List l) throws IOException {
+ outputStream.writeInt(l.size());
+ for ( String elt : l ) outputStream.writeUTF(elt);
+ }
+
+ private List allelesToStrings(List alleles) {
+ List strings = new ArrayList(alleles.size());
+ for ( Allele allele : alleles ) strings.add(allele.toString());
+ return strings;
+ }
+
+ private List> initializeFilterCache() {
+ // required to allow offset -> set lookup
+ List> l = new ArrayList>(strings.size());
+ for ( int i = 0; i < strings.size(); i++ ) l.add(null);
+ return l;
+ }
+
+ private static List stringsToAlleles(final List strings) {
+ final List alleles = new ArrayList(strings.size());
+ for ( String string : strings ) {
+ boolean isRef = string.endsWith("*");
+ if ( isRef ) string = string.substring(0, string.length() - 1);
+ alleles.add(Allele.create(string, isRef));
+ }
+ return alleles;
+ }
+
+ private static List readStrings(final DataInputStream inputStream) throws IOException {
+ final int nStrings = inputStream.readInt();
+
+ final List strings = new ArrayList(nStrings);
+ for ( int i = 0; i < nStrings; i++ ) {
+ strings.add(inputStream.readUTF());
+ }
+
+ return strings;
+ }
+
+ private static List linearize(final Map map) {
+ final ArrayList l = new ArrayList(map.size());
+ for ( int i = 0; i < map.size(); i++ ) l.add(null);
+ for ( final Map.Entry elt : map.entrySet() )
+ l.set(elt.getValue(), elt.getKey());
+ return l;
+ }
+
+ public String getSample(final int offset) { return samples.get(offset); }
+ public String getString(final int offset) { return strings.get(offset); }
+ public Allele getAllele(final int offset) { return alleles.get(offset); }
+ public List getAlleles(final int[] offsets) {
+ final List alleles = new ArrayList(offsets.length);
+ for ( int i : offsets ) alleles.add(getAllele(i));
+ return alleles;
+ }
+
+ public Set getFilters(final int offset) {
+ Set cached = filters.get(offset);
+
+ if ( cached != null )
+ return cached;
+ else {
+ final String filterString = getString(offset);
+ if ( filterString.equals(VCFConstants.UNFILTERED) )
+ return null; // UNFILTERED records are represented by null
+ else {
+ Set set = VCFCodec.parseFilters(null, -1, filterString);
+ filters.set(offset, set); // remember the result
+ return set;
+ }
+ }
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java
new file mode 100644
index 000000000..40e01ec72
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.gcf;
+
+import org.broadinstitute.sting.utils.variantcontext.Allele;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * [Short one sentence description of this walker]
+ *
+ *
+ * [Functionality of this walker]
+ *
+ *
+ * Input
+ *
+ * [Input description]
+ *
+ *
+ * Output
+ *
+ * [Output description]
+ *
+ *
+ * Examples
+ *
+ * java
+ * -jar GenomeAnalysisTK.jar
+ * -T $WalkerName
+ *
+ *
+ * @author Your Name
+ * @since Date created
+ */
+public class GCFHeaderBuilder {
+ Map alleles = new HashMap();
+ Map strings = new HashMap();
+ Map samples = new HashMap();
+
+ public GCFHeader createHeader() {
+ return new GCFHeader(alleles, strings, samples);
+ }
+
+ public int encodeString(final String chr) { return encode(strings, chr); }
+ public int encodeAllele(final Allele allele) { return encode(alleles, allele); }
+ public int encodeSample(final String sampleName) { return encode(samples, sampleName); }
+
+ private int encode(Map map, T key) {
+ Integer v = map.get(key);
+ if ( v == null ) {
+ v = map.size();
+ map.put(key, v);
+ }
+ return v;
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java
new file mode 100644
index 000000000..7ff6e27a2
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.gcf;
+
+import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+
+import java.io.*;
+
+/**
+ * GCFWriter implementing the VCFWriter interface
+ * @author Your Name
+ * @since Date created
+ */
+public class GCFWriter extends IndexingVCFWriter {
+ final boolean skipGenotypes;
+ final FileOutputStream fileOutputStream;
+ final DataOutputStream dataOutputStream;
+ final GCFHeaderBuilder gcfHeaderBuilder;
+ int nbytes = 0;
+ VCFHeader header = null;
+ File location;
+
+ // --------------------------------------------------------------------------------
+ //
+ // Constructors
+ //
+ // --------------------------------------------------------------------------------
+
+ public GCFWriter(File location, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
+ super(writerName(location, null), location, null, enableOnTheFlyIndexing);
+ this.location = location;
+ this.skipGenotypes = doNotWriteGenotypes;
+
+ // write the output
+ try {
+ fileOutputStream = new FileOutputStream(location);
+ dataOutputStream = createDataOutputStream(fileOutputStream);
+ gcfHeaderBuilder = new GCFHeaderBuilder();
+ } catch ( FileNotFoundException e ) {
+ throw new UserException.CouldNotCreateOutputFile(location, e);
+ }
+ }
+
+ // --------------------------------------------------------------------------------
+ //
+ // VCFWriter interface functions
+ //
+ // --------------------------------------------------------------------------------
+
+ @Override
+ public void writeHeader(VCFHeader header) {
+ this.header = header;
+ try {
+ nbytes += GCFHeader.writeHeader(dataOutputStream);
+ } catch ( IOException e ) {
+ throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Couldn't write header", e);
+ }
+ }
+
+ @Override
+ public void add(VariantContext vc) {
+ super.add(vc);
+ GCF gcf = new GCF(gcfHeaderBuilder, vc, skipGenotypes);
+ try {
+ nbytes += gcf.write(dataOutputStream);
+ } catch ( IOException e ) {
+ throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Failed to add gcf record " + gcf + " to stream " + getStreamName(), e);
+ }
+ }
+
+ @Override
+ public void close() {
+ // todo -- write out VCF header lines
+ GCFHeader gcfHeader = gcfHeaderBuilder.createHeader();
+ try {
+ long headerPosition = nbytes;
+ nbytes += gcfHeader.writeFooter(dataOutputStream);
+ dataOutputStream.close();
+ //System.out.println("Writing forward reference to " + headerPosition);
+
+ RandomAccessFile raFile = new RandomAccessFile(location, "rw");
+ raFile.seek(GCFHeader.HEADER_FORWARD_REFERENCE_OFFSET);
+ raFile.writeLong(headerPosition);
+ raFile.close();
+ } catch ( IOException e ) {
+ throw new ReviewedStingException("Failed to close GCFWriter " + getStreamName(), e);
+ }
+
+ super.close();
+ }
+
+ private static final DataOutputStream createDataOutputStream(final OutputStream stream) {
+ return new DataOutputStream(new BufferedOutputStream(stream, GCF.BUFFER_SIZE));
+ }
+
+}
diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
index f551e1368..41cbbe59f 100644
--- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java
@@ -334,24 +334,44 @@ public class IntervalUtils {
}
/**
- * Splits an interval list into multiple files.
- * @param fileHeader The sam file header.
+ * Splits an interval list into multiple sublists.
* @param locs The genome locs to split.
* @param splits The stop points for the genome locs returned by splitFixedIntervals.
- * @param scatterParts The output interval lists to write to.
+ * @return A list of lists of genome locs, split according to splits
*/
- public static void scatterFixedIntervals(SAMFileHeader fileHeader, List locs, List splits, List scatterParts) {
- if (splits.size() != scatterParts.size())
- throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
- int fileIndex = 0;
+ public static List> splitIntervalsToSubLists(List locs, List splits) {
int locIndex = 1;
int start = 0;
+ List> sublists = new ArrayList>(splits.size());
for (Integer stop: splits) {
- IntervalList intervalList = new IntervalList(fileHeader);
+ List curList = new ArrayList();
for (int i = start; i < stop; i++)
- intervalList.add(toInterval(locs.get(i), locIndex++));
- intervalList.write(scatterParts.get(fileIndex++));
+ curList.add(locs.get(i));
start = stop;
+ sublists.add(curList);
+ }
+
+ return sublists;
+ }
+
+
+ /**
+ * Splits an interval list into multiple files.
+ * @param fileHeader The sam file header.
+ * @param splits Pre-divided genome locs returned by splitFixedIntervals.
+ * @param scatterParts The output interval lists to write to.
+ */
+ public static void scatterFixedIntervals(SAMFileHeader fileHeader, List> splits, List scatterParts) {
+ if (splits.size() != scatterParts.size())
+ throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
+
+ int fileIndex = 0;
+ int locIndex = 1;
+ for (final List split : splits) {
+ IntervalList intervalList = new IntervalList(fileHeader);
+ for (final GenomeLoc loc : split)
+ intervalList.add(toInterval(loc, locIndex++));
+ intervalList.write(scatterParts.get(fileIndex++));
}
}
@@ -361,17 +381,15 @@ public class IntervalUtils {
* @param numParts Number of parts to split the locs into.
* @return The stop points to split the genome locs.
*/
- public static List splitFixedIntervals(List locs, int numParts) {
+ public static List> splitFixedIntervals(List locs, int numParts) {
if (locs.size() < numParts)
throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
- long locsSize = 0;
- for (GenomeLoc loc: locs)
- locsSize += loc.size();
- List splitPoints = new ArrayList();
+ final long locsSize = intervalSize(locs);
+ final List splitPoints = new ArrayList();
addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
Collections.sort(splitPoints);
splitPoints.add(locs.size());
- return splitPoints;
+ return splitIntervalsToSubLists(locs, splitPoints);
}
private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) {
@@ -441,4 +459,11 @@ public class IntervalUtils {
return merged;
}
}
+
+ public static final long intervalSize(final List locs) {
+ long size = 0;
+ for ( final GenomeLoc loc : locs )
+ size += loc.size();
+ return size;
+ }
}
diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
index 62bbb0307..5d3ef3086 100644
--- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java
@@ -118,31 +118,40 @@ public class ReadUtils {
/**
* This enum represents all the different ways in which a read can overlap an interval.
*
- * NO_OVERLAP:
+ * NO_OVERLAP_CONTIG:
+ * read and interval are in different contigs.
+ *
+ * NO_OVERLAP_LEFT:
+ * the read does not overlap the interval.
+ *
+ * |----------------| (interval)
+ * <----------------> (read)
+ *
+ * NO_OVERLAP_RIGHT:
* the read does not overlap the interval.
*
* |----------------| (interval)
* <----------------> (read)
*
- * LEFT_OVERLAP:
+ * OVERLAP_LEFT:
* the read starts before the beginning of the interval but ends inside of it
*
* |----------------| (interval)
* <----------------> (read)
*
- * RIGHT_OVERLAP:
+ * OVERLAP_RIGHT:
* the read starts inside the interval but ends outside of it
*
* |----------------| (interval)
* <----------------> (read)
*
- * FULL_OVERLAP:
+ * OVERLAP_LEFT_AND_RIGHT:
* the read starts before the interval and ends after the interval
*
* |-----------| (interval)
* <-------------------> (read)
*
- * CONTAINED:
+ * OVERLAP_CONTAINED:
* the read starts and ends inside the interval
*
* |----------------| (interval)
@@ -658,7 +667,7 @@ public class ReadUtils {
return ReadAndIntervalOverlap.OVERLAP_RIGHT;
}
- @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"})
+ @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"})
public static int getRefCoordSoftUnclippedStart(SAMRecord read) {
int start = read.getUnclippedStart();
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
@@ -670,9 +679,13 @@ public class ReadUtils {
return start;
}
- @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"})
+ @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"})
public static int getRefCoordSoftUnclippedEnd(SAMRecord read) {
int stop = read.getUnclippedStart();
+
+ if (readIsEntirelyInsertion(read))
+ return stop;
+
int shift = 0;
CigarOperator lastOperator = null;
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
@@ -686,6 +699,14 @@ public class ReadUtils {
return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ;
}
+ private static boolean readIsEntirelyInsertion(SAMRecord read) {
+ for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
+ if (cigarElement.getOperator() != CigarOperator.INSERTION)
+ return false;
+ }
+ return true;
+ }
+
/**
* Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region before
* the alignment start of the read.
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java
index fdf3d97db..85d752003 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java
@@ -293,17 +293,8 @@ public class Genotype {
return commonInfo.getAttribute(key, defaultValue);
}
- public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); }
public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); }
- public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); }
public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); }
- public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); }
public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); }
- public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); }
public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); }
-
- public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); }
- public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); }
- public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); }
- public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); }
}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java
index 3d162adb0..bf16cd1cf 100755
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java
@@ -1,6 +1,8 @@
package org.broadinstitute.sting.utils.variantcontext;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
+
import java.util.*;
@@ -204,27 +206,40 @@ public final class InferredGeneticContext {
return defaultValue;
}
-// public AttributedObject getAttributes(Collection