Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Ryan Poplin 2011-09-20 09:08:38 -04:00
commit efce2ece9d
47 changed files with 1394 additions and 462 deletions

View File

@ -163,6 +163,14 @@
<!-- Remove old versions of ivy jars AFTER the ivy:retrieve has been class loaded. -->
<delete file="${ivy.jar.dir}/ivy-2.0.0.jar"/>
<delete file="${ivy.jar.dir}/ivy-2.2.0-rc1.jar"/>
<!--
An old versions of the ivy-1.4.1.xml does not contain /ivy-module/configuration/conf/@name="compile".
Easier to upgrade to 1.4.4 than try to deal with xmlproperty and conditional deletion in ant.
Just in case we remove explicit 1.4.4 and go back to 1.4.1, try to clean out the file for now.
-->
<delete file="${ivy.home}/cache/javax.mail/mail/ivy-1.4.1.xml"/>
<delete file="${ivy.home}/cache/javax.mail/mail/ivydata-1.4.1.properties"/>
<delete file="${ivy.home}/cache/javax.mail/mail/jars/mail-1.4.1.jar"/>
</target>
<target name="init.buildall">

View File

@ -15,10 +15,8 @@
<!-- Tribble -->
<dependency org="org.broad" name="tribble" rev="latest.integration"/>
<dependency org="log4j" name="log4j" rev="1.2.15">
<!-- Don't include javax.mail here in default, only used in scala->default by commons-email -->
<exclude org="javax.mail" />
</dependency>
<dependency org="log4j" name="log4j" rev="1.2.15"/>
<dependency org="javax.mail" name="mail" rev="1.4.4"/>
<dependency org="colt" name="colt" rev="1.2.0"/>
<dependency org="jboss" name="javassist" rev="3.7.ga"/>
<dependency org="org.simpleframework" name="simple-xml" rev="2.0.4"/>

View File

@ -12,14 +12,14 @@ if ( onCMDLine ) {
inputFileName = args[1]
outputPDF = args[2]
} else {
#inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt"
inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
inputFileName = "~/Desktop/Q-30033@gsa1.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt"
#inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt"
outputPDF = NA
}
RUNTIME_UNITS = "(sec)"
ORIGINAL_UNITS_TO_SECONDS = 1/1000
RUNTIME_UNITS = "(hours)"
ORIGINAL_UNITS_TO_SECONDS = 1/1000/60/60
#
# Helper function to aggregate all of the jobs in the report across all tables
@ -33,7 +33,7 @@ allJobsFromReport <- function(report) {
#
# Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job
#
plotJobsGantt <- function(gatkReport, sortOverall) {
plotJobsGantt <- function(gatkReport, sortOverall, includeText) {
allJobs = allJobsFromReport(gatkReport)
if ( sortOverall ) {
title = "All jobs, by analysis, by start time"
@ -44,16 +44,18 @@ plotJobsGantt <- function(gatkReport, sortOverall) {
}
allJobs$index = 1:nrow(allJobs)
minTime = min(allJobs$startTime)
allJobs$relStartTime = allJobs$startTime - minTime
allJobs$relDoneTime = allJobs$doneTime - minTime
allJobs$relStartTime = (allJobs$startTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
allJobs$relDoneTime = (allJobs$doneTime - minTime) * ORIGINAL_UNITS_TO_SECONDS
allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts)
maxRelTime = max(allJobs$relDoneTime)
p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName))
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm")))
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
p <- p + theme_bw()
p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm")))
if ( includeText )
p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2)
p <- p + xlim(0, maxRelTime * 1.1)
p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS))
p <- p + ylab("Job")
p <- p + ylab("Job number")
p <- p + opts(title=title)
print(p)
}
@ -157,8 +159,8 @@ if ( ! is.na(outputPDF) ) {
pdf(outputPDF, height=8.5, width=11)
}
plotJobsGantt(gatkReportData, T)
plotJobsGantt(gatkReportData, F)
plotJobsGantt(gatkReportData, T, F)
plotJobsGantt(gatkReportData, F, F)
plotProgressByTime(gatkReportData)
for ( group in gatkReportData ) {
plotGroup(group)

View File

@ -293,15 +293,16 @@ public class GATKRunReport {
* That is, postReport() is guarenteed not to fail for any reason.
*/
private File postReportToLocalDisk(File rootDir) {
String filename = getID() + ".report.xml.gz";
File file = new File(rootDir, filename);
try {
String filename = getID() + ".report.xml.gz";
File file = new File(rootDir, filename);
postReportToFile(file);
logger.debug("Wrote report to " + file);
return file;
} catch ( Exception e ) {
// we catch everything, and no matter what eat the error
exceptDuringRunReport("Couldn't read report file", e);
file.delete();
return null;
}
}
@ -312,6 +313,7 @@ public class GATKRunReport {
File localFile = postReportToLocalDisk(new File("./"));
logger.debug("Generating GATK report to AWS S3 based on local file " + localFile);
if ( localFile != null ) { // we succeeded in creating the local file
localFile.deleteOnExit();
try {
// stop us from printing the annoying, and meaningless, mime types warning
Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
@ -336,14 +338,13 @@ public class GATKRunReport {
//logger.info("Uploading " + localFile + " to AWS bucket");
S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
logger.debug("Uploaded to AWS: " + s3Object);
logger.info("Uploaded run statistics report to AWS S3");
} catch ( S3ServiceException e ) {
exceptDuringRunReport("S3 exception occurred", e);
} catch ( NoSuchAlgorithmException e ) {
exceptDuringRunReport("Couldn't calculate MD5", e);
} catch ( IOException e ) {
exceptDuringRunReport("Couldn't read report file", e);
} finally {
localFile.delete();
}
}
}

View File

@ -419,9 +419,10 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
if (dict == null) return;
SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary());
validateTrackSequenceDictionary(inputFile.getAbsolutePath(),currentDict,dict);
// check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set
validateTrackSequenceDictionary(inputFile.getAbsolutePath(),currentDict,dict);
for (SAMSequenceRecord seq : currentDict.getSequences()) {
if (dict.getSequence(seq.getSequenceName()) == null)
continue;
@ -435,6 +436,13 @@ public class RMDTrackBuilder { // extends PluginManager<FeatureCodec> {
}
}
public void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) {
for ( SAMSequenceRecord seq : dict.getSequences() ) {
final String contig = SequenceDictionaryPropertyPredicate + seq.getSequenceName();
final String length = String.valueOf(seq.getSequenceLength());
index.addProperty(contig,length);
}
}
public void validateTrackSequenceDictionary(String trackName, SAMSequenceDictionary trackDict, SAMSequenceDictionary referenceDict) {
// if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation

View File

@ -26,7 +26,7 @@ public class SBByDepth extends AnnotationByDepth {
if (!vc.hasAttribute(VCFConstants.STRAND_BIAS_KEY))
return null;
double sBias = Double.valueOf(vc.getAttributeAsString(VCFConstants.STRAND_BIAS_KEY));
double sBias = vc.getAttributeAsDouble(VCFConstants.STRAND_BIAS_KEY, -1);
final Map<String, Genotype> genotypes = vc.getGenotypes();
if ( genotypes == null || genotypes.size() == 0 )

View File

@ -63,9 +63,12 @@ import java.util.*;
* <h2>Input</h2>
* <p>
* One or more bam files (with proper headers) to be analyzed for coverage statistics
* (Optional) A REFSEQ Rod to aggregate coverage to the gene level
* </p>
*
* <p>
*(Optional) A REFSEQ Rod to aggregate coverage to the gene level
* <p>
* (for information about creating the REFSEQ Rod, please consult the RefSeqCodec documentation)
*</p></p>
* <h2>Output</h2>
* <p>
* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:
@ -93,7 +96,7 @@ import java.util.*;
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantEval \
* -T DepthOfCoverage \
* -o file_name_base \
* -I input_bams.list
* [-geneList refSeq.sorted.txt] \

View File

@ -73,7 +73,7 @@ public class HaplotypeIndelErrorModel {
baseMatchArray = new double[MAX_CACHED_QUAL+1];
baseMismatchArray = new double[MAX_CACHED_QUAL+1];
for (int k=1; k <= MAX_CACHED_QUAL; k++) {
double baseProb = QualityUtils.qualToProb(k);
double baseProb = QualityUtils.qualToProb((byte)k);
baseMatchArray[k] = probToQual(baseProb);

View File

@ -37,7 +37,7 @@ public class PhasingRead extends BaseArray {
public PhasingRead(int length, int mappingQual) {
super(length);
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual));
this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb((byte)mappingQual));
this.baseProbs = new PreciseNonNegativeDouble[length];
Arrays.fill(this.baseProbs, null);

View File

@ -44,12 +44,12 @@ public class RefSeqDataParser {
String nameKeyToUseMultiplePrefix = nameKeyToUse + "_";
Map<String, String> entriesToNames = new HashMap<String, String>();
Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY);
if (numRecords != null) {
int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1);
if (numRecords != -1) {
boolean done = false;
if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1":
String name = vc.getAttributeAsStringNoException(nameKeyToUse);
String name = vc.getAttributeAsString(nameKeyToUse, null);
if (name != null) {
entriesToNames.put(nameKeyToUse, name);
done = true;
@ -59,14 +59,14 @@ public class RefSeqDataParser {
if (!done) {
for (int i = 1; i <= numRecords; i++) {
String key = nameKeyToUseMultiplePrefix + i;
String name = vc.getAttributeAsStringNoException(key);
String name = vc.getAttributeAsString(key, null);
if (name != null)
entriesToNames.put(key, name);
}
}
}
else { // no entry with the # of records:
String name = vc.getAttributeAsStringNoException(nameKeyToUse);
String name = vc.getAttributeAsString(nameKeyToUse, null);
if (name != null) {
entriesToNames.put(nameKeyToUse, name);
}

View File

@ -110,12 +110,12 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
case SNP:
nVariantLoci++;
nSNPs++;
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
break;
case MNP:
nVariantLoci++;
nMNPs++;
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++;
break;
case INDEL:
nVariantLoci++;
@ -137,7 +137,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
String refStr = vc1.getReference().getBaseString().toUpperCase();
String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE").toUpperCase() : null;
String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase() : null;
// if (aaStr.equals(".")) {
// aaStr = refStr;
// }

View File

@ -219,7 +219,8 @@ public class GenotypePhasingEvaluator extends VariantEvaluator {
}
public static Double getPQ(Genotype gt) {
return gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY);
Double d = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1);
return d == -1 ? null : d;
}
public static boolean topMatchesTop(AllelePair b1, AllelePair b2) {

View File

@ -120,7 +120,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval
if ( eval.hasGenotypes() )
ac = eval.getChromosomeCount(eval.getAlternateAllele(0));
else if ( eval.hasAttribute("AC") ) {
ac = Integer.valueOf(eval.getAttributeAsString("AC"));
ac = eval.getAttributeAsInt("AC", -1);
}
if ( ac != -1 ) {

View File

@ -49,18 +49,14 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv
else nTv++;
}
String refStr = vc.getReference().getBaseString().toUpperCase();
String aaStr = vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase();
if (aaStr != null && !aaStr.equalsIgnoreCase("null") && !aaStr.equals(".")) {
BaseUtils.BaseSubstitutionType aaSubType = BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0]);
//System.out.println(refStr + " " + vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase() + " " + aaSubType);
if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSITION) {
nTiDerived++;
} else if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSVERSION) {
nTvDerived++;
if (vc.hasAttribute("ANCESTRALALLELE")) {
final String aaStr = vc.getAttributeAsString("ANCESTRALALLELE", "null").toUpperCase();
if ( ! aaStr.equals(".") ) {
switch ( BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0] ) ) {
case TRANSITION: nTiDerived++; break;
case TRANSVERSION: nTvDerived++; break;
default: break;
}
}
}
}

View File

@ -131,7 +131,7 @@ public class ValidationReport extends VariantEvaluator implements StandardEval {
//// System.out.printf(" ac = %d%n", ac);
}
else
ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY);
ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0);
return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO;
} else {
return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED ? SiteStatus.POLY : SiteStatus.NO_CALL; // we can't figure out what to do

View File

@ -44,7 +44,7 @@ public class AlleleCount extends VariantStratifier {
if (eval != null) {
int AC = -1;
if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) {
AC = eval.getAttributeAsInt("AC");
AC = eval.getAttributeAsInt("AC", 0);
} else if ( eval.isVariant() ) {
for (Allele allele : eval.getAlternateAlleles())
AC = Math.max(AC, eval.getChromosomeCount(allele));

View File

@ -28,7 +28,7 @@ public class AlleleFrequency extends VariantStratifier {
if (eval != null) {
try {
relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF") / 5.0, 3))));
relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF", 0.0) / 5.0, 3))));
} catch (Exception e) {
return relevantStates;
}

View File

@ -90,8 +90,8 @@ public class Degeneracy extends VariantStratifier {
Integer frame = null;
if (eval.hasAttribute("refseq.functionalClass")) {
aa = eval.getAttributeAsString("refseq.variantAA");
frame = eval.getAttributeAsInt("refseq.frame");
aa = eval.getAttributeAsString("refseq.variantAA", null);
frame = eval.getAttributeAsInt("refseq.frame", 0);
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
int annotationId = 1;
String key;
@ -99,7 +99,7 @@ public class Degeneracy extends VariantStratifier {
do {
key = String.format("refseq.functionalClass_%d", annotationId);
String newtype = eval.getAttributeAsString(key);
String newtype = eval.getAttributeAsString(key, null);
if ( newtype != null &&
( type == null ||
@ -109,13 +109,13 @@ public class Degeneracy extends VariantStratifier {
type = newtype;
String aakey = String.format("refseq.variantAA_%d", annotationId);
aa = eval.getAttributeAsString(aakey);
aa = eval.getAttributeAsString(aakey, null);
if (aa != null) {
String framekey = String.format("refseq.frame_%d", annotationId);
if (eval.hasAttribute(framekey)) {
frame = eval.getAttributeAsInt(framekey);
frame = eval.getAttributeAsInt(framekey, 0);
}
}
}

View File

@ -28,7 +28,7 @@ public class FunctionalClass extends VariantStratifier {
}
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
public List<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
ArrayList<String> relevantStates = new ArrayList<String>();
relevantStates.add("all");
@ -38,7 +38,7 @@ public class FunctionalClass extends VariantStratifier {
if (eval.hasAttribute("refseq.functionalClass")) {
try {
type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass"));
type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass", null));
} catch ( Exception e ) {} // don't error out if the type isn't supported
} else if (eval.hasAttribute("refseq.functionalClass_1")) {
int annotationId = 1;
@ -47,7 +47,7 @@ public class FunctionalClass extends VariantStratifier {
do {
key = String.format("refseq.functionalClass_%d", annotationId);
String newtypeStr = eval.getAttributeAsString(key);
String newtypeStr = eval.getAttributeAsString(key, null);
if ( newtypeStr != null && !newtypeStr.equalsIgnoreCase("null") ) {
try {
FunctionalType newType = FunctionalType.valueOf(newtypeStr);

View File

@ -115,7 +115,7 @@ public class VQSRCalibrationCurve {
if ( vc.isFiltered() )
return 0.0;
else if ( vc.hasAttribute(VQSRQualKey) ) {
double qual = vc.getAttributeAsDouble(VQSRQualKey);
double qual = vc.getAttributeAsDouble(VQSRQualKey, 0.0);
return probTrueVariant(qual);
} else {
throw new UserException.VariantContextMissingRequiredField(VQSRQualKey, vc);
@ -143,7 +143,7 @@ public class VQSRCalibrationCurve {
for ( int i = 0; i < log10Likelihoods.length; i++) {
double p = Math.pow(10, log10Likelihoods[i]);
double q = alpha * p + (1-alpha) * noInfoPr;
if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey), p, alpha, q);
if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey, 0.0), p, alpha, q);
updated[i] = Math.log10(q);
}

View File

@ -574,7 +574,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
// ok we have a comp VC and we need to match the AF spectrum of inputAFRodName.
// We then pick a variant with probablity AF*desiredFraction
if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) {
String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY);
String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY, null);
double af;
double afBoost = 1.0;

View File

@ -192,7 +192,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
if ( getters.containsKey(field) ) {
val = getters.get(field).get(vc);
} else if ( vc.hasAttribute(field) ) {
val = vc.getAttributeAsString(field);
val = vc.getAttributeAsString(field, null);
} else if ( isWildCard(field) ) {
Set<String> wildVals = new HashSet<String>();
for ( Map.Entry<String,Object> elt : vc.getAttributes().entrySet()) {
@ -309,6 +309,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
getters.put("TYPE", new Getter() { public String get(VariantContext vc) { return vc.getType().toString(); } });
getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });

View File

@ -306,7 +306,7 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Serializable, HasGenome
@Override
public int hashCode() {
return (int)( start << 16 + stop << 4 + contigIndex );
return start << 16 | stop << 4 | contigIndex;
}

View File

@ -9,14 +9,17 @@ import net.sf.samtools.SAMUtils;
* @author Kiran Garimella
*/
public class QualityUtils {
public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE;
public final static double MIN_REASONABLE_ERROR = 0.0001;
public final static byte MAX_REASONABLE_Q_SCORE = 40;
public final static byte MIN_USABLE_Q_SCORE = 6;
public final static int MAPPING_QUALITY_UNAVAILABLE = 255;
private static double qualToErrorProbCache[] = new double[256];
static {
for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw((byte)i);
}
/**
* Private constructor. No instantiating this class!
*/
@ -33,10 +36,6 @@ public class QualityUtils {
return 1.0 - qualToErrorProb(qual);
}
static public double qualToProb(int qual) {
return qualToProb( (double)qual );
}
static public double qualToProb(double qual) {
return 1.0 - Math.pow(10.0, qual/(-10.0));
}
@ -48,10 +47,14 @@ public class QualityUtils {
* @param qual a quality score (0-40)
* @return a probability (0.0-1.0)
*/
static public double qualToErrorProb(byte qual) {
static public double qualToErrorProbRaw(byte qual) {
return Math.pow(10.0, ((double) qual)/-10.0);
}
static public double qualToErrorProb(byte qual) {
return qualToErrorProbCache[qual];
}
/**
* Convert a probability to a quality score. Note, this is capped at Q40.
*
@ -110,88 +113,4 @@ public class QualityUtils {
//return (byte) Math.min(qual, maxQual);
return (byte) Math.max(Math.min(qual, maxQual), 1);
}
/**
* Compress a base and a probability into a single byte so that it can be output in a SAMRecord's SQ field.
* Note: the highest probability this function can encode is 64%, so this function should only never be used on the best base hypothesis.
* Another note: the probability encoded here gets rounded to the nearest 1%.
*
* @param baseIndex the base index
* @param prob the base probability
* @return a byte containing the index and the probability
*/
static public byte baseAndProbToCompressedQuality(int baseIndex, double prob) {
byte compressedQual = 0;
compressedQual = (byte) baseIndex;
byte cprob = (byte) (100.0*prob);
byte qualmask = (byte) 252;
compressedQual += ((cprob << 2) & qualmask);
return compressedQual;
}
/**
* From a compressed base, extract the base index (0:A, 1:C, 2:G, 3:T)
*
* @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality
* @return base index
*/
static public int compressedQualityToBaseIndex(byte compressedQual) {
return (int) (compressedQual & 0x3);
}
/**
* From a compressed base, extract the base probability
*
* @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality
* @return the probability
*/
static public double compressedQualityToProb(byte compressedQual) {
// Because java natives are signed, extra care must be taken to avoid
// shifting a 1 into the sign bit in the implicit promotion of 2 to an int.
int x2 = ((int) compressedQual) & 0xff;
x2 = (x2 >>> 2);
return ((double) x2)/100.0;
}
/**
* Return the complement of a compressed quality
*
* @param compressedQual the compressed quality score (as returned by baseAndProbToCompressedQuality)
* @return the complementary compressed quality
*/
static public byte complementCompressedQuality(byte compressedQual) {
int baseIndex = compressedQualityToBaseIndex(compressedQual);
double prob = compressedQualityToProb(compressedQual);
return baseAndProbToCompressedQuality(BaseUtils.complementIndex(baseIndex), prob);
}
/**
* Return the reverse complement of a byte array of compressed qualities
*
* @param compressedQuals a byte array of compressed quality scores
* @return the reverse complement of the byte array
*/
static public byte[] reverseComplementCompressedQualityArray(byte[] compressedQuals) {
byte[] rcCompressedQuals = new byte[compressedQuals.length];
for (int pos = 0; pos < compressedQuals.length; pos++) {
rcCompressedQuals[compressedQuals.length - pos - 1] = complementCompressedQuality(compressedQuals[pos]);
}
return rcCompressedQuals;
}
/**
* Return the reverse of a byte array of qualities (compressed or otherwise)
* @param quals the array of bytes to be reversed
* @return the reverse of the quality array
*/
static public byte[] reverseQualityArray( byte[] quals ) {
return Utils.reverse(quals); // no sense in duplicating functionality
}
}

View File

@ -190,11 +190,21 @@ public class SampleUtils {
}
public static List<String> getSamplesFromCommandLineInput(Collection<String> sampleArgs) {
/**
* Returns a new set of samples, containing a final list of samples expanded from sampleArgs
*
* Each element E of sampleArgs can either be a literal sample name or a file. For each E,
* we try to read a file named E from disk, and if possible all lines from that file are expanded
* into unique sample names.
*
* @param sampleArgs
* @return
*/
public static Set<String> getSamplesFromCommandLineInput(Collection<String> sampleArgs) {
if (sampleArgs != null) {
// Let's first go through the list and see if we were given any files. We'll add every entry in the file to our
// sample list set, and treat the entries as if they had been specified on the command line.
List<String> samplesFromFiles = new ArrayList<String>();
Set<String> samplesFromFiles = new HashSet<String>();
for (String SAMPLE_EXPRESSION : sampleArgs) {
File sampleFile = new File(SAMPLE_EXPRESSION);
@ -203,7 +213,7 @@ public class SampleUtils {
List<String> lines = reader.readLines();
for (String line : lines) {
samplesFromFiles.add(line);
samplesFromFiles.add(line.trim());
}
} catch (FileNotFoundException e) {
samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample
@ -212,7 +222,8 @@ public class SampleUtils {
return samplesFromFiles;
}
return new ArrayList<String>();
return new HashSet<String>();
}
public static Set<String> getSamplesFromCommandLineInput(Collection<String> vcfSamples, Collection<String> sampleExpressions) {

View File

@ -12,19 +12,35 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.ArrayList;
/**
* TODO FOR CHRIS HARTL
* Allows for reading in RefSeq information
*
* <p>
* Codec Description
* Parses a sorted UCSC RefSeq file (see below) into relevant features: the gene name, the unique gene name (if multiple transcrips get separate entries), exons, gene start/stop, coding start/stop,
* strandedness of transcription.
* </p>
*
* <p>
* See also: link to file specification
* Instructions for generating a RefSeq file for use with the RefSeq codec can be found on the Wiki here
* <a href="http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq">http://www.broadinstitute.org/gsa/wiki/index.php/RefSeq</a>
* </p>
* <h2> Usage </h2>
* The RefSeq Rod can be bound as any other rod, and is specified by REFSEQ, for example
* <pre>
* -refSeqBinding:REFSEQ /path/to/refSeq.txt
* </pre>
*
* You will need to consult individual walkers for the binding name ("refSeqBinding", above)
*
* <h2>File format example</h2>
* If you want to define your own file for use, the format is (tab delimited):
* bin, name, chrom, strand, transcription start, transcription end, coding start, coding end, num exons, exon starts, exon ends, id, alt. name, coding start status (complete/incomplete), coding end status (complete,incomplete)
* and exon frames, for example:
* <pre>
* 76 NM_001011874 1 - 3204562 3661579 3206102 3661429 3 3204562,3411782,3660632, 3207049,3411982,3661579, 0 Xkr4 cmpl cmpl 1,2,0,
* </pre>
* for more information see <a href="http://skip.ucsc.edu/cgi-bin/hgTables?hgsid=5651&hgta_doSchemaDb=mm8&hgta_doSchemaTable=refGene">here</a>
* <p>
* A BAM file containing <b>exactly one sample</b>.
*
* </p>
*
* @author Mark DePristo

View File

@ -227,7 +227,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
throw new UserException.MalformedVCF(message, lineNo);
}
private static void generateException(String message, int lineNo) {
protected static void generateException(String message, int lineNo) {
throw new UserException.MalformedVCF(message, lineNo);
}

View File

@ -0,0 +1,122 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.codecs.vcf;
import org.broad.tribble.Tribble;
import org.broad.tribble.TribbleException;
import org.broad.tribble.index.DynamicIndexCreator;
import org.broad.tribble.index.Index;
import org.broad.tribble.index.IndexFactory;
import org.broad.tribble.util.LittleEndianOutputStream;
import org.broad.tribble.util.PositionalStream;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
/**
* this class writes VCF files
*/
public abstract class IndexingVCFWriter implements VCFWriter {
final private String name;
private File indexFile = null;
private OutputStream outputStream;
private PositionalStream positionalStream = null;
private DynamicIndexCreator indexer = null;
private LittleEndianOutputStream idxStream = null;
protected IndexingVCFWriter(String name, File location, OutputStream output, boolean enableOnTheFlyIndexing) {
outputStream = output;
this.name = name;
if ( enableOnTheFlyIndexing ) {
indexFile = Tribble.indexFile(location);
try {
idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
//System.out.println("Creating index on the fly for " + location);
indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
indexer.initialize(location, indexer.defaultBinSize());
positionalStream = new PositionalStream(output);
outputStream = positionalStream;
} catch ( IOException ex ) {
// No matter what we keep going, since we don't care if we can't create the index file
idxStream = null;
indexer = null;
positionalStream = null;
indexFile = null;
}
}
}
public OutputStream getOutputStream() {
return outputStream;
}
public String getStreamName() {
return name;
}
public abstract void writeHeader(VCFHeader header);
/**
* attempt to close the VCF file
*/
public void close() {
// try to close the index stream (keep it separate to help debugging efforts)
if ( indexer != null ) {
try {
Index index = indexer.finalizeIndex(positionalStream.getPosition());
index.write(idxStream);
idxStream.close();
} catch (IOException e) {
throw new ReviewedStingException("Unable to close index for " + getStreamName(), e);
}
}
}
/**
* add a record to the file
*
* @param vc the Variant Context object
*/
public void add(VariantContext vc) {
// if we are doing on the fly indexing, add the record ***before*** we write any bytes
if ( indexer != null )
indexer.addFeature(vc, positionalStream.getPosition());
}
protected static final String writerName(File location, OutputStream stream) {
return location == null ? stream.toString() : location.getAbsolutePath();
}
protected static OutputStream openOutputStream(File location) {
try {
return new FileOutputStream(location);
} catch (FileNotFoundException e) {
throw new ReviewedStingException("Unable to create VCF file at location: " + location, e);
}
}
}

View File

@ -44,26 +44,19 @@ import java.util.*;
/**
* this class writes VCF files
*/
public class StandardVCFWriter implements VCFWriter {
public class StandardVCFWriter extends IndexingVCFWriter {
// the print stream we're writing to
final protected BufferedWriter mWriter;
// should we write genotypes or just sites?
final protected boolean doNotWriteGenotypes;
// the VCF header we're storing
protected VCFHeader mHeader = null;
// the print stream we're writing to
protected BufferedWriter mWriter;
protected PositionalStream positionalStream = null;
// were filters applied?
protected boolean filtersWereAppliedToContext = false;
// should we write genotypes or just sites?
protected boolean doNotWriteGenotypes = false;
protected DynamicIndexCreator indexer = null;
protected File indexFile = null;
LittleEndianOutputStream idxStream = null;
File location = null;
/**
* create a VCF writer, given a file to write to
*
@ -93,32 +86,22 @@ public class StandardVCFWriter implements VCFWriter {
* @param doNotWriteGenotypes do not write genotypes
*/
public StandardVCFWriter(OutputStream output, boolean doNotWriteGenotypes) {
mWriter = new BufferedWriter(new OutputStreamWriter(output));
this.doNotWriteGenotypes = doNotWriteGenotypes;
this(null, output, false, doNotWriteGenotypes);
}
public StandardVCFWriter(File location, OutputStream output, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
this.location = location;
if ( enableOnTheFlyIndexing ) {
indexFile = Tribble.indexFile(location);
try {
idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
//System.out.println("Creating index on the fly for " + location);
indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
indexer.initialize(location, indexer.defaultBinSize());
positionalStream = new PositionalStream(output);
output = positionalStream;
} catch ( IOException ex ) {
// No matter what we keep going, since we don't care if we can't create the index file
}
}
//mWriter = new BufferedWriter(new OutputStreamWriter(new PositionalStream(output)));
mWriter = new BufferedWriter(new OutputStreamWriter(output));
super(writerName(location, output), location, output, enableOnTheFlyIndexing);
mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size
this.doNotWriteGenotypes = doNotWriteGenotypes;
}
// --------------------------------------------------------------------------------
//
// VCFWriter interface functions
//
// --------------------------------------------------------------------------------
@Override
public void writeHeader(VCFHeader header) {
mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header;
@ -158,44 +141,24 @@ public class StandardVCFWriter implements VCFWriter {
mWriter.flush(); // necessary so that writing to an output stream will work
}
catch (IOException e) {
throw new TribbleException("IOException writing the VCF header to " + locationString(), e);
throw new ReviewedStingException("IOException writing the VCF header to " + getStreamName(), e);
}
}
private String locationString() {
return location == null ? mWriter.toString() : location.getAbsolutePath();
}
/**
* attempt to close the VCF file
*/
@Override
public void close() {
// try to close the vcf stream
try {
mWriter.flush();
mWriter.close();
} catch (IOException e) {
throw new TribbleException("Unable to close " + locationString() + " because of " + e.getMessage());
throw new ReviewedStingException("Unable to close " + getStreamName(), e);
}
// try to close the index stream (keep it separate to help debugging efforts)
if ( indexer != null ) {
try {
Index index = indexer.finalizeIndex(positionalStream.getPosition());
index.write(idxStream);
idxStream.close();
} catch (IOException e) {
throw new TribbleException("Unable to close index for " + locationString() + " because of " + e.getMessage());
}
}
}
protected static OutputStream openOutputStream(File location) {
try {
return new FileOutputStream(location);
} catch (FileNotFoundException e) {
throw new TribbleException("Unable to create VCF file at location: " + location);
}
super.close();
}
/**
@ -203,28 +166,17 @@ public class StandardVCFWriter implements VCFWriter {
*
* @param vc the Variant Context object
*/
@Override
public void add(VariantContext vc) {
add(vc, false);
}
/**
* add a record to the file
*
* @param vc the Variant Context object
* @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER THE EVENT INSTEAD)
*/
public void add(VariantContext vc, boolean refBaseShouldBeAppliedToEndOfAlleles) {
if ( mHeader == null )
throw new IllegalStateException("The VCF Header must be written before records can be added: " + locationString());
throw new IllegalStateException("The VCF Header must be written before records can be added: " + getStreamName());
if ( doNotWriteGenotypes )
vc = VariantContext.modifyGenotypes(vc, null);
try {
vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBaseShouldBeAppliedToEndOfAlleles);
// if we are doing on the fly indexing, add the record ***before*** we write any bytes
if ( indexer != null ) indexer.addFeature(vc, positionalStream.getPosition());
vc = VariantContext.createVariantContextWithPaddedAlleles(vc, false);
super.add(vc);
Map<Allele, String> alleleMap = new HashMap<Allele, String>(vc.getAlleles().size());
alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup
@ -275,7 +227,7 @@ public class StandardVCFWriter implements VCFWriter {
mWriter.write(VCFConstants.FIELD_SEPARATOR);
// FILTER
String filters = vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (filtersWereAppliedToContext || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
String filters = getFilterString(vc, filtersWereAppliedToContext);
mWriter.write(filters);
mWriter.write(VCFConstants.FIELD_SEPARATOR);
@ -317,9 +269,22 @@ public class StandardVCFWriter implements VCFWriter {
mWriter.write("\n");
mWriter.flush(); // necessary so that writing to an output stream will work
} catch (IOException e) {
throw new RuntimeException("Unable to write the VCF object to " + locationString());
throw new RuntimeException("Unable to write the VCF object to " + getStreamName());
}
}
// --------------------------------------------------------------------------------
//
// implementation functions
//
// --------------------------------------------------------------------------------
public static final String getFilterString(final VariantContext vc) {
return getFilterString(vc, false);
}
public static final String getFilterString(final VariantContext vc, boolean forcePASS) {
return vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (forcePASS || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
}
private String getQualValue(double qual) {
@ -462,7 +427,7 @@ public class StandardVCFWriter implements VCFWriter {
mWriter.write(encoding);
}
private static String formatVCFField(Object val) {
public static String formatVCFField(Object val) {
String result;
if ( val == null )
result = VCFConstants.MISSING_VALUE_v4;
@ -524,12 +489,11 @@ public class StandardVCFWriter implements VCFWriter {
}
public static int countOccurrences(char c, String s) {
private static int countOccurrences(char c, String s) {
int count = 0;
for (int i = 0; i < s.length(); i++) {
count += s.charAt(i) == c ? 1 : 0;
}
return count;
}
}

View File

@ -105,34 +105,37 @@ public class VCFCodec extends AbstractVCFCodec {
* @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF)
*/
protected Set<String> parseFilters(String filterString) {
return parseFilters(filterHash, lineNo, filterString);
}
public static Set<String> parseFilters(final Map<String, LinkedHashSet<String>> cache, final int lineNo, final String filterString) {
// null for unfiltered
if ( filterString.equals(VCFConstants.UNFILTERED) )
return null;
// empty set for passes filters
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) )
return fFields;
return Collections.emptySet();
if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4");
generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo);
if ( filterString.length() == 0 )
generateException("The VCF specification requires a valid filter status");
generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo);
// do we have the filter string cached?
if ( filterHash.containsKey(filterString) )
return filterHash.get(filterString);
if ( cache != null && cache.containsKey(filterString) )
return Collections.unmodifiableSet(cache.get(filterString));
// empty set for passes filters
LinkedHashSet<String> fFields = new LinkedHashSet<String>();
// otherwise we have to parse and cache the value
if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
fFields.add(filterString);
else
fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
filterHash.put(filterString, fFields);
fFields = fFields;
if ( cache != null ) cache.put(filterString, fFields);
return fFields;
return Collections.unmodifiableSet(fFields);
}

View File

@ -0,0 +1,256 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.gcf;
import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
import java.util.*;
/**
* GATK binary VCF record
*
* @author Your Name
* @since Date created
*/
public class GCF {
private final static int RECORD_TERMINATOR = 123456789;
private int chromOffset;
private int start, stop;
private String id;
private List<Allele> alleleMap;
private int alleleOffsets[];
private float qual;
private byte refPad;
private String info;
private int filterOffset;
private List<GCFGenotype> genotypes = Collections.emptyList();
public GCF(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc, boolean skipGenotypes) {
chromOffset = GCFHeaderBuilder.encodeString(vc.getChr());
start = vc.getStart();
stop = vc.getEnd();
refPad = vc.hasReferenceBaseForIndel() ? vc.getReferenceBaseForIndel() : 0;
id = vc.getID();
// encode alleles
alleleMap = new ArrayList<Allele>(vc.getNAlleles());
alleleOffsets = new int[vc.getNAlleles()];
alleleMap.add(vc.getReference());
alleleOffsets[0] = GCFHeaderBuilder.encodeAllele(vc.getReference());
for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
alleleMap.add(vc.getAlternateAllele(i));
alleleOffsets[i+1] = GCFHeaderBuilder.encodeAllele(vc.getAlternateAllele(i));
}
qual = (float)vc.getNegLog10PError(); //qualToByte(vc.getPhredScaledQual());
info = infoFieldString(vc, GCFHeaderBuilder);
filterOffset = GCFHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc));
if ( ! skipGenotypes ) {
genotypes = encodeGenotypes(GCFHeaderBuilder, vc);
}
}
public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException, EOFException {
chromOffset = inputStream.readInt();
// have we reached the footer?
if ( chromOffset == GCFHeader.FOOTER_START_MARKER )
throw new EOFException();
start = inputStream.readInt();
stop = inputStream.readInt();
id = inputStream.readUTF();
refPad = inputStream.readByte();
alleleOffsets = readIntArray(inputStream);
qual = inputStream.readFloat();
info = inputStream.readUTF();
filterOffset = inputStream.readInt();
int nGenotypes = inputStream.readInt();
int sizeOfGenotypes = inputStream.readInt();
if ( skipGenotypes ) {
genotypes = Collections.emptyList();
inputStream.skipBytes(sizeOfGenotypes);
} else {
genotypes = new ArrayList<GCFGenotype>(nGenotypes);
for ( int i = 0; i < nGenotypes; i++ )
genotypes.add(new GCFGenotype(this, inputStream));
}
int recordDone = inputStream.readInt();
if ( recordDone != RECORD_TERMINATOR )
throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key");
}
public int write(DataOutputStream outputStream) throws IOException {
int startSize = outputStream.size();
outputStream.writeInt(chromOffset);
outputStream.writeInt(start);
outputStream.writeInt(stop);
outputStream.writeUTF(id);
outputStream.writeByte(refPad);
writeIntArray(alleleOffsets, outputStream, true);
outputStream.writeFloat(qual);
outputStream.writeUTF(info);
outputStream.writeInt(filterOffset);
int nGenotypes = genotypes.size();
int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes;
outputStream.writeInt(nGenotypes);
outputStream.writeInt(expectedSizeOfGenotypes);
int obsSizeOfGenotypes = 0;
for ( GCFGenotype g : genotypes )
obsSizeOfGenotypes += g.write(outputStream);
if ( obsSizeOfGenotypes != expectedSizeOfGenotypes )
throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes);
outputStream.writeInt(RECORD_TERMINATOR);
return outputStream.size() - startSize;
}
public VariantContext decode(final String source, final GCFHeader header) {
final String contig = header.getString(chromOffset);
alleleMap = header.getAlleles(alleleOffsets);
double negLog10PError = qual; // QualityUtils.qualToErrorProb(qual);
Set<String> filters = header.getFilters(filterOffset);
Map<String, Object> attributes = new HashMap<String, Object>();
attributes.put("INFO", info);
Byte refPadByte = refPad == 0 ? null : refPad;
Map<String, Genotype> genotypes = decodeGenotypes(header);
return new VariantContext(source, contig, start, stop, alleleMap, genotypes, negLog10PError, filters, attributes, refPadByte);
}
private Map<String, Genotype> decodeGenotypes(final GCFHeader header) {
if ( genotypes.isEmpty() )
return VariantContext.NO_GENOTYPES;
else {
Map<String, Genotype> map = new TreeMap<String, Genotype>();
for ( int i = 0; i < genotypes.size(); i++ ) {
final String sampleName = header.getSample(i);
final Genotype g = genotypes.get(i).decode(sampleName, header, this, alleleMap);
map.put(sampleName, g);
}
return map;
}
}
private List<GCFGenotype> encodeGenotypes(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc) {
int nGenotypes = vc.getNSamples();
if ( nGenotypes > 0 ) {
List<GCFGenotype> genotypes = new ArrayList<GCFGenotype>(nGenotypes);
for ( int i = 0; i < nGenotypes; i++ ) genotypes.add(null);
for ( Genotype g : vc.getGenotypes().values() ) {
int i = GCFHeaderBuilder.encodeSample(g.getSampleName());
genotypes.set(i, new GCFGenotype(GCFHeaderBuilder, alleleMap, g));
}
return genotypes;
} else {
return Collections.emptyList();
}
}
public int getNAlleles() { return alleleOffsets.length; }
private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) {
StringBuilder s = new StringBuilder();
boolean first = true;
for ( Map.Entry<String, Object> field : vc.getAttributes().entrySet() ) {
String key = field.getKey();
if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) )
continue;
int stringIndex = GCFHeaderBuilder.encodeString(key);
String outputValue = StandardVCFWriter.formatVCFField(field.getValue());
if ( outputValue != null ) {
if ( ! first ) s.append(";");
s.append(stringIndex).append("=").append(outputValue);
first = false;
}
}
return s.toString();
}
protected final static int BUFFER_SIZE = 1048576; // 2**20
public static DataInputStream createDataInputStream(final InputStream stream) {
return new DataInputStream(new BufferedInputStream(stream, BUFFER_SIZE));
}
public static FileInputStream createFileInputStream(final File file) throws FileNotFoundException {
return new FileInputStream(file);
}
protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException {
return readIntArray(inputStream, inputStream.readInt());
}
protected final static int[] readIntArray(final DataInputStream inputStream, int size) throws IOException {
int[] array = new int[size];
for ( int i = 0; i < array.length; i++ )
array[i] = inputStream.readInt();
return array;
}
protected final static void writeIntArray(int[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException {
if ( writeSize ) outputStream.writeInt(array.length);
for ( int i : array )
outputStream.writeInt(i);
}
protected final static byte[] readByteArray(final DataInputStream inputStream) throws IOException {
return readByteArray(inputStream, inputStream.readInt());
}
protected final static byte[] readByteArray(final DataInputStream inputStream, int size) throws IOException {
byte[] array = new byte[size];
for ( int i = 0; i < array.length; i++ )
array[i] = inputStream.readByte();
return array;
}
protected final static void writeByteArray(byte[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException {
if ( writeSize ) outputStream.writeInt(array.length);
for ( byte i : array )
outputStream.writeByte(i);
}
protected final static byte qualToByte(double phredScaledQual) {
return (byte)Math.round(Math.min(phredScaledQual, 255));
}
}

View File

@ -0,0 +1,147 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.gcf;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.*;
/**
* GATK binary VCF record
*
* @author Your Name
* @since Date created
*/
public class GCFGenotype {
private byte gq;
private int gt;
private int dp;
private int ad[];
private byte[] pl;
// todo -- what to do about phasing? Perhaps we shouldn't support it
// todo -- is the FL field generic or just a flag? Should we even support per sample filtering?
public GCFGenotype(final GCFHeaderBuilder GCFHeaderBuilder, final List<Allele> allAlleles, Genotype genotype) {
gq = GCF.qualToByte(genotype.getPhredScaledQual());
gt = encodeAlleles(genotype.getAlleles(), allAlleles);
dp = genotype.getAttributeAsInt("DP", 0);
int nAlleles = allAlleles.size();
ad = new int[nAlleles];
int npls = nAllelesToNPls(nAlleles);
pl = new byte[npls];
}
private int nAllelesToNPls( int nAlleles ) {
return nAlleles*(nAlleles+1) / 2;
}
public GCFGenotype(GCF GCF, DataInputStream inputStream) throws IOException {
int gqInt = inputStream.readUnsignedByte();
gq = (byte)gqInt;
gt = inputStream.readInt();
dp = inputStream.readInt();
ad = GCF.readIntArray(inputStream, GCF.getNAlleles());
pl = GCF.readByteArray(inputStream, nAllelesToNPls(GCF.getNAlleles()));
}
// 2 alleles => 1 + 8 + 8 + 3 => 20
protected int sizeInBytes() {
return 1 // gq
+ 4 * 2 // gt + dp
+ 4 * ad.length // ad
+ 1 * pl.length; // pl
}
public Genotype decode(final String sampleName, final GCFHeader header, GCF GCF, List<Allele> alleleIndex) {
final List<Allele> alleles = decodeAlleles(gt, alleleIndex);
final double negLog10PError = gq / 10.0;
final Set<String> filters = Collections.emptySet();
final Map<String, Object> attributes = new HashMap<String, Object>();
attributes.put("DP", dp);
attributes.put("AD", ad);
attributes.put("PL", pl);
return new Genotype(sampleName, alleles, negLog10PError, filters, attributes, false);
}
private static int encodeAlleles(List<Allele> gtList, List<Allele> allAlleles) {
final int nAlleles = gtList.size();
if ( nAlleles > 4 )
throw new IllegalArgumentException("encodeAlleles doesn't support more than 4 alt alleles, but I saw " + gtList);
int gtInt = 0;
for ( int i = 0; i < nAlleles ; i++ ) {
final int bitOffset = i * 8;
final int allelei = getAlleleIndex(gtList.get(i), allAlleles);
final int gti = (allelei + 1) << bitOffset;
gtInt = gtInt | gti;
}
return gtInt;
}
private static int getAlleleIndex(Allele q, List<Allele> allAlleles) {
if ( q.isNoCall() )
return 254;
for ( int i = 0; i < allAlleles.size(); i++ )
if ( q.equals(allAlleles.get(i)) )
return i;
throw new IllegalStateException("getAlleleIndex passed allele not in map! allele " + q + " allAlleles " + allAlleles);
}
private static List<Allele> decodeAlleles(int gtInt, List<Allele> alleleIndex) {
List<Allele> alleles = new ArrayList<Allele>(4);
for ( int i = 0; i < 32; i += 8 ) {
final int gi = (gtInt & (0x000000FF << i)) >> i;
if ( gi != 0 ) {
final int allelei = gi - 1;
alleles.add( allelei == 254 ? Allele.NO_CALL : alleleIndex.get(allelei) );
} else {
break;
}
}
return alleles;
}
public int write(DataOutputStream outputStream) throws IOException {
int startSize = outputStream.size();
outputStream.writeByte(gq);
outputStream.writeInt(gt);
outputStream.writeInt(dp);
GCF.writeIntArray(ad, outputStream, false);
GCF.writeByteArray(pl, outputStream, false);
return outputStream.size() - startSize;
}
}

View File

@ -0,0 +1,205 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.gcf;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.io.*;
import java.util.*;
/**
* [Short one sentence description of this walker]
* <p/>
* <p>
* [Functionality of this walker]
* </p>
* <p/>
* <h2>Input</h2>
* <p>
* [Input description]
* </p>
* <p/>
* <h2>Output</h2>
* <p>
* [Output description]
* </p>
* <p/>
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T $WalkerName
* </pre>
*
* @author Your Name
* @since Date created
*/
public class GCFHeader {
final protected static Logger logger = Logger.getLogger(GCFHeader.class);
public final static int GCF_VERSION = 1;
public final static byte[] GCF_FILE_START_MARKER = "GCF\1".getBytes();
public final static int FOOTER_START_MARKER = -1;
public final static long HEADER_FORWARD_REFERENCE_OFFSET = GCF_FILE_START_MARKER.length + 4; // for the version
final int version;
long footerPosition;
final List<Allele> alleles;
final List<String> strings;
final List<String> samples;
final List<Set<String>> filters;
public GCFHeader(final Map<Allele, Integer> allelesIn, final Map<String, Integer> stringIn, final Map<String, Integer> samplesIn) {
version = GCF_VERSION;
footerPosition = 0;
this.alleles = linearize(allelesIn);
this.strings = linearize(stringIn);
this.samples = linearize(samplesIn);
this.filters = null; // not used with this constructor
}
public GCFHeader(FileInputStream fileInputStream) throws IOException {
DataInputStream inputStream = new DataInputStream(fileInputStream);
byte[] headerTest = new byte[GCF_FILE_START_MARKER.length];
inputStream.read(headerTest);
if ( ! Arrays.equals(headerTest, GCF_FILE_START_MARKER) ) {
throw new UserException("Could not read GVCF file. GCF_FILE_START_MARKER missing. Saw " + new String(headerTest));
} else {
version = inputStream.readInt();
logger.info("Read GCF version " + version);
footerPosition = inputStream.readLong();
logger.info("Read footer position of " + footerPosition);
long lastPos = fileInputStream.getChannel().position();
logger.info(" Last position is " + lastPos);
// seek to the footer
fileInputStream.getChannel().position(footerPosition);
if ( inputStream.readInt() != FOOTER_START_MARKER )
throw new UserException.MalformedFile("Malformed GCF file: couldn't find the footer marker");
alleles = stringsToAlleles(readStrings(inputStream));
strings = readStrings(inputStream);
samples = readStrings(inputStream);
logger.info(String.format("Allele map of %d elements", alleles.size()));
logger.info(String.format("String map of %d elements", strings.size()));
logger.info(String.format("Sample map of %d elements", samples.size()));
filters = initializeFilterCache();
fileInputStream.getChannel().position(lastPos);
}
}
public static int writeHeader(final DataOutputStream outputStream) throws IOException {
int startBytes = outputStream.size();
outputStream.write(GCF_FILE_START_MARKER);
outputStream.writeInt(GCF_VERSION);
outputStream.writeLong(0);
return outputStream.size() - startBytes;
}
public int writeFooter(final DataOutputStream outputStream) throws IOException {
int startBytes = outputStream.size();
outputStream.writeInt(FOOTER_START_MARKER); // has to be the same as chrom encoding
write(outputStream, allelesToStrings(alleles));
write(outputStream, strings);
write(outputStream, samples);
return outputStream.size() - startBytes;
}
private void write(DataOutputStream outputStream, List<String> l) throws IOException {
outputStream.writeInt(l.size());
for ( String elt : l ) outputStream.writeUTF(elt);
}
private List<String> allelesToStrings(List<Allele> alleles) {
List<String> strings = new ArrayList<String>(alleles.size());
for ( Allele allele : alleles ) strings.add(allele.toString());
return strings;
}
private List<Set<String>> initializeFilterCache() {
// required to allow offset -> set lookup
List<Set<String>> l = new ArrayList<Set<String>>(strings.size());
for ( int i = 0; i < strings.size(); i++ ) l.add(null);
return l;
}
private static List<Allele> stringsToAlleles(final List<String> strings) {
final List<Allele> alleles = new ArrayList<Allele>(strings.size());
for ( String string : strings ) {
boolean isRef = string.endsWith("*");
if ( isRef ) string = string.substring(0, string.length() - 1);
alleles.add(Allele.create(string, isRef));
}
return alleles;
}
private static List<String> readStrings(final DataInputStream inputStream) throws IOException {
final int nStrings = inputStream.readInt();
final List<String> strings = new ArrayList<String>(nStrings);
for ( int i = 0; i < nStrings; i++ ) {
strings.add(inputStream.readUTF());
}
return strings;
}
private static <T> List<T> linearize(final Map<T, Integer> map) {
final ArrayList<T> l = new ArrayList<T>(map.size());
for ( int i = 0; i < map.size(); i++ ) l.add(null);
for ( final Map.Entry<T, Integer> elt : map.entrySet() )
l.set(elt.getValue(), elt.getKey());
return l;
}
public String getSample(final int offset) { return samples.get(offset); }
public String getString(final int offset) { return strings.get(offset); }
public Allele getAllele(final int offset) { return alleles.get(offset); }
public List<Allele> getAlleles(final int[] offsets) {
final List<Allele> alleles = new ArrayList<Allele>(offsets.length);
for ( int i : offsets ) alleles.add(getAllele(i));
return alleles;
}
public Set<String> getFilters(final int offset) {
Set<String> cached = filters.get(offset);
if ( cached != null )
return cached;
else {
final String filterString = getString(offset);
if ( filterString.equals(VCFConstants.UNFILTERED) )
return null; // UNFILTERED records are represented by null
else {
Set<String> set = VCFCodec.parseFilters(null, -1, filterString);
filters.set(offset, set); // remember the result
return set;
}
}
}
}

View File

@ -0,0 +1,80 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.gcf;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.HashMap;
import java.util.Map;
/**
* [Short one sentence description of this walker]
* <p/>
* <p>
* [Functionality of this walker]
* </p>
* <p/>
* <h2>Input</h2>
* <p>
* [Input description]
* </p>
* <p/>
* <h2>Output</h2>
* <p>
* [Output description]
* </p>
* <p/>
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T $WalkerName
* </pre>
*
* @author Your Name
* @since Date created
*/
public class GCFHeaderBuilder {
Map<Allele, Integer> alleles = new HashMap<Allele, Integer>();
Map<String, Integer> strings = new HashMap<String, Integer>();
Map<String, Integer> samples = new HashMap<String, Integer>();
public GCFHeader createHeader() {
return new GCFHeader(alleles, strings, samples);
}
public int encodeString(final String chr) { return encode(strings, chr); }
public int encodeAllele(final Allele allele) { return encode(alleles, allele); }
public int encodeSample(final String sampleName) { return encode(samples, sampleName); }
private <T> int encode(Map<T, Integer> map, T key) {
Integer v = map.get(key);
if ( v == null ) {
v = map.size();
map.put(key, v);
}
return v;
}
}

View File

@ -0,0 +1,122 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.gcf;
import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.*;
/**
* GCFWriter implementing the VCFWriter interface
* @author Your Name
* @since Date created
*/
public class GCFWriter extends IndexingVCFWriter {
final boolean skipGenotypes;
final FileOutputStream fileOutputStream;
final DataOutputStream dataOutputStream;
final GCFHeaderBuilder gcfHeaderBuilder;
int nbytes = 0;
VCFHeader header = null;
File location;
// --------------------------------------------------------------------------------
//
// Constructors
//
// --------------------------------------------------------------------------------
public GCFWriter(File location, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
super(writerName(location, null), location, null, enableOnTheFlyIndexing);
this.location = location;
this.skipGenotypes = doNotWriteGenotypes;
// write the output
try {
fileOutputStream = new FileOutputStream(location);
dataOutputStream = createDataOutputStream(fileOutputStream);
gcfHeaderBuilder = new GCFHeaderBuilder();
} catch ( FileNotFoundException e ) {
throw new UserException.CouldNotCreateOutputFile(location, e);
}
}
// --------------------------------------------------------------------------------
//
// VCFWriter interface functions
//
// --------------------------------------------------------------------------------
@Override
public void writeHeader(VCFHeader header) {
this.header = header;
try {
nbytes += GCFHeader.writeHeader(dataOutputStream);
} catch ( IOException e ) {
throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Couldn't write header", e);
}
}
@Override
public void add(VariantContext vc) {
super.add(vc);
GCF gcf = new GCF(gcfHeaderBuilder, vc, skipGenotypes);
try {
nbytes += gcf.write(dataOutputStream);
} catch ( IOException e ) {
throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Failed to add gcf record " + gcf + " to stream " + getStreamName(), e);
}
}
@Override
public void close() {
// todo -- write out VCF header lines
GCFHeader gcfHeader = gcfHeaderBuilder.createHeader();
try {
long headerPosition = nbytes;
nbytes += gcfHeader.writeFooter(dataOutputStream);
dataOutputStream.close();
//System.out.println("Writing forward reference to " + headerPosition);
RandomAccessFile raFile = new RandomAccessFile(location, "rw");
raFile.seek(GCFHeader.HEADER_FORWARD_REFERENCE_OFFSET);
raFile.writeLong(headerPosition);
raFile.close();
} catch ( IOException e ) {
throw new ReviewedStingException("Failed to close GCFWriter " + getStreamName(), e);
}
super.close();
}
private static final DataOutputStream createDataOutputStream(final OutputStream stream) {
return new DataOutputStream(new BufferedOutputStream(stream, GCF.BUFFER_SIZE));
}
}

View File

@ -334,24 +334,44 @@ public class IntervalUtils {
}
/**
* Splits an interval list into multiple files.
* @param fileHeader The sam file header.
* Splits an interval list into multiple sublists.
* @param locs The genome locs to split.
* @param splits The stop points for the genome locs returned by splitFixedIntervals.
* @param scatterParts The output interval lists to write to.
* @return A list of lists of genome locs, split according to splits
*/
public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<GenomeLoc> locs, List<Integer> splits, List<File> scatterParts) {
if (splits.size() != scatterParts.size())
throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
int fileIndex = 0;
public static List<List<GenomeLoc>> splitIntervalsToSubLists(List<GenomeLoc> locs, List<Integer> splits) {
int locIndex = 1;
int start = 0;
List<List<GenomeLoc>> sublists = new ArrayList<List<GenomeLoc>>(splits.size());
for (Integer stop: splits) {
IntervalList intervalList = new IntervalList(fileHeader);
List<GenomeLoc> curList = new ArrayList<GenomeLoc>();
for (int i = start; i < stop; i++)
intervalList.add(toInterval(locs.get(i), locIndex++));
intervalList.write(scatterParts.get(fileIndex++));
curList.add(locs.get(i));
start = stop;
sublists.add(curList);
}
return sublists;
}
/**
* Splits an interval list into multiple files.
* @param fileHeader The sam file header.
* @param splits Pre-divided genome locs returned by splitFixedIntervals.
* @param scatterParts The output interval lists to write to.
*/
public static void scatterFixedIntervals(SAMFileHeader fileHeader, List<List<GenomeLoc>> splits, List<File> scatterParts) {
if (splits.size() != scatterParts.size())
throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size()));
int fileIndex = 0;
int locIndex = 1;
for (final List<GenomeLoc> split : splits) {
IntervalList intervalList = new IntervalList(fileHeader);
for (final GenomeLoc loc : split)
intervalList.add(toInterval(loc, locIndex++));
intervalList.write(scatterParts.get(fileIndex++));
}
}
@ -361,17 +381,15 @@ public class IntervalUtils {
* @param numParts Number of parts to split the locs into.
* @return The stop points to split the genome locs.
*/
public static List<Integer> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
public static List<List<GenomeLoc>> splitFixedIntervals(List<GenomeLoc> locs, int numParts) {
if (locs.size() < numParts)
throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts));
long locsSize = 0;
for (GenomeLoc loc: locs)
locsSize += loc.size();
List<Integer> splitPoints = new ArrayList<Integer>();
final long locsSize = intervalSize(locs);
final List<Integer> splitPoints = new ArrayList<Integer>();
addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts);
Collections.sort(splitPoints);
splitPoints.add(locs.size());
return splitPoints;
return splitIntervalsToSubLists(locs, splitPoints);
}
private static void addFixedSplit(List<Integer> splitPoints, List<GenomeLoc> locs, long locsSize, int startIndex, int stopIndex, int numParts) {
@ -441,4 +459,11 @@ public class IntervalUtils {
return merged;
}
}
public static final long intervalSize(final List<GenomeLoc> locs) {
long size = 0;
for ( final GenomeLoc loc : locs )
size += loc.size();
return size;
}
}

View File

@ -118,31 +118,40 @@ public class ReadUtils {
/**
* This enum represents all the different ways in which a read can overlap an interval.
*
* NO_OVERLAP:
* NO_OVERLAP_CONTIG:
* read and interval are in different contigs.
*
* NO_OVERLAP_LEFT:
* the read does not overlap the interval.
*
* |----------------| (interval)
* <----------------> (read)
*
* NO_OVERLAP_RIGHT:
* the read does not overlap the interval.
*
* |----------------| (interval)
* <----------------> (read)
*
* LEFT_OVERLAP:
* OVERLAP_LEFT:
* the read starts before the beginning of the interval but ends inside of it
*
* |----------------| (interval)
* <----------------> (read)
*
* RIGHT_OVERLAP:
* OVERLAP_RIGHT:
* the read starts inside the interval but ends outside of it
*
* |----------------| (interval)
* <----------------> (read)
*
* FULL_OVERLAP:
* OVERLAP_LEFT_AND_RIGHT:
* the read starts before the interval and ends after the interval
*
* |-----------| (interval)
* <-------------------> (read)
*
* CONTAINED:
* OVERLAP_CONTAINED:
* the read starts and ends inside the interval
*
* |----------------| (interval)
@ -658,7 +667,7 @@ public class ReadUtils {
return ReadAndIntervalOverlap.OVERLAP_RIGHT;
}
@Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"})
@Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"})
public static int getRefCoordSoftUnclippedStart(SAMRecord read) {
int start = read.getUnclippedStart();
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
@ -670,9 +679,13 @@ public class ReadUtils {
return start;
}
@Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"})
@Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"})
public static int getRefCoordSoftUnclippedEnd(SAMRecord read) {
int stop = read.getUnclippedStart();
if (readIsEntirelyInsertion(read))
return stop;
int shift = 0;
CigarOperator lastOperator = null;
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
@ -686,6 +699,14 @@ public class ReadUtils {
return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ;
}
private static boolean readIsEntirelyInsertion(SAMRecord read) {
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
if (cigarElement.getOperator() != CigarOperator.INSERTION)
return false;
}
return true;
}
/**
* Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region before
* the alignment start of the read.

View File

@ -293,17 +293,8 @@ public class Genotype {
return commonInfo.getAttribute(key, defaultValue);
}
public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); }
public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); }
public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); }
public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); }
public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); }
public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); }
public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); }
public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); }
public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); }
public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); }
public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); }
public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); }
}

View File

@ -1,6 +1,8 @@
package org.broadinstitute.sting.utils.variantcontext;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import java.util.*;
@ -204,27 +206,40 @@ public final class InferredGeneticContext {
return defaultValue;
}
// public AttributedObject getAttributes(Collection<Object> keys) {
// AttributedObject selected = new AttributedObject();
//
// for ( Object key : keys )
// selected.putAttribute(key, this.getAttribute(key));
//
// return selected;
// }
public String getAttributeAsString(String key, String defaultValue) {
Object x = getAttribute(key);
if ( x == null ) return defaultValue;
if ( x instanceof String ) return (String)x;
return String.valueOf(x); // throws an exception if this isn't a string
}
public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null"
public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); }
public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); }
public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); }
public int getAttributeAsInt(String key, int defaultValue) {
Object x = getAttribute(key);
if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue;
if ( x instanceof Integer ) return (Integer)x;
return Integer.valueOf((String)x); // throws an exception if this isn't a string
}
public String getAttributeAsString(String key, String defaultValue) { return (String)getAttribute(key, defaultValue); }
public int getAttributeAsInt(String key, int defaultValue) { return (Integer)getAttribute(key, defaultValue); }
public double getAttributeAsDouble(String key, double defaultValue) { return (Double)getAttribute(key, defaultValue); }
public boolean getAttributeAsBoolean(String key, boolean defaultValue){ return (Boolean)getAttribute(key, defaultValue); }
public double getAttributeAsDouble(String key, double defaultValue) {
Object x = getAttribute(key);
if ( x == null ) return defaultValue;
if ( x instanceof Double ) return (Double)x;
return Double.valueOf((String)x); // throws an exception if this isn't a string
}
public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} }
public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} }
public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); }
public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} }
public boolean getAttributeAsBoolean(String key, boolean defaultValue) {
Object x = getAttribute(key);
if ( x == null ) return defaultValue;
if ( x instanceof Boolean ) return (Boolean)x;
return Boolean.valueOf((String)x); // throws an exception if this isn't a string
}
// public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null"
// public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); }
// public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); }
// public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); }
// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} }
// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} }
// public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); }
// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} }
}

View File

@ -666,21 +666,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati
return commonInfo.getAttribute(key, defaultValue);
}
public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); }
public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); }
public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); }
public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); }
public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); }
public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); }
public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); }
public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); }
public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); }
public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); }
public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); }
public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); }
// ---------------------------------------------------------------------------------------------------------
//
// Working with alleles

View File

@ -565,11 +565,11 @@ public class VariantContextUtils {
// special case DP (add it up) and ID (just preserve it)
//
if (vc.hasAttribute(VCFConstants.DEPTH_KEY))
depth += Integer.valueOf(vc.getAttributeAsString(VCFConstants.DEPTH_KEY));
depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0);
if (rsID == null && vc.hasID())
rsID = vc.getID();
if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) {
String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY);
String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null);
// lets see if the string contains a , separator
if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) {
List<String> alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR));
@ -1159,9 +1159,7 @@ public class VariantContextUtils {
for (String orAttrib : MERGE_OR_ATTRIBS) {
boolean attribVal = false;
for (VariantContext vc : vcList) {
Boolean val = vc.getAttributeAsBooleanNoException(orAttrib);
if (val != null)
attribVal = (attribVal || val);
attribVal = vc.getAttributeAsBoolean(orAttrib, false);
if (attribVal) // already true, so no reason to continue:
break;
}
@ -1171,7 +1169,7 @@ public class VariantContextUtils {
// Merge ID fields:
String iDVal = null;
for (VariantContext vc : vcList) {
String val = vc.getAttributeAsStringNoException(VariantContext.ID_KEY);
String val = vc.getAttributeAsString(VariantContext.ID_KEY, null);
if (val != null && !val.equals(VCFConstants.EMPTY_ID_FIELD)) {
if (iDVal == null)
iDVal = val;
@ -1251,8 +1249,10 @@ public class VariantContextUtils {
public PhaseAndQuality(Genotype gt) {
this.isPhased = gt.isPhased();
if (this.isPhased)
this.PQ = gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY);
if (this.isPhased) {
this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1);
if ( this.PQ == -1 ) this.PQ = null;
}
}
}

View File

@ -22,34 +22,6 @@ public class IndexFactoryUnitTest {
File outputFile = new File("public/testdata/onTheFlyOutputTest.vcf");
File outputFileIndex = Tribble.indexFile(outputFile);
/**
* test out scoring the indexes
*/
@Test
public void testScoreIndexes() {
/*// make a list of indexes to score
Map<Class,IndexCreator> creators = new HashMap<Class,IndexCreator>();
// add a linear index with the default bin size
LinearIndexCreator linearNormal = new LinearIndexCreator();
linearNormal.initialize(inputFile, linearNormal.defaultBinSize());
creators.add(LInearIndexlinearNormal);
// create a tree index with a small index size
IntervalIndexCreator treeSmallBin = new IntervalIndexCreator();
treeSmallBin.initialize(inputFile, Math.max(200,treeSmallBin.defaultBinSize()/10));
creators.add(treeSmallBin);
List<Index> indexes = new ArrayList<Index>();
for (IndexCreator creator : creators)
indexes.add(creator.finalizeIndex(0));
ArrayList<Double> scores = IndexFactory.scoreIndexes(0.5,indexes,100, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
System.err.println("scores are : ");
for (Double score : scores) {
System.err.println(score);
*/
}
//
// test out scoring the indexes
//

View File

@ -30,6 +30,20 @@ public class IntervalUtilsUnitTest extends BaseTest {
private SAMFileHeader hg19Header;
private GenomeLocParser hg19GenomeLocParser;
private List<GenomeLoc> hg19ReferenceLocs;
private List<GenomeLoc> hg19exomeIntervals;
private List<GenomeLoc> getLocs(String... intervals) {
return getLocs(Arrays.asList(intervals));
}
private List<GenomeLoc> getLocs(List<String> intervals) {
if (intervals.size() == 0)
return hg18ReferenceLocs;
List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
for (String interval: intervals)
locs.add(hg18GenomeLocParser.parseGenomeLoc(interval));
return locs;
}
@BeforeClass
public void init() {
@ -54,12 +68,69 @@ public class IntervalUtilsUnitTest extends BaseTest {
ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref);
hg19GenomeLocParser = new GenomeLocParser(seq);
hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ;
hg19exomeIntervals = Collections.unmodifiableList(IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(hg19Intervals), false));
}
catch(FileNotFoundException ex) {
throw new UserException.CouldNotReadInputFile(hg19Ref,ex);
}
}
// -------------------------------------------------------------------------------------
//
// tests to ensure the quality of the interval cuts of the interval cutting functions
//
// -------------------------------------------------------------------------------------
private class IntervalSlicingTest extends TestDataProvider {
public int parts;
public double maxAllowableVariance;
private IntervalSlicingTest(final int parts, final double maxAllowableVariance) {
super(IntervalSlicingTest.class);
this.parts = parts;
this.maxAllowableVariance = maxAllowableVariance;
}
public String toString() {
return String.format("IntervalSlicingTest parts=%d maxVar=%.2f", parts, maxAllowableVariance);
}
}
@DataProvider(name = "intervalslicingdata")
public Object[][] createTrees() {
new IntervalSlicingTest(1, 0);
new IntervalSlicingTest(2, 1);
new IntervalSlicingTest(5, 1);
new IntervalSlicingTest(10, 1);
new IntervalSlicingTest(67, 1);
new IntervalSlicingTest(100, 1);
new IntervalSlicingTest(500, 1);
new IntervalSlicingTest(1000, 1);
return IntervalSlicingTest.getTests(IntervalSlicingTest.class);
}
@Test(enabled = true, dataProvider = "intervalslicingdata")
public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) {
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts);
long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals);
long idealSplitSize = totalSize / test.parts;
long sumOfSplitSizes = 0;
int counter = 0;
for ( final List<GenomeLoc> split : splits ) {
long splitSize = IntervalUtils.intervalSize(split);
double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize);
//logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma));
counter++;
sumOfSplitSizes += splitSize;
Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance));
}
Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals");
}
@Test(expectedExceptions=UserException.class)
public void testMergeListsBySetOperatorNoOverlap() {
// a couple of lists we'll use for the testing
@ -129,19 +200,6 @@ public class IntervalUtilsUnitTest extends BaseTest {
Assert.assertEquals((long)lengths.get("chrX"), 154913754);
}
private List<GenomeLoc> getLocs(String... intervals) {
return getLocs(Arrays.asList(intervals));
}
private List<GenomeLoc> getLocs(List<String> intervals) {
if (intervals.size() == 0)
return hg18ReferenceLocs;
List<GenomeLoc> locs = new ArrayList<GenomeLoc>();
for (String interval: intervals)
locs.add(hg18GenomeLocParser.parseGenomeLoc(interval));
return locs;
}
@Test
public void testParseIntervalArguments() {
Assert.assertEquals(getLocs().size(), 45);
@ -174,8 +232,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<File> files = testFiles("basic.", 3, ".intervals");
List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3");
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -200,8 +258,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<File> files = testFiles("less.", 3, ".intervals");
List<GenomeLoc> locs = getLocs("chr1", "chr2", "chr3", "chr4");
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -228,8 +286,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
public void testScatterFixedIntervalsMoreFiles() {
List<File> files = testFiles("more.", 3, ".intervals");
List<GenomeLoc> locs = getLocs("chr1", "chr2");
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size()
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size()
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
}
@Test
public void testScatterFixedIntervalsStart() {
@ -242,8 +300,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<File> files = testFiles("split.", 3, ".intervals");
List<GenomeLoc> locs = getLocs(intervals);
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -270,8 +328,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<File> files = testFiles("split.", 3, ".intervals");
List<GenomeLoc> locs = getLocs(intervals);
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -298,8 +356,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
List<File> files = testFiles("split.", 3, ".intervals");
List<GenomeLoc> locs = getLocs(intervals);
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
List<GenomeLoc> locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false);
List<GenomeLoc> locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false);
@ -319,7 +377,7 @@ public class IntervalUtilsUnitTest extends BaseTest {
public void testScatterFixedIntervalsFile() {
List<File> files = testFiles("sg.", 20, ".intervals");
List<GenomeLoc> locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false);
List<Integer> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(locs, files.size());
int[] counts = {
125, 138, 287, 291, 312, 105, 155, 324,
@ -332,16 +390,13 @@ public class IntervalUtilsUnitTest extends BaseTest {
};
//String splitCounts = "";
for (int lastIndex = 0, i = 0; i < splits.size(); i++) {
int splitIndex = splits.get(i);
int splitCount = (splitIndex - lastIndex);
//splitCounts += ", " + splitCount;
lastIndex = splitIndex;
for (int i = 0; i < splits.size(); i++) {
int splitCount = splits.get(i).size();
Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i);
}
//System.out.println(splitCounts.substring(2));
IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files);
IntervalUtils.scatterFixedIntervals(hg18Header, splits, files);
int locIndex = 0;
for (int i = 0; i < files.size(); i++) {
@ -357,8 +412,8 @@ public class IntervalUtilsUnitTest extends BaseTest {
@Test
public void testScatterFixedIntervalsMax() {
List<File> files = testFiles("sg.", 85, ".intervals");
List<Integer> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size());
IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files);
List<List<GenomeLoc>> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size());
IntervalUtils.scatterFixedIntervals(hg19Header, splits, files);
for (int i = 0; i < files.size(); i++) {
String file = files.get(i).toString();

View File

@ -131,11 +131,11 @@ class GATKResourcesBundle extends QScript {
addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf",
"hapmap_3.3", b37, true, true))
addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/AFR+EUR+ASN+1KG.dindel_august_release_merged_pilot1.20110126.sites.vcf",
"1000G_indels_for_realignment", b37, true, false))
addResource(new Resource("/humgen/1kg/processing/official_release/phase1/ALL.wgs.VQSR_consensus_biallelic.20101123.indels.sites.vcf",
"1000G_biallelic.indels", b37, true, false))
addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Mills_Devine_Indels_2011/ALL.wgs.indels_mills_devine_hg19_leftAligned_collapsed_double_hit.sites.vcf",
"indels_mills_devine", b37, true, true))
"Mills_Devine_2hit.indels", b37, true, true))
//
// example call set for wiki tutorial
@ -300,9 +300,9 @@ class GATKResourcesBundle extends QScript {
bamFile = bamIn
}
class IndexVCF(@Input vcf: File, @Input ref: File) extends CountRod with UNIVERSAL_GATK_ARGS {
class IndexVCF(@Input vcf: File, @Input ref: File) extends CountRODs with UNIVERSAL_GATK_ARGS {
//@Output val vcfIndex: File = swapExt(vcf.getParent, vcf, ".vcf", ".vcf.idx")
this.rodBind :+= RodBind(vcf.getName, "VCF", vcf)
this.rod :+= vcf
this.reference_sequence = ref
}
@ -313,7 +313,7 @@ class GATKResourcesBundle extends QScript {
}
class MakeDBSNP129(@Input dbsnp: File, @Input ref: File, @Output dbsnp129: File) extends SelectVariants with UNIVERSAL_GATK_ARGS {
this.rodBind :+= RodBind("variant", "VCF", dbsnp)
this.variant = dbsnp
this.select ++= List("\"dbSNPBuildID <= 129\"")
this.reference_sequence = ref
this.out = dbsnp129

View File

@ -1,65 +1,65 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.queue.extensions.gatk
import java.io.File
import collection.JavaConversions._
import org.broadinstitute.sting.utils.interval.IntervalUtils
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
import net.sf.samtools.SAMFileHeader
import java.util.Collections
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser}
case class GATKIntervals(reference: File, intervals: List[String]) {
private lazy val referenceDataSource = new ReferenceDataSource(reference)
private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]]
lazy val samFileHeader = {
val header = new SAMFileHeader
header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary)
header
}
lazy val locs: java.util.List[GenomeLoc] = {
val parser = new GenomeLocParser(referenceDataSource.getReference)
val parsedLocs =
if (intervals.isEmpty)
GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList
else
IntervalUtils.parseIntervalArguments(parser, intervals, false)
Collections.sort(parsedLocs)
Collections.unmodifiableList(parsedLocs)
}
lazy val contigs = locs.map(_.getContig).distinct.toList
def getSplits(size: Int) = {
splitsBySize.getOrElse(size, {
val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size)
splitsBySize += size -> splits
splits
})
}
}
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.queue.extensions.gatk
import java.io.File
import collection.JavaConversions._
import org.broadinstitute.sting.utils.interval.IntervalUtils
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource
import net.sf.samtools.SAMFileHeader
import java.util.Collections
import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocSortedSet, GenomeLocParser}
case class GATKIntervals(reference: File, intervals: List[String]) {
private lazy val referenceDataSource = new ReferenceDataSource(reference)
// private var splitsBySize = Map.empty[Int, java.util.List[java.lang.Integer]]
lazy val samFileHeader = {
val header = new SAMFileHeader
header.setSequenceDictionary(referenceDataSource.getReference.getSequenceDictionary)
header
}
lazy val locs: java.util.List[GenomeLoc] = {
val parser = new GenomeLocParser(referenceDataSource.getReference)
val parsedLocs =
if (intervals.isEmpty)
GenomeLocSortedSet.createSetFromSequenceDictionary(samFileHeader.getSequenceDictionary).toList
else
IntervalUtils.parseIntervalArguments(parser, intervals, false)
Collections.sort(parsedLocs)
Collections.unmodifiableList(parsedLocs)
}
lazy val contigs = locs.map(_.getContig).distinct.toList
// def getSplits(size: Int) = {
// splitsBySize.getOrElse(size, {
// val splits: java.util.List[java.lang.Integer] = IntervalUtils.splitFixedIntervals(locs, size)
// splitsBySize += size -> splits
// splits
// })
// }
}

View File

@ -37,7 +37,7 @@ class IntervalScatterFunction extends GATKScatterFunction with InProcessFunction
def run() {
val gi = GATKScatterFunction.getGATKIntervals(this.referenceSequence, this.intervals)
IntervalUtils.scatterFixedIntervals(gi.samFileHeader, gi.locs,
gi.getSplits(this.scatterOutputFiles.size), this.scatterOutputFiles)
val splits = IntervalUtils.splitFixedIntervals(gi.locs, this.scatterOutputFiles.size)
IntervalUtils.scatterFixedIntervals(gi.samFileHeader, splits, this.scatterOutputFiles)
}
}

View File

@ -53,8 +53,8 @@ class GATKIntervalsUnitTest {
val gi = new GATKIntervals(hg18Reference, List("chr1:1-1", "chr2:2-3", "chr3:3-5"))
Assert.assertEquals(gi.locs.toList, List(chr1, chr2, chr3))
Assert.assertEquals(gi.contigs, List("chr1", "chr2", "chr3"))
Assert.assertEquals(gi.getSplits(2).toList, List(2, 3))
Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3))
// Assert.assertEquals(gi.getSplits(2).toList, List(2, 3))
// Assert.assertEquals(gi.getSplits(3).toList, List(1, 2, 3))
}
@Test(timeOut = 30000)
@ -65,7 +65,7 @@ class GATKIntervalsUnitTest {
// for(Item item: javaConvertedScalaList)
// This for loop is actually an O(N^2) operation as the iterator calls the
// O(N) javaConvertedScalaList.size() for each iteration of the loop.
Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894)
//Assert.assertEquals(gi.getSplits(gi.locs.size).size, 189894)
Assert.assertEquals(gi.contigs.size, 24)
}
@ -74,8 +74,8 @@ class GATKIntervalsUnitTest {
val gi = new GATKIntervals(hg18Reference, Nil)
Assert.assertEquals(gi.locs, hg18ReferenceLocs)
Assert.assertEquals(gi.contigs.size, hg18ReferenceLocs.size)
Assert.assertEquals(gi.getSplits(2).toList, List(10, 45))
Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45))
// Assert.assertEquals(gi.getSplits(2).toList, List(10, 45))
// Assert.assertEquals(gi.getSplits(4).toList, List(5, 10, 16, 45))
}
@Test