diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..8623fa076 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +/*.bam +/*.bai +/*.bed +*.idx +*~ +/*.vcf +/*.txt +/*.csh +/.* +/*.pdf +/*.eval +*.ipr +*.iws +*.iml +.DS_Store +queueScatterGather +/foo* +/bar* +integrationtests/ +public/testdata/onTheFlyOutputTest.vcf diff --git a/build.xml b/build.xml index babf31506..232b074f6 100644 --- a/build.xml +++ b/build.xml @@ -28,6 +28,8 @@ + + @@ -35,18 +37,26 @@ + + + + + + + - + - + @@ -60,7 +70,7 @@ - + @@ -82,7 +92,7 @@ - + @@ -113,7 +123,7 @@ - + @@ -154,16 +164,18 @@ - + + + @@ -209,11 +221,11 @@ - + - + @@ -222,11 +234,11 @@ - + - + @@ -264,7 +276,7 @@ - + @@ -310,13 +322,13 @@ - + - + @@ -325,11 +337,11 @@ - + - @@ -339,9 +351,9 @@ - + - + @@ -360,14 +372,14 @@ - + - + - - + @@ -411,9 +423,9 @@ - + - + @@ -422,12 +434,12 @@ - + - + @@ -530,6 +542,11 @@ + + + + + @@ -537,7 +554,7 @@ - + @@ -549,6 +566,15 @@ + + + + + + + + + @@ -577,6 +603,14 @@ + + + + + + + + @@ -591,6 +625,14 @@ + + + + + + + + @@ -603,28 +645,7 @@ - @@ -641,6 +662,12 @@ + + + + + + @@ -680,20 +707,7 @@ - + @@ -778,10 +792,6 @@ - @@ -798,10 +808,6 @@ - @@ -849,6 +855,9 @@ + + + @@ -1185,19 +1194,18 @@ - - + - + - + diff --git a/public/R/plot_residualError_OtherCovariate.R b/public/R/scripts/org/broadinstitute/sting/analyzecovariates/plot_residualError_OtherCovariate.R similarity index 96% rename from public/R/plot_residualError_OtherCovariate.R rename to public/R/scripts/org/broadinstitute/sting/analyzecovariates/plot_residualError_OtherCovariate.R index a1385ff3f..15c6fc8f0 100644 --- a/public/R/plot_residualError_OtherCovariate.R +++ b/public/R/scripts/org/broadinstitute/sting/analyzecovariates/plot_residualError_OtherCovariate.R @@ -1,5 +1,7 @@ #!/bin/env Rscript +library(tools) + args <- commandArgs(TRUE) verbose = TRUE @@ -47,6 +49,9 @@ if( is.numeric(c$Covariate) ) { } dev.off() +if (exists('compactPDF')) { + compactPDF(outfile) +} # # Plot mean quality versus the covariate @@ -69,6 +74,10 @@ if( is.numeric(c$Covariate) ) { } dev.off() +if (exists('compactPDF')) { + compactPDF(outfile) +} + # # Plot histogram of the covariate # @@ -106,3 +115,7 @@ if( is.numeric(c$Covariate) ) { axis(2,axTicks(2), format(axTicks(2), scientific=F)) } dev.off() + +if (exists('compactPDF')) { + compactPDF(outfile) +} diff --git a/public/R/plot_residualError_QualityScoreCovariate.R b/public/R/scripts/org/broadinstitute/sting/analyzecovariates/plot_residualError_QualityScoreCovariate.R similarity index 94% rename from public/R/plot_residualError_QualityScoreCovariate.R rename to public/R/scripts/org/broadinstitute/sting/analyzecovariates/plot_residualError_QualityScoreCovariate.R index 81bc9460d..33eeb1f16 100644 --- a/public/R/plot_residualError_QualityScoreCovariate.R +++ b/public/R/scripts/org/broadinstitute/sting/analyzecovariates/plot_residualError_QualityScoreCovariate.R @@ -1,5 +1,7 @@ #!/bin/env Rscript +library(tools) + args <- commandArgs(TRUE) input = args[1] @@ -33,6 +35,10 @@ points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16) abline(0,1, lty=2) dev.off() +if (exists('compactPDF')) { + compactPDF(outfile) +} + # # Plot Q empirical histogram # @@ -52,6 +58,10 @@ points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1") axis(2,axTicks(2), format(axTicks(2), scientific=F)) dev.off() +if (exists('compactPDF')) { + compactPDF(outfile) +} + # # Plot Q reported histogram # @@ -68,3 +78,7 @@ plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yM points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1") axis(2,axTicks(2), format(axTicks(2), scientific=F)) dev.off() + +if (exists('compactPDF')) { + compactPDF(outfile) +} diff --git a/public/R/plot_Tranches.R b/public/R/scripts/org/broadinstitute/sting/gatk/walkers/variantrecalibration/plot_Tranches.R similarity index 98% rename from public/R/plot_Tranches.R rename to public/R/scripts/org/broadinstitute/sting/gatk/walkers/variantrecalibration/plot_Tranches.R index a79ddd3ab..d96add768 100755 --- a/public/R/plot_Tranches.R +++ b/public/R/scripts/org/broadinstitute/sting/gatk/walkers/variantrecalibration/plot_Tranches.R @@ -1,5 +1,7 @@ #!/bin/env Rscript +library(tools) + args <- commandArgs(TRUE) verbose = TRUE @@ -85,3 +87,7 @@ if ( ! is.null(sensitivity) ) { } dev.off() + +if (exists('compactPDF')) { + compactPDF(outfile) +} diff --git a/public/R/queueJobReport.R b/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R similarity index 83% rename from public/R/queueJobReport.R rename to public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R index 31916361e..866766c2c 100644 --- a/public/R/queueJobReport.R +++ b/public/R/scripts/org/broadinstitute/sting/queue/util/queueJobReport.R @@ -12,20 +12,20 @@ if ( onCMDLine ) { inputFileName = args[1] outputPDF = args[2] } else { - #inputFileName = "~/Desktop/broadLocal/GATK/unstable/report.txt" - inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt" + inputFileName = "~/Desktop/broadLocal/GATK/unstable/wgs.jobreport.txt" + #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/Q-25718@node1149.jobreport.txt" #inputFileName = "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/rodPerformanceGoals/history/report.082711.txt" outputPDF = NA } -RUNTIME_UNITS = "(sec)" -ORIGINAL_UNITS_TO_SECONDS = 1/1000 +RUNTIME_UNITS = "(hours)" +ORIGINAL_UNITS_TO_RUNTIME_UNITS = 1/1000/60/60 # # Helper function to aggregate all of the jobs in the report across all tables # allJobsFromReport <- function(report) { - names <- c("jobName", "startTime", "analysisName", "doneTime", "exechosts") + names <- c("jobName", "startTime", "analysisName", "doneTime", "exechosts", "runtime") sub <- lapply(report, function(table) table[,names]) do.call("rbind", sub) } @@ -33,7 +33,7 @@ allJobsFromReport <- function(report) { # # Creates segmentation plots of time (x) vs. job (y) with segments for the duration of the job # -plotJobsGantt <- function(gatkReport, sortOverall) { +plotJobsGantt <- function(gatkReport, sortOverall, includeText) { allJobs = allJobsFromReport(gatkReport) if ( sortOverall ) { title = "All jobs, by analysis, by start time" @@ -44,16 +44,18 @@ plotJobsGantt <- function(gatkReport, sortOverall) { } allJobs$index = 1:nrow(allJobs) minTime = min(allJobs$startTime) - allJobs$relStartTime = allJobs$startTime - minTime + allJobs$relStartTime = allJobs$startTime - minTime allJobs$relDoneTime = allJobs$doneTime - minTime allJobs$ganttName = paste(allJobs$jobName, "@", allJobs$exechosts) maxRelTime = max(allJobs$relDoneTime) p <- ggplot(data=allJobs, aes(x=relStartTime, y=index, color=analysisName)) - p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=2, arrow=arrow(length = unit(0.1, "cm"))) - p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2) + p <- p + theme_bw() + p <- p + geom_segment(aes(xend=relDoneTime, yend=index), size=1, arrow=arrow(length = unit(0.1, "cm"))) + if ( includeText ) + p <- p + geom_text(aes(x=relDoneTime, label=ganttName, hjust=-0.2), size=2) p <- p + xlim(0, maxRelTime * 1.1) - p <- p + xlab(paste("Start time (relative to first job)", RUNTIME_UNITS)) - p <- p + ylab("Job") + p <- p + xlab(paste("Start time, relative to first job", RUNTIME_UNITS)) + p <- p + ylab("Job number") p <- p + opts(title=title) print(p) } @@ -119,7 +121,7 @@ plotGroup <- function(groupTable) { if ( length(groupAnnotations) == 1 && dim(sub)[1] > 1 ) { # todo -- how do we group by annotations? p <- ggplot(data=sub, aes(x=runtime)) + geom_histogram() - p <- p + xlab("runtime in seconds") + ylab("No. of jobs") + p <- p + xlab(paste("runtime", RUNTIME_UNITS)) + ylab("No. of jobs") p <- p + opts(title=paste("Job runtime histogram for", name)) print(p) } @@ -139,9 +141,9 @@ print(paste("Project :", inputFileName)) convertUnits <- function(gatkReportData) { convertGroup <- function(g) { - g$runtime = g$runtime * ORIGINAL_UNITS_TO_SECONDS - g$startTime = g$startTime * ORIGINAL_UNITS_TO_SECONDS - g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_SECONDS + g$runtime = g$runtime * ORIGINAL_UNITS_TO_RUNTIME_UNITS + g$startTime = g$startTime * ORIGINAL_UNITS_TO_RUNTIME_UNITS + g$doneTime = g$doneTime * ORIGINAL_UNITS_TO_RUNTIME_UNITS g } lapply(gatkReportData, convertGroup) @@ -157,8 +159,8 @@ if ( ! is.na(outputPDF) ) { pdf(outputPDF, height=8.5, width=11) } -plotJobsGantt(gatkReportData, T) -plotJobsGantt(gatkReportData, F) +plotJobsGantt(gatkReportData, T, F) +plotJobsGantt(gatkReportData, F, F) plotProgressByTime(gatkReportData) for ( group in gatkReportData ) { plotGroup(group) diff --git a/public/R/src/gsalib/DESCRIPTION b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/DESCRIPTION similarity index 100% rename from public/R/src/gsalib/DESCRIPTION rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/DESCRIPTION diff --git a/public/R/src/gsalib/R/gsa.error.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.error.R similarity index 100% rename from public/R/src/gsalib/R/gsa.error.R rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.error.R diff --git a/public/R/src/gsalib/R/gsa.getargs.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.getargs.R similarity index 100% rename from public/R/src/gsalib/R/gsa.getargs.R rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.getargs.R diff --git a/public/R/src/gsalib/R/gsa.message.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.message.R similarity index 100% rename from public/R/src/gsalib/R/gsa.message.R rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.message.R diff --git a/public/R/src/gsalib/R/gsa.plot.venn.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.plot.venn.R similarity index 100% rename from public/R/src/gsalib/R/gsa.plot.venn.R rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.plot.venn.R diff --git a/public/R/src/gsalib/R/gsa.read.eval.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.eval.R similarity index 100% rename from public/R/src/gsalib/R/gsa.read.eval.R rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.eval.R diff --git a/public/R/src/gsalib/R/gsa.read.gatkreport.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R similarity index 98% rename from public/R/src/gsalib/R/gsa.read.gatkreport.R rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R index 011b5240d..46bbf7eda 100644 --- a/public/R/src/gsalib/R/gsa.read.gatkreport.R +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R @@ -99,5 +99,5 @@ gsa.read.gatkreport <- function(filename) { .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows, tableEnv); } - gatkreport = as.list(tableEnv); + gatkreport = as.list(tableEnv, all.names=TRUE); } diff --git a/public/R/src/gsalib/R/gsa.read.squidmetrics.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.squidmetrics.R similarity index 100% rename from public/R/src/gsalib/R/gsa.read.squidmetrics.R rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.squidmetrics.R diff --git a/public/R/src/gsalib/R/gsa.read.vcf.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.vcf.R similarity index 100% rename from public/R/src/gsalib/R/gsa.read.vcf.R rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.vcf.R diff --git a/public/R/src/gsalib/R/gsa.warn.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.warn.R similarity index 100% rename from public/R/src/gsalib/R/gsa.warn.R rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.warn.R diff --git a/public/R/src/gsalib/Read-and-delete-me b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/Read-and-delete-me similarity index 100% rename from public/R/src/gsalib/Read-and-delete-me rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/Read-and-delete-me diff --git a/public/R/src/gsalib/data/tearsheetdrop.jpg b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/data/tearsheetdrop.jpg similarity index 100% rename from public/R/src/gsalib/data/tearsheetdrop.jpg rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/data/tearsheetdrop.jpg diff --git a/public/R/src/gsalib/man/gsa.error.Rd b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.error.Rd similarity index 100% rename from public/R/src/gsalib/man/gsa.error.Rd rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.error.Rd diff --git a/public/R/src/gsalib/man/gsa.getargs.Rd b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.getargs.Rd similarity index 100% rename from public/R/src/gsalib/man/gsa.getargs.Rd rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.getargs.Rd diff --git a/public/R/src/gsalib/man/gsa.message.Rd b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.message.Rd similarity index 100% rename from public/R/src/gsalib/man/gsa.message.Rd rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.message.Rd diff --git a/public/R/src/gsalib/man/gsa.plot.venn.Rd b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.plot.venn.Rd similarity index 100% rename from public/R/src/gsalib/man/gsa.plot.venn.Rd rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.plot.venn.Rd diff --git a/public/R/src/gsalib/man/gsa.read.eval.Rd b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.eval.Rd similarity index 100% rename from public/R/src/gsalib/man/gsa.read.eval.Rd rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.eval.Rd diff --git a/public/R/src/gsalib/man/gsa.read.gatkreport.Rd b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.gatkreport.Rd similarity index 100% rename from public/R/src/gsalib/man/gsa.read.gatkreport.Rd rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.gatkreport.Rd diff --git a/public/R/src/gsalib/man/gsa.read.squidmetrics.Rd b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.squidmetrics.Rd similarity index 100% rename from public/R/src/gsalib/man/gsa.read.squidmetrics.Rd rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.squidmetrics.Rd diff --git a/public/R/src/gsalib/man/gsa.read.vcf.Rd b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.vcf.Rd similarity index 100% rename from public/R/src/gsalib/man/gsa.read.vcf.Rd rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.read.vcf.Rd diff --git a/public/R/src/gsalib/man/gsa.warn.Rd b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.warn.Rd similarity index 100% rename from public/R/src/gsalib/man/gsa.warn.Rd rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsa.warn.Rd diff --git a/public/R/src/gsalib/man/gsalib-package.Rd b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd similarity index 100% rename from public/R/src/gsalib/man/gsalib-package.Rd rename to public/R/src/org/broadinstitute/sting/utils/R/gsalib/man/gsalib-package.Rd diff --git a/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java b/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java index c6755e878..a342cf932 100644 --- a/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java +++ b/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidationWalker.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.alignment; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; import org.broadinstitute.sting.alignment.bwa.BWTFiles; import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; @@ -35,6 +34,7 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Iterator; @@ -72,12 +72,13 @@ public class AlignmentValidationWalker extends ReadWalker { /** * Aligns a read to the given reference. + * * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null. * @param read Read to align. * @return Number of reads aligned by this map (aka 1). */ @Override - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { //logger.info(String.format("examining read %s", read.getReadName())); byte[] bases = read.getReadBases(); diff --git a/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java b/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java index 7064e637f..c8554573b 100644 --- a/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java +++ b/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java @@ -39,6 +39,7 @@ import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.WalkerName; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; @@ -92,12 +93,13 @@ public class AlignmentWalker extends ReadWalker { /** * Aligns a read to the given reference. + * * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null. * @param read Read to align. * @return Number of alignments found for this read. */ @Override - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { SAMRecord alignedRead = aligner.align(read,header); out.addAlignment(alignedRead); return 1; diff --git a/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignmentsWalker.java b/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignmentsWalker.java index 57d92319f..d91b83e7a 100644 --- a/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignmentsWalker.java +++ b/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignmentsWalker.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.alignment; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.alignment.bwa.BWAConfiguration; import org.broadinstitute.sting.alignment.bwa.BWTFiles; import org.broadinstitute.sting.alignment.bwa.c.BWACAligner; @@ -34,6 +33,7 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.PrintStream; import java.util.Iterator; @@ -79,12 +79,13 @@ public class CountBestAlignmentsWalker extends ReadWalker { /** * Aligns a read to the given reference. + * * @param ref Reference over the read. Read will most likely be unmapped, so ref will be null. * @param read Read to align. * @return Number of alignments found for this read. */ @Override - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { Iterator alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator(); if(alignmentIterator.hasNext()) { int numAlignments = alignmentIterator.next().length; diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java index 98f2a9b5c..a399867fa 100755 --- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java +++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java @@ -25,6 +25,9 @@ package org.broadinstitute.sting.analyzecovariates; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.CommandLineProgram; @@ -33,14 +36,16 @@ import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate; import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum; import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.R.RScriptExecutor; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.io.Resource; import org.broadinstitute.sting.utils.text.XReadLines; import java.io.*; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.Map; import java.util.regex.Pattern; @@ -71,15 +76,13 @@ import java.util.regex.Pattern; * * *

- * NOTE: For those running this tool externally from the Broad, it is crucial to note that both the -Rscript and -resources options - * must be changed from the default. -Rscript needs to point to your installation of Rscript (this is the scripting version of R, - * not the interactive version) while -resources needs to point to the folder holding the R scripts that are used. For those using - * this tool as part of the Binary Distribution the -resources should point to the resources folder that is part of the tarball. - * For those using this tool by building from the git repository the -resources should point to the R/ subdirectory of the Sting checkout. + * NOTE: Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). + * See http://www.r-project.org for more info on how to download and install R. * *

* See the GATK wiki for a tutorial and example recalibration accuracy plots. - * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration + * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration * *

Input

*

@@ -91,7 +94,6 @@ import java.util.regex.Pattern; * java -Xmx4g -jar AnalyzeCovariates.jar \ * -recalFile /path/to/recal.table.csv \ * -outputDir /path/to/output_dir/ \ - * -resources resources/ \ * -ignoreQ 5 * * @@ -101,6 +103,11 @@ import java.util.regex.Pattern; groupName = "AnalyzeCovariates", summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator") public class AnalyzeCovariates extends CommandLineProgram { + final private static Logger logger = Logger.getLogger(AnalyzeCovariates.class); + + private static final String PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE = "plot_residualError_QualityScoreCovariate.R"; + private static final String PLOT_RESDIUAL_ERROR_OTHER_COVARIATE = "plot_residualError_OtherCovariate.R"; + private static final String PLOT_INDEL_QUALITY_RSCRIPT = "plot_indelQuality.R"; ///////////////////////////// // Command Line Arguments @@ -114,11 +121,7 @@ public class AnalyzeCovariates extends CommandLineProgram { @Input(fullName = "recal_file", shortName = "recalFile", doc = "The input recal csv file to analyze", required = false) private String RECAL_FILE = "output.recal_data.csv"; @Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false) - private String OUTPUT_DIR = "analyzeCovariates/"; - @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/software/free/Linux/redhat_5_x86_64/pkgs/r_2.12.0/bin/Rscript", required = false) - private String PATH_TO_RSCRIPT = "Rscript"; - @Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required = false) - private String PATH_TO_RESOURCES = "public/R/"; + private File OUTPUT_DIR = new File("analyzeCovariates"); @Argument(fullName = "ignoreQ", shortName = "ignoreQ", doc = "Ignore bases with reported quality less than this number.", required = false) private int IGNORE_QSCORES_LESS_THAN = 5; @Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false) @@ -154,29 +157,26 @@ public class AnalyzeCovariates extends CommandLineProgram { protected int execute() { // create the output directory where all the data tables and plots will go - try { - Process p = Runtime.getRuntime().exec("mkdir " + OUTPUT_DIR); - } catch (IOException e) { - System.out.println("Couldn't create directory: " + OUTPUT_DIR); - System.out.println("User is responsible for making sure the output directory exists."); - } - if( !OUTPUT_DIR.endsWith("/") ) { OUTPUT_DIR = OUTPUT_DIR + "/"; } - if( !PATH_TO_RESOURCES.endsWith("/") ) { PATH_TO_RESOURCES = PATH_TO_RESOURCES + "/"; } + if (!OUTPUT_DIR.exists() && !OUTPUT_DIR.mkdirs()) + throw new UserException.BadArgumentValue("--output_dir/-outDir", "Unable to create output directory: " + OUTPUT_DIR); + + if (!RScriptExecutor.RSCRIPT_EXISTS) + Utils.warnUser(logger, "Rscript not found in environment path. Plots will not be generated."); // initialize all the data from the csv file and allocate the list of covariates - System.out.println("Reading in input csv file..."); + logger.info("Reading in input csv file..."); initializeData(); - System.out.println("...Done!"); + logger.info("...Done!"); // output data tables for Rscript to read in - System.out.println("Writing out intermediate tables for R..."); + logger.info("Writing out intermediate tables for R..."); writeDataTables(); - System.out.println("...Done!"); + logger.info("...Done!"); // perform the analysis using Rscript and output the plots - System.out.println("Calling analysis R scripts and writing out figures..."); + logger.info("Calling analysis R scripts and writing out figures..."); callRScripts(); - System.out.println("...Done!"); + logger.info("...Done!"); return 0; } @@ -287,37 +287,40 @@ public class AnalyzeCovariates extends CommandLineProgram { if(NUM_READ_GROUPS_TO_PROCESS == -1 || ++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS) { String readGroup = readGroupKey.toString(); RecalDatum readGroupDatum = (RecalDatum) dataManager.getCollapsedTable(0).data.get(readGroupKey); - System.out.print("Writing out data tables for read group: " + readGroup + "\twith " + readGroupDatum.getNumObservations() + " observations" ); - System.out.println("\tand aggregate residual error = " + String.format("%.3f", readGroupDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE) - readGroupDatum.getEstimatedQReported())); + logger.info(String.format( + "Writing out data tables for read group: %s\twith %s observations\tand aggregate residual error = %.3f", + readGroup, readGroupDatum.getNumObservations(), + readGroupDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE) - readGroupDatum.getEstimatedQReported())); // for each covariate for( int iii = 1; iii < requestedCovariates.size(); iii++ ) { Covariate cov = requestedCovariates.get(iii); // Create a PrintStream - PrintStream output = null; + File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat"); + PrintStream output; try { - output = new PrintStream(new FileOutputStream(OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat")); - - } catch (FileNotFoundException e) { - System.err.println("Can't create file: " + OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat"); - System.exit(-1); + output = new PrintStream(FileUtils.openOutputStream(outputFile)); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(outputFile, e); } - // Output the header - output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases"); + try { + // Output the header + output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases"); - for( Object covariateKey : ((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).keySet()) { - output.print( covariateKey.toString() + "\t" ); // Covariate - RecalDatum thisDatum = (RecalDatum)((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).get(covariateKey); - output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported - output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE)) + "\t" ); // Qempirical - output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches - output.println( thisDatum.getNumObservations() ); // nBases + for( Object covariateKey : ((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).keySet()) { + output.print( covariateKey.toString() + "\t" ); // Covariate + RecalDatum thisDatum = (RecalDatum)((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).get(covariateKey); + output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported + output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE)) + "\t" ); // Qempirical + output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches + output.println( thisDatum.getNumObservations() ); // nBases + } + } finally { + // Close the PrintStream + IOUtils.closeQuietly(output); } - - // Close the PrintStream - output.close(); } } else { break; @@ -327,10 +330,6 @@ public class AnalyzeCovariates extends CommandLineProgram { } private void callRScripts() { - RScriptExecutor.RScriptArgumentCollection argumentCollection = - new RScriptExecutor.RScriptArgumentCollection(PATH_TO_RSCRIPT, Arrays.asList(PATH_TO_RESOURCES)); - RScriptExecutor executor = new RScriptExecutor(argumentCollection, true); - int numReadGroups = 0; // for each read group @@ -338,23 +337,32 @@ public class AnalyzeCovariates extends CommandLineProgram { if(++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS || NUM_READ_GROUPS_TO_PROCESS == -1) { String readGroup = readGroupKey.toString(); - System.out.println("Analyzing read group: " + readGroup); + logger.info("Analyzing read group: " + readGroup); // for each covariate for( int iii = 1; iii < requestedCovariates.size(); iii++ ) { Covariate cov = requestedCovariates.get(iii); - final String outputFilename = OUTPUT_DIR + readGroup + "." + cov.getClass().getSimpleName()+ ".dat"; + final File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat"); if (DO_INDEL_QUALITY) { - executor.callRScripts("plot_indelQuality.R", outputFilename, - cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice + RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(new Resource(PLOT_INDEL_QUALITY_RSCRIPT, AnalyzeCovariates.class)); + // The second argument is the name of the covariate in order to make the plots look nice + executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]); + executor.exec(); } else { if( iii == 1 ) { // Analyze reported quality - executor.callRScripts("plot_residualError_QualityScoreCovariate.R", outputFilename, - IGNORE_QSCORES_LESS_THAN, MAX_QUALITY_SCORE, MAX_HISTOGRAM_VALUE); // The third argument is the Q scores that should be turned pink in the plot because they were ignored + RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(new Resource(PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE, AnalyzeCovariates.class)); + // The second argument is the Q scores that should be turned pink in the plot because they were ignored + executor.addArgs(outputFile, IGNORE_QSCORES_LESS_THAN, MAX_QUALITY_SCORE, MAX_HISTOGRAM_VALUE); + executor.exec(); } else { // Analyze all other covariates - executor.callRScripts("plot_residualError_OtherCovariate.R", outputFilename, - cov.getClass().getSimpleName().split("Covariate")[0]); // The third argument is the name of the covariate in order to make the plots look nice + RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(new Resource(PLOT_RESDIUAL_ERROR_OTHER_COVARIATE, AnalyzeCovariates.class)); + // The second argument is the name of the covariate in order to make the plots look nice + executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]); + executor.exec(); } } } diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java index 351583c07..c0823e5c5 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java @@ -46,7 +46,7 @@ public class ArgumentMatch implements Iterable { /** * Maps indices of command line arguments to values paired with that argument. */ - public final SortedMap> indices = new TreeMap>(); + public final SortedMap> sites = new TreeMap>(); /** * An ordered, freeform collection of tags. @@ -72,32 +72,32 @@ public class ArgumentMatch implements Iterable { } /** - * A simple way of indicating that an argument with the given label and definition exists at this index. + * A simple way of indicating that an argument with the given label and definition exists at this site. * @param label Label of the argument match. Must not be null. * @param definition The associated definition, if one exists. May be null. - * @param index Position of the argument. Must not be null. + * @param site Position of the argument. Must not be null. * @param tags ordered freeform text tags associated with this argument. */ - public ArgumentMatch(final String label, final ArgumentDefinition definition, final int index, final Tags tags) { - this( label, definition, index, null, tags ); + public ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final Tags tags) { + this( label, definition, site, null, tags ); } /** - * A simple way of indicating that an argument with the given label and definition exists at this index. + * A simple way of indicating that an argument with the given label and definition exists at this site. * @param label Label of the argument match. Must not be null. * @param definition The associated definition, if one exists. May be null. - * @param index Position of the argument. Must not be null. + * @param site Position of the argument. Must not be null. * @param value Value for the argument at this position. * @param tags ordered freeform text tags associated with this argument. */ - private ArgumentMatch(final String label, final ArgumentDefinition definition, final int index, final String value, final Tags tags) { + private ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final String value, final Tags tags) { this.label = label; this.definition = definition; ArrayList values = new ArrayList(); if( value != null ) values.add(value); - indices.put(index,values ); + sites.put(site,values ); this.tags = tags; } @@ -117,7 +117,7 @@ public class ArgumentMatch implements Iterable { ArgumentMatch otherArgumentMatch = (ArgumentMatch)other; return this.definition.equals(otherArgumentMatch.definition) && this.label.equals(otherArgumentMatch.label) && - this.indices.equals(otherArgumentMatch.indices) && + this.sites.equals(otherArgumentMatch.sites) && this.tags.equals(otherArgumentMatch.tags); } @@ -129,16 +129,17 @@ public class ArgumentMatch implements Iterable { * @param key Key which specifies the transform. * @return A variant of this ArgumentMatch with all keys transformed. */ + @SuppressWarnings("unchecked") ArgumentMatch transform(Multiplexer multiplexer, Object key) { - SortedMap> newIndices = new TreeMap>(); - for(Map.Entry> index: indices.entrySet()) { + SortedMap> newIndices = new TreeMap>(); + for(Map.Entry> site: sites.entrySet()) { List newEntries = new ArrayList(); - for(String entry: index.getValue()) + for(String entry: site.getValue()) newEntries.add(multiplexer.transformArgument(key,entry)); - newIndices.put(index.getKey(),newEntries); + newIndices.put(site.getKey(),newEntries); } ArgumentMatch newArgumentMatch = new ArgumentMatch(label,definition); - newArgumentMatch.indices.putAll(newIndices); + newArgumentMatch.sites.putAll(newIndices); return newArgumentMatch; } @@ -157,9 +158,9 @@ public class ArgumentMatch implements Iterable { public Iterator iterator() { return new Iterator() { /** - * Iterate over each the available index. + * Iterate over each the available site. */ - private Iterator indexIterator = null; + private Iterator siteIterator = null; /** * Iterate over each available token. @@ -167,9 +168,9 @@ public class ArgumentMatch implements Iterable { private Iterator tokenIterator = null; /** - * The next index to return. Null if none remain. + * The next site to return. Null if none remain. */ - Integer nextIndex = null; + ArgumentMatchSite nextSite = null; /** * The next token to return. Null if none remain. @@ -177,7 +178,7 @@ public class ArgumentMatch implements Iterable { String nextToken = null; { - indexIterator = indices.keySet().iterator(); + siteIterator = sites.keySet().iterator(); prepareNext(); } @@ -186,7 +187,7 @@ public class ArgumentMatch implements Iterable { * @return True if there's another token waiting in the wings. False otherwise. */ public boolean hasNext() { - return nextToken != null; + return nextToken != null; } /** @@ -194,32 +195,32 @@ public class ArgumentMatch implements Iterable { * @return The next ArgumentMatch in the series. Should never be null. */ public ArgumentMatch next() { - if( nextIndex == null || nextToken == null ) + if( nextSite == null || nextToken == null ) throw new IllegalStateException( "No more ArgumentMatches are available" ); - ArgumentMatch match = new ArgumentMatch( label, definition, nextIndex, nextToken, tags ); + ArgumentMatch match = new ArgumentMatch( label, definition, nextSite, nextToken, tags ); prepareNext(); return match; } /** * Initialize the next ArgumentMatch to return. If no ArgumentMatches are available, - * initialize nextIndex / nextToken to null. + * initialize nextSite / nextToken to null. */ private void prepareNext() { if( tokenIterator != null && tokenIterator.hasNext() ) { nextToken = tokenIterator.next(); } else { - nextIndex = null; + nextSite = null; nextToken = null; // Do a nested loop. While more data is present in the inner loop, grab that data. // Otherwise, troll the outer iterator looking for more data. - while( indexIterator.hasNext() ) { - nextIndex = indexIterator.next(); - if( indices.get(nextIndex) != null ) { - tokenIterator = indices.get(nextIndex).iterator(); + while( siteIterator.hasNext() ) { + nextSite = siteIterator.next(); + if( sites.get(nextSite) != null ) { + tokenIterator = sites.get(nextSite).iterator(); if( tokenIterator.hasNext() ) { nextToken = tokenIterator.next(); break; @@ -245,29 +246,29 @@ public class ArgumentMatch implements Iterable { * @param other The other match to merge into. */ public void mergeInto( ArgumentMatch other ) { - indices.putAll(other.indices); + sites.putAll(other.sites); } /** * Associate a value with this merge maapping. - * @param index index of the command-line argument to which this value is mated. + * @param site site of the command-line argument to which this value is mated. * @param value Text representation of value to add. */ - public void addValue( int index, String value ) { - if( !indices.containsKey(index) || indices.get(index) == null ) - indices.put(index, new ArrayList() ); - indices.get(index).add(value); + public void addValue( ArgumentMatchSite site, String value ) { + if( !sites.containsKey(site) || sites.get(site) == null ) + sites.put(site, new ArrayList() ); + sites.get(site).add(value); } /** * Does this argument already have a value at the given site? * Arguments are only allowed to be single-valued per site, and * flags aren't allowed a value at all. - * @param index Index at which to check for values. + * @param site Site at which to check for values. * @return True if the argument has a value at the given site. False otherwise. */ - public boolean hasValueAtSite( int index ) { - return (indices.get(index) != null && indices.get(index).size() >= 1) || isArgumentFlag(); + public boolean hasValueAtSite( ArgumentMatchSite site ) { + return (sites.get(site) != null && sites.get(site).size() >= 1) || isArgumentFlag(); } /** @@ -276,9 +277,9 @@ public class ArgumentMatch implements Iterable { */ public List values() { List values = new ArrayList(); - for( int index: indices.keySet() ) { - if( indices.get(index) != null ) - values.addAll(indices.get(index)); + for( ArgumentMatchSite site: sites.keySet() ) { + if( sites.get(site) != null ) + values.addAll(sites.get(site)); } return values; } diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSite.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSite.java new file mode 100644 index 000000000..8a4120101 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSite.java @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.commandline; + +/** + * Which source and the index within the source where an argument match was found. + */ +public class ArgumentMatchSite implements Comparable { + private final ArgumentMatchSource source; + private final int index; + + public ArgumentMatchSite(ArgumentMatchSource source, int index) { + this.source = source; + this.index = index; + } + + public ArgumentMatchSource getSource() { + return source; + } + + public int getIndex() { + return index; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + ArgumentMatchSite that = (ArgumentMatchSite) o; + + return (index == that.index) && (source == null ? that.source == null : source.equals(that.source)); + } + + @Override + public int hashCode() { + int result = source != null ? source.hashCode() : 0; + // Generated by intellij. No other special reason to this implementation. -ks + result = 31 * result + index; + return result; + } + + @Override + public int compareTo(ArgumentMatchSite that) { + int comp = this.source.compareTo(that.source); + if (comp != 0) + return comp; + + // Both files are the same. + if (this.index == that.index) + return 0; + return this.index < that.index ? -1 : 1; + } +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java new file mode 100644 index 000000000..ed2700006 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.commandline; + +import java.io.File; + +/** + * Where an argument match originated, via the commandline or a file. + */ +public class ArgumentMatchSource implements Comparable { + public static final ArgumentMatchSource COMMAND_LINE = new ArgumentMatchSource(ArgumentMatchSourceType.CommandLine, null); + + private final ArgumentMatchSourceType type; + private final File file; + + /** + * Creates an argument match source from the specified file. + * @param file File specifying the arguments. Must not be null. + */ + public ArgumentMatchSource(File file) { + this(ArgumentMatchSourceType.File, file); + } + + private ArgumentMatchSource(ArgumentMatchSourceType type, File file) { + if (type == ArgumentMatchSourceType.File && file == null) + throw new IllegalArgumentException("An argument match source of type File cannot have a null file."); + this.type = type; + this.file = file; + } + + public ArgumentMatchSourceType getType() { + return type; + } + + public File getFile() { + return file; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + ArgumentMatchSource that = (ArgumentMatchSource) o; + + return (type == that.type) && (file == null ? that.file == null : file.equals(that.file)); + } + + @Override + public int hashCode() { + int result = type != null ? type.hashCode() : 0; + result = 31 * result + (file != null ? file.hashCode() : 0); + return result; + } + + /** + * Compares two sources, putting the command line first, then files. + */ + @Override + public int compareTo(ArgumentMatchSource that) { + int comp = this.type.compareTo(that.type); + if (comp != 0) + return comp; + + File f1 = this.file; + File f2 = that.file; + + if ((f1 == null) ^ (f2 == null)) { + // If one of the files is null and the other is not + // put the null file first + return f1 == null ? -1 : 1; + } + + return f1 == null ? 0 : f1.compareTo(f2); + } +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java new file mode 100644 index 000000000..3ff6e21d4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.commandline; + +/** + * Type of where an argument match originated, via the commandline or a file. + */ +public enum ArgumentMatchSourceType { + CommandLine, File +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java index 52d3b8232..3da28c420 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatches.java @@ -37,7 +37,7 @@ public class ArgumentMatches implements Iterable { * Collection matches from argument definition to argument value. * Package protected access is deliberate. */ - Map argumentMatches = new TreeMap(); + Map argumentMatches = new TreeMap(); /** * Provide a place to put command-line argument values that don't seem to belong to @@ -80,7 +80,7 @@ public class ArgumentMatches implements Iterable { * @param site Site at which to check. * @return True if the site has a match. False otherwise. */ - boolean hasMatch( int site ) { + boolean hasMatch( ArgumentMatchSite site ) { return argumentMatches.containsKey( site ); } @@ -90,7 +90,7 @@ public class ArgumentMatches implements Iterable { * @return The match present at the given site. * @throws IllegalArgumentException if site does not contain a match. */ - ArgumentMatch getMatch( int site ) { + ArgumentMatch getMatch( ArgumentMatchSite site ) { if( !argumentMatches.containsKey(site) ) throw new IllegalArgumentException( "Site does not contain an argument: " + site ); return argumentMatches.get(site); @@ -107,6 +107,7 @@ public class ArgumentMatches implements Iterable { /** * Return all argument matches of this source. + * @param parsingEngine Parsing engine. * @param argumentSource Argument source to match. * @return List of all matches. */ @@ -167,6 +168,7 @@ public class ArgumentMatches implements Iterable { * TODO: Generify this. * @param multiplexer Multiplexer that controls the transformation process. * @param key Key which specifies the transform. + * @return new argument matches. */ ArgumentMatches transform(Multiplexer multiplexer, Object key) { ArgumentMatches newArgumentMatches = new ArgumentMatches(); @@ -187,15 +189,15 @@ public class ArgumentMatches implements Iterable { for( ArgumentMatch argumentMatch: getUniqueMatches() ) { if( argumentMatch.definition == match.definition && argumentMatch.tags.equals(match.tags) ) { argumentMatch.mergeInto( match ); - for( int index: match.indices.keySet() ) - argumentMatches.put( index, argumentMatch ); + for( ArgumentMatchSite site: match.sites.keySet() ) + argumentMatches.put( site, argumentMatch ); definitionExists = true; } } if( !definitionExists ) { - for( int index: match.indices.keySet() ) - argumentMatches.put( index, match ); + for( ArgumentMatchSite site: match.sites.keySet() ) + argumentMatches.put( site, match ); } } diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java index d1d9cf7fe..31212a46f 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java @@ -336,6 +336,28 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { @Override public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { + return parse(parsingEngine, source, type, matches, false); + } + + /** + * The actual argument parsing method. + * + * IMPORTANT NOTE: the createIntervalBinding argument is a bit of a hack, but after discussions with SE we've decided + * that it's the best way to proceed for now. IntervalBindings can either be proper RodBindings (hence the use of + * this parse() method) or can be Strings (representing raw intervals or the files containing them). If createIntervalBinding + * is true, we do not call parsingEngine.addRodBinding() because we don't want walkers to assume that these are the + * usual set of RodBindings. It also allows us in the future to be smart about tagging rods as intervals. One other + * side point is that we want to continue to allow the usage of non-Feature intervals so that users can theoretically + * continue to input them out of order (whereas Tribble Features are ordered). + * + * @param parsingEngine parsing engine + * @param source source + * @param type type to check + * @param matches matches + * @param createIntervalBinding should we attempt to create an IntervalBinding instead of a RodBinding? + * @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding. + */ + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches, boolean createIntervalBinding) { ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); String value = getArgumentValue( defaultDefinition, matches ); Class parameterType = JVMUtils.getParameterizedTypeClass(type); @@ -348,7 +370,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { if ( tags.getPositionalTags().size() > 2 ) { throw new UserException.CommandLineException( String.format("Unexpected number of positional tags for argument %s : %s. " + - "Rod bindings only suport -X:type and -X:name,type argument styles", + "Rod bindings only support -X:type and -X:name,type argument styles", value, source.field.getName())); } if ( tags.getPositionalTags().size() == 2 ) { // -X:name,type style @@ -378,7 +400,12 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { } } - if ( tribbleType == null ) + if ( tribbleType == null ) { + // IntervalBindings allow streaming conversion of Strings + if ( createIntervalBinding ) { + return new IntervalBinding(value); + } + if ( ! file.exists() ) { throw new UserException.CouldNotReadInputFile(file, "file does not exist"); } else if ( ! file.canRead() || ! file.isFile() ) { @@ -389,13 +416,20 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s", manager.userFriendlyListOfAvailableFeatures(parameterType))); } + } } } Constructor ctor = (makeRawTypeIfNecessary(type)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); - RodBinding result = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags); - parsingEngine.addTags(result,tags); - parsingEngine.addRodBinding(result); + Object result; + if ( createIntervalBinding ) { + result = ctor.newInstance(parameterType, name, value, tribbleType, tags); + } else { + RodBinding rbind = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags); + parsingEngine.addTags(rbind, tags); + parsingEngine.addRodBinding(rbind); + result = rbind; + } return result; } catch (InvocationTargetException e) { throw new UserException.CommandLineException( @@ -409,6 +443,39 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { } } +/** + * Parser for RodBinding objects + */ +class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { + /** + * We only want IntervalBinding class objects + * @param type The type to check. + * @return true if the provided class is an IntervalBinding.class + */ + @Override + public boolean supports( Class type ) { + return isIntervalBinding(type); + } + + public static boolean isIntervalBinding( Class type ) { + return IntervalBinding.class.isAssignableFrom(type); + } + + /** + * See note from RodBindingArgumentTypeDescriptor.parse(). + * + * @param parsingEngine parsing engine + * @param source source + * @param type type to check + * @param matches matches + * @return the IntervalBinding object. + */ + @Override + public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) { + return new RodBindingArgumentTypeDescriptor().parse(parsingEngine, source, type, matches, true); + } +} + /** * Parse simple argument types: java primitives, wrapper classes, and anything that has * a simple String constructor. @@ -416,7 +483,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor { class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor { @Override public boolean supports( Class type ) { - if ( RodBindingArgumentTypeDescriptor.isRodBinding(type) ) return false; + if ( RodBindingArgumentTypeDescriptor.isRodBinding(type) || IntervalBindingArgumentTypeDescriptor.isIntervalBinding(type) ) return false; if ( type.isPrimitive() ) return true; if ( type.isEnum() ) return true; if ( primitiveToWrapperMap.containsValue(type) ) return true; diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java index d88e7030e..bed1e710e 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -35,10 +35,7 @@ import org.broadinstitute.sting.utils.help.ApplicationDetails; import org.broadinstitute.sting.utils.help.HelpFormatter; import java.io.IOException; -import java.util.Collection; -import java.util.Collections; -import java.util.EnumSet; -import java.util.Locale; +import java.util.*; public abstract class CommandLineProgram { @@ -155,6 +152,7 @@ public abstract class CommandLineProgram { * * @param clp the command line program to execute * @param args the command line arguments passed in + * @param dryRun dry run * @throws Exception when an exception occurs */ @SuppressWarnings("unchecked") @@ -176,6 +174,8 @@ public abstract class CommandLineProgram { ParsingEngine parser = clp.parser = new ParsingEngine(clp); parser.addArgumentSource(clp.getClass()); + Map> parsedArgs; + // process the args if (clp.canAddArgumentsDynamically()) { // if the command-line program can toss in extra args, fetch them and reparse the arguments. @@ -196,14 +196,14 @@ public abstract class CommandLineProgram { Class[] argumentSources = clp.getArgumentSources(); for (Class argumentSource : argumentSources) parser.addArgumentSource(clp.getArgumentSourceName(argumentSource), argumentSource); - parser.parse(args); + parsedArgs = parser.parse(args); if (isHelpPresent(parser)) printHelpAndExit(clp, parser); if ( ! dryRun ) parser.validate(); } else { - parser.parse(args); + parsedArgs = parser.parse(args); if ( ! dryRun ) { if (isHelpPresent(parser)) @@ -230,7 +230,7 @@ public abstract class CommandLineProgram { } // regardless of what happens next, generate the header information - HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), args); + HelpFormatter.generateHeaderInformation(clp.getApplicationDetails(), parsedArgs); // call the execute CommandLineProgram.result = clp.execute(); diff --git a/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java new file mode 100644 index 000000000..86ca6c2df --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/commandline/IntervalBinding.java @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.commandline; + +import com.google.java.contract.Requires; +import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodec; +import org.broad.tribble.readers.AsciiLineReader; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec; +import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalUtils; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.*; + +/** + * An IntervalBinding representing a walker argument that gets bound to either a ROD track or interval string. + * + * The IntervalBinding is a formal GATK argument that bridges between a walker and + * the engine to construct intervals for traversal at runtime. The IntervalBinding can + * either be a RodBinding, a string of one or more intervals, or a file with interval strings. + * The GATK Engine takes care of initializing the binding when appropriate and determining intervals from it. + * + * Note that this class is immutable. + */ +public final class IntervalBinding { + + private RodBinding featureIntervals; + private String stringIntervals; + + @Requires({"type != null", "rawName != null", "source != null", "tribbleType != null", "tags != null"}) + public IntervalBinding(Class type, final String rawName, final String source, final String tribbleType, final Tags tags) { + featureIntervals = new RodBinding(type, rawName, source, tribbleType, tags); + } + + @Requires({"intervalArgument != null"}) + public IntervalBinding(String intervalArgument) { + stringIntervals = intervalArgument; + } + + public String getSource() { + if ( featureIntervals != null ) + return featureIntervals.getSource(); + return stringIntervals; + } + + public List getIntervals(GenomeAnalysisEngine toolkit) { + List intervals; + + if ( featureIntervals != null ) { + intervals = new ArrayList(); + + //RMDTrackBuilder builder = new RMDTrackBuilder(toolkit.getReferenceDataSource().getReference().getSequenceDictionary(), + // toolkit.getGenomeLocParser(), + // toolkit.getArguments().unsafe); + + // TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files + + FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec(); + if ( codec instanceof ReferenceDependentFeatureCodec ) + ((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser()); + try { + FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource())); + AsciiLineReader lineReader = new AsciiLineReader(fis); + codec.readHeader(lineReader); + String line = lineReader.readLine(); + while ( line != null ) { + intervals.add(toolkit.getGenomeLocParser().createGenomeLoc(codec.decodeLoc(line))); + line = lineReader.readLine(); + } + } catch (IOException e) { + throw new UserException("Problem reading the interval file " + featureIntervals.getSource() + "; " + e.getMessage()); + } + + } else { + intervals = IntervalUtils.parseIntervalArguments(toolkit.getGenomeLocParser(), stringIntervals); + } + + return intervals; + } +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java index fbf8c6516..0fac195e1 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.commandline; import com.google.java.contract.Requires; +import org.apache.commons.io.FileUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.JVMUtils; @@ -35,6 +36,8 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.ApplicationDetails; import org.broadinstitute.sting.utils.help.HelpFormatter; +import java.io.File; +import java.io.IOException; import java.lang.reflect.Field; import java.util.*; @@ -75,6 +78,7 @@ public class ParsingEngine { * The type of set used must be ordered (but not necessarily sorted). */ private static final Set STANDARD_ARGUMENT_TYPE_DESCRIPTORS = new LinkedHashSet( Arrays.asList(new SimpleArgumentTypeDescriptor(), + new IntervalBindingArgumentTypeDescriptor(), new RodBindingArgumentTypeDescriptor(), new CompoundArgumentTypeDescriptor(), new MultiplexArgumentTypeDescriptor()) ); @@ -100,6 +104,8 @@ public class ParsingEngine { if(clp != null) argumentTypeDescriptors.addAll(clp.getArgumentTypeDescriptors()); argumentTypeDescriptors.addAll(STANDARD_ARGUMENT_TYPE_DESCRIPTORS); + + addArgumentSource(ParsingEngineArgumentFiles.class); } /** @@ -148,21 +154,43 @@ public class ParsingEngine { * command-line arguments to the arguments that are actually * required. * @param tokens Tokens passed on the command line. + * @return The parsed arguments by file. */ - public void parse( String[] tokens ) { + public SortedMap> parse( String[] tokens ) { argumentMatches = new ArgumentMatches(); + SortedMap> parsedArgs = new TreeMap>(); - int lastArgumentMatchSite = -1; + List cmdLineTokens = Arrays.asList(tokens); + parse(ArgumentMatchSource.COMMAND_LINE, cmdLineTokens, argumentMatches, parsedArgs); - for( int i = 0; i < tokens.length; i++ ) { - String token = tokens[i]; + ParsingEngineArgumentFiles argumentFiles = new ParsingEngineArgumentFiles(); + + // Load the arguments ONLY into the argument files. + // Validation may optionally run on the rest of the arguments. + loadArgumentsIntoObject(argumentFiles); + + for (File file: argumentFiles.files) { + List fileTokens = getArguments(file); + parse(new ArgumentMatchSource(file), fileTokens, argumentMatches, parsedArgs); + } + + return parsedArgs; + } + + private void parse(ArgumentMatchSource matchSource, List tokens, + ArgumentMatches argumentMatches, SortedMap> parsedArgs) { + ArgumentMatchSite lastArgumentMatchSite = new ArgumentMatchSite(matchSource, -1); + + int i = 0; + for (String token: tokens) { // If the token is of argument form, parse it into its own argument match. // Otherwise, pair it with the most recently used argument discovered. + ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i); if( isArgumentForm(token) ) { - ArgumentMatch argumentMatch = parseArgument( token, i ); + ArgumentMatch argumentMatch = parseArgument( token, site ); if( argumentMatch != null ) { argumentMatches.mergeInto( argumentMatch ); - lastArgumentMatchSite = i; + lastArgumentMatchSite = site; } } else { @@ -170,10 +198,31 @@ public class ParsingEngine { !argumentMatches.getMatch(lastArgumentMatchSite).hasValueAtSite(lastArgumentMatchSite)) argumentMatches.getMatch(lastArgumentMatchSite).addValue( lastArgumentMatchSite, token ); else - argumentMatches.MissingArgument.addValue( i, token ); + argumentMatches.MissingArgument.addValue( site, token ); } + i++; } + + parsedArgs.put(matchSource, tokens); + } + + private List getArguments(File file) { + try { + if (file.getAbsolutePath().endsWith(".list")) { + return getListArguments(file); + } + } catch (IOException e) { + throw new UserException.CouldNotReadInputFile(file, e); + } + throw new UserException.CouldNotReadInputFile(file, "file extension is not .list"); + } + + private List getListArguments(File file) throws IOException { + ArrayList argsList = new ArrayList(); + for (String line: FileUtils.readLines(file)) + argsList.addAll(Arrays.asList(Utils.escapeExpressions(line))); + return argsList; } public enum ValidationType { MissingRequiredArgument, @@ -494,7 +543,7 @@ public class ParsingEngine { * @param position The position of the token in question. * @return ArgumentMatch associated with this token, or null if no match exists. */ - private ArgumentMatch parseArgument( String token, int position ) { + private ArgumentMatch parseArgument( String token, ArgumentMatchSite position ) { if( !isArgumentForm(token) ) throw new IllegalArgumentException( "Token is not recognizable as an argument: " + token ); @@ -579,9 +628,21 @@ class UnmatchedArgumentException extends ArgumentException { private static String formatArguments( ArgumentMatch invalidValues ) { StringBuilder sb = new StringBuilder(); - for( int index: invalidValues.indices.keySet() ) - for( String value: invalidValues.indices.get(index) ) { - sb.append( String.format("%nInvalid argument value '%s' at position %d.", value, index) ); + for( ArgumentMatchSite site: invalidValues.sites.keySet() ) + for( String value: invalidValues.sites.get(site) ) { + switch (site.getSource().getType()) { + case CommandLine: + sb.append( String.format("%nInvalid argument value '%s' at position %d.", + value, site.getIndex()) ); + break; + case File: + sb.append( String.format("%nInvalid argument value '%s' in file %s at position %d.", + value, site.getSource().getFile().getAbsolutePath(), site.getIndex()) ); + break; + default: + throw new RuntimeException( String.format("Unexpected argument match source type: %s", + site.getSource().getType())); + } if(value != null && Utils.dupString(' ',value.length()).equals(value)) sb.append(" Please make sure any line continuation backslashes on your command line are not followed by whitespace."); } @@ -634,4 +695,13 @@ class UnknownEnumeratedValueException extends ArgumentException { private static String formatArguments(ArgumentDefinition definition, String argumentPassed) { return String.format("Invalid value %s specified for argument %s; valid options are (%s).", argumentPassed, definition.fullName, Utils.join(",",definition.validOptions)); } -} \ No newline at end of file +} + +/** + * Container class to store the list of argument files. + * The files will be parsed after the command line arguments. + */ +class ParsingEngineArgumentFiles { + @Argument(fullName = "arg_file", shortName = "args", doc = "Reads arguments from the specified file", required = false) + public List files = new ArrayList(); +} diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java index a070cb5a1..452309e89 100755 --- a/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java +++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingMethod.java @@ -68,7 +68,7 @@ public abstract class ParsingMethod { * @return An argument match. Definition field will be populated if a match was found or * empty if no appropriate definition could be found. */ - public ArgumentMatch match( ArgumentDefinitions definitions, String token, int position ) { + public ArgumentMatch match( ArgumentDefinitions definitions, String token, ArgumentMatchSite position ) { // If the argument is valid, parse out the argument. Matcher matcher = pattern.matcher(token); @@ -102,9 +102,7 @@ public abstract class ParsingMethod { // Try to find a matching argument. If found, label that as the match. If not found, add the argument // with a null definition. - ArgumentMatch argumentMatch = new ArgumentMatch(argument,argumentDefinition,position,tags); - - return argumentMatch; + return new ArgumentMatch(argument,argumentDefinition,position,tags); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 5b9ebd99b..f8e87aa58 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -28,34 +28,30 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.samtools.*; import org.apache.log4j.Logger; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; +import org.broadinstitute.sting.gatk.samples.SampleDB; import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.stubs.Stub; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; -import org.broadinstitute.sting.gatk.refdata.utils.RMDIntervalGenerator; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; +import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.SequenceDictionaryUtils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; import java.util.*; @@ -92,7 +88,7 @@ public class GenomeAnalysisEngine { /** * Accessor for sample metadata */ - private SampleDataSource sampleDataSource = null; + private SampleDB sampleDB = null; /** * Accessor for sharded reference-ordered data. @@ -206,6 +202,9 @@ public class GenomeAnalysisEngine { // Prepare the data for traversal. initializeDataSources(); + // initialize sampleDB + initializeSampleDB(); + // initialize and validate the interval list initializeIntervals(); validateSuppliedIntervals(); @@ -222,12 +221,12 @@ public class GenomeAnalysisEngine { ShardStrategy shardStrategy = getShardStrategy(readsDataSource,microScheduler.getReference(),intervals); // execute the microscheduler, storing the results - Object result = microScheduler.execute(this.walker, shardStrategy); + return microScheduler.execute(this.walker, shardStrategy); //monitor.stop(); //logger.info(String.format("Maximum heap size consumed: %d",monitor.getMaxMemoryUsed())); - return result; + //return result; } /** @@ -259,13 +258,12 @@ public class GenomeAnalysisEngine { * @return A collection of available filters. */ public Collection createFilters() { - Set filters = new HashSet(); - filters.addAll(WalkerManager.getReadFilters(walker,this.getFilterManager())); + final List filters = WalkerManager.getReadFilters(walker,this.getFilterManager()); if (this.getArguments().readGroupBlackList != null && this.getArguments().readGroupBlackList.size() > 0) filters.add(new ReadGroupBlackListFilter(this.getArguments().readGroupBlackList)); - for(String filterName: this.getArguments().readFilters) + for(final String filterName: this.getArguments().readFilters) filters.add(this.getFilterManager().createByName(filterName)); - return Collections.unmodifiableSet(filters); + return Collections.unmodifiableList(filters); } /** @@ -299,10 +297,14 @@ public class GenomeAnalysisEngine { else if(WalkerManager.getDownsamplingMethod(walker) != null) method = WalkerManager.getDownsamplingMethod(walker); else - method = argCollection.getDefaultDownsamplingMethod(); + method = GATKArgumentCollection.getDefaultDownsamplingMethod(); return method; } + protected void setDownsamplingMethod(DownsamplingMethod method) { + argCollection.setDownsamplingMethod(method); + } + public BAQ.QualityMode getWalkerBAQQualityMode() { return WalkerManager.getBAQQualityMode(walker); } public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); } @@ -381,18 +383,18 @@ public class GenomeAnalysisEngine { // If intervals is non-null and empty at this point, it means that the list of intervals to process // was filtered down to an empty set (eg., the user specified something like -L chr1 -XL chr1). Since // this was very likely unintentional, the user should be informed of this. Note that this is different - // from the case where intervals == null, which indicates either that there were no interval arguments, - // or that -L all was specified. + // from the case where intervals == null, which indicates that there were no interval arguments. if ( intervals != null && intervals.isEmpty() ) { - throw new ArgumentException("The given combination of -L and -XL options results in an empty set. " + - "No intervals to process."); + logger.warn("The given combination of -L and -XL options results in an empty set. No intervals to process."); } } /** * Get the sharding strategy given a driving data source. * + * @param readsDataSource readsDataSource * @param drivingDataSource Data on which to shard. + * @param intervals intervals * @return the sharding strategy */ protected ShardStrategy getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { @@ -429,7 +431,7 @@ public class GenomeAnalysisEngine { return new MonolithicShardStrategy(getGenomeLocParser(), readsDataSource,shardType,region); } - ShardStrategy shardStrategy = null; + ShardStrategy shardStrategy; ShardStrategyFactory.SHATTER_STRATEGY shardType; long SHARD_SIZE = 100000L; @@ -438,6 +440,8 @@ public class GenomeAnalysisEngine { if (walker instanceof RodWalker) SHARD_SIZE *= 1000; if (intervals != null && !intervals.isEmpty()) { + if (readsDataSource == null) + throw new IllegalArgumentException("readsDataSource is null"); if(!readsDataSource.isEmpty() && readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Locus walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); @@ -501,7 +505,8 @@ public class GenomeAnalysisEngine { */ private void initializeTempDirectory() { File tempDir = new File(System.getProperty("java.io.tmpdir")); - tempDir.mkdirs(); + if (!tempDir.exists() && !tempDir.mkdirs()) + throw new UserException.BadTmpDir("Unable to create directory"); } /** @@ -566,34 +571,23 @@ public class GenomeAnalysisEngine { protected void initializeIntervals() { // return if no interval arguments at all - if ((argCollection.intervals == null) && (argCollection.excludeIntervals == null) && (argCollection.RODToInterval == null)) + if ( argCollection.intervals == null && argCollection.excludeIntervals == null ) return; - // if '-L all' was specified, verify that it was the only -L specified and return if so. - if(argCollection.intervals != null) { - for(String interval: argCollection.intervals) { - if(interval.trim().equals("all")) { - if(argCollection.intervals.size() > 1) - throw new UserException("'-L all' was specified along with other intervals or interval lists; the GATK cannot combine '-L all' with other intervals."); - - // '-L all' was specified and seems valid. Return. - return; - } - } - } + // Note that the use of '-L all' is no longer supported. // if include argument isn't given, create new set of all possible intervals - GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ? + GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ? GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) : - loadIntervals(argCollection.intervals, IntervalUtils.mergeIntervalLocations(getRODIntervals(), argCollection.intervalMerging))); + loadIntervals(argCollection.intervals, argCollection.intervalSetRule)); // if no exclude arguments, can return parseIntervalArguments directly - if (argCollection.excludeIntervals == null) + if ( argCollection.excludeIntervals == null ) intervals = includeSortedSet; - // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets + // otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets else { - GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, null); + GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, IntervalSetRule.UNION); intervals = includeSortedSet.subtractRegions(excludeSortedSet); // logging messages only printed when exclude (-XL) arguments are given @@ -608,47 +602,26 @@ public class GenomeAnalysisEngine { /** * Loads the intervals relevant to the current execution - * @param argList String representation of arguments; might include 'all', filenames, intervals in samtools - * notation, or a combination of the above - * @param rodIntervals a list of ROD intervals to add to the returned set. Can be empty or null. + * @param argList argument bindings; might include filenames, intervals in samtools notation, or a combination of the above + * @param rule interval merging rule * @return A sorted, merged list of all intervals specified in this arg list. */ - protected GenomeLocSortedSet loadIntervals( List argList, List rodIntervals ) { + protected GenomeLocSortedSet loadIntervals( List> argList, IntervalSetRule rule ) { - boolean allowEmptyIntervalList = (argCollection.unsafe == ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST || - argCollection.unsafe == ValidationExclusion.TYPE.ALL); + List allIntervals = new ArrayList(0); + for ( IntervalBinding intervalBinding : argList ) { + List intervals = intervalBinding.getIntervals(this); - List nonRODIntervals = IntervalUtils.parseIntervalArguments(genomeLocParser, argList, allowEmptyIntervalList); - List allIntervals = IntervalUtils.mergeListsBySetOperator(rodIntervals, nonRODIntervals, argCollection.BTIMergeRule); + if ( intervals.isEmpty() ) { + logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed."); + } + + allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, rule); + } return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, argCollection.intervalMerging); } - /** - * if we have a ROD specified as a 'rodToIntervalTrackName', convert its records to RODs - * @return ROD intervals as GenomeLocs - */ - private List getRODIntervals() { - Map rodNames = RMDIntervalGenerator.getRMDTrackNames(rodDataSources); - // Do we have any RODs that overloaded as interval lists with the 'rodToIntervalTrackName' flag? - List ret = new ArrayList(); - if (rodNames != null && argCollection.RODToInterval != null) { - String rodName = argCollection.RODToInterval; - - // check to make sure we have a rod of that name - if (!rodNames.containsKey(rodName)) - throw new UserException.CommandLineException("--rodToIntervalTrackName (-BTI) was passed the name '"+rodName+"', which wasn't given as a ROD name in the -B option"); - - for (String str : rodNames.keySet()) - if (str.equals(rodName)) { - logger.info("Adding interval list from track (ROD) named " + rodName); - RMDIntervalGenerator intervalGenerator = new RMDIntervalGenerator(rodNames.get(str)); - ret.addAll(intervalGenerator.toGenomeLocList()); - } - } - return ret; - } - /** * Add additional, externally managed IO streams for inputs. * @@ -692,12 +665,22 @@ public class GenomeAnalysisEngine { for (ReadFilter filter : filters) filter.initialize(this); - sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles); - // set the sequence dictionary of all of Tribble tracks to the sequence dictionary of our reference rodDataSources = getReferenceOrderedDataSources(referenceMetaDataFiles,referenceDataSource.getReference().getSequenceDictionary(),genomeLocParser,argCollection.unsafe); } + /** + * Entry-point function to initialize the samples database from input data and pedigree arguments + */ + private void initializeSampleDB() { + SampleDBBuilder sampleDBBuilder = new SampleDBBuilder(this, argCollection.pedigreeValidationType); + sampleDBBuilder.addSamplesFromSAMHeader(getSAMFileHeader()); + sampleDBBuilder.addSamplesFromSampleNames(SampleUtils.getUniqueSamplesFromRods(this)); + sampleDBBuilder.addSamplesFromPedigreeFiles(argCollection.pedigreeFiles); + sampleDBBuilder.addSamplesFromPedigreeStrings(argCollection.pedigreeStrings); + sampleDB = sampleDBBuilder.getFinalSampleDB(); + } + /** * Gets a unique identifier for the reader sourcing this read. * @param read Read to examine. @@ -716,106 +699,13 @@ public class GenomeAnalysisEngine { return getReadsDataSource().getSAMFile(id); } - /** - * Returns sets of samples present in the (merged) input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of sample names - * found in the corresponding bam file. - * - * @return Sets of samples in the merged input SAM stream, grouped by readers - */ - public List> getSamplesByReaders() { - Collection readers = getReadsDataSource().getReaderIDs(); - - List> sample_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set samples = new HashSet(1); - sample_sets.add(samples); - - for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { - samples.add(g.getSample()); - } - } - - return sample_sets; - - } - - /** - * Returns sets of libraries present in the (merged) input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of library names - * found in the corresponding bam file. - * - * @return Sets of libraries present in the (merged) input SAM stream, grouped by readers - */ - public List> getLibrariesByReaders() { - - - Collection readers = getReadsDataSource().getReaderIDs(); - - List> lib_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set libs = new HashSet(2); - lib_sets.add(libs); - - for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { - libs.add(g.getLibrary()); - } - } - - return lib_sets; - - } - - /** - * **** UNLESS YOU HAVE GOOD REASON TO, DO NOT USE THIS METHOD; USE getFileToReadGroupIdMapping() INSTEAD **** - * - * Returns sets of (remapped) read groups in input SAM stream, grouped by readers (i.e. underlying - * individual bam files). For instance: if GATK is run with three input bam files (three -I arguments), then the list - * returned by this method will contain 3 elements (one for each reader), with each element being a set of remapped read groups - * (i.e. as seen by read.getReadGroup().getReadGroupId() in the merged stream) that come from the corresponding bam file. - * - * @return sets of (merged) read group ids in order of input bams - */ - public List> getMergedReadGroupsByReaders() { - - - Collection readers = getReadsDataSource().getReaderIDs(); - - List> rg_sets = new ArrayList>(readers.size()); - - for (SAMReaderID r : readers) { - - Set groups = new HashSet(5); - rg_sets.add(groups); - - for (SAMReadGroupRecord g : getReadsDataSource().getHeader(r).getReadGroups()) { - if (getReadsDataSource().hasReadGroupCollisions()) { // Check if there were read group clashes with hasGroupIdDuplicates and if so: - // use HeaderMerger to translate original read group id from the reader into the read group id in the - // merged stream, and save that remapped read group id to associate it with specific reader - groups.add(getReadsDataSource().getReadGroupId(r, g.getReadGroupId())); - } else { - // otherwise, pass through the unmapped read groups since this is what Picard does as well - groups.add(g.getReadGroupId()); - } - } - } - - return rg_sets; - - } - /** * Now that all files are open, validate the sequence dictionaries of the reads vs. the reference vrs the reference ordered data (if available). * * @param reads Reads data source. * @param reference Reference data source. * @param rods a collection of the reference ordered data tracks + * @param manager manager */ private void validateSourcesAgainstReference(SAMDataSource reads, ReferenceSequenceFile reference, Collection rods, RMDTrackBuilder manager) { if ((reads.isEmpty() && (rods == null || rods.isEmpty())) || reference == null ) @@ -844,15 +734,22 @@ public class GenomeAnalysisEngine { /** * Gets a data source for the given set of reads. * + * @param argCollection arguments + * @param genomeLocParser parser + * @param refReader reader * @return A data source for the given set of reads. */ private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) { DownsamplingMethod method = getDownsamplingMethod(); + // Synchronize the method back into the collection so that it shows up when + // interrogating for the downsample method during command line recreation. + setDownsamplingMethod(method); + if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF) throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested."); - SAMDataSource dataSource = new SAMDataSource( + return new SAMDataSource( samReaderIDs, genomeLocParser, argCollection.useOriginalBaseQualities, @@ -868,14 +765,12 @@ public class GenomeAnalysisEngine { refReader, argCollection.defaultBaseQualities, !argCollection.disableLowMemorySharding); - return dataSource; } /** * Opens a reference sequence file paired with an index. Only public for testing purposes * * @param refFile Handle to a reference sequence file. Non-null. - * @return A thread-safe file wrapper. */ public void setReferenceDataSource(File refFile) { this.referenceDataSource = new ReferenceDataSource(refFile); @@ -929,6 +824,26 @@ public class GenomeAnalysisEngine { return readsDataSource.getHeader(reader); } + /** + * Returns an ordered list of the unmerged SAM file headers known to this engine. + * @return list of header for each input SAM file, in command line order + */ + public List getSAMFileHeaders() { + final List headers = new ArrayList(); + for ( final SAMReaderID id : getReadsDataSource().getReaderIDs() ) { + headers.add(getReadsDataSource().getHeader(id)); + } + return headers; + } + + /** + * Gets the master sequence dictionary for this GATK engine instance + * @return a never-null dictionary listing all of the contigs known to this engine instance + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return getReferenceDataSource().getReference().getSequenceDictionary(); + } + /** * Returns data source object encapsulating all essential info and handlers used to traverse * reads; header merger, individual file readers etc can be accessed through the returned data source object. @@ -939,8 +854,6 @@ public class GenomeAnalysisEngine { return this.readsDataSource; } - - /** * Sets the collection of GATK main application arguments. * @@ -1027,140 +940,14 @@ public class GenomeAnalysisEngine { return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics(); } - public SampleDataSource getSampleMetadata() { - return this.sampleDataSource; - } + // ------------------------------------------------------------------------------------- + // + // code for working with Samples database + // + // ------------------------------------------------------------------------------------- - /** - * Get a sample by its ID - * If an alias is passed in, return the main sample object - * @param id sample id - * @return sample Object with this ID - */ - public Sample getSampleById(String id) { - return sampleDataSource.getSampleById(id); - } - - /** - * Get the sample for a given read group - * Must first look up ID for read group - * @param readGroup of sample - * @return sample object with ID from the read group - */ - public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { - return sampleDataSource.getSampleByReadGroup(readGroup); - } - - /** - * Get a sample for a given read - * Must first look up read group, and then sample ID for that read group - * @param read of sample - * @return sample object of this read - */ - public Sample getSampleByRead(SAMRecord read) { - return getSampleByReadGroup(read.getReadGroup()); - } - - /** - * Get number of sample objects - * @return size of samples map - */ - public int sampleCount() { - return sampleDataSource.sampleCount(); - } - - /** - * Return all samples with a given family ID - * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this - * @param familyId family ID - * @return Samples with the given family ID - */ - public Set getFamily(String familyId) { - return sampleDataSource.getFamily(familyId); - } - - /** - * Returns all children of a given sample - * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient - * @param sample parent sample - * @return children of the given sample - */ - public Set getChildren(Sample sample) { - return sampleDataSource.getChildren(sample); - } - - /** - * Gets all the samples - * @return - */ - public Collection getSamples() { - return sampleDataSource.getSamples(); - } - - /** - * Takes a list of sample names and returns their corresponding sample objects - * - * @param sampleNameList List of sample names - * @return Corresponding set of samples - */ - public Set getSamples(Collection sampleNameList) { - return sampleDataSource.getSamples(sampleNameList); - } - - - /** - * Returns a set of samples that have any value (which could be null) for a given property - * @param key Property key - * @return Set of samples with the property - */ - public Set getSamplesWithProperty(String key) { - return sampleDataSource.getSamplesWithProperty(key); - } - - /** - * Returns a set of samples that have a property with a certain value - * Value must be a string for now - could add a similar method for matching any objects in the future - * - * @param key Property key - * @param value String property value - * @return Set of samples that match key and value - */ - public Set getSamplesWithProperty(String key, String value) { - return sampleDataSource.getSamplesWithProperty(key, value); - - } - - /** - * Returns a set of sample objects for the sample names in a variant context - * - * @param context Any variant context - * @return a set of the sample objects - */ - public Set getSamplesByVariantContext(VariantContext context) { - Set samples = new HashSet(); - for (String sampleName : context.getSampleNames()) { - samples.add(sampleDataSource.getOrCreateSample(sampleName)); - } - return samples; - } - - /** - * Returns all samples that were referenced in the SAM file - */ - public Set getSAMFileSamples() { - return sampleDataSource.getSAMFileSamples(); - } - - /** - * Return a subcontext restricted to samples with a given property key/value - * Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering - * @param context VariantContext to filter - * @param key property key - * @param value property value (must be string) - * @return subcontext - */ - public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) { - return sampleDataSource.subContextFromSampleProperty(context, key, value); + public SampleDB getSampleDB() { + return this.sampleDB; } public Map getApproximateCommandLineArguments(Object... argumentProviders) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java b/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java index 7cb615f7f..ceaa30f01 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java +++ b/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Collections; import java.util.HashMap; import java.util.Map; +import java.util.TreeMap; /** * Holds a bunch of basic information about the traversal. @@ -102,8 +103,12 @@ public class ReadMetrics implements Cloneable { counter.put(filter.getClass(), c + 1L); } - public Map getCountsByFilter() { - return Collections.unmodifiableMap(counter); + public Map getCountsByFilter() { + final TreeMap sortedCounts = new TreeMap(); + for(Map.Entry counterEntry: counter.entrySet()) { + sortedCounts.put(counterEntry.getKey().getSimpleName(),counterEntry.getValue()); + } + return sortedCounts; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java index 2f4dd06e2..e0c2ce72a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java @@ -29,13 +29,11 @@ package org.broadinstitute.sting.gatk.arguments; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.simpleframework.xml.*; /** * @author ebanks * @version 1.0 */ -@Root public class DbsnpArgumentCollection { /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index fd39d46b0..8078a1ea4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -26,34 +26,26 @@ package org.broadinstitute.sting.gatk.arguments; import net.sf.samtools.SAMFileReader; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Input; +import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalSetRule; -import org.simpleframework.xml.*; -import org.simpleframework.xml.core.Persister; -import org.simpleframework.xml.stream.Format; -import org.simpleframework.xml.stream.HyphenStyle; import java.io.File; -import java.io.InputStream; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * @author aaron * @version 1.0 */ -@Root public class GATKArgumentCollection { /* our version number */ @@ -64,58 +56,58 @@ public class GATKArgumentCollection { public GATKArgumentCollection() { } - @ElementMap(entry = "analysis_argument", key = "key", attribute = true, inline = true, required = false) public Map walkerArgs = new HashMap(); // parameters and their defaults - @ElementList(required = false) @Input(fullName = "input_file", shortName = "I", doc = "SAM or BAM file(s)", required = false) public List samFiles = new ArrayList(); - // parameters and their defaults - @ElementList(required = false) - @Argument(fullName = "sample_metadata", shortName = "SM", doc = "Sample file(s) in JSON format", required = false) - public List sampleFiles = new ArrayList(); - - @Element(required = false) @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) public Integer readBufferSize = null; - @Element(required = false) @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? Standard is the default, can be verbose or NO_ET so nothing is posted to the run repository", required = false) public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD; - @ElementList(required = false) - @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually.", required = false) + @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false) public List readFilters = new ArrayList(); - @ElementList(required = false) - @Input(fullName = "intervals", shortName = "L", doc = "A list of genomic intervals over which to operate. Can be explicitly specified on the command line or in a file.", required = false) - public List intervals = null; + /** + * Using this option one can instruct the GATK engine to traverse over only part of the genome. This argument can be specified multiple times. + * One may use samtools-style intervals either explicitly (e.g. -L chr1 or -L chr1:100-200) or listed in a file (e.g. -L myFile.intervals). + * Additionally, one may specify a rod file to traverse over the positions for which there is a record in the file (e.g. -L file.vcf). + */ + @Input(fullName = "intervals", shortName = "L", doc = "One or more genomic intervals over which to operate. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) + public List> intervals = null; - @ElementList(required = false) - @Input(fullName = "excludeIntervals", shortName = "XL", doc = "A list of genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file.", required = false) - public List excludeIntervals = null; + /** + * Using this option one can instruct the GATK engine NOT to traverse over certain parts of the genome. This argument can be specified multiple times. + * One may use samtools-style intervals either explicitly (e.g. -XL chr1 or -XL chr1:100-200) or listed in a file (e.g. -XL myFile.intervals). + * Additionally, one may specify a rod file to skip over the positions for which there is a record in the file (e.g. -XL file.vcf). + */ + @Input(fullName = "excludeIntervals", shortName = "XL", doc = "One or more genomic intervals to exclude from processing. Can be explicitly specified on the command line or in a file (including a rod file)", required = false) + public List> excludeIntervals = null; + + /** + * How should the intervals specified by multiple -L or -XL arguments be combined? Using this argument one can, for example, traverse over all of the positions + * for which there is a record in a VCF but just in chromosome 20 (-L chr20 -L file.vcf -isr INTERSECTION). + */ + @Argument(fullName = "interval_set_rule", shortName = "isr", doc = "Indicates the set merging approach the interval parser should use to combine the various -L or -XL inputs", required = false) + public IntervalSetRule intervalSetRule = IntervalSetRule.UNION; + + /** + * Should abutting (but not overlapping) intervals be treated as separate intervals? + */ + @Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false) + public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; - @Element(required = false) @Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false) public File referenceFile = null; @Deprecated @Hidden - @ElementList(required = false) @Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :, ", required = false) public ArrayList RODBindings = new ArrayList(); - @Element(required = false) - @Argument(fullName = "rodToIntervalTrackName", shortName = "BTI", doc = "Indicates that the named track should be converted into an interval list, to drive the traversal", required = false) - public String RODToInterval = null; - - @Element(required = false) - @Argument(fullName = "BTI_merge_rule", shortName = "BTIMR", doc = "Indicates the merging approach the interval parser should use to combine the BTI track with other -L options", required = false) - public IntervalSetRule BTIMergeRule = IntervalSetRule.UNION; - - @Element(required = false) @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) public boolean nonDeterministicRandomSeed = false; @@ -128,22 +120,19 @@ public class GATKArgumentCollection { private static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE; private static int DEFAULT_DOWNSAMPLING_COVERAGE = 1000; - @Element(required = false) - @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here.", required = false) + @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false) public DownsampleType downsamplingType = null; - @Element(required = false) @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false) public Double downsampleFraction = null; - @Element(required = false) @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false) public Integer downsampleCoverage = null; /** * Gets the downsampling method explicitly specified by the user. If the user didn't specify - * a default downsampling mechanism, return null. - * @return The explicitly specified downsampling mechanism, or null if none exists. + * a default downsampling mechanism, return the default. + * @return The explicitly specified downsampling mechanism, or the default if none exists. */ public DownsamplingMethod getDownsamplingMethod() { if(downsamplingType == null && downsampleFraction == null && downsampleCoverage == null) @@ -153,16 +142,26 @@ public class GATKArgumentCollection { return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction); } + /** + * Set the downsampling method stored in the argument collection so that it is read back out when interrogating the command line arguments. + * @param method The downsampling mechanism. + */ + public void setDownsamplingMethod(DownsamplingMethod method) { + if (method == null) + throw new IllegalArgumentException("method is null"); + downsamplingType = method.type; + downsampleCoverage = method.toCoverage; + downsampleFraction = method.toFraction; + } + // -------------------------------------------------------------------------------------------------------------- // // BAQ arguments // // -------------------------------------------------------------------------------------------------------------- - @Element(required = false) @Argument(fullName = "baq", shortName="baq", doc="Type of BAQ calculation to apply in the engine", required = false) public BAQ.CalculationMode BAQMode = BAQ.CalculationMode.OFF; - @Element(required = false) @Argument(fullName = "baqGapOpenPenalty", shortName="baqGOP", doc="BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) public double BAQGOP = BAQ.DEFAULT_GOP; @@ -171,7 +170,6 @@ public class GATKArgumentCollection { // performance log arguments // // -------------------------------------------------------------------------------------------------------------- - @Element(required = false) @Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false) public File performanceLog = null; @@ -184,67 +182,117 @@ public class GATKArgumentCollection { return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null); } - @Element(required = false) @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false) public Boolean useOriginalBaseQualities = false; @Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false) public byte defaultBaseQualities = -1; - @Element(required = false) @Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false) public SAMFileReader.ValidationStringency strictnessLevel = SAMFileReader.ValidationStringency.SILENT; - @Element(required = false) @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) public ValidationExclusion.TYPE unsafe; - /** How many threads should be allocated to this analysis. */ - @Element(required = false) - @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) + @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis", required = false) public int numberOfThreads = 1; - /** What rule should we use when merging intervals */ - @Element(required = false) - @Argument(fullName = "interval_merging", shortName = "im", doc = "What interval merging rule should we use.", required = false) - public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL; - - @ElementList(required = false) - @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line.", required = false) + @Input(fullName = "read_group_black_list", shortName="rgbl", doc="Filters out read groups matching : or a .txt file containing the filter strings one per line", required = false) public List readGroupBlackList = null; // -------------------------------------------------------------------------------------------------------------- // - // distributed GATK arguments + // PED (pedigree) support // // -------------------------------------------------------------------------------------------------------------- - @Element(required=false) - @Argument(fullName="processingTracker",shortName="C",doc="A lockable, shared file for coordinating distributed GATK runs",required=false) - @Hidden - public File processingTrackerFile = null; - @Element(required=false) - @Argument(fullName="restartProcessingTracker",shortName="RPT",doc="Should we delete the processing tracker file at startup?",required=false) - @Hidden - public boolean restartProcessingTracker = false; + /** + *

Reads PED file-formatted tabular text files describing meta-data about the samples being + * processed in the GATK.

+ * + * + * + *

The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory:

+ * + *
    + *
  • Family ID
  • + *
  • Individual ID
  • + *
  • Paternal ID
  • + *
  • Maternal ID
  • + *
  • Sex (1=male; 2=female; other=unknown)
  • + *
  • Phenotype
  • + *
+ * + *

The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. + * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a + * quantitative trait or an affection status column: GATK will automatically detect which type + * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed).

+ * + *

If an individual's sex is unknown, then any character other than 1 or 2 can be used.

+ * + *

You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that + * line will be ignored. Do not start any family IDs with this character therefore.

+ * + *

Affection status should be coded:

+ * + *
    + *
  • -9 missing
  • + *
  • 0 missing
  • + *
  • 1 unaffected
  • + *
  • 2 affected
  • + *
+ * + *

If any value outside of -9,0,1,2 is detected than the samples are assumed + * to phenotype values are interpreted as string phenotype values. In this case -9 uniquely + * represents the missing value.

+ * + *

Genotypes (column 7 onwards) cannot be specified to the GATK.

+ * + *

For example, here are two individuals (one row = one person):

+ * + *
+     *   FAM001  1  0 0  1  2
+     *   FAM001  2  0 0  1  2
+     * 
+ * + *

Each -ped argument can be tagged with NO_FAMILY_ID, NO_PARENTS, NO_SEX, NO_PHENOTYPE to + * tell the GATK PED parser that the corresponding fields are missing from the ped file.

+ * + *

Note that most GATK walkers do not use pedigree information. Walkers that require pedigree + * data should clearly indicate so in their arguments and will throw errors if required pedigree + * information is missing.

+ */ + @Argument(fullName="pedigree", shortName = "ped", doc="Pedigree files for samples",required=false) + public List pedigreeFiles = Collections.emptyList(); - @Element(required=false) - @Argument(fullName="processingTrackerStatusFile",shortName="CSF",doc="If provided, a detailed accounting of the state of the process tracker is written to this file. For debugging, only",required=false) - @Hidden - public File processingTrackerStatusFile = null; + /** + * Inline PED records (see -ped argument). Each -pedString STRING can contain one or more + * valid PED records (see -ped) separated by semi-colons. Supports all tags for each pedString + * as -ped supports + */ + @Argument(fullName="pedigreeString", shortName = "pedString", doc="Pedigree string for samples",required=false) + public List pedigreeStrings = Collections.emptyList(); - @Element(required=false) - @Argument(fullName="processingTrackerID",shortName="CID",doc="If provided, an integer ID (starting at 1) indicating a unique id for this process within the distributed GATK group",required=false) - @Hidden - public int processTrackerID = -1; + /** + * How strict should we be in parsing the PED files? + */ + @Argument(fullName="pedigreeValidationType", shortName = "pedValidationType", doc="How strict should we be in validating the pedigree information?",required=false) + public PedigreeValidationType pedigreeValidationType = PedigreeValidationType.STRICT; + + // -------------------------------------------------------------------------------------------------------------- + // + // BAM indexing and sharding arguments + // + // -------------------------------------------------------------------------------------------------------------- - @Element(required = false) @Argument(fullName="allow_intervals_with_unindexed_bam",doc="Allow interval processing with an unsupported BAM. NO INTEGRATION TESTS are available. Use at your own risk.",required=false) @Hidden public boolean allowIntervalsWithUnindexedBAM = false; - @Element(required = false) - @Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality.",required=false) + @Argument(fullName="disable_experimental_low_memory_sharding",doc="Disable experimental low-memory sharding functionality",required=false) public boolean disableLowMemorySharding = false; // -------------------------------------------------------------------------------------------------------------- @@ -253,69 +301,6 @@ public class GATKArgumentCollection { // // -------------------------------------------------------------------------------------------------------------- - /** - * marshal the data out to a object - * - * @param collection the GATKArgumentCollection to load into - * @param outputFile the file to write to - */ - public static void marshal(GATKArgumentCollection collection, String outputFile) { - Serializer serializer = new Persister(new Format(new HyphenStyle())); - File result = new File(outputFile); - try { - serializer.write(collection, result); - } catch (Exception e) { - throw new ReviewedStingException("Failed to marshal the data to the file " + outputFile, e); - } - } - - /** - * marshal the data out to a object - * - * @param collection the GATKArgumentCollection to load into - * @param outputFile the stream to write to - */ - public static void marshal(GATKArgumentCollection collection, PrintStream outputFile) { - Serializer serializer = new Persister(new Format(new HyphenStyle())); - try { - serializer.write(collection, outputFile); - } catch (Exception e) { - throw new ReviewedStingException("Failed to marshal the data to the file " + outputFile, e); - } - } - - /** - * unmashall the object from a configuration file - * - * @param filename the filename to marshal from - */ - public static GATKArgumentCollection unmarshal(String filename) { - Serializer serializer = new Persister(new Format(new HyphenStyle())); - File source = new File(filename); - try { - GATKArgumentCollection example = serializer.read(GATKArgumentCollection.class, source); - return example; - } catch (Exception e) { - throw new ReviewedStingException("Failed to marshal the data from file " + filename, e); - } - } - - /** - * unmashall the object from a configuration file - * - * @param file the inputstream to marshal from - */ - public static GATKArgumentCollection unmarshal(InputStream file) { - Serializer serializer = new Persister(new Format(new HyphenStyle())); - try { - GATKArgumentCollection example = serializer.read(GATKArgumentCollection.class, file); - return example; - } catch (Exception e) { - throw new ReviewedStingException("Failed to marshal the data from file " + file.toString(), e); - } - } - - /** * test equality between two arg collections. This function defines the statement: * "not fun to write" @@ -363,7 +348,7 @@ public class GATKArgumentCollection { if (!other.referenceFile.equals(this.referenceFile)) { return false; } - if (!other.intervals.equals(this.intervals)) { + if ((other.intervals == null && this.intervals != null) || !other.intervals.equals(this.intervals)) { return false; } if (!other.excludeIntervals.equals(this.excludeIntervals)) { @@ -386,39 +371,21 @@ public class GATKArgumentCollection { if (other.intervalMerging != this.intervalMerging) { return false; } - if ((other.RODToInterval == null && RODToInterval != null) || - (other.RODToInterval != null && !other.RODToInterval.equals(RODToInterval))) { - return false; - } if (other.phoneHomeType != this.phoneHomeType) { return false; } - if (BTIMergeRule != other.BTIMergeRule) + if (intervalSetRule != other.intervalSetRule) return false; - if ( BAQMode != other.BAQMode) return false; + if ( BAQMode != other.BAQMode ) return false; if ( BAQGOP != other.BAQGOP ) return false; if ((other.performanceLog == null && this.performanceLog != null) || (other.performanceLog != null && !other.performanceLog.equals(this.performanceLog))) return false; - if ((other.processingTrackerFile == null && this.processingTrackerFile != null) || - (other.processingTrackerFile != null && !other.processingTrackerFile.equals(this.processingTrackerFile))) - return false; - - if ((other.processingTrackerStatusFile == null && this.processingTrackerStatusFile != null) || - (other.processingTrackerStatusFile != null && !other.processingTrackerStatusFile.equals(this.processingTrackerStatusFile))) - return false; - - if ( restartProcessingTracker != other.restartProcessingTracker ) - return false; - - if ( processTrackerID != other.processTrackerID ) - return false; - if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM) return false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java index 654770fe7..4c0257e6a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardVariantContextInputArgumentCollection.java @@ -28,13 +28,11 @@ package org.broadinstitute.sting.gatk.arguments; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.simpleframework.xml.Root; /** * @author ebanks * @version 1.0 */ -@Root public class StandardVariantContextInputArgumentCollection { /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java index 0d5a23f1d..577f7929a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/ValidationExclusion.java @@ -37,7 +37,6 @@ public class ValidationExclusion { public enum TYPE { ALLOW_UNINDEXED_BAM, // allow bam files that do not have an index; we'll traverse them using monolithic shard - ALLOW_EMPTY_INTERVAL_LIST, // allow the user to pass in an empty interval list ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java index 17e4a0743..57416d111 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java @@ -25,12 +25,12 @@ package org.broadinstitute.sting.gatk.contexts; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.List; @@ -130,7 +130,7 @@ public class AlignmentContext implements HasGenomeLocation { */ @Deprecated //todo: unsafe and tailored for current usage only; both pileups can be null or worse, bot can be not null in theory - public List getReads() { return ( basePileup.getReads() ); } + public List getReads() { return ( basePileup.getReads() ); } /** * Are there any reads associated with this locus? @@ -138,7 +138,7 @@ public class AlignmentContext implements HasGenomeLocation { * @return */ public boolean hasReads() { - return basePileup != null && basePileup.size() > 0 ; + return basePileup != null && basePileup.getNumberOfElements() > 0 ; } /** @@ -146,7 +146,7 @@ public class AlignmentContext implements HasGenomeLocation { * @return */ public int size() { - return basePileup.size(); + return basePileup.getNumberOfElements(); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java index 1f9a7d705..4e75f3ddb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContextUtils.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.contexts; import net.sf.samtools.SAMReadGroupRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -76,14 +75,6 @@ public class AlignmentContextUtils { return splitContextBySampleName(context, null); } - public static Map splitContextBySample(AlignmentContext context) { - Map m = new HashMap(); - for ( Map.Entry entry : splitContextBySampleName(context, null).entrySet() ) { - m.put(new Sample(entry.getKey()), entry.getValue()); - } - return m; - } - /** * Splits the given AlignmentContext into a StratifiedAlignmentContext per sample, but referencd by sample name instead * of sample object. @@ -97,11 +88,11 @@ public class AlignmentContextUtils { GenomeLoc loc = context.getLocation(); HashMap contexts = new HashMap(); - for(String sample: context.getPileup().getSampleNames()) { - ReadBackedPileup pileupBySample = context.getPileup().getPileupForSampleName(sample); + for(String sample: context.getPileup().getSamples()) { + ReadBackedPileup pileupBySample = context.getPileup().getPileupForSample(sample); // Don't add empty pileups to the split context. - if(pileupBySample.size() == 0) + if(pileupBySample.getNumberOfElements() == 0) continue; if(sample != null) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java index e92599494..a6731ee18 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/AllLocusView.java @@ -1,10 +1,10 @@ package org.broadinstitute.sting.gatk.datasources.providers; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.iterators.GenomeLocusIterator; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Collections; import java.util.List; @@ -132,7 +132,7 @@ public class AllLocusView extends LocusView { * @param site Site at which to create the blank locus context. * @return empty context. */ - private final static List EMPTY_PILEUP_READS = Collections.emptyList(); + private final static List EMPTY_PILEUP_READS = Collections.emptyList(); private final static List EMPTY_PILEUP_OFFSETS = Collections.emptyList(); private AlignmentContext createEmptyLocus( GenomeLoc site ) { return new AlignmentContext(site,new ReadBackedPileupImpl(site, EMPTY_PILEUP_READS, EMPTY_PILEUP_OFFSETS)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java index ba6321121..bf5f33dc3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LowMemoryIntervalSharder.java @@ -59,7 +59,7 @@ public class LowMemoryIntervalSharder implements Iterator { */ public FilePointer next() { FilePointer current = wrappedIterator.next(); - while(wrappedIterator.hasNext() && current.minus(wrappedIterator.peek()) == 0) + while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0) current = current.combine(parser,wrappedIterator.next()); return current; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index 572970349..8452aadfd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -43,6 +43,7 @@ import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.baq.BAQSamIterator; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import java.io.File; import java.lang.reflect.InvocationTargetException; @@ -57,6 +58,8 @@ import java.util.*; * Converts shards to SAM iterators over the specified region */ public class SAMDataSource { + final private static GATKSamRecordFactory factory = new GATKSamRecordFactory(); + /** Backing support for reads. */ protected final ReadProperties readProperties; @@ -235,6 +238,12 @@ public class SAMDataSource { for(SAMFileReader reader: readers.values()) { // Get the sort order, forcing it to coordinate if unsorted. SAMFileHeader header = reader.getFileHeader(); + + if ( header.getReadGroups().isEmpty() ) { + throw new UserException.MalformedBAM(readers.getReaderID(reader).samFile, + "SAM file doesn't have any read groups defined in the header. The GATK no longer supports SAM files without read groups"); + } + SAMFileHeader.SortOrder sortOrder = header.getSortOrder() != SAMFileHeader.SortOrder.unsorted ? header.getSortOrder() : SAMFileHeader.SortOrder.coordinate; // Validate that all input files are sorted in the same order. @@ -638,7 +647,9 @@ public class SAMDataSource { BAQ.QualityMode qmode, IndexedFastaSequenceFile refReader, byte defaultBaseQualities) { - wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); + if ( useOriginalBaseQualities || defaultBaseQualities >= 0 ) + // only wrap if we are replacing the original qualitiies or using a default base quality + wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); // NOTE: this (and other filtering) should be done before on-the-fly sorting // as there is no reason to sort something that we will end of throwing away @@ -750,6 +761,7 @@ public class SAMDataSource { public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) { for(SAMReaderID readerID: readerIDs) { SAMFileReader reader = new SAMFileReader(readerID.samFile); + reader.setSAMRecordFactory(factory); reader.enableFileSource(true); reader.enableIndexMemoryMapping(false); if(!enableLowMemorySharding) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java index 24d8bc6c5..673df6dfa 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/utilities/FindLargeShards.java @@ -97,7 +97,7 @@ public class FindLargeShards extends CommandLineProgram { // intervals GenomeLocSortedSet intervalSortedSet = null; if(intervals != null) - intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals, true), IntervalMergingRule.ALL); + intervalSortedSet = IntervalUtils.sortAndMergeIntervals(genomeLocParser, IntervalUtils.parseIntervalArguments(genomeLocParser, intervals), IntervalMergingRule.ALL); else { intervalSortedSet = new GenomeLocSortedSet(genomeLocParser); for(SAMSequenceRecord entry: refReader.getSequenceDictionary().getSequences()) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java deleted file mode 100644 index 433e0af40..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/PropertyDefinition.java +++ /dev/null @@ -1,30 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Aug 12, 2010 - * Time: 2:09:16 PM - */ -public class PropertyDefinition { - - String property; - - String[] values; - - public String getProperty() { - return property; - } - - public void setProperty(String property) { - this.property = property; - } - - public String[] getValues() { - return values; - } - - public void setValues(String[] values) { - this.values = values; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java deleted file mode 100644 index ca8756684..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/Sample.java +++ /dev/null @@ -1,203 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - - -import org.broadinstitute.sting.utils.exceptions.StingException; - -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Jul 26, 2010 - * Time: 3:31:38 PM - */ -public class Sample implements java.io.Serializable { - - private final String id; - - private boolean hasSampleFileEntry = false; // true if this sample has an entry in a sample file - - private boolean hasSAMFileEntry = false; // true if this sample has an entry in the SAM file - - private HashMap properties = new HashMap(); - - private HashMap relationships = new HashMap(); - - public enum Gender { - MALE, - FEMALE, - UNKNOWN - } - - public Sample(String id) { -/* if (id == null) { - throw new StingException("Error creating sample: sample ID cannot be null"); - }*/ - this.id = id; - } - - public String getId() { - return this.id; - } - - public Map getProperties() { - return properties; - } - - public void setProperties(Map properties) { - this.properties = (HashMap) properties; - } - - public Map getRelationships() { - return Collections.unmodifiableMap(this.relationships); - } - - public void setSampleFileEntry(boolean value) { - this.hasSampleFileEntry = value; - } - - public boolean hasSAMFileEntry() { - return this.hasSAMFileEntry; - } - - public void setSAMFileEntry(boolean value) { - this.hasSAMFileEntry = value; - } - - public boolean hasSampleFileEntry() { - return this.hasSampleFileEntry; - } - - /** - * Get one property - * @param key key of property - * @return value of property as generic object - */ - public Object getProperty(String key) { - return properties.get(key); - } - - /** - * Set a property - * If property already exists, it is overwritten - * @param key key of property - * @param value object to be stored in properties array - */ - public void setProperty(String key, Object value) { - - if (relationships.containsKey(key)) { - throw new StingException("The same key cannot exist as a property and a relationship"); - } - - if (key.equals("gender") && value.getClass() != Gender.class) { - throw new StingException("'gender' property must be of type Sample.Gender"); - } - - if (key.equals("population") && value.getClass() != String.class) { - throw new StingException("'population' property must be of type String"); - } - - properties.put(key, value); - } - - /** - * Get one relationship - * @param key of relationship - * @return Sample object that this relationship points to - */ - public Sample getRelationship(String key) { - return relationships.get(key); - } - - /** - * Set one relationship - * If already set, it is overwritten - * @param key key of the relationship - * @param value Sample object this relationship points to - */ - public void setRelationship(String key, Sample value) { - if (properties.containsKey(key)) { - throw new StingException("The same key cannot exist as a property and a relationship"); - } - relationships.put(key, value); - } - - /** - * Get the sample's mother - * @return sample object with relationship mother, if exists, or null - */ - public Sample getMother() { - return getRelationship("mother"); - } - - /** - * Get the sample's father - * @return sample object with relationship father, if exists, or null - */ - public Sample getFather() { - return getRelationship("father"); - } - - /** - * Get gender of the sample - * @return property of key "gender" - must be of type Gender - */ - public Gender getGender() { - return (Gender) properties.get("gender"); - } - - public String getPopulation() { - return (String) properties.get("population"); - } - - public String getFamilyId() { - return (String) properties.get("familyId"); - } - - /** - * @return True if sample is male, false if female, unknown, or null - */ - public boolean isMale() { - return properties.get("gender") == Gender.MALE; - } - - /** - * @return True if sample is female, false if male, unknown or null - */ - public boolean isFemale() { - return properties.get("gender") == Gender.MALE; - } - - /** - * - * @param key property key - * @return true if sample has this property (even if its value is null) - */ - public boolean hasProperty(String key) { - return properties.containsKey(key); - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - Sample sample = (Sample) o; - - if (hasSAMFileEntry != sample.hasSAMFileEntry) return false; - if (hasSampleFileEntry != sample.hasSampleFileEntry) return false; - if (id != null ? !id.equals(sample.id) : sample.id != null) return false; - if (properties != null ? !properties.equals(sample.properties) : sample.properties != null) return false; - if (relationships != null ? !relationships.equals(sample.relationships) : sample.relationships != null) - return false; - - return true; - } - - @Override - public int hashCode() { - return id != null ? id.hashCode() : "".hashCode(); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java deleted file mode 100644 index ce749cb83..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleAlias.java +++ /dev/null @@ -1,31 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Aug 13, 2010 - * Time: 5:13:46 PM - */ -public class SampleAlias { - - String mainId; - - String[] otherIds; - - public String getMainId() { - return mainId; - } - - public void setMainId(String mainId) { - this.mainId = mainId; - } - - public String[] getOtherIds() { - return otherIds; - } - - public void setOtherIds(String[] otherIds) { - this.otherIds = otherIds; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java deleted file mode 100644 index 067bf3f72..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSource.java +++ /dev/null @@ -1,590 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.yaml.snakeyaml.TypeDescription; -import org.yaml.snakeyaml.Yaml; -import org.yaml.snakeyaml.constructor.Constructor; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Jul 26, 2010 - * Time: 3:30:09 PM - * - * This class stores and manages sample metadata. This data is encoded in a sample file, which can be included - * in the GATK by the "--samples" argument. This class reads and parses those files. - * - * Although there are a set of public methods for accessing sample data, they aren't used by walkers - they are really - * only used by GenomeAnalysisEngine. An instance of GenomeAnalysisEngine has one SampleDataSource. When a walker - * wants to access sample data, it asks GenomeAnalysis to fetch this data from its SampleDataSource. - * - */ -public class SampleDataSource { - - /** - * SAMFileHeader that has been created for this analysis. - */ - private SAMFileHeader header; - - /** - * This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so - * this is stored as a HashMap. - */ - private final HashMap samples = new HashMap(); - - /** - * Samples can have "aliases", because sometimes the same sample is referenced by different IDs in different - * datasets. If this is the case, one ID is the "primary ID" and others are "aliases". - * - * This maps ID => primary ID for all samples ID strings - both primary IDs and aliases. - */ - private HashMap sampleAliases = new HashMap(); - - /** - * While loading sample files, we must be aware of "special" properties and relationships that are always allowed - */ - public static final String[] specialProperties = new String[] {"familyId", "population", "gender"}; - public static final String[] specialRelationships = new String[] {"mother", "father"}; - - /** - * Constructor takes both a SAM header and sample files because the two must be integrated. - * @param header SAMFileHeader that has been created for this analysis - * @param sampleFiles Sample files that were included on the command line - */ - public SampleDataSource(SAMFileHeader header, List sampleFiles) { - this(); - this.header = header; - // create empty sample object for each sample referenced in the SAM header - for (String sampleName : SampleUtils.getSAMFileSamples(header)) { - if (!hasSample(sampleName)) { - Sample newSample = new Sample(sampleName); - newSample.setSAMFileEntry(true); - samples.put(sampleName, newSample); - } - } - - // add files consecutively - if (sampleFiles != null) { - for (File file : sampleFiles) { - addFile(file); - } - } - } - - public SampleDataSource() { - samples.put(null, new Sample(null)); - } - - /** - * Hallucinates sample objects for all the samples in the SAM file and stores them - */ - public void addSamplesFromSAMHeader(SAMFileHeader header) { - for (String sampleName : SampleUtils.getSAMFileSamples(header)) { - if (!hasSample(sampleName)) { - Sample newSample = new Sample(sampleName); - newSample.setSAMFileEntry(true); - samples.put(sampleName, newSample); - } - } - } - - /** - * Parse one sample file and integrate it with samples that are already there - * Fail quickly if we find any errors in the file - */ - public void addFile(File sampleFile) { - - BufferedReader reader; - try { - reader = new BufferedReader(new FileReader(sampleFile)); - } - catch (IOException e) { - throw new StingException("Could not open sample file " + sampleFile.getAbsolutePath(), e); - } - - // set up YAML reader - a "Constructor" creates java object from YAML and "Loader" loads the file - Constructor con = new Constructor(SampleFileParser.class); - TypeDescription desc = new TypeDescription(SampleFileParser.class); - desc.putListPropertyType("propertyDefinitions", PropertyDefinition.class); - desc.putListPropertyType("sampleAliases", SampleAlias.class); - con.addTypeDescription(desc); - Yaml yaml = new Yaml(con); - - // SampleFileParser stores an object representation of a sample file - this is what we'll parse - SampleFileParser parser; - try { - parser = (SampleFileParser) yaml.load(reader); - } - catch (Exception e) { - throw new StingException("There was a syntactic error with the YAML in sample file " + sampleFile.getAbsolutePath(), e); - } - - // check to see which validation options were built into the file - boolean restrictProperties = parser.getAllowedProperties() != null; - boolean restrictRelationships = parser.getAllowedRelationships() != null; - boolean restrictPropertyValues = parser.getPropertyDefinitions() != null; - - // propertyValues stores the values that are allowed for a given property - HashMap propertyValues = null; - if (restrictPropertyValues) { - propertyValues = new HashMap(); - for (PropertyDefinition def : parser.getPropertyDefinitions()) { - HashSet set = new HashSet(); - for (String value : def.getValues()) { - set.add(value); - } - propertyValues.put(def.getProperty(), set); - } - } - - // make sure the aliases are valid - validateAliases(parser); - - // loop through each sample in the file - a SampleParser stores an object that will become a Sample - for (SampleParser sampleParser : parser.getSamples()) { - - try { - // step 1: add the sample if it doesn't already exist - Sample sample = getSampleById(sampleParser.getId()); - if (sample == null) { - sample = new Sample(sampleParser.getId()); - } - addSample(sample); - sample.setSampleFileEntry(true); - - // step 2: add the properties - if (sampleParser.getProperties() != null) { - for (String property : sampleParser.getProperties().keySet()) { - - // check that property is allowed - if (restrictProperties) { - if (!isPropertyValid(property, parser.getAllowedProperties())) { - throw new StingException(property + " is an invalid property. It is not included in the list " + - "of allowed properties."); - } - } - - // next check that the value is allowed - if (restrictPropertyValues) { - if (!isValueAllowed(property, sampleParser.getProperties().get(property), propertyValues)) { - throw new StingException("The value of property '" + property + "' is invalid. " + - "It is not included in the list of allowed values for this property."); - } - } - - // next check that there isn't already a conflicting property there - if (sample.getProperty(property) != null && - sample.getProperty(property) != sampleParser.getProperties().get(property)) - { - throw new StingException(property + " is a conflicting property!"); - } - - // checks are passed - now add the property! - saveProperty(sample, property, sampleParser.getProperties().get(property)); - } - } - - // step 3: add the relationships - if (sampleParser.getRelationships() != null) { - for (String relationship : sampleParser.getRelationships().keySet()) { - String relativeId = sampleParser.getRelationships().get(relationship); - if (relativeId == null) { - throw new StingException("The relationship cannot be null"); - } - - // first check that it's not invalid - if (restrictRelationships) { - if (!isRelationshipValid(relationship, parser.getAllowedRelationships())) { - throw new StingException(relationship + " is an invalid relationship"); - } - } - - // next check that there isn't already a conflicting property there - if (sample.getRelationship(relationship) != null) { - if (sample.getRelationship(relationship).getId() != sampleParser.getProperties().get(relationship)) { - throw new StingException(relationship + " is a conflicting relationship!"); - } - // if the relationship is already set - and consistent with what we're reading now - no need to continue - else { - continue; - } - } - - // checks are passed - now save the relationship - saveRelationship(sample, relationship, relativeId); - } - } - } catch (Exception e) { - throw new StingException("An error occurred while loading this sample from the sample file: " + - sampleParser.getId(), e); - } - } - - } - - private boolean isValueAllowed(String key, Object value, HashMap valuesList) { - - // if the property values weren't specified for this property, then any value is okay - if (!valuesList.containsKey(key)) { - return true; - } - - // if this property has enumerated values, it must be a string - else if (value.getClass() != String.class) - return false; - - // is the value specified or not? - else if (!valuesList.get(key).contains(value)) - return false; - - return true; - } - - /** - * Makes sure that the aliases are valid - * Checks that 1) no string is used as both a main ID and an alias; - * 2) no alias is used more than once - * @param parser - */ - private void validateAliases(SampleFileParser parser) { - - // no aliases sure validate - if (parser.getSampleAliases() == null) - return; - - HashSet mainIds = new HashSet(); - HashSet otherIds = new HashSet(); - - for (SampleAlias sampleAlias : parser.getSampleAliases()) { - mainIds.add(sampleAlias.getMainId()); - for (String otherId : sampleAlias.getOtherIds()) { - if (mainIds.contains(otherId)) - throw new StingException(String.format("The aliases in your sample file are invalid - the alias %s cannot " + - "be both a main ID and an other ID", otherId)); - - if (!otherIds.add(otherId)) - throw new StingException(String.format("The aliases in your sample file are invalid - %s is listed as an " + - "alias more than once.", otherId)); - } - } - } - - private boolean isPropertyValid(String property, String[] allowedProperties) { - - // is it a special property that is always allowed? - for (String allowedProperty : specialProperties) { - if (property.equals(allowedProperty)) - return true; - } - - // is it in the allowed properties list? - for (String allowedProperty : allowedProperties) { - if (property.equals(allowedProperty)) - return true; - } - - return false; - } - - private boolean isRelationshipValid(String relationship, String[] allowedRelationships) { - - // is it a special relationship that is always allowed? - for (String allowedRelationship : specialRelationships) { - if (relationship.equals(allowedRelationship)) - return true; - } - - // is it in the allowed properties list? - for (String allowedRelationship : allowedRelationships) { - if (relationship.equals(allowedRelationship)) - return true; - } - - return false; - } - - /** - * Saves a property as the correct type - * @param key property key - * @param value property value, as read from YAML parser - * @return property value to be stored - */ - private void saveProperty(Sample sample, String key, Object value) { - - // convert gender to the right type, if it was stored as a String - if (key.equals("gender")) { - if (((String) value).toLowerCase().equals("male")) { - value = Sample.Gender.MALE; - } - else if (((String) value).toLowerCase().equals("female")) { - value = Sample.Gender.FEMALE; - } - else if (((String) value).toLowerCase().equals("unknown")) { - value = Sample.Gender.UNKNOWN; - } - else if (value != null) { - throw new StingException("'gender' property must be male, female, or unknown."); - } - } - try { - sample.setProperty(key, value); - } - catch (Exception e) { - throw new StingException("Could not save property " + key, e); - } - } - - /** - * Saves a relationship as the correct type - * @param key relationship key - * @param relativeId sample ID string of the relative - * @return relationship value to be stored - */ - private void saveRelationship(Sample sample, String key, String relativeId) { - - // get the reference that we'll store as the value - Sample relative = getSampleById(relativeId); - - // create sample object for the relative, if necessary - if (relative == null) { - relative = new Sample(relativeId); - addSample(relative); - } - sample.setRelationship(key, relative); - } - - - - /** - * Filter a sample name in case it is an alias - * @param sampleId to be filtered - * @return ID of sample that stores data for this alias - */ - private String aliasFilter(String sampleId) { - if (!sampleAliases.containsKey(sampleId)) - return sampleId; - else - return sampleAliases.get(sampleId); - } - - /** - * Add a sample to the collection - * @param sample to be added - */ - private void addSample(Sample sample) { - samples.put(sample.getId(), sample); - } - - /** - * Check if sample with this ID exists - * Note that this will return true if name passed in is an alias - * @param id ID of sample to be checked - * @return true if sample exists; false if not - */ - public boolean hasSample(String id) { - return samples.get(aliasFilter(id)) != null; - } - - /** - * Get a sample by its ID - * If an alias is passed in, return the main sample object - * @param id - * @return sample Object with this ID - */ - public Sample getSampleById(String id) { - return samples.get(aliasFilter(id)); - } - - /** - * Get the sample for a given read group - * Must first look up ID for read group - * @param readGroup of sample - * @return sample object with ID from the read group - */ - public Sample getSampleByReadGroup(SAMReadGroupRecord readGroup) { - String nameFromReadGroup = readGroup.getSample(); - return getSampleById(nameFromReadGroup); - } - - /** - * Get a sample for a given read - * Must first look up read group, and then sample ID for that read group - * @param read of sample - * @return sample object of this read - */ - public Sample getSampleByRead(SAMRecord read) { - return getSampleByReadGroup(read.getReadGroup()); - } - - /** - * Get number of sample objects - * @return size of samples map - */ - public int sampleCount() { - return samples.size(); - } - - /** - * Return all samples with a given family ID - * Note that this isn't terribly efficient (linear) - it may be worth adding a new family ID data structure for this - * @param familyId - * @return - */ - public Set getFamily(String familyId) { - HashSet familyMembers = new HashSet(); - - for (Sample sample : samples.values()) { - if (sample.getFamilyId() != null) { - if (sample.getFamilyId().equals(familyId)) - familyMembers.add(sample); - } - } - return familyMembers; - } - - /** - * Returns all children of a given sample - * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient - * @param sample - * @return - */ - public Set getChildren(Sample sample) { - HashSet children = new HashSet(); - for (Sample familyMember : getFamily(sample.getFamilyId())) { - if (familyMember.getMother() == sample || familyMember.getFather() == sample) { - children.add(familyMember); - } - } - return children; - } - - public Set getSamples() { - HashSet set = new HashSet(); - set.addAll(samples.values()); - return set; - } - - /** - * Takes a collection of sample names and returns their corresponding sample objects - * Note that, since a set is returned, if you pass in a list with duplicates names there will not be any duplicates in the returned set - * @param sampleNameList Set of sample names - * @return Corresponding set of samples - */ - public Set getSamples(Collection sampleNameList) { - HashSet samples = new HashSet(); - for (String name : sampleNameList) { - try { - samples.add(getSampleById(name)); - } - catch (Exception e) { - throw new StingException("Could not get sample with the following ID: " + name, e); - } - } - return samples; - } - - /** - * Returns a set of samples that have any value (which could be null) for a given property - * @param key Property key - * @return Set of samples with the property - */ - public Set getSamplesWithProperty(String key) { - HashSet toReturn = new HashSet(); - for (Sample s : samples.values()) { - if (s.hasProperty(key)) - toReturn.add(s); - } - return toReturn; - } - - /** - * Returns a set of samples that have a property with a certain value - * Value must be a string for now - could add a similar method for matching any objects in the future - * - * @param key Property key - * @param value String property value - * @return Set of samples that match key and value - */ - public Set getSamplesWithProperty(String key, String value) { - Set toReturn = getSamplesWithProperty(key); - for (Sample s : toReturn) { - if (!s.getProperty(key).equals(value)) - toReturn.remove(s); - } - return toReturn; - } - - public Sample getOrCreateSample(String id) { - Sample sample = getSampleById(id); - if (sample == null) { - sample = new Sample(id); - addSample(sample); - } - return sample; - } - - /** - * Returns all samples that were referenced in the SAM file - */ - public Set getSAMFileSamples() { - Set toReturn = new HashSet(); - for (Sample sample : samples.values()) { - if (sample.hasSAMFileEntry()) - toReturn.add(sample); - } - return toReturn; - } - - /** - * Returns a set of sample objects for the sample names in a variant context - * - * @param context Any variant context - * @return a set of the sample objects - */ - public Set getSamplesByVariantContext(VariantContext context) { - Set samples = new HashSet(); - for (String sampleName : context.getSampleNames()) { - samples.add(getOrCreateSample(sampleName)); - } - return samples; - } - - - /** - * Return a subcontext restricted to samples with a given property key/value - * Gets the sample names from key/value and relies on VariantContext.subContextFromGenotypes for the filtering - * @param context VariantContext to filter - * @param key property key - * @param value property value (must be string) - * @return subcontext - */ - public VariantContext subContextFromSampleProperty(VariantContext context, String key, String value) { - - Set samplesWithProperty = new HashSet(); - for (String sampleName : context.getSampleNames()) { - Sample s = samples.get(sampleName); - if (s != null && s.hasProperty(key) && s.getProperty(key).equals(value)) - samplesWithProperty.add(sampleName); - } - Map genotypes = context.getGenotypes(samplesWithProperty); - return context.subContextFromGenotypes(genotypes.values()); - } - - public static SampleDataSource createEmptyDataSource() { - SAMFileHeader header = new SAMFileHeader(); - return new SampleDataSource(header, null); - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java deleted file mode 100644 index a362af663..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleFileParser.java +++ /dev/null @@ -1,65 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Aug 12, 2010 - * Time: 1:30:44 PM - */ -public class SampleFileParser { - - private SampleAlias[] sampleAliases; - - private String[] allowedProperties; - - private String[] allowedRelationships; - - private PropertyDefinition[] propertyDefinitions; - - private SampleParser[] samples; - - public PropertyDefinition[] getPropertyDefinitions() { - return propertyDefinitions; - } - - public void setPropertyDefinitions(PropertyDefinition[] propertyDefinitions) { - this.propertyDefinitions = propertyDefinitions; - } - - public SampleFileParser() { - - } - - public String[] getAllowedProperties() { - return allowedProperties; - } - - public void setAllowedProperties(String[] allowedProperties) { - this.allowedProperties = allowedProperties; - } - - public SampleParser[] getSamples() { - return samples; - } - - public void setSamples(SampleParser[] samples) { - this.samples = samples; - } - - public String[] getAllowedRelationships() { - return allowedRelationships; - } - - public void setAllowedRelationships(String[] allowedRelationships) { - this.allowedRelationships = allowedRelationships; - } - - public SampleAlias[] getSampleAliases() { - return sampleAliases; - } - - public void setSampleAliases(SampleAlias[] sampleAliases) { - this.sampleAliases = sampleAliases; - } - -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java deleted file mode 100644 index f5e07ca29..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/sample/SampleParser.java +++ /dev/null @@ -1,43 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -import java.util.HashMap; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Aug 13, 2010 - * Time: 2:09:43 PM - */ -public class SampleParser { - - private String id; - - private HashMap properties; - - private HashMap relationships; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public HashMap getProperties() { - return properties; - } - - public void setProperties(HashMap properties) { - this.properties = properties; - } - - public HashMap getRelationships() { - return relationships; - } - - public void setRelationships(HashMap relationships) { - this.relationships = relationships; - } - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java index ae98874c1..162baed00 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java @@ -85,12 +85,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar */ protected HierarchicalMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse ) { super(engine, walker, reads, reference, rods); - this.threadPool = Executors.newFixedThreadPool(nThreadsToUse); - - if (engine.getArguments().processingTrackerFile != null) { - throw new UserException.BadArgumentValue("-C", "Distributed GATK calculations currently not supported in multi-threaded mode. Complain to Mark depristo@broadinstitute.org to implement and test this code path"); - } } public Object execute( Walker walker, ShardStrategy shardStrategy ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 09ab4bd44..deafcd0cc 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.SampleUtils; import java.util.Collection; @@ -56,7 +57,8 @@ public class LinearMicroScheduler extends MicroScheduler { traversalEngine.startTimersIfNecessary(); if(shard.getShardType() == Shard.ShardType.LOCUS) { LocusWalker lWalker = (LocusWalker)walker; - WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), getReadIterator(shard), shard.getGenomeLocs(), engine.getSampleMetadata()); + WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), + getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); for(WindowMaker.WindowMakerIterator iterator: windowMaker) { ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),iterator.getLocus(),iterator,reference,rods); Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java index 2b6488ada..badd39860 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java @@ -62,16 +62,17 @@ public class ShardTraverser implements Callable { Object accumulator = walker.reduceInit(); LocusWalker lWalker = (LocusWalker)walker; - WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(),microScheduler.getReadIterator(shard),shard.getGenomeLocs(), microScheduler.engine.getSampleMetadata()); // todo: microScheduler.engine is protected - is it okay to user it here? - ShardDataProvider dataProvider = null; + WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(), + microScheduler.getReadIterator(shard), + shard.getGenomeLocs(), + microScheduler.engine.getSampleDB().getSampleNames()); // todo: microScheduler.engine is protected - is it okay to user it here? for(WindowMaker.WindowMakerIterator iterator: windowMaker) { - dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),microScheduler.getEngine().getGenomeLocParser(),iterator.getLocus(),iterator,microScheduler.reference,microScheduler.rods); + final ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),microScheduler.getEngine().getGenomeLocParser(),iterator.getLocus(),iterator,microScheduler.reference,microScheduler.rods); accumulator = traversalEngine.traverse( walker, dataProvider, accumulator ); dataProvider.close(); } - if (dataProvider != null) dataProvider.close(); windowMaker.close(); outputMergeTask = outputTracker.closeStorage(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java index 43ea46002..d1f5d80da 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java @@ -4,7 +4,6 @@ import net.sf.picard.util.PeekableIterator; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; @@ -12,6 +11,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; @@ -63,17 +63,20 @@ public class WindowMaker implements Iterable, I * the given intervals. * @param iterator The data source for this window. * @param intervals The set of intervals over which to traverse. - * @param sampleData SampleDataSource that we can reference reads with + * @param sampleNames The complete set of sample names in the reads in shard */ - public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, SampleDataSource sampleData ) { + public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, Collection sampleNames) { this.sourceInfo = shard.getReadProperties(); this.readIterator = iterator; - - this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser,sampleData)); + this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames)); this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null; } + public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals ) { + this(shard, genomeLocParser, iterator, intervals, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + } + public Iterator iterator() { return this; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java index 74deace9a..11bbf9e4c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MalformedReadFilter.java @@ -27,7 +27,9 @@ package org.broadinstitute.sting.gatk.filters; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceRecord; +import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.UserException; /** * Filter out malformed reads. @@ -37,14 +39,25 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; */ public class MalformedReadFilter extends ReadFilter { private SAMFileHeader header; - + + @Argument(fullName = "filter_mismatching_base_and_quals", shortName = "filterMBQ", doc = "if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required = false) + boolean filterMismatchingBaseAndQuals = false; + @Override public void initialize(GenomeAnalysisEngine engine) { this.header = engine.getSAMFileHeader(); } public boolean filterOut(SAMRecord read) { - return !checkInvalidAlignmentStart(read) || + // slowly changing the behavior to blow up first and filtering out if a parameter is explicitly provided + if (!checkMismatchingBasesAndQuals(read)) { + if (!filterMismatchingBaseAndQuals) + throw new UserException.MalformedBAM(read, "BAM file has a read with mismatching number of bases and base qualities. Offender: " + read.getReadName() +" [" + read.getReadLength() + " bases] [" +read.getBaseQualities().length +"] quals"); + else + return true; + } + + return !checkInvalidAlignmentStart(read) || !checkInvalidAlignmentEnd(read) || !checkAlignmentDisagreesWithHeader(this.header,read) || !checkCigarDisagreesWithAlignment(read); @@ -108,4 +121,13 @@ public class MalformedReadFilter extends ReadFilter { return false; return true; } + + /** + * Check if the read has the same number of bases and base qualities + * @param read the read to validate + * @return true if they have the same number. False otherwise. + */ + private static boolean checkMismatchingBasesAndQuals(SAMRecord read) { + return (read.getReadLength() == read.getBaseQualities().length); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java new file mode 100755 index 000000000..a56af56d1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java @@ -0,0 +1,23 @@ +package org.broadinstitute.sting.gatk.filters; + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.commandline.Argument; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 9/19/11 + * Time: 4:09 PM + * To change this template use File | Settings | File Templates. + */ +public class ReadNameFilter extends ReadFilter { + @Argument(fullName = "readName", shortName = "rn", doc="Filter out all reads except those with this read name", required=true) + private String readName; + + public boolean filterOut(final SAMRecord rec) { + return ! rec.getReadName().equals(readName); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java index ebb4cbe66..4ca7b935f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VCFWriterStorage.java @@ -46,7 +46,7 @@ public class VCFWriterStorage implements Storage, VCFWriter { else if ( stub.getOutputStream() != null ) { this.file = null; this.stream = stub.getOutputStream(); - writer = new StandardVCFWriter(stream, stub.doNotWriteGenotypes()); + writer = new StandardVCFWriter(stream, stub.getMasterSequenceDictionary(), stub.doNotWriteGenotypes()); } else throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream."); @@ -71,7 +71,7 @@ public class VCFWriterStorage implements Storage, VCFWriter { } // The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it. - return new StandardVCFWriter(file, this.stream, indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes()); + return new StandardVCFWriter(file, this.stream, stub.getMasterSequenceDictionary(), indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java index 936243f9d..82cb43634 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterStub.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.io.stubs; +import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; import org.broadinstitute.sting.gatk.CommandLineExecutable; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -150,6 +151,15 @@ public class VCFWriterStub implements Stub, VCFWriter { return isCompressed; } + /** + * Gets the master sequence dictionary from the engine associated with this stub + * @link GenomeAnalysisEngine.getMasterSequenceDictionary + * @return + */ + public SAMSequenceDictionary getMasterSequenceDictionary() { + return engine.getMasterSequenceDictionary(); + } + /** * Should we tell the VCF writer not to write genotypes? * @return true if the writer should not write genotypes. diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index e13c5a764..ee3ea63eb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -35,26 +35,23 @@ import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.ReservoirDownsampler; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileupImpl; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.*; /** Iterator that traverses a SAM File, accumulating information on a per-locus basis */ public class LocusIteratorByState extends LocusIterator { -// private static long discarded_bases = 0L; -// private static long observed_bases = 0L; - /** our log, which we want to capture anything from this class */ private static Logger logger = Logger.getLogger(LocusIteratorByState.class); @@ -69,7 +66,7 @@ public class LocusIteratorByState extends LocusIterator { * Used to create new GenomeLocs. */ private final GenomeLocParser genomeLocParser; - private final ArrayList samples; + private final ArrayList samples; private final ReadStateManager readStates; static private class SAMRecordState { @@ -278,15 +275,27 @@ public class LocusIteratorByState extends LocusIterator { // // ----------------------------------------------------------------------------------------------------------------- - public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, SampleDataSource sampleData ) { + public LocusIteratorByState(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples ) { this.readInfo = readInformation; this.genomeLocParser = genomeLocParser; + this.samples = new ArrayList(samples); + this.readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod()); - // get the list of samples - this.samples = new ArrayList(sampleData.getSamples()); - - readStates = new ReadStateManager(samIterator,readInformation.getDownsamplingMethod()); - + // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when + // there's no read data. So we need to throw this error only when samIterator.hasNext() is true + if ( this.samples.isEmpty() && samIterator.hasNext() ) { + throw new IllegalArgumentException("samples list must not be empty"); + } + } + + /** + * For testing only. Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list + * for the system. + */ + public final static Collection sampleListForSAMWithoutReadGroups() { + List samples = new ArrayList(); + samples.add(null); + return samples; } public Iterator iterator() { @@ -303,19 +312,6 @@ public class LocusIteratorByState extends LocusIterator { //if ( DEBUG ) System.out.printf("hasNext() = %b%n", r); } - public void printState() { - for(Sample sample: samples) { - Iterator iterator = readStates.iterator(sample); - while(iterator.hasNext()) { - SAMRecordState state = iterator.next(); - logger.debug(String.format("printState():")); - SAMRecord read = state.getRead(); - int offset = state.getReadOffset(); - logger.debug(String.format(" read: %s(%d)=%s, cigar=%s", read.getReadName(), offset, (char)read.getReadBases()[offset], read.getCigarString())); - } - } - } - private GenomeLoc getLocation() { return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser); } @@ -355,14 +351,14 @@ public class LocusIteratorByState extends LocusIterator { // In this case, the subsequent call to next() will emit the normal pileup at the current base // and shift the position. if (readInfo.generateExtendedEvents() && hasExtendedEvents) { - Map fullExtendedEventPileup = new HashMap(); + Map fullExtendedEventPileup = new HashMap(); // get current location on the reference and decrement it by 1: the indels we just stepped over // are associated with the *previous* reference base GenomeLoc loc = genomeLocParser.incPos(getLocation(),-1); boolean hasBeenSampled = false; - for(Sample sample: samples) { + for(final String sample: samples) { Iterator iterator = readStates.iterator(sample); List indelPile = new ArrayList(readStates.size(sample)); hasBeenSampled |= loc.getStart() <= readStates.getDownsamplingExtent(sample); @@ -382,10 +378,7 @@ public class LocusIteratorByState extends LocusIterator { maxDeletionLength = Math.max(maxDeletionLength,state.getEventLength()); } else nInsertions++; - indelPile.add ( new ExtendedEventPileupElement(state.getRead(), - state.getReadEventStartOffset(), - state.getEventLength(), - state.getEventBases()) ); + indelPile.add ( new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), state.getReadEventStartOffset(), state.getEventLength(), state.getEventBases()) ); } else { // HACK: The readahead mechanism for LocusIteratorByState will effectively read past the current position @@ -407,9 +400,7 @@ public class LocusIteratorByState extends LocusIterator { // we count such reads (with a longer deletion spanning over a deletion at the previous base we are // about to report) only if includeReadsWithDeletionAtLoci is true. size++; - indelPile.add ( new ExtendedEventPileupElement(state.getRead(), - state.getReadOffset()-1, - -1) // length=-1 --> noevent + indelPile.add ( new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), state.getReadOffset()-1, -1) // length=-1 --> noevent ); } } @@ -426,10 +417,10 @@ public class LocusIteratorByState extends LocusIterator { nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled); } else { GenomeLoc location = getLocation(); - Map fullPileup = new HashMap(); + Map fullPileup = new HashMap(); boolean hasBeenSampled = false; - for(Sample sample: samples) { + for(final String sample: samples) { Iterator iterator = readStates.iterator(sample); List pile = new ArrayList(readStates.size(sample)); hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample); @@ -447,12 +438,12 @@ public class LocusIteratorByState extends LocusIterator { continue; } else { //observed_bases++; - pile.add(new PileupElement(state.getRead(), state.getReadOffset())); + pile.add(new PileupElement((GATKSAMRecord) state.getRead(), state.getReadOffset())); size++; } } else if ( readInfo.includeReadsWithDeletionAtLoci() && state.getCurrentCigarOperator() != CigarOperator.N ) { size++; - pile.add(new PileupElement(state.getRead(), -1)); + pile.add(new PileupElement((GATKSAMRecord) state.getRead(), -1)); nDeletions++; } @@ -495,7 +486,7 @@ public class LocusIteratorByState extends LocusIterator { } private void updateReadStates() { - for(Sample sample: samples) { + for(final String sample: samples) { Iterator it = readStates.iterator(sample); while ( it.hasNext() ) { SAMRecordState state = it.next(); @@ -522,7 +513,7 @@ public class LocusIteratorByState extends LocusIterator { private final PeekableIterator iterator; private final DownsamplingMethod downsamplingMethod; private final SamplePartitioner samplePartitioner; - private final Map readStatesBySample = new HashMap(); + private final Map readStatesBySample = new HashMap(); private final int targetCoverage; private int totalReadStates = 0; @@ -540,9 +531,9 @@ public class LocusIteratorByState extends LocusIterator { } Map readSelectors = new HashMap(); - for(Sample sample: samples) { + for(final String sample: samples) { readStatesBySample.put(sample,new PerSampleReadStateManager()); - readSelectors.put(sample.getId(),downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector()); + readSelectors.put(sample,downsamplingMethod.type == DownsampleType.BY_SAMPLE ? new NRandomReadSelector(null,targetCoverage) : new AllReadsSelector()); } samplePartitioner = new SamplePartitioner(readSelectors); @@ -554,7 +545,7 @@ public class LocusIteratorByState extends LocusIterator { * @param sample The sample. * @return Iterator over the reads associated with that sample. */ - public Iterator iterator(final Sample sample) { + public Iterator iterator(final String sample) { return new Iterator() { private Iterator wrappedIterator = readStatesBySample.get(sample).iterator(); @@ -590,7 +581,7 @@ public class LocusIteratorByState extends LocusIterator { * @param sample The sample. * @return Total number of reads in the given sample. */ - public int size(final Sample sample) { + public int size(final String sample) { return readStatesBySample.get(sample).size(); } @@ -600,12 +591,12 @@ public class LocusIteratorByState extends LocusIterator { * @param sample Sample, downsampled independently. * @return Integer stop of the furthest undownsampled region. */ - public int getDownsamplingExtent(final Sample sample) { + public int getDownsamplingExtent(final String sample) { return readStatesBySample.get(sample).getDownsamplingExtent(); } public SAMRecordState getFirst() { - for(Sample sample: samples) { + for(final String sample: samples) { PerSampleReadStateManager reads = readStatesBySample.get(sample); if(!reads.isEmpty()) return reads.peek(); @@ -639,8 +630,8 @@ public class LocusIteratorByState extends LocusIterator { } samplePartitioner.complete(); - for(Sample sample: samples) { - ReadSelector aggregator = samplePartitioner.getSelectedReads(sample.getId()); + for(final String sample: samples) { + ReadSelector aggregator = samplePartitioner.getSelectedReads(sample); Collection newReads = new ArrayList(aggregator.getSelectedReads()); @@ -1072,6 +1063,3 @@ class SamplePartitioner implements ReadSelector { } } - - - diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java index 2f30d12a8..9a89d2086 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadFormattingIterator.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.gatk.iterators; import net.sf.samtools.SAMRecord; import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** * An iterator which does post-processing of a read, including potentially wrapping @@ -78,7 +77,30 @@ public class ReadFormattingIterator implements StingSAMIterator { * no next exists. */ public SAMRecord next() { - return new GATKSAMRecord(wrappedIterator.next(), useOriginalBaseQualities, defaultBaseQualities); + SAMRecord rec = wrappedIterator.next(); + + // if we are using default quals, check if we need them, and add if necessary. + // 1. we need if reads are lacking or have incomplete quality scores + // 2. we add if defaultBaseQualities has a positive value + if (defaultBaseQualities >= 0) { + byte reads [] = rec.getReadBases(); + byte quals [] = rec.getBaseQualities(); + if (quals == null || quals.length < reads.length) { + byte new_quals [] = new byte [reads.length]; + for (int i=0; i entry : index.getProperties().entrySet()) { + if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) + dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), + Integer.valueOf(entry.getValue()))); + } + return dict; + } + + /** + * create the sequence dictionary with the contig list; a backup approach + * @param index the index file to use + * @param dict the sequence dictionary to add contigs to + * @return the filled-in sequence dictionary + */ + static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) { + LinkedHashSet seqNames = index.getSequenceNames(); + if (seqNames == null) { + return dict; + } + for (String name : seqNames) { + SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); + dict.addSequence(seq); + } + return dict; + } + + public static void setIndexSequenceDictionary(Index index, SAMSequenceDictionary dict) { + for ( SAMSequenceRecord seq : dict.getSequences() ) { + final String contig = IndexDictionaryUtils.SequenceDictionaryPropertyPredicate + seq.getSequenceName(); + final String length = String.valueOf(seq.getSequenceLength()); + index.addProperty(contig,length); + } + } + + public static void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict, + final ValidationExclusion.TYPE validationExclusionType ) { + // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation + if (trackDict == null || trackDict.size() == 0) + logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); + else { + Set trackSequences = new TreeSet(); + for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) + trackSequences.add(dictionaryEntry.getSequenceName()); + SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java index 06d05912a..edb514984 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilder.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.gatk.refdata.tracks; import net.sf.samtools.SAMSequenceDictionary; -import net.sf.samtools.SAMSequenceRecord; import org.apache.log4j.Logger; import org.broad.tribble.FeatureCodec; import org.broad.tribble.FeatureSource; @@ -41,7 +40,6 @@ import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet.RMDStorageType; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.SequenceDictionaryUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -52,16 +50,11 @@ import org.broadinstitute.sting.utils.instrumentation.Sizeof; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; -import java.util.LinkedHashSet; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - /** - * - * @author aaron + * + * @author aaron * ` * Class RMDTrackBuilder * @@ -76,9 +69,6 @@ public class RMDTrackBuilder { // extends PluginManager { private final static Logger logger = Logger.getLogger(RMDTrackBuilder.class); public final static boolean MEASURE_TRIBBLE_QUERY_PERFORMANCE = false; - // a constant we use for marking sequence dictionary entries in the Tribble index property list - public static final String SequenceDictionaryPropertyPredicate = "DICT:"; - // private sequence dictionary we use to set our tracks with private SAMSequenceDictionary dict = null; @@ -150,7 +140,7 @@ public class RMDTrackBuilder { // extends PluginManager { final FeatureManager.FeatureDescriptor descriptor = getFeatureManager().getByCodec(codecClass); if (descriptor == null) - throw new ReviewedStingException("Unable to find type name for codex class " + codecClass.getName()); + throw new ReviewedStingException("Unable to find type name for codec class " + codecClass.getName()); return createInstanceOfTrack(new RMDTriplet("anonymous",descriptor.getName(),inputFile.getAbsolutePath(),RMDStorageType.FILE,new Tags())); } @@ -210,13 +200,19 @@ public class RMDTrackBuilder { // extends PluginManager { try { logger.info(String.format(" Index for %s has size in bytes %d", inputFile, Sizeof.getObjectGraphSize(index))); } catch (ReviewedStingException e) { } - sequenceDictionary = getSequenceDictionaryFromProperties(index); + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); // if we don't have a dictionary in the Tribble file, and we've set a dictionary for this builder, set it in the file if they match if (sequenceDictionary.size() == 0 && dict != null) { File indexFile = Tribble.indexFile(inputFile); - setIndexSequenceDictionary(inputFile,index,dict,indexFile,true); - sequenceDictionary = getSequenceDictionaryFromProperties(index); + validateAndUpdateIndexSequenceDictionary(inputFile, index, dict); + try { // re-write the index + writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); + } catch (IOException e) { + logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK"); + } + + sequenceDictionary = IndexDictionaryUtils.getSequenceDictionaryFromProperties(index); } if ( MEASURE_TRIBBLE_QUERY_PERFORMANCE ) @@ -363,88 +359,31 @@ public class RMDTrackBuilder { // extends PluginManager { // this can take a while, let them know what we're doing logger.info("Creating Tribble index in memory for file " + inputFile); Index idx = IndexFactory.createIndex(inputFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - setIndexSequenceDictionary(inputFile, idx, dict, null, false); + validateAndUpdateIndexSequenceDictionary(inputFile, idx, dict); return idx; } - - // --------------------------------------------------------------------------------------------------------- - // static functions to work with the sequence dictionaries of indexes - // --------------------------------------------------------------------------------------------------------- - - /** - * get the sequence dictionary from the track, if available. If not, make it from the contig list that is always in the index - * @param index the index file to use - * @return a SAMSequenceDictionary if available, null if unavailable - */ - public static SAMSequenceDictionary getSequenceDictionaryFromProperties(Index index) { - SAMSequenceDictionary dict = new SAMSequenceDictionary(); - for (Map.Entry entry : index.getProperties().entrySet()) { - if (entry.getKey().startsWith(SequenceDictionaryPropertyPredicate)) - dict.addSequence(new SAMSequenceRecord(entry.getKey().substring(SequenceDictionaryPropertyPredicate.length() , entry.getKey().length()), - Integer.valueOf(entry.getValue()))); - } - return dict; - } - - /** - * create the sequence dictionary with the contig list; a backup approach - * @param index the index file to use - * @param dict the sequence dictionary to add contigs to - * @return the filled-in sequence dictionary - */ - private static SAMSequenceDictionary createSequenceDictionaryFromContigList(Index index, SAMSequenceDictionary dict) { - LinkedHashSet seqNames = index.getSequenceNames(); - if (seqNames == null) { - return dict; - } - for (String name : seqNames) { - SAMSequenceRecord seq = new SAMSequenceRecord(name, 0); - dict.addSequence(seq); - } - return dict; - } - /** * set the sequence dictionary of the track. This function checks that the contig listing of the underlying file is compatible. * (that each contig in the index is in the sequence dictionary). * @param inputFile for proper error message formatting. * @param dict the sequence dictionary * @param index the index file - * @param indexFile the index file - * @param rewriteIndex should we rewrite the index when we're done? - * */ - public void setIndexSequenceDictionary(File inputFile, Index index, SAMSequenceDictionary dict, File indexFile, boolean rewriteIndex) { - if (dict == null) return; - - SAMSequenceDictionary currentDict = createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); - validateTrackSequenceDictionary(inputFile.getAbsolutePath(),currentDict,dict); + public void validateAndUpdateIndexSequenceDictionary(final File inputFile, final Index index, final SAMSequenceDictionary dict) { + if (dict == null) throw new ReviewedStingException("BUG: dict cannot be null"); // check that every contig in the RMD contig list is at least in the sequence dictionary we're being asked to set - for (SAMSequenceRecord seq : currentDict.getSequences()) { - if (dict.getSequence(seq.getSequenceName()) == null) - continue; - index.addProperty(SequenceDictionaryPropertyPredicate + dict.getSequence(seq.getSequenceName()).getSequenceName(), String.valueOf(dict.getSequence(seq.getSequenceName()).getSequenceLength())); - } - // re-write the index - if (rewriteIndex) try { - writeIndexToDisk(index,indexFile,new FSLockWithShared(indexFile)); - } catch (IOException e) { - logger.warn("Unable to update index with the sequence dictionary for file " + indexFile + "; this will not effect your run of the GATK"); - } + final SAMSequenceDictionary currentDict = IndexDictionaryUtils.createSequenceDictionaryFromContigList(index, new SAMSequenceDictionary()); + validateTrackSequenceDictionary(inputFile.getAbsolutePath(), currentDict, dict); + + // actually update the dictionary in the index + IndexDictionaryUtils.setIndexSequenceDictionary(index, dict); } - - public void validateTrackSequenceDictionary(String trackName, SAMSequenceDictionary trackDict, SAMSequenceDictionary referenceDict) { - // if the sequence dictionary is empty (as well as null which means it doesn't have a dictionary), skip validation - if (trackDict == null || trackDict.size() == 0) - logger.info("Track " + trackName + " doesn't have a sequence dictionary built in, skipping dictionary validation"); - else { - Set trackSequences = new TreeSet(); - for (SAMSequenceRecord dictionaryEntry : trackDict.getSequences()) - trackSequences.add(dictionaryEntry.getSequenceName()); - SequenceDictionaryUtils.validateDictionaries(logger, validationExclusionType, trackName, trackDict, "reference", referenceDict); - } + public void validateTrackSequenceDictionary(final String trackName, + final SAMSequenceDictionary trackDict, + final SAMSequenceDictionary referenceDict ) { + IndexDictionaryUtils.validateTrackSequenceDictionary(trackName, trackDict, referenceDict, validationExclusionType); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDIntervalGenerator.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDIntervalGenerator.java deleted file mode 100644 index a7666981c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/utils/RMDIntervalGenerator.java +++ /dev/null @@ -1,57 +0,0 @@ -package org.broadinstitute.sting.gatk.refdata.utils; - -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.utils.GenomeLoc; - -import java.util.*; - - -/** - * - * @author aaron - * - * Class RMDIntervalGenerator - * - * Creates an interval list, given an RMDTrack - */ -public class RMDIntervalGenerator { - public ReferenceOrderedDataSource dataSource; - - /** - * create a interval representation of a ROD track - * @param dataSource the track - */ - public RMDIntervalGenerator(ReferenceOrderedDataSource dataSource) { - if (dataSource == null) throw new IllegalArgumentException("Data source cannot be null"); - this.dataSource = dataSource; - } - - /** - * create a genome location list from the interval track - * @return a list of genome locations - */ - public List toGenomeLocList() { - Iterator iter = dataSource.seek((GenomeLoc)null); - List locations = new ArrayList(); - while (iter.hasNext()) { - RODRecordList feature = iter.next(); - GenomeLoc loc = feature.getLocation(); - if (loc != null) locations.add(loc); - } - return locations; - } - - /** - * return a map of reference meta data track names to RODS - * @param sources the reference ordered data sources to get the names from - * @return a map of reference meta data names to RODS - */ - public static Map getRMDTrackNames(List sources) { - // get a list of the current rod names we're working with - Map rodNames = new HashMap(); - for (ReferenceOrderedDataSource rod : sources) { - rodNames.put(rod.getName(),rod); - } - return rodNames; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java old mode 100644 new mode 100755 index a33631c85..a73123b6c --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportColumns.java @@ -24,12 +24,14 @@ package org.broadinstitute.sting.gatk.report; +import org.broadinstitute.sting.utils.collections.Pair; + import java.util.*; /** * Tracks a linked list of GATKReportColumn in order by name. */ -public class GATKReportColumns extends LinkedHashMap { +public class GATKReportColumns extends LinkedHashMap implements Iterable { private List columnNames = new ArrayList(); /** @@ -52,4 +54,14 @@ public class GATKReportColumns extends LinkedHashMap { columnNames.add(key); return super.put(key, value); } + + @Override + public Iterator iterator() { + return new Iterator() { + int offset = 0; + public boolean hasNext() { return offset < columnNames.size() ; } + public GATKReportColumn next() { return getByIndex(offset++); } + public void remove() { throw new UnsupportedOperationException("Cannot remove from a GATKReportColumn iterator"); } + }; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index 3e3aa29a7..2fd5ad7e3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -286,6 +286,10 @@ public class GATKReportTable { } } + public boolean containsKey(Object primaryKey) { + return primaryKeyColumn.contains(primaryKey); + } + /** * Set the value for a given position in the table * diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java old mode 100644 new mode 100755 diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java new file mode 100644 index 000000000..83e31f672 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Affection.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +/** + * Categorical sample trait for association and analysis + * + * Samples can have unknown status, be affected or unaffected by the + * categorical trait, or they can be marked as actually having an + * other trait value (stored in an associated value in the Sample class) + * + * @author Mark DePristo + * @since Sept. 2011 + */ +public enum Affection { + /** Status is unknown */ + UNKNOWN, + /** Suffers from the disease */ + AFFECTED, + /** Unaffected by the disease */ + UNAFFECTED, + /** An "other" trait: value of the trait is stored elsewhere and is an arbitrary string */ + OTHER +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java new file mode 100644 index 000000000..6fb44804a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Gender.java @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +/** +* ENUM of possible human genders: male, female, or unknown +*/ +public enum Gender { + MALE, + FEMALE, + UNKNOWN +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java new file mode 100644 index 000000000..c442409fb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedReader.java @@ -0,0 +1,310 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; + +import java.io.*; +import java.util.*; + +/** + * Reads PED file-formatted tabular text files + * + * See http://www.broadinstitute.org/mpg/tagger/faq.html + * See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped + * + * The "ped" file format refers to the widely-used format for linkage pedigree data. + * Each line describes a single (diploid) individual in the following format: + * + * family_ID individual_ID father_ID mother_ID gender phenotype genotype_1 genotype_2 ... + * + * If your data lacks pedigree information (for example, unrelated case/control individuals), + * set the father_ID and mother_ID to 0. sex denotes the individual's gender with 1=male and 2=female. + * phenotype refers to the affected status (for association studies) where 0=unknown, 1=unaffected, 2=affected. + * Finally, each genotype is written as two (=diploid) integer numbers (separated by whitespace), + * where 1=A, 2=C, 3=G, 4=T. No header lines are allowed and all columns must be separated by whitespace. + * Check out the information at the PLINK website on the "ped" file format. + * + * The PED file is a white-space (space or tab) delimited file: the first six columns are mandatory: + * Family ID + * Individual ID + * Paternal ID + * Maternal ID + * Sex (1=male; 2=female; other=unknown) + * Phenotype + * + * The IDs are alphanumeric: the combination of family and individual ID should uniquely identify a person. + * A PED file must have 1 and only 1 phenotype in the sixth column. The phenotype can be either a + * quantitative trait or an affection status column: PLINK will automatically detect which type + * (i.e. based on whether a value other than 0, 1, 2 or the missing genotype code is observed). + * Note that the GATK actually supports arbitrary values for quantitative trait -- not just doubles -- + * and are actually representing these values as strings instead of doubles + * + * NOTE Quantitative traits with decimal points must be coded with a period/full-stop character and + * not a comma, i.e. 2.394 not 2,394 + * + * If an individual's sex is unknown, then any character other than 1 or 2 can be used. + * When new files are created (PED, FAM, or other which contain sex) then the original coding will be + * preserved. However, these individuals will be dropped from any analyses (i.e. phenotype set to missing also) + * and an error message will arise if an analysis that uses family information is requested and an + * individual of 'unknown' sex is specified as a father or mother. + * + * + * HINT You can add a comment to a PED or MAP file by starting the line with a # character. The rest of that + * line will be ignored. Do not start any family IDs with this character therefore. + * + * Affection status, by default, should be coded: + * -9 missing + * 0 missing + * 1 unaffected + * 2 affected + * + * If your file is coded 0/1 to represent unaffected/affected, then use the --1 flag: + * plink --file mydata --1 which will specify a disease phenotype coded: + * + * -9 missing + * 0 unaffected + * 1 affected + * + * The missing phenotype value for quantitative traits is, by default, -9 (this can also be used for + * disease traits as well as 0). It can be reset by including the --missing-phenotype option: + * + * Genotypes (column 7 onwards) should also be white-space delimited; they can be any character + * (e.g. 1,2,3,4 or A,C,G,T or anything else) except 0 which is, by default, the missing genotype + * character. All markers should be biallelic. All SNPs (whether haploid or not) must have two + * alleles specified. Either Both alleles should be missing (i.e. 0) or neither. + * + * No header row should be given. For example, here are two individuals typed for 3 SNPs (one row = one person): + * + * FAM001 1 0 0 1 2 A A G G A C + * FAM001 2 0 0 1 2 A A A G 0 0 + * ... + * + * Note that the GATK does not support genotypes in a PED file. + * + * @author Mark DePristo + * @since 2011 + */ +public class PedReader { + private static Logger logger = Logger.getLogger(PedReader.class); + final static private Set CATAGORICAL_TRAIT_VALUES = new HashSet(Arrays.asList("-9", "0", "1", "2")); + final static private String commentMarker = "#"; + + /** + * An enum that specifies which, if any, of the standard PED fields are + * missing from the input records. For example, suppose we have the full record: + * + * "fam1 kid dad mom 1 2" + * + * indicating a male affected child. This can be parsed with the -ped x.ped argument + * to the GATK. Suppose we only have: + * + * "fam1 kid 1" + * + * we can parse the reduced version of this record with -ped:NO_PARENTS,NO_PHENOTYPE x.ped + */ + public enum MissingPedField { + /** + * The PED records do not have the first (FAMILY_ID) argument. The family id + * will be set to null / empty. + */ + NO_FAMILY_ID, + + /** + * The PED records do not have either the paternal or maternal IDs, so + * the corresponding IDs are set to null. + */ + NO_PARENTS, + + /** + * The PED records do not have the GENDER field, so the sex of each + * sample will be set to UNKNOWN. + */ + NO_SEX, + + /** + * The PED records do not have the PHENOTYPE field, so the phenotype + * of each sample will be set to UNKNOWN. + */ + NO_PHENOTYPE + } + + protected enum Field { + FAMILY_ID, INDIVIDUAL_ID, PATERNAL_ID, MATERNAL_ID, GENDER, PHENOTYPE + } + + // phenotype + private final static String MISSING_VALUE1 = "-9"; + private final static String MISSING_VALUE2 = "0"; + private final static String PHENOTYPE_UNAFFECTED = "1"; + private final static String PHENOTYPE_AFFECTED = "2"; + + // Sex + private final static String SEX_MALE = "1"; + private final static String SEX_FEMALE = "2"; + // other=unknown + + public PedReader() { } + + public final List parse(File source, EnumSet missingFields, SampleDB sampleDB) throws FileNotFoundException { + logger.info("Reading PED file " + source + " with missing fields: " + missingFields); + return parse(new FileReader(source), missingFields, sampleDB); + } + + public final List parse(final String source, EnumSet missingFields, SampleDB sampleDB) { + logger.warn("Reading PED string: \"" + source + "\" with missing fields: " + missingFields); + return parse(new StringReader(source.replace(";", String.format("%n"))), missingFields, sampleDB); + } + + public final List parse(Reader reader, EnumSet missingFields, SampleDB sampleDB) { + final List lines = new XReadLines(reader).readLines(); + + // What are the record offsets? + final int familyPos = missingFields.contains(MissingPedField.NO_FAMILY_ID) ? -1 : 0; + final int samplePos = familyPos + 1; + final int paternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : samplePos + 1; + final int maternalPos = missingFields.contains(MissingPedField.NO_PARENTS) ? -1 : paternalPos + 1; + final int sexPos = missingFields.contains(MissingPedField.NO_SEX) ? -1 : Math.max(maternalPos, samplePos) + 1; + final int phenotypePos = missingFields.contains(MissingPedField.NO_PHENOTYPE) ? -1 : Math.max(sexPos, Math.max(maternalPos, samplePos)) + 1; + final int nExpectedFields = MathUtils.arrayMaxInt(Arrays.asList(samplePos, paternalPos, maternalPos, sexPos, phenotypePos)) + 1; + + // go through once and determine properties + int lineNo = 1; + boolean isQT = false; + final List splits = new ArrayList(lines.size()); + for ( final String line : lines ) { + if ( line.startsWith(commentMarker)) continue; + if ( line.trim().equals("") ) continue; + + final String[] parts = line.split("\\s+"); + + if ( parts.length != nExpectedFields ) + throw new UserException.MalformedFile(reader.toString(), "Bad PED line " + lineNo + ": wrong number of fields"); + + if ( phenotypePos != -1 ) { + isQT = isQT || ! CATAGORICAL_TRAIT_VALUES.contains(parts[phenotypePos]); + } + + splits.add(parts); + lineNo++; + } + logger.info("Phenotype is other? " + isQT); + + // now go through and parse each record + lineNo = 1; + final List samples = new ArrayList(splits.size()); + for ( final String[] parts : splits ) { + String familyID = null, individualID, paternalID = null, maternalID = null; + Gender sex = Gender.UNKNOWN; + String quantitativePhenotype = Sample.UNSET_QT; + Affection affection = Affection.UNKNOWN; + + if ( familyPos != -1 ) familyID = maybeMissing(parts[familyPos]); + individualID = parts[samplePos]; + if ( paternalPos != -1 ) paternalID = maybeMissing(parts[paternalPos]); + if ( maternalPos != -1 ) maternalID = maybeMissing(parts[maternalPos]); + + if ( sexPos != -1 ) { + if ( parts[sexPos].equals(SEX_MALE) ) sex = Gender.MALE; + else if ( parts[sexPos].equals(SEX_FEMALE) ) sex = Gender.FEMALE; + else sex = Gender.UNKNOWN; + } + + if ( phenotypePos != -1 ) { + if ( isQT ) { + if ( parts[phenotypePos].equals(MISSING_VALUE1) ) + affection = Affection.UNKNOWN; + else { + affection = Affection.OTHER; + quantitativePhenotype = parts[phenotypePos]; + } + } else { + if ( parts[phenotypePos].equals(MISSING_VALUE1) ) affection = Affection.UNKNOWN; + else if ( parts[phenotypePos].equals(MISSING_VALUE2) ) affection = Affection.UNKNOWN; + else if ( parts[phenotypePos].equals(PHENOTYPE_UNAFFECTED) ) affection = Affection.UNAFFECTED; + else if ( parts[phenotypePos].equals(PHENOTYPE_AFFECTED) ) affection = Affection.AFFECTED; + else throw new ReviewedStingException("Unexpected phenotype type " + parts[phenotypePos] + " at line " + lineNo); + } + } + + final Sample s = new Sample(individualID, sampleDB, familyID, paternalID, maternalID, sex, affection, quantitativePhenotype); + samples.add(s); + sampleDB.addSample(s); + lineNo++; + } + + for ( final Sample sample : new ArrayList(samples) ) { + Sample dad = maybeAddImplicitSample(sampleDB, sample.getPaternalID(), sample.getFamilyID(), Gender.MALE); + if ( dad != null ) samples.add(dad); + + Sample mom = maybeAddImplicitSample(sampleDB, sample.getMaternalID(), sample.getFamilyID(), Gender.FEMALE); + if ( mom != null ) samples.add(mom); + } + + return samples; + } + + private final static String maybeMissing(final String string) { + if ( string.equals(MISSING_VALUE1) || string.equals(MISSING_VALUE2) ) + return null; + else + return string; + } + + private final Sample maybeAddImplicitSample(SampleDB sampleDB, final String id, final String familyID, final Gender gender) { + if ( id != null && sampleDB.getSample(id) == null ) { + Sample s = new Sample(id, sampleDB, familyID, null, null, gender, Affection.UNKNOWN, Sample.UNSET_QT); + sampleDB.addSample(s); + return s; + } else + return null; + } + + /** + * Parses a list of tags from the command line, assuming it comes from the GATK Engine + * tags, and returns the corresponding EnumSet. + * + * @param arg the actual engine arg, used for the UserException if there's an error + * @param tags a list of string tags that should be converted to the MissingPedField value + * @return + */ + public static final EnumSet parseMissingFieldTags(final Object arg, final List tags) { + final EnumSet missingFields = EnumSet.noneOf(MissingPedField.class); + + for ( final String tag : tags ) { + try { + missingFields.add(MissingPedField.valueOf(tag)); + } catch ( IllegalArgumentException e ) { + throw new UserException.BadArgumentValue(arg.toString(), "Unknown tag " + tag + " allowed values are " + MissingPedField.values()); + } + } + + return missingFields; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java new file mode 100644 index 000000000..bbf857820 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/PedigreeValidationType.java @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +/** +* +*/ +public enum PedigreeValidationType { + /** + * Require if a pedigree file is provided at all samples in the VCF or BAM files have a corresponding + * entry in the pedigree file(s). + */ + STRICT, + + /** + * Do not enforce any overlap between the VCF/BAM samples and the pedigree data + * */ + SILENT +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java new file mode 100644 index 000000000..b39fdd79d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Sample.java @@ -0,0 +1,222 @@ +package org.broadinstitute.sting.gatk.samples; + + +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.util.HashMap; +import java.util.Map; + +/** + * + */ +public class Sample implements Comparable { // implements java.io.Serializable { + final private String familyID, paternalID, maternalID; + final private Gender gender; + final private String otherPhenotype; + final private Affection affection; + final private String ID; + final private SampleDB infoDB; + final private Map properties = new HashMap(); + + public final static String UNSET_QT = null; + + public Sample(final String ID, final SampleDB infoDB, + final String familyID, final String paternalID, final String maternalID, + final Gender gender, final Affection affection, final String otherPhenotype) { + this.familyID = familyID; + this.paternalID = paternalID; + this.maternalID = maternalID; + this.gender = gender; + this.otherPhenotype = otherPhenotype; + this.affection = affection; + this.ID = ID; + this.infoDB = infoDB; + } + + protected Sample(final String ID, + final String familyID, final String paternalID, final String maternalID, + final Gender gender, final Affection affection, final String otherPhenotype) { + this(ID, null, familyID, paternalID, maternalID, gender, affection, otherPhenotype); + } + + protected Sample(final String ID, + final String familyID, final String paternalID, final String maternalID, + final Gender gender, final Affection affection) { + this(ID, null, familyID, paternalID, maternalID, gender, affection, UNSET_QT); + } + + + public Sample(final String ID, final SampleDB infoDB, + final String familyID, final String paternalID, final String maternalID, final Gender gender) { + this(ID, infoDB, familyID, paternalID, maternalID, gender, Affection.UNKNOWN, UNSET_QT); + } + + public Sample(final String ID, final SampleDB infoDB, final Affection affection, final String otherPhenotype) { + this(ID, infoDB, null, null, null, Gender.UNKNOWN, affection, otherPhenotype); + } + + public Sample(String id, SampleDB infoDB) { + this(id, infoDB, null, null, null, + Gender.UNKNOWN, Affection.UNKNOWN, UNSET_QT); + } + + // ------------------------------------------------------------------------------------- + // + // standard property getters + // + // ------------------------------------------------------------------------------------- + + public String getID() { + return ID; + } + + public String getFamilyID() { + return familyID; + } + + public String getPaternalID() { + return paternalID; + } + + public String getMaternalID() { + return maternalID; + } + + public Affection getAffection() { + return affection; + } + + public boolean hasOtherPhenotype() { + return affection == Affection.OTHER; + } + + public String getOtherPhenotype() { + return otherPhenotype; + } + + /** + * Get the sample's mother + * @return sample object with relationship mother, if exists, or null + */ + public Sample getMother() { + return infoDB.getSample(maternalID); + } + + /** + * Get the sample's father + * @return sample object with relationship father, if exists, or null + */ + public Sample getFather() { + return infoDB.getSample(paternalID); + } + + /** + * Get gender of the sample + * @return property of key "gender" - must be of type Gender + */ + public Gender getGender() { + return gender; + } + + @Override + public int compareTo(final Sample sample) { + return ID.compareTo(sample.getID()); + } + + @Override + public String toString() { + return String.format("Sample %s fam=%s dad=%s mom=%s gender=%s affection=%s qt=%s props=%s", + getID(), getFamilyID(), getPaternalID(), getMaternalID(), getGender(), getAffection(), + getOtherPhenotype(), properties); + } + +// // ------------------------------------------------------------------------------------- +// // +// // code for working with additional -- none standard -- properites +// // +// // ------------------------------------------------------------------------------------- +// +// public Map getExtraProperties() { +// return Collections.unmodifiableMap(properties); +// } +// +// /** +// * Get one property +// * @param key key of property +// * @return value of property as generic object +// */ +// public Object getExtraPropertyValue(final String key) { +// return properties.get(key); +// } +// +// /** +// * +// * @param key property key +// * @return true if sample has this property (even if its value is null) +// */ +// public boolean hasExtraProperty(String key) { +// return properties.containsKey(key); +// } + + @Override + public int hashCode() { + return ID.hashCode(); + } + + @Override + public boolean equals(final Object o) { + if(o == null) + return false; + if(o instanceof Sample) { + Sample otherSample = (Sample)o; + return ID.equals(otherSample.ID) && + equalOrNull(familyID, otherSample.familyID) && + equalOrNull(paternalID, otherSample.paternalID) && + equalOrNull(maternalID, otherSample.maternalID) && + equalOrNull(gender, otherSample.gender) && + equalOrNull(otherPhenotype, otherSample.otherPhenotype) && + equalOrNull(affection, otherSample.affection) && + equalOrNull(properties, otherSample.properties); + } + return false; + } + + private final static boolean equalOrNull(final Object o1, final Object o2) { + if ( o1 == null ) + return o2 == null; + else + return o2 == null ? false : o1.equals(o2); + } + + private final static T mergeValues(final String name, final String field, final T o1, final T o2, final T emptyValue) { + if ( o1 == null || o1.equals(emptyValue) ) { + // take o2 if both are null, otherwise keep o2 + return o2 == null ? null : o2; + } else { + if ( o2 == null || o2.equals(emptyValue) ) + return o1; // keep o1, since it's a real value + else { + // both o1 and o2 have a value + if ( o1 == o2 ) + return o1; + else + throw new UserException("Inconsistent values detected for " + name + " for field " + field + " value1 " + o1 + " value2 " + o2); + } + } + } + + public final static Sample mergeSamples(final Sample prev, final Sample next) { + if ( prev.equals(next) ) + return next; + else { + return new Sample(prev.getID(), prev.infoDB, + mergeValues(prev.getID(), "Family_ID", prev.getFamilyID(), next.getFamilyID(), null), + mergeValues(prev.getID(), "Paternal_ID", prev.getPaternalID(), next.getPaternalID(), null), + mergeValues(prev.getID(), "Material_ID", prev.getMaternalID(), next.getMaternalID(), null), + mergeValues(prev.getID(), "Gender", prev.getGender(), next.getGender(), Gender.UNKNOWN), + mergeValues(prev.getID(), "Affection", prev.getAffection(), next.getAffection(), Affection.UNKNOWN), + mergeValues(prev.getID(), "OtherPhenotype", prev.getOtherPhenotype(), next.getOtherPhenotype(), UNSET_QT)); + //mergeValues(prev.getID(), "ExtraProperties", prev.getExtraProperties(), next.getExtraProperties(), Collections.emptyMap())); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java new file mode 100644 index 000000000..ee0873c6e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -0,0 +1,183 @@ +package org.broadinstitute.sting.gatk.samples; + +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.variantcontext.Genotype; + +import java.util.*; + +/** + * + */ +public class SampleDB { + /** + * This is where Sample objects are stored. Samples are usually accessed by their ID, which is unique, so + * this is stored as a HashMap. + */ + private final HashMap samples = new HashMap(); + + /** + * Constructor takes both a SAM header and sample files because the two must be integrated. + */ + public SampleDB() { + + } + + /** + * Protected function to add a single sample to the database + * + * @param sample to be added + */ + protected SampleDB addSample(Sample sample) { + Sample prev = samples.get(sample.getID()); + if ( prev != null ) + sample = Sample.mergeSamples(prev, sample); + samples.put(sample.getID(), sample); + return this; + } + + // -------------------------------------------------------------------------------- + // + // Functions for getting a sample from the DB + // + // -------------------------------------------------------------------------------- + + /** + * Get a sample by its ID + * If an alias is passed in, return the main sample object + * @param id + * @return sample Object with this ID, or null if this does not exist + */ + public Sample getSample(String id) { + return samples.get(id); + } + + /** + * + * @param read + * @return sample Object with this ID, or null if this does not exist + */ + public Sample getSample(final SAMRecord read) { + return getSample(read.getReadGroup()); + } + + /** + * + * @param rg + * @return sample Object with this ID, or null if this does not exist + */ + public Sample getSample(final SAMReadGroupRecord rg) { + return getSample(rg.getSample()); + } + + /** + * @param g Genotype + * @return sample Object with this ID, or null if this does not exist + */ + public Sample getSample(final Genotype g) { + return getSample(g.getSampleName()); + } + + // -------------------------------------------------------------------------------- + // + // Functions for accessing samples in the DB + // + // -------------------------------------------------------------------------------- + + /** + * Get number of sample objects + * @return size of samples map + */ + public int sampleCount() { + return samples.size(); + } + + public Set getSamples() { + return new HashSet(samples.values()); + } + + public Collection getSampleNames() { + return Collections.unmodifiableCollection(samples.keySet()); + } + + + /** + * Takes a collection of sample names and returns their corresponding sample objects + * Note that, since a set is returned, if you pass in a list with duplicates names there will not be any duplicates in the returned set + * @param sampleNameList Set of sample names + * @return Corresponding set of samples + */ + public Set getSamples(Collection sampleNameList) { + HashSet samples = new HashSet(); + for (String name : sampleNameList) { + try { + samples.add(getSample(name)); + } + catch (Exception e) { + throw new StingException("Could not get sample with the following ID: " + name, e); + } + } + return samples; + } + + // -------------------------------------------------------------------------------- + // + // Higher level pedigree functions + // + // -------------------------------------------------------------------------------- + + /** + * Returns a sorted set of the family IDs in all samples (excluding null ids) + * @return + */ + public final Set getFamilyIDs() { + return getFamilies().keySet(); + } + + /** + * Returns a map from family ID -> set of family members for all samples with + * non-null family ids + * + * @return + */ + public final Map> getFamilies() { + final Map> families = new TreeMap>(); + + for ( final Sample sample : samples.values() ) { + final String famID = sample.getFamilyID(); + if ( famID != null ) { + if ( ! families.containsKey(famID) ) + families.put(famID, new TreeSet()); + families.get(famID).add(sample); + } + } + + return families; + } + + /** + * Return all samples with a given family ID + * @param familyId + * @return + */ + public Set getFamily(String familyId) { + return getFamilies().get(familyId); + } + + /** + * Returns all children of a given sample + * See note on the efficiency of getFamily() - since this depends on getFamily() it's also not efficient + * @param sample + * @return + */ + public Set getChildren(Sample sample) { + final HashSet children = new HashSet(); + for ( final Sample familyMember : getFamily(sample.getFamilyID())) { + if ( familyMember.getMother() == sample || familyMember.getFather() == sample ) { + children.add(familyMember); + } + } + return children; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java new file mode 100644 index 000000000..44a8600b0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * + */ +public class SampleDBBuilder { + PedigreeValidationType validationStrictness; + final SampleDB sampleDB = new SampleDB(); + final GenomeAnalysisEngine engine; + + Set samplesFromDataSources = new HashSet(); + Set samplesFromPedigrees = new HashSet(); + + /** for testing only */ + protected SampleDBBuilder(PedigreeValidationType validationStrictness) { + engine = null; + this.validationStrictness = validationStrictness; + } + + /** + * Constructor takes both a SAM header and sample files because the two must be integrated. + */ + public SampleDBBuilder(GenomeAnalysisEngine engine, PedigreeValidationType validationStrictness) { + this.engine = engine; + this.validationStrictness = validationStrictness; + } + + /** + * Hallucinates sample objects for all the samples in the SAM file and stores them + */ + public SampleDBBuilder addSamplesFromSAMHeader(final SAMFileHeader header) { + addSamplesFromSampleNames(SampleUtils.getSAMFileSamples(header)); + return this; + } + + public SampleDBBuilder addSamplesFromSampleNames(final Collection sampleNames) { + for (final String sampleName : sampleNames) { + if (sampleDB.getSample(sampleName) == null) { + final Sample newSample = new Sample(sampleName, sampleDB); + sampleDB.addSample(newSample); + samplesFromDataSources.add(newSample); // keep track of data source samples + } + } + return this; + } + + public SampleDBBuilder addSamplesFromPedigreeFiles(final List pedigreeFiles) { + for (final File pedFile : pedigreeFiles) { + Collection samples = addSamplesFromPedigreeArgument(pedFile); + samplesFromPedigrees.addAll(samples); + } + + return this; + } + + public SampleDBBuilder addSamplesFromPedigreeStrings(final List pedigreeStrings) { + for (final String pedString : pedigreeStrings) { + Collection samples = addSamplesFromPedigreeArgument(pedString); + samplesFromPedigrees.addAll(samples); + } + + return this; + } + + /** + * Parse one sample file and integrate it with samples that are already there + * Fail quickly if we find any errors in the file + */ + private Collection addSamplesFromPedigreeArgument(File sampleFile) { + final PedReader reader = new PedReader(); + + try { + return reader.parse(sampleFile, getMissingFields(sampleFile), sampleDB); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(sampleFile, e); + } + } + + private Collection addSamplesFromPedigreeArgument(final String string) { + final PedReader reader = new PedReader(); + return reader.parse(string, getMissingFields(string), sampleDB); + } + + public SampleDB getFinalSampleDB() { + validate(); + return sampleDB; + } + + public EnumSet getMissingFields(final Object engineArg) { + if ( engine == null ) + return EnumSet.noneOf(PedReader.MissingPedField.class); + else { + final List posTags = engine.getTags(engineArg).getPositionalTags(); + return PedReader.parseMissingFieldTags(engineArg, posTags); + } + } + + // -------------------------------------------------------------------------------- + // + // Validation + // + // -------------------------------------------------------------------------------- + + protected final void validate() { + if ( validationStrictness == PedigreeValidationType.SILENT ) + return; + else { + // check that samples in data sources are all annotated, if anything is annotated + if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) { + final Set sampleNamesFromPedigrees = new HashSet(); + for ( final Sample pSample : samplesFromPedigrees ) + sampleNamesFromPedigrees.add(pSample.getID()); + + for ( final Sample dsSample : samplesFromDataSources ) + if ( ! sampleNamesFromPedigrees.contains(dsSample.getID()) ) + throw new UserException("Sample " + dsSample.getID() + " found in data sources but not in pedigree files with STRICT pedigree validation"); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index c6321e2ad..fd691735f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -364,8 +364,8 @@ public abstract class TraversalEngine,Provide // count up the number of skipped reads by summing over all filters long nSkippedReads = 0L; - for ( Map.Entry countsByFilter: cumulativeMetrics.getCountsByFilter().entrySet()) - nSkippedReads += countsByFilter.getValue(); + for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) + nSkippedReads += countsByFilter; logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours", elapsed, elapsed / 60, elapsed / 3600)); if ( cumulativeMetrics.getNumReadsSeen() > 0 ) @@ -373,10 +373,10 @@ public abstract class TraversalEngine,Provide nSkippedReads, cumulativeMetrics.getNumReadsSeen(), 100.0 * MathUtils.ratio(nSkippedReads,cumulativeMetrics.getNumReadsSeen()))); - for ( Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { + for ( Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { long count = filterCounts.getValue(); logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", - count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), Utils.getClassName(filterCounts.getKey()))); + count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); } if ( performanceLog != null ) performanceLog.close(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index 046003154..3f349d86d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.iterators.PushbackIterator; import org.broadinstitute.sting.gatk.walkers.DuplicateWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; @@ -57,9 +58,9 @@ public class TraverseDuplicates extends TraversalEngine readsAtLoc(final SAMRecord read, PushbackIterator iter) { + private List readsAtLoc(final GATKSAMRecord read, PushbackIterator iter) { GenomeLoc site = engine.getGenomeLocParser().createGenomeLoc(read); - ArrayList l = new ArrayList(); + ArrayList l = new ArrayList(); l.add(read); for (SAMRecord read2 : iter) { @@ -70,7 +71,7 @@ public class TraverseDuplicates extends TraversalEngine extends TraversalEngine> uniqueReadSets(List reads) { - Set> readSets = new LinkedHashSet>(); + protected Set> uniqueReadSets(List reads) { + Set> readSets = new LinkedHashSet>(); // for each read, find duplicates, and either add the read to its duplicate list or start a new one - for ( SAMRecord read : reads ) { - List readSet = findDuplicateReads(read, readSets); + for ( GATKSAMRecord read : reads ) { + List readSet = findDuplicateReads(read, readSets); if ( readSet == null ) { - readSets.add(new ArrayList(Arrays.asList(read))); // copy so I can add to the list + readSets.add(new ArrayList(Arrays.asList(read))); // copy so I can add to the list } else { readSet.add(read); } @@ -110,13 +111,13 @@ public class TraverseDuplicates extends TraversalEngine findDuplicateReads(SAMRecord read, Set> readSets ) { + protected List findDuplicateReads(GATKSAMRecord read, Set> readSets ) { if ( read.getReadPairedFlag() ) { // paired final GenomeLoc readMateLoc = engine.getGenomeLocParser().createGenomeLoc(read.getMateReferenceName(), read.getMateAlignmentStart(), read.getMateAlignmentStart()); - for (List reads : readSets) { - SAMRecord key = reads.get(0); + for (List reads : readSets) { + GATKSAMRecord key = reads.get(0); // read and key start at the same place, and either the this read and the key // share a mate location or the read is flagged as a duplicate @@ -131,8 +132,8 @@ public class TraverseDuplicates extends TraversalEngine reads : readSets) { - SAMRecord key = reads.get(0); + for (List reads : readSets) { + GATKSAMRecord key = reads.get(0); boolean v = (! key.getReadPairedFlag()) && read.getAlignmentStart() == key.getAlignmentStart() && ( key.getDuplicateReadFlag() || read.getDuplicateReadFlag() ) && read.getReadLength() == key.getReadLength(); //System.out.printf("%s %s %b %b %d %d %d %d => %b%n", // read.getReadPairedFlag(), key.getReadPairedFlag(), read.getDuplicateReadFlag(), key.getDuplicateReadFlag(), @@ -179,7 +180,7 @@ public class TraverseDuplicates extends TraversalEngine> readSets = uniqueReadSets(readsAtLoc(read, iter)); + Set> readSets = uniqueReadSets(readsAtLoc((GATKSAMRecord) read, iter)); if ( DEBUG ) logger.debug(String.format("*** TraverseDuplicates.traverse at %s with %d read sets", site, readSets.size())); // Jump forward in the reference to this locus location diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java index 27bbd50d5..24b8ac986 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReads.java @@ -13,6 +13,7 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -100,9 +101,9 @@ public class TraverseReads extends TraversalEngine,Read // if the read is mapped, create a metadata tracker ReadMetaDataTracker tracker = (read.getReferenceIndex() >= 0) ? rodView.getReferenceOrderedDataForRead(read) : null; - final boolean keepMeP = walker.filter(refContext, read); + final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); if (keepMeP) { - M x = walker.map(refContext, read, tracker); // the tracker can be null + M x = walker.map(refContext, (GATKSAMRecord) read, tracker); // the tracker can be null sum = walker.reduce(x, sum); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java index 6989f45b2..d1148cbd5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.walkers; import net.sf.picard.reference.ReferenceSequence; import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.picard.reference.ReferenceSequenceFileFactory; -import net.sf.samtools.SAMRecord; import net.sf.samtools.util.StringUtil; import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; @@ -43,6 +42,7 @@ import org.broadinstitute.sting.utils.clipreads.ClippingOp; import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation; import org.broadinstitute.sting.utils.clipreads.ReadClipper; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.io.File; @@ -292,11 +292,12 @@ public class ClipReadsWalker extends ReadWalker strandAwarePositions(SAMRecord read, int start, int stop) { + private Pair strandAwarePositions(GATKSAMRecord read, int start, int stop) { if (read.getReadNegativeStrandFlag()) return new Pair(read.getReadLength() - stop - 1, read.getReadLength() - start - 1); else @@ -374,7 +375,7 @@ public class ClipReadsWalker extends ReadWalker p : cyclesToClip) { // iterate over each cycle range @@ -416,7 +417,7 @@ public class ClipReadsWalker extends ReadWalker clipSeqs) { + public ReadClipperWithData(GATKSAMRecord read, List clipSeqs) { super(read); data = new ClippingData(clipSeqs); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java index e2db1dc52..905ecf273 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java @@ -1,10 +1,10 @@ package org.broadinstitute.sting.gatk.walkers; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.List; import java.util.Set; @@ -20,11 +20,11 @@ import java.util.Set; @ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class}) public abstract class DuplicateWalker extends Walker { // Do we actually want to operate on the context? - public boolean filter(GenomeLoc loc, AlignmentContext context, Set> readSets ) { + public boolean filter(GenomeLoc loc, AlignmentContext context, Set> readSets ) { return true; // We are keeping all the reads } - public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set> readSets ); + public abstract MapType map(GenomeLoc loc, AlignmentContext context, Set> readSets ); // Given result of map function public abstract ReduceType reduceInit(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/FindReadsWithNamesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/FindReadsWithNamesWalker.java index 56287df31..7f9269725 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/FindReadsWithNamesWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/FindReadsWithNamesWalker.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.text.XReadLines; import java.io.File; @@ -71,21 +72,23 @@ public class FindReadsWithNamesWalker extends ReadWalker { private FlagStat myStat = new FlagStat(); - public Integer map( ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker ) { + public Integer map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { myStat.readCount++; if (read.getReadFailsVendorQualityCheckFlag()) { myStat.QC_failure++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java index 8152f74c2..e94d01d5a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java @@ -17,7 +17,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; */ @By(DataSource.READS) @Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES}) -@PartitionBy(PartitionType.INTERVAL) +@PartitionBy(PartitionType.LOCUS) @ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class}) public abstract class LocusWalker extends Walker { // Do we actually want to operate on the context? diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java index 361e222c2..f0d92ef8a 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PartitionType.java @@ -34,6 +34,12 @@ public enum PartitionType { */ NONE, + /** + * The walker inputs can be chunked down to individual + * reads. + */ + READ, + /** * The walker inputs can be chunked down to the * per-locus level. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java index 4f072e88c..ac69738d3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java @@ -40,6 +40,7 @@ import java.util.TreeSet; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file. @@ -136,11 +137,12 @@ public class PrintReadsWalker extends ReadWalker { /** * The reads filter function. + * * @param ref the reference bases that correspond to our read, if a reference was provided * @param read the read itself, as a SAMRecord * @return true if the read passes the filter, false if it doesn't */ - public boolean filter(ReferenceContext ref, SAMRecord read) { + public boolean filter(ReferenceContext ref, GATKSAMRecord read) { // check the read group if ( readGroup != null ) { SAMReadGroupRecord myReadGroup = read.getReadGroup(); @@ -180,11 +182,12 @@ public class PrintReadsWalker extends ReadWalker { /** * The reads map function. + * * @param ref the reference bases that correspond to our read, if a reference was provided * @param read the read itself, as a SAMRecord * @return the read itself */ - public SAMRecord map( ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker ) { + public SAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { return read; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java index db2038aa3..8933bd73e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** * Created by IntelliJ IDEA. @@ -12,7 +13,7 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; * To change this template use File | Settings | File Templates. */ @Requires({DataSource.READS, DataSource.REFERENCE_BASES}) -@PartitionBy(PartitionType.CONTIG) +@PartitionBy(PartitionType.READ) public abstract class ReadWalker extends Walker { public boolean requiresOrderedReads() { return false; } @@ -20,11 +21,11 @@ public abstract class ReadWalker extends Walker { return toolkit; } + /** + * Gets the master sequence dictionary for this walker + * @link GenomeAnalysisEngine.getMasterSequenceDictionary + * @return + */ + protected SAMSequenceDictionary getMasterSequenceDictionary() { + return getToolkit().getMasterSequenceDictionary(); + } + + protected SampleDB getSampleDB() { + return getToolkit().getSampleDB(); + } + + protected Sample getSample(final String id) { + return getToolkit().getSampleDB().getSample(id); + } + /** * (conceptual static) method that states whether you want to see reads piling up at a locus * that contain a deletion at the locus. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java index e501258c5..e5f75f06d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -92,7 +92,7 @@ public class AlleleBalance extends InfoFieldAnnotation { continue; } // todo -- actually care about indel length from the pileup (agnostic at the moment) - int refCount = indelPileup.size(); + int refCount = indelPileup.getNumberOfElements(); int altCount = vc.isSimpleInsertion() ? indelPileup.getNumberOfInsertions() : indelPileup.getNumberOfDeletions(); if ( refCount + altCount == 0 ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index 75c4037d5..820fd248a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -47,7 +47,7 @@ public class AlleleBalanceBySample extends GenotypeAnnotation implements Experim if (!g.isHet()) return null; - Set altAlleles = vc.getAlternateAlleles(); + Collection altAlleles = vc.getAlternateAlleles(); if ( altAlleles.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java deleted file mode 100755 index 353fd1c2c..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AnnotationByDepth.java +++ /dev/null @@ -1,32 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.utils.variantcontext.Genotype; - -import java.util.Map; - -/** - * Abstract base class for all annotations that are normalized by depth - */ -public abstract class AnnotationByDepth extends InfoFieldAnnotation { - - - protected int annotationByVariantDepth(final Map genotypes, Map stratifiedContexts) { - int depth = 0; - for ( Map.Entry genotype : genotypes.entrySet() ) { - - // we care only about variant calls - if ( genotype.getValue().isHomRef() ) - continue; - - AlignmentContext context = stratifiedContexts.get(genotype.getKey()); - if ( context != null ) - depth += context.size(); - } - - return depth; - } - - -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index 864be55b7..8098de5b1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -41,7 +41,7 @@ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnno int depth = 0; for ( Map.Entry sample : stratifiedContexts.entrySet() ) - depth += sample.getValue().size(); + depth += sample.getValue().hasBasePileup() ? sample.getValue().getBasePileup().depthOfCoverage() : sample.getValue().getExtendedEventPileup().depthOfCoverage(); Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%d", depth)); return map; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 393eb549c..2d1d1978c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -205,7 +205,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat for ( Map.Entry sample : stratifiedContexts.entrySet() ) { for (PileupElement p : sample.getValue().getBasePileup()) { - if ( p.isDeletion() ) // ignore deletions + if ( p.isDeletion() || p.isReducedRead() ) // ignore deletions and reduced reads continue; if ( p.getRead().getMappingQuality() < 20 || p.getQual() < 20 ) @@ -258,6 +258,8 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat continue; for (final PileupElement p: pileup) { + if ( p.isReducedRead() ) // ignore reduced reads + continue; if ( p.getRead().getMappingQuality() < 20) continue; if (indelLikelihoodMap.containsKey(p)) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index df6da3b85..c142109fa 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -34,12 +34,12 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnot import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.genotype.Haplotype; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java new file mode 100755 index 000000000..bd0d4e3fb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -0,0 +1,58 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.MendelianViolation; +import org.broadinstitute.sting.utils.codecs.vcf.VCFFilterHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; +import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 9/14/11 + * Time: 12:24 PM + * To change this template use File | Settings | File Templates. + */ +public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation { + + private MendelianViolation mendelianViolation = null; + + public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + if ( mendelianViolation == null ) { + if ( walker instanceof VariantAnnotator && ((VariantAnnotator) walker).familyStr != null) { + mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).familyStr, ((VariantAnnotator)walker).minGenotypeQualityP ); + } + else { + throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid Family String file (-family) on the command line."); + } + } + + Map toRet = new HashMap(1); + boolean hasAppropriateGenotypes = vc.hasGenotype(mendelianViolation.getSampleChild()) && vc.getGenotype(mendelianViolation.getSampleChild()).hasLikelihoods() && + vc.hasGenotype(mendelianViolation.getSampleDad()) && vc.getGenotype(mendelianViolation.getSampleDad()).hasLikelihoods() && + vc.hasGenotype(mendelianViolation.getSampleMom()) && vc.getGenotype(mendelianViolation.getSampleMom()).hasLikelihoods(); + if ( hasAppropriateGenotypes ) + toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc)); + + return toRet; + } + + // return the descriptions used for the VCF INFO meta field + public List getKeyNames() { return Arrays.asList("MVLR"); } + + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index 552289309..ffc852903 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -1,10 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.annotator; -import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -21,7 +21,7 @@ import java.util.Map; * * Low scores are indicative of false positive calls and artifacts. */ -public class QualByDepth extends AnnotationByDepth implements StandardAnnotation { +public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation { public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { if ( stratifiedContexts.size() == 0 ) @@ -43,14 +43,13 @@ public class QualByDepth extends AnnotationByDepth implements StandardAnnotation if ( context == null ) continue; - depth += context.size(); + depth += context.hasBasePileup() ? context.getBasePileup().depthOfCoverage() : context.getExtendedEventPileup().depthOfCoverage(); } if ( depth == 0 ) return null; - int qDepth = annotationByVariantDepth(genotypes, stratifiedContexts); - double QD = 10.0 * vc.getNegLog10PError() / (double)qDepth; + double QD = 10.0 * vc.getNegLog10PError() / (double)depth; Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", QD)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java index 772541eb6..168fbdc49 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadDepthAndAllelicFractionBySample.java @@ -79,7 +79,7 @@ public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation { alleleCounts.put(allele.getBases()[0], 0); ReadBackedPileup pileup = stratifiedContext.getBasePileup(); - int totalDepth = pileup.size(); + int totalDepth = pileup.getNumberOfElements(); Map map = new HashMap(); map.put(getKeyNames().get(0), totalDepth); // put total depth in right away @@ -119,7 +119,7 @@ public class ReadDepthAndAllelicFractionBySample extends GenotypeAnnotation { ReadBackedExtendedEventPileup pileup = stratifiedContext.getExtendedEventPileup(); if ( pileup == null ) return null; - int totalDepth = pileup.size(); + int totalDepth = pileup.getNumberOfElements(); Map map = new HashMap(); map.put(getKeyNames().get(0), totalDepth); // put total depth in right away diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java deleted file mode 100755 index efe96f226..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SBByDepth.java +++ /dev/null @@ -1,59 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.annotator; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; -import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * SB annotation value by depth of alt containing samples - */ -public class SBByDepth extends AnnotationByDepth { - - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; - - if (!vc.hasAttribute(VCFConstants.STRAND_BIAS_KEY)) - return null; - - double sBias = Double.valueOf(vc.getAttributeAsString(VCFConstants.STRAND_BIAS_KEY)); - - final Map genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() == 0 ) - return null; - - int sDepth = annotationByVariantDepth(genotypes, stratifiedContexts); - if ( sDepth == 0 ) - return null; - - - - double SbyD = (-sBias / (double)sDepth); - if (SbyD > 0) - SbyD = Math.log10(SbyD); - else - SbyD = -1000; - - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", SbyD)); - return map; - } - - public List getKeyNames() { return Arrays.asList("SBD"); } - - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Strand Bias by Depth")); } - - - -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java index ff409484d..ee08cfa3b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java @@ -42,7 +42,7 @@ import java.util.List; import java.util.Map; /** - * List all of the samples in the info field + * List all of the polymorphic samples. */ public class SampleList extends InfoFieldAnnotation { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index f747fbc2e..66d2ad318 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -43,7 +43,7 @@ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAn if (pileup != null) { deletions += pileup.getNumberOfDeletions(); - depth += pileup.size(); + depth += pileup.getNumberOfElements(); } } Map map = new HashMap(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index f6a1c4f31..c9937f3d6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -132,6 +132,13 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false) protected List annotationsToUse = new ArrayList(); + /** + * Note that this argument has higher priority than the -A or -G arguments, + * so annotations will be excluded even if they are explicitly included with the other options. + */ + @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) + protected List annotationsToExclude = new ArrayList(); + /** * See the -list argument to view available groups. */ @@ -148,6 +155,9 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false) protected List expressionsToUse = new ArrayList(); + /** + * Note that the -XL argument can be used along with this one to exclude annotations. + */ @Argument(fullName="useAllAnnotations", shortName="all", doc="Use all possible annotations (not for the faint of heart)", required=false) protected Boolean USE_ALL_ANNOTATIONS = false; @@ -162,6 +172,12 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false) protected boolean indelsOnly = false; + @Argument(fullName="family_string",shortName="family",required=false,doc="A family string of the form mom+dad=child for use with the mendelian violation ratio annotation") + public String familyStr = null; + + @Argument(fullName="MendelViolationGenotypeQualityThreshold",shortName="mvq",required=false,doc="The genotype quality treshold in order to annotate mendelian violation ratio") + public double minGenotypeQualityP = 0.0; + private VariantAnnotatorEngine engine; private Collection indelBufferContext; @@ -203,9 +219,9 @@ public class VariantAnnotator extends RodWalker implements Ann } if ( USE_ALL_ANNOTATIONS ) - engine = new VariantAnnotatorEngine(this, getToolkit()); + engine = new VariantAnnotatorEngine(annotationsToExclude, this, getToolkit()); else - engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, this, getToolkit()); + engine = new VariantAnnotatorEngine(annotationGroupsToUse, annotationsToUse, annotationsToExclude, this, getToolkit()); engine.initializeExpressions(expressionsToUse); // setup the header fields @@ -216,6 +232,8 @@ public class VariantAnnotator extends RodWalker implements Ann if ( isUniqueHeaderLine(line, hInfo) ) hInfo.add(line); } + for ( String expression : expressionsToUse ) + hInfo.add(new VCFInfoHeaderLine(expression, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Value transferred from another external VCF resource")); engine.invokeAnnotationInitializationMethods(hInfo); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index e5effe6d8..e4bc0d5d5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -73,19 +73,20 @@ public class VariantAnnotatorEngine { } // use this constructor if you want all possible annotations - public VariantAnnotatorEngine(AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) { + public VariantAnnotatorEngine(List annotationsToExclude, AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) { this.walker = walker; this.toolkit = toolkit; requestedInfoAnnotations = AnnotationInterfaceManager.createAllInfoFieldAnnotations(); requestedGenotypeAnnotations = AnnotationInterfaceManager.createAllGenotypeAnnotations(); + excludeAnnotations(annotationsToExclude); initializeDBs(); } // use this constructor if you want to select specific annotations (and/or interfaces) - public VariantAnnotatorEngine(List annotationGroupsToUse, List annotationsToUse, AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) { + public VariantAnnotatorEngine(List annotationGroupsToUse, List annotationsToUse, List annotationsToExclude, AnnotatorCompatibleWalker walker, GenomeAnalysisEngine toolkit) { this.walker = walker; this.toolkit = toolkit; - initializeAnnotations(annotationGroupsToUse, annotationsToUse); + initializeAnnotations(annotationGroupsToUse, annotationsToUse, annotationsToExclude); initializeDBs(); } @@ -96,10 +97,30 @@ public class VariantAnnotatorEngine { requestedExpressions.add(new VAExpression(expression, walker.getResourceRodBindings())); } - private void initializeAnnotations(List annotationGroupsToUse, List annotationsToUse) { + private void initializeAnnotations(List annotationGroupsToUse, List annotationsToUse, List annotationsToExclude) { AnnotationInterfaceManager.validateAnnotations(annotationGroupsToUse, annotationsToUse); requestedInfoAnnotations = AnnotationInterfaceManager.createInfoFieldAnnotations(annotationGroupsToUse, annotationsToUse); requestedGenotypeAnnotations = AnnotationInterfaceManager.createGenotypeAnnotations(annotationGroupsToUse, annotationsToUse); + excludeAnnotations(annotationsToExclude); + } + + private void excludeAnnotations(List annotationsToExclude) { + if ( annotationsToExclude.size() == 0 ) + return; + + List tempRequestedInfoAnnotations = new ArrayList(requestedInfoAnnotations.size()); + for ( InfoFieldAnnotation annotation : requestedInfoAnnotations ) { + if ( !annotationsToExclude.contains(annotation.getClass().getSimpleName()) ) + tempRequestedInfoAnnotations.add(annotation); + } + requestedInfoAnnotations = tempRequestedInfoAnnotations; + + List tempRequestedGenotypeAnnotations = new ArrayList(requestedGenotypeAnnotations.size()); + for ( GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { + if ( !annotationsToExclude.contains(annotation.getClass().getSimpleName()) ) + tempRequestedGenotypeAnnotations.add(annotation); + } + requestedGenotypeAnnotations = tempRequestedGenotypeAnnotations; } private void initializeDBs() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 87695077d..b722220f9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Gender; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.variantrecalibration.VQSRCalibrationCurve; import org.broadinstitute.sting.utils.GenomeLoc; @@ -247,7 +248,7 @@ public class ProduceBeagleInputWalker extends RodWalker { Map preferredGenotypes = preferredVC.getGenotypes(); Map otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; for ( String sample : samples ) { - boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getToolkit().getSampleById(sample).isMale(); + boolean isMaleOnChrX = CHECK_IS_MALE_ON_CHR_X && getSample(sample).getGender() == Gender.MALE; Genotype genotype; boolean isValidation; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java index 32875a098..1dfc6fea0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java @@ -227,9 +227,8 @@ public class CallableLociWalker extends LocusWalker bases on the reference, or on the subset of the reference - * specified by the intervals provided. Moving to the next contig on the reference will always restart the - * count anew, even if the count of bases in the last chunk on the previous contig did not reach specified . - */ -public class CoarseCoverageWalker extends ReadWalker { - @Output - public PrintStream out; - - @Argument(fullName="granularity", shortName="G", doc="Granularity", required=true) - public Integer N; - - @Argument(fullName="dontZeroMissingContigs", shortName="Z", doc="If provided, we won't emit 0 counts for all sites on contigs skipped", required=true) - public boolean dontZeroMissingContigs; - - private int chunkStart = 1; // start of the current chunk we are counting reads for - private int contig = 0; // current contig we are on - private int count = 0; // number of reads overlapping with the current chunk - private static String zeroString = "0"; - - @Override - public void initialize() { - chunkStart = 1; - contig = 0; - count = 0; - } - - @Override - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - - if ( read.getReadUnmappedFlag() || - read.getDuplicateReadFlag() || - read.getNotPrimaryAlignmentFlag() || - read.getMappingQuality() == 0 ) return 0; - - if ( read.getReferenceIndex() != contig ) { - // we jumped onto another contig - out.printf("%d%n", count); // print old count - count = 0; - - // if we skipped one or more contigs completely, make sure we print 0 counts over all of them: - for ( contig++ ; contig < read.getReferenceIndex() ; contig++) { - if ( ! dontZeroMissingContigs ) { - int contigSize = read.getHeader().getSequence(contig).getSequenceLength(); - for ( int k = 1 ; k < contigSize ; k+=N ) out.println(zeroString); - } - } - // by now we scrolled to the right contig - - chunkStart = 1; // reset chunk start - } - - // if our read is past the boundary of the current chunk, print old count(s) - // (for the current chunk and all chunks we may have skipped altogether) and reinitialize: - while ( chunkStart+N < read.getAlignmentStart() ) { - out.printf("%d%n", count); // print old count - count = 0; - chunkStart += N; - } - count++; - return 1; - } - - @Override - public Integer reduce(Integer value, Integer sum) { - return value+sum; - } - - @Override - public Integer reduceInit() { - return 0; - } - - @Override - public void onTraversalDone(Integer result) { - out.printf("%d%n", count); // print count from the last chunk - super.onTraversalDone(result); - - } - - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java index 664c319ab..cbbb3d43f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageWalker.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; @@ -112,6 +113,7 @@ import java.util.*; // todo -- allow for user to set linear binning (default is logarithmic) // todo -- formatting --> do something special for end bins in getQuantile(int[] foo), this gets mushed into the end+-1 bins for now @By(DataSource.REFERENCE) +@PartitionBy(PartitionType.INTERVAL) public class DepthOfCoverageWalker extends LocusWalker>, CoveragePartitioner> implements TreeReducible { @Output @Multiplex(value=DoCOutputMultiplexer.class,arguments={"partitionTypes","refSeqGeneList","omitDepthOutput","omitIntervals","omitSampleSummary","omitLocusTable"}) @@ -281,20 +283,14 @@ public class DepthOfCoverageWalker extends LocusWalker getSamplesFromToolKit(DoCOutputType.Partition type) { HashSet partition = new HashSet(); if ( type == DoCOutputType.Partition.sample ) { - for ( Set sampleSet : getToolkit().getSamplesByReaders() ) { - for ( String s : sampleSet ) { - partition.add(s); - } - } + partition.addAll(SampleUtils.getSAMFileSamples(getToolkit())); } else if ( type == DoCOutputType.Partition.readgroup ) { for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { partition.add(rg.getSample()+"_rg_"+rg.getReadGroupId()); } } else if ( type == DoCOutputType.Partition.library ) { - for ( Set libraries : getToolkit().getLibrariesByReaders() ) { - for ( String l : libraries ) { - partition.add(l); - } + for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + partition.add(rg.getLibrary()); } } else if ( type == DoCOutputType.Partition.center ) { for ( SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java index 5c2a967b9..17b17764b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java @@ -42,12 +42,12 @@ import java.util.List; * *

Input

*

- * One or more BAM files. + * A reference file *

* *

Output

*

- * GC content calculations per interval. + * GC content calculations per interval. *

* *

Examples

@@ -56,7 +56,6 @@ import java.util.List; * -R ref.fasta \ * -T GCContentByInterval \ * -o output.txt \ - * -I input.bam \ * -L input.intervals * * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java index 0f1cea2e1..2d88baf3f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java @@ -1,13 +1,13 @@ package org.broadinstitute.sting.gatk.walkers.diagnostics; import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.PrintStream; import java.util.List; @@ -69,12 +69,12 @@ public class ReadLengthDistribution extends ReadWalker { } - public boolean filter(ReferenceContext ref, SAMRecord read) { + public boolean filter(ReferenceContext ref, GATKSAMRecord read) { return ( !read.getReadPairedFlag() || read.getReadPairedFlag() && read.getFirstOfPairFlag()); } @Override - public Integer map(ReferenceContext referenceContext, SAMRecord samRecord, ReadMetaDataTracker readMetaDataTracker) { + public Integer map(ReferenceContext referenceContext, GATKSAMRecord samRecord, ReadMetaDataTracker readMetaDataTracker) { GATKReportTable table = report.getTable("ReadLengthDistribution"); int length = Math.abs(samRecord.getReadLength()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java index 5889d19e5..04437fdd1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsWalker.java @@ -219,10 +219,10 @@ public class DiffObjectsWalker extends RodWalker { @Override public void onTraversalDone(Integer sum) { - out.printf("Reading master file %s%n", masterFile); + //out.printf("Reading master file %s%n", masterFile); DiffElement master = diffEngine.createDiffableFromFile(masterFile, MAX_OBJECTS_TO_READ); out.printf(" Read %d objects%n", master.size()); - out.printf("Reading test file %s%n", testFile); + //out.printf("Reading test file %s%n", testFile); DiffElement test = diffEngine.createDiffableFromFile(testFile, MAX_OBJECTS_TO_READ); out.printf(" Read %d objects%n", test.size()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java index 70f3c6a1a..35a9fe31d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.PrintStream; +import java.util.List; import java.util.Map; import java.util.Set; @@ -67,16 +68,12 @@ public abstract class AlleleFrequencyCalculationModel implements Cloneable { /** * Must be overridden by concrete subclasses - * @param tracker rod data - * @param ref reference context * @param GLs genotype likelihoods * @param Alleles Alleles corresponding to GLs * @param log10AlleleFrequencyPriors priors * @param log10AlleleFrequencyPosteriors array (pre-allocated) to store results */ - protected abstract void getLog10PNonRef(RefMetaDataTracker tracker, - ReferenceContext ref, - Map GLs, Set Alleles, + protected abstract void getLog10PNonRef(Map GLs, List Alleles, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors); diff --git a/public/java/src/org/broadinstitute/sting/utils/genotype/DiploidGenotype.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java similarity index 98% rename from public/java/src/org/broadinstitute/sting/utils/genotype/DiploidGenotype.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java index 1c2cfe2e1..b5987963f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/genotype/DiploidGenotype.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotype.java @@ -23,7 +23,7 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.sting.utils.genotype; +package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.BaseUtils; @@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.BaseUtils; * Time: 6:46:09 PM * To change this template use File | Settings | File Templates. */ -public enum DiploidGenotype { +enum DiploidGenotype { AA ('A', 'A'), AC ('A', 'C'), AG ('A', 'G'), diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java index 696a74de8..d8c911092 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidIndelGenotypePriors.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; /** * Created by IntelliJ IDEA. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 5f6865d04..666fe88a3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -27,14 +27,16 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMUtils; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; +import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; -import org.broadinstitute.sting.utils.pileup.FragmentPileup; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import java.util.List; + import static java.lang.Math.log10; import static java.lang.Math.pow; @@ -260,35 +262,42 @@ public class DiploidSNPGenotypeLikelihoods implements Cloneable { int n = 0; // for each fragment, add to the likelihoods - FragmentPileup fpile = new FragmentPileup(pileup); + FragmentCollection fpile = pileup.toFragments(); - for ( PileupElement p : fpile.getOneReadPileup() ) + for ( PileupElement p : fpile.getSingletonReads() ) n += add(p, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - for ( FragmentPileup.TwoReadPileupElement twoRead : fpile.getTwoReadPileup() ) - n += add(twoRead, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + for ( List overlappingPair : fpile.getOverlappingPairs() ) + n += add(overlappingPair, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); return n; } + public int add(PileupElement elt, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { byte obsBase = elt.getBase(); if ( elt.isReducedRead() ) { // reduced read representation - byte qual = elt.getReducedQual(); - add(obsBase, qual, (byte)0, (byte)0, elt.getReducedCount()); // fast calculation of n identical likelihoods - return elt.getReducedCount(); // we added nObs bases here + byte qual = elt.getQual(); + if ( BaseUtils.isRegularBase( elt.getBase() )) { + add(obsBase, qual, (byte)0, (byte)0, elt.getRepresentativeCount()); // fast calculation of n identical likelihoods + return elt.getRepresentativeCount(); // we added nObs bases here + } else // odd bases or deletions => don't use them + return 0; } else { byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); return qual > 0 ? add(obsBase, qual, (byte)0, (byte)0, 1) : 0; } } - public int add(FragmentPileup.TwoReadPileupElement twoRead, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { - final byte observedBase1 = twoRead.getFirst().getBase(); - final byte qualityScore1 = qualToUse(twoRead.getFirst(), ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); - final byte observedBase2 = twoRead.getSecond().getBase(); - final byte qualityScore2 = qualToUse(twoRead.getSecond(), ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + public int add(List overlappingPair, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) { + final PileupElement p1 = overlappingPair.get(0); + final PileupElement p2 = overlappingPair.get(1); + + final byte observedBase1 = p1.getBase(); + final byte qualityScore1 = qualToUse(p1, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); + final byte observedBase2 = p2.getBase(); + final byte qualityScore2 = qualToUse(p2, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual); if ( qualityScore1 == 0 ) { if ( qualityScore2 == 0 ) // abort early if we didn't see any good bases diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java index b9ed17d3e..71854591f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypePriors.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import java.util.Arrays; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index 6ae437b27..1c2d82ab7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -29,90 +29,37 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import sun.reflect.generics.reflectiveObjects.NotImplementedException; import java.io.PrintStream; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; +import java.util.*; public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // // code for testing purposes // private final static boolean DEBUG = false; - private final static boolean PRINT_LIKELIHOODS = false; - private final static int N_CYCLES = 1; - private SimpleTimer timerExpt = new SimpleTimer("linearExactBanded"); - private SimpleTimer timerGS = new SimpleTimer("linearExactGS"); - private final static boolean COMPARE_TO_GS = false; - - public enum ExactCalculation { - N2_GOLD_STANDARD, - LINEAR_EXPERIMENTAL - } - private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - - private boolean SIMPLE_GREEDY_GENOTYPER = false; - + private final boolean SIMPLE_GREEDY_GENOTYPER = false; private final static double SUM_GL_THRESH_NOCALL = -0.001; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. - final private ExactCalculation calcToUse; protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { super(UAC, N, logger, verboseWriter); - calcToUse = UAC.EXACT_CALCULATION_TYPE; } - public void getLog10PNonRef(RefMetaDataTracker tracker, - ReferenceContext ref, - Map GLs, Setalleles, + public void getLog10PNonRef(Map GLs, List alleles, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors) { - // todo -- REMOVE ME AFTER TESTING - // todo -- REMOVE ME AFTER TESTING - // todo -- REMOVE ME AFTER TESTING - double[] gsPosteriors; - if ( COMPARE_TO_GS ) // due to annoying special values in incoming array, we have to clone up here - gsPosteriors = log10AlleleFrequencyPosteriors.clone(); - - int idxAA = GenotypeType.AA.ordinal(); - int idxAB = GenotypeType.AB.ordinal(); - int idxBB = GenotypeType.BB.ordinal(); - - // todo -- remove me after testing - if ( N_CYCLES > 1 ) { - for ( int i = 0; i < N_CYCLES; i++) { - timerGS.restart(); - linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone(), idxAA, idxAB, idxBB); - timerGS.stop(); - - timerExpt.restart(); - linearExactBanded(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors.clone()); - timerExpt.stop(); - } - - System.out.printf("good = %.2f, expt = %.2f, delta = %.2f%n", - timerGS.getElapsedTime(), timerExpt.getElapsedTime(), timerExpt.getElapsedTime()-timerGS.getElapsedTime()); - } - - int lastK = -1; - - int numAlleles = alleles.size(); + final int numAlleles = alleles.size(); + final double[][] posteriorCache = numAlleles > 2 ? new double[numAlleles-1][] : null; + final double[] bestAFguess = numAlleles > 2 ? new double[numAlleles-1] : null; int idxDiag = numAlleles; int incr = numAlleles - 1; - - double[][] posteriorCache = new double[numAlleles-1][]; - double[] bestAFguess = new double[numAlleles-1]; - for (int k=1; k < numAlleles; k++) { // multi-allelic approximation, part 1: Ideally // for each alt allele compute marginal (suboptimal) posteriors - @@ -121,24 +68,17 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // So, for example, with 2 alt alleles, likelihoods have AA,AB,AC,BB,BC,CC. // 3 alt alleles: AA,AB,AC,AD BB BC BD CC CD DD - idxAA = 0; - idxAB = k; + final int idxAA = 0; + final int idxAB = k; // yy is always element on the diagonal. // 2 alleles: BBelement 2 // 3 alleles: BB element 3. CC element 5 // 4 alleles: - idxBB = idxDiag; + final int idxBB = idxDiag; idxDiag += incr--; - // todo - possible cleanup - switch ( calcToUse ) { - case N2_GOLD_STANDARD: - lastK = gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB); - break; - case LINEAR_EXPERIMENTAL: - lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB); - break; - } + final int lastK = linearExact(GLs, log10AlleleFrequencyPriors, log10AlleleFrequencyPosteriors, idxAA, idxAB, idxBB); + if (numAlleles > 2) { posteriorCache[k-1] = log10AlleleFrequencyPosteriors.clone(); bestAFguess[k-1] = (double)MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors); @@ -153,39 +93,14 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { log10AlleleFrequencyPosteriors[k] = (posteriorCache[mostLikelyAlleleIdx][k]); } - // todo -- REMOVE ME AFTER TESTING - // todo -- REMOVE ME AFTER TESTING - // todo -- REMOVE ME AFTER TESTING - if ( COMPARE_TO_GS ) { - gdaN2GoldStandard(GLs, log10AlleleFrequencyPriors, gsPosteriors, idxAA, idxAB, idxBB); - - double log10thisPVar = Math.log10(MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors)[0]); - double log10gsPVar = Math.log10(MathUtils.normalizeFromLog10(gsPosteriors)[0]); - boolean eq = (log10thisPVar == Double.NEGATIVE_INFINITY && log10gsPVar == Double.NEGATIVE_INFINITY) || MathUtils.compareDoubles(log10thisPVar, log10gsPVar, 1e-4) == 0; - - if ( ! eq || PRINT_LIKELIHOODS ) { - System.out.printf("----------------------------------------%n"); - for (int k=0; k < log10AlleleFrequencyPosteriors.length; k++) { - double x = log10AlleleFrequencyPosteriors[k]; - System.out.printf(" %d\t%.2f\t%.2f\t%b%n", k, - x < -1e10 ? Double.NEGATIVE_INFINITY : x, gsPosteriors[k], - log10AlleleFrequencyPosteriors[k] == gsPosteriors[k]); - } - System.out.printf("MAD_AC\t%d\t%d\t%.2f\t%.2f\t%.6f%n", - ref.getLocus().getStart(), lastK, log10thisPVar, log10gsPVar, log10thisPVar - log10gsPVar); - } - } - } private static final ArrayList getGLs(Map GLs) { ArrayList genotypeLikelihoods = new ArrayList(); - //int j = 0; genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy for ( Genotype sample : GLs.values() ) { if ( sample.hasLikelihoods() ) { - //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); double[] gls = sample.getLikelihoods().getAsVector(); if (MathUtils.sum(gls) < SUM_GL_THRESH_NOCALL) @@ -240,84 +155,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } - // now with banding - public int linearExactBanded(Map GLs, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors) { - throw new NotImplementedException(); -// final int numSamples = GLs.size(); -// final int numChr = 2*numSamples; -// final double[][] genotypeLikelihoods = getGLs(GLs); -// -// final ExactACCache logY = new ExactACCache(numSamples+1); -// logY.getkMinus0()[0] = 0.0; // the zero case -// -// double maxLog10L = Double.NEGATIVE_INFINITY; -// boolean done = false; -// int lastK = -1; -// final int BAND_SIZE = 10; -// -// for (int k=0; k <= numChr && ! done; k++ ) { -// final double[] kMinus0 = logY.getkMinus0(); -// int jStart = Math.max(k - BAND_SIZE, 1); -// int jStop = Math.min(k + BAND_SIZE, numSamples); -// -// if ( k == 0 ) { // special case for k = 0 -// for ( int j=1; j <= numSamples; j++ ) { -// kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods[j][GenotypeType.AA.ordinal()]; -// } -// } else { // k > 0 -// final double[] kMinus1 = logY.getkMinus1(); -// final double[] kMinus2 = logY.getkMinus2(); -// Arrays.fill(kMinus0,0); -// -// for ( int j = jStart; j <= jStop; j++ ) { -// final double[] gl = genotypeLikelihoods[j]; -// final double logDenominator = log10Cache[2*j] + log10Cache[2*j-1]; -// -// double aa = Double.NEGATIVE_INFINITY; -// double ab = Double.NEGATIVE_INFINITY; -// if (k < 2*j-1) -// aa = log10Cache[2*j-k] + log10Cache[2*j-k-1] + kMinus0[j-1] + gl[GenotypeType.AA.ordinal()]; -// -// if (k < 2*j) -// ab = log10Cache[2*k] + log10Cache[2*j-k]+ kMinus1[j-1] + gl[GenotypeType.AB.ordinal()]; -// -// double log10Max; -// if (k > 1) { -// final double bb = log10Cache[k] + log10Cache[k-1] + kMinus2[j-1] + gl[GenotypeType.BB.ordinal()]; -// log10Max = approximateLog10SumLog10(aa, ab, bb); -// } else { -// // we know we aren't considering the BB case, so we can use an optimized log10 function -// log10Max = approximateLog10SumLog10(aa, ab); -// } -// -// // finally, update the L(j,k) value -// kMinus0[j] = log10Max - logDenominator; -// -// String offset = Utils.dupString(' ',k); -// System.out.printf("%s%3d %3d %.2f%n", offset, k, j, kMinus0[j]); -// } -// } -// -// // update the posteriors vector -// final double log10LofK = kMinus0[jStop]; -// log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k]; -// -// // can we abort early? -// lastK = k; -// maxLog10L = Math.max(maxLog10L, log10LofK); -// if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { -// if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); -// done = true; -// } -// -// logY.rotate(); -// } -// -// return lastK; - } - public int linearExact(Map GLs, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { @@ -605,82 +442,6 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { return calls; } - // ------------------------------------------------------------------------------------- - // - // Gold standard, but O(N^2), implementation. - // - // TODO -- remove me for clarity in this code - // - // ------------------------------------------------------------------------------------- - public int gdaN2GoldStandard(Map GLs, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { - int numSamples = GLs.size(); - int numChr = 2*numSamples; - - double[][] logYMatrix = new double[1+numSamples][1+numChr]; - - for (int i=0; i <=numSamples; i++) - for (int j=0; j <=numChr; j++) - logYMatrix[i][j] = Double.NEGATIVE_INFINITY; - - //YMatrix[0][0] = 1.0; - logYMatrix[0][0] = 0.0; - int j=0; - - for ( Map.Entry sample : GLs.entrySet() ) { - j++; - - if ( !sample.getValue().hasLikelihoods() ) - continue; - - //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); - double[] genotypeLikelihoods = sample.getValue().getLikelihoods().getAsVector(); - //double logDenominator = Math.log10(2.0*j*(2.0*j-1)); - double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - - // special treatment for k=0: iteration reduces to: - //YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()]; - logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[idxAA]; - - for (int k=1; k <= 2*j; k++ ) { - - //double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()]; - double logNumerator[]; - logNumerator = new double[3]; - if (k < 2*j-1) - logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] + - genotypeLikelihoods[idxAA]; - else - logNumerator[0] = Double.NEGATIVE_INFINITY; - - - if (k < 2*j) - logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] + - genotypeLikelihoods[idxAB]; - else - logNumerator[1] = Double.NEGATIVE_INFINITY; - - if (k > 1) - logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] + - genotypeLikelihoods[idxBB]; - else - logNumerator[2] = Double.NEGATIVE_INFINITY; - - double logNum = MathUtils.softMax(logNumerator); - - //YMatrix[j][k] = num/den; - logYMatrix[j][k] = logNum - logDenominator; - } - - } - - for (int k=0; k <= numChr; k++) - log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; - - return numChr; - } - private final static void printLikelihoods(int numChr, double[][] logYMatrix, double[] log10AlleleFrequencyPriors) { int j = logYMatrix.length - 1; System.out.printf("-----------------------------------%n"); @@ -689,5 +450,4 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { System.out.printf(" %4d\t%8.2f\t%8.2f\t%8.2f%n", k, logYMatrix[j][k], log10AlleleFrequencyPriors[k], posterior); } } - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 60dfe4fe7..489e963e8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -53,7 +53,7 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { } public enum GENOTYPING_MODE { - /** the default; the Unified Genotyper will choose the most likely alternate allele */ + /** the Unified Genotyper will choose the most likely alternate allele */ DISCOVERY, /** only the alleles passed in from a VCF rod bound to the -alleles argument will be used for genotyping */ GENOTYPE_GIVEN_ALLELES diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java index 10b646d63..27842a8bf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GridSearchAFEstimation.java @@ -52,9 +52,7 @@ public class GridSearchAFEstimation extends AlleleFrequencyCalculationModel { AFMatrix = new AlleleFrequencyMatrix(N); } - protected void getLog10PNonRef(RefMetaDataTracker tracker, - ReferenceContext ref, - Map GLs, Setalleles, + protected void getLog10PNonRef(Map GLs, List alleles, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors) { initializeAFMatrix(GLs); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index ec5eefd60..aea63b61d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -30,14 +30,11 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.genotype.Haplotype; import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup; @@ -72,9 +69,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood // gdebug removeme // todo -cleanup - private HaplotypeIndelErrorModel model; - private boolean useOldWrongHorribleHackedUpLikelihoodModel = false; -// private GenomeLoc lastSiteVisited; private ArrayList alleleList; @@ -85,26 +79,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); - if (UAC.GSA_PRODUCTION_ONLY == false) { - pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE); - useOldWrongHorribleHackedUpLikelihoodModel = false; - } - else { - useOldWrongHorribleHackedUpLikelihoodModel = true; - double INSERTION_START_PROBABILITY = 1e-3; - - double INSERTION_END_PROBABILITY = 0.5; - - double ALPHA_DELETION_PROBABILITY = 1e-3; - - - model = new HaplotypeIndelErrorModel(3, INSERTION_START_PROBABILITY, - INSERTION_END_PROBABILITY,ALPHA_DELETION_PROBABILITY,UAC.INDEL_HAPLOTYPE_SIZE, false, UAC.OUTPUT_DEBUG_INDEL_INFO); - } - pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY,UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.DO_CONTEXT_DEPENDENT_PENALTIES, UAC.dovit, UAC.GET_GAP_PENALTIES_FROM_DATA, UAC.INDEL_RECAL_FILE); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.BANDED_INDEL_COMPUTATION); alleleList = new ArrayList(); getAlleleListFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; minIndelCountForGenotyping = UAC.MIN_INDEL_COUNT_FOR_GENOTYPING; @@ -123,10 +99,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood GenomeLoc loc = ref.getLocus(); ArrayList aList = new ArrayList(); - if (DEBUG) { - System.out.println("'''''''''''''''''''''"); - System.out.println("Loc:"+loc.toString()); - } HashMap consensusIndelStrings = new HashMap(); int insCount = 0, delCount = 0; @@ -160,12 +132,12 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood continue; } - if (DEBUG && p.isIndel()) { +/* if (DEBUG && p.isIndel()) { System.out.format("Read: %s, cigar: %s, aln start: %d, aln end: %d, p.len:%d, Type:%s, EventBases:%s\n", read.getReadName(),read.getCigar().toString(),read.getAlignmentStart(),read.getAlignmentEnd(), p.getEventLength(),p.getType().toString(), p.getEventBases()); } - + */ String indelString = p.getEventBases(); if (p.isInsertion()) { @@ -235,7 +207,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } } - if (DEBUG) { +/* if (DEBUG) { int icount = indelPileup.getNumberOfInsertions(); int dcount = indelPileup.getNumberOfDeletions(); if (icount + dcount > 0) @@ -249,7 +221,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } System.out.println(); } - } + } */ } int maxAlleleCnt = 0; @@ -260,8 +232,8 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood maxAlleleCnt = curCnt; bestAltAllele = s; } - if (DEBUG) - System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); +// if (DEBUG) +// System.out.format("Key:%s, number: %d\n",s,consensusIndelStrings.get(s) ); } //gdebug- if (maxAlleleCnt < minIndelCountForGenotyping) @@ -384,20 +356,13 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood } } } - int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); - int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1; - int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1; - if (useOldWrongHorribleHackedUpLikelihoodModel) { - numPrefBases = 20; - hsize=80; - } - if (DEBUG) - System.out.format("hsize: %d eventLength: %d refSize: %d, locStart: %d numpr: %d\n",hsize,eventLength, - (int)ref.getWindow().size(), loc.getStart(), numPrefBases); - //System.out.println(eventLength); - haplotypeMap = Haplotype.makeHaplotypeListFromAlleles( alleleList, loc.getStart(), - ref, hsize, numPrefBases); + final int eventLength = altAllele.getBaseString().length() - refAllele.getBaseString().length(); + final int hsize = (int)ref.getWindow().size()-Math.abs(eventLength)-1; + final int numPrefBases= ref.getLocus().getStart()-ref.getWindow().getStart()+1; + + haplotypeMap = Haplotype.makeHaplotypeListFromAlleles(alleleList, loc.getStart(), + ref, hsize, numPrefBases); // For each sample, get genotype likelihoods based on pileup // compute prior likelihoods on haplotypes, and initialize haplotype likelihood matrix with them. @@ -414,13 +379,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood pileup = context.getBasePileup(); if (pileup != null ) { - double[] genotypeLikelihoods; - - if (useOldWrongHorribleHackedUpLikelihoodModel) - genotypeLikelihoods = model.computeReadHaplotypeLikelihoods( pileup, haplotypeMap); - else - genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); - + final double[] genotypeLikelihoods = pairModel.computeReadHaplotypeLikelihoods( pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), alleleList, diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java index 3652763de..4f378b24a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/MultiallelicGenotypeLikelihoods.java @@ -4,6 +4,7 @@ import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.ArrayList; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -15,11 +16,11 @@ import java.util.ArrayList; public class MultiallelicGenotypeLikelihoods { private String sample; private double[] GLs; - private ArrayList alleleList; + private List alleleList; private int depth; public MultiallelicGenotypeLikelihoods(String sample, - ArrayList A, + List A, double[] log10Likelihoods, int depth) { /* Check for consistency between likelihood vector and number of alleles */ int numAlleles = A.size(); @@ -40,7 +41,7 @@ public class MultiallelicGenotypeLikelihoods { return GLs; } - public ArrayList getAlleles() { + public List getAlleles() { return alleleList; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 6905ce4a4..9bdc754e9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -31,9 +31,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; @@ -122,8 +122,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC aList.add(refAllele); aList.add(altAllele); double[] dlike = new double[]{likelihoods[refGenotype.ordinal()],likelihoods[hetGenotype.ordinal()],likelihoods[homGenotype.ordinal()]} ; + + // normalize in log space so that max element is zero. GLs.put(sample.getKey(), new MultiallelicGenotypeLikelihoods(sample.getKey(), - aList, dlike, getFilteredDepth(pileup))); + aList, MathUtils.normalizeFromLog10(dlike, false, true), getFilteredDepth(pileup))); } return refAllele; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java index 500b11360..d88e55687 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UGCallVariants.java @@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 7b8045581..07d9892a1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -31,8 +31,6 @@ import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.io.File; - public class UnifiedArgumentCollection { @@ -103,18 +101,13 @@ public class UnifiedArgumentCollection { @Argument(fullName = "assume_single_sample_reads", shortName = "single_sample", doc = "The single sample that we should assume is represented in the input bam (and therefore associate with all reads regardless of whether they have read groups)", required = false) public String ASSUME_SINGLE_SAMPLE = null; - // TODO -- delete me - @Hidden - @Argument(fullName = "abort_at_too_much_coverage", doc = "Don't call a site if the downsampled coverage is greater than this value", required = false) - public int COVERAGE_AT_WHICH_TO_ABORT = -1; - - // control the various parameters to be used + /** + * The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base + * is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value. + */ @Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false) public int MIN_BASE_QUALTY_SCORE = 17; - @Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false) - public int MIN_MAPPING_QUALTY_SCORE = 20; - @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; @@ -143,47 +136,28 @@ public class UnifiedArgumentCollection { @Hidden @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) public int INDEL_HAPLOTYPE_SIZE = 80; - @Hidden - @Argument(fullName = "doContextDependentGapPenalties", shortName = "doCDP", doc = "Vary gap penalties by context", required = false) - public boolean DO_CONTEXT_DEPENDENT_PENALTIES = true; + //gdebug+ // experimental arguments, NOT TO BE USED BY ANYONE WHOSE INITIALS AREN'T GDA!!! +// @Hidden +// @Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false) +// public boolean GET_GAP_PENALTIES_FROM_DATA = false; +// +// @Hidden +// @Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE") +// public File INDEL_RECAL_FILE = new File("indel.recal_data.csv"); @Hidden - @Argument(fullName = "getGapPenaltiesFromData", shortName = "dataGP", doc = "Vary gap penalties by context - EXPERIMENTAL, DO NO USE", required = false) - public boolean GET_GAP_PENALTIES_FROM_DATA = false; - - @Hidden - @Argument(fullName="indel_recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file - EXPERIMENTAL, DO NO USE") - public File INDEL_RECAL_FILE = new File("indel.recal_data.csv"); + @Argument(fullName = "bandedIndel", shortName = "bandedIndel", doc = "Banded Indel likelihood computation", required = false) + public boolean BANDED_INDEL_COMPUTATION = false; @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) public boolean OUTPUT_DEBUG_INDEL_INFO = false; - @Hidden - @Argument(fullName = "dovit", shortName = "dovit", doc = "Perform full Viterbi calculation when evaluating the HMM", required = false) - public boolean dovit = false; - - @Hidden - @Argument(fullName = "GSA_PRODUCTION_ONLY", shortName = "GSA_PRODUCTION_ONLY", doc = "don't ever use me", required = false) - public boolean GSA_PRODUCTION_ONLY = false; - - @Hidden - @Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false) - public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL; - @Hidden @Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false) public boolean IGNORE_SNP_ALLELES = false; - @Deprecated - @Argument(fullName="output_all_callable_bases", shortName="all_bases", doc="Please use --output_mode EMIT_ALL_SITES instead" ,required=false) - private Boolean ALL_BASES_DEPRECATED = false; - - @Deprecated - @Argument(fullName="genotype", shortName="genotype", doc="Please use --output_mode EMIT_ALL_CONFIDENT_SITES instead" ,required=false) - private Boolean GENOTYPE_DEPRECATED = false; - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! public UnifiedArgumentCollection clone() { @@ -191,7 +165,6 @@ public class UnifiedArgumentCollection { uac.GLmodel = GLmodel; uac.AFmodel = AFmodel; - uac.EXACT_CALCULATION_TYPE = EXACT_CALCULATION_TYPE; uac.heterozygosity = heterozygosity; uac.PCR_error = PCR_error; uac.GenotypingMode = GenotypingMode; @@ -201,7 +174,6 @@ public class UnifiedArgumentCollection { uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; - uac.MIN_MAPPING_QUALTY_SCORE = MIN_MAPPING_QUALTY_SCORE; uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; @@ -209,17 +181,11 @@ public class UnifiedArgumentCollection { uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO; uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; - uac.DO_CONTEXT_DEPENDENT_PENALTIES = DO_CONTEXT_DEPENDENT_PENALTIES; uac.alleles = alleles; - uac.GET_GAP_PENALTIES_FROM_DATA = GET_GAP_PENALTIES_FROM_DATA; - uac.INDEL_RECAL_FILE = INDEL_RECAL_FILE; // todo- arguments to remove - uac.COVERAGE_AT_WHICH_TO_ABORT = COVERAGE_AT_WHICH_TO_ABORT; - uac.dovit = dovit; - uac.GSA_PRODUCTION_ONLY = GSA_PRODUCTION_ONLY; uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - + uac.BANDED_INDEL_COMPUTATION = BANDED_INDEL_COMPUTATION; return uac; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 9fdf65015..72dc217e1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -149,6 +149,13 @@ public class UnifiedGenotyper extends LocusWalker annotationsToUse = new ArrayList(); + /** + * Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments, + * so annotations will be excluded even if they are explicitly included with the other options. + */ + @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) + protected List annotationsToExclude = new ArrayList(); + /** * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. */ @@ -210,7 +217,7 @@ public class UnifiedGenotyper extends LocusWalker(samples); // note that, because we cap the base quality by the mapping quality, minMQ cannot be less than minBQ this.UAC = UAC.clone(); - this.UAC.MIN_MAPPING_QUALTY_SCORE = Math.max(UAC.MIN_MAPPING_QUALTY_SCORE, UAC.MIN_BASE_QUALTY_SCORE); this.logger = logger; this.verboseWriter = verboseWriter; @@ -146,9 +145,6 @@ public class UnifiedGenotyperEngine { * @return the VariantCallContext object */ public VariantCallContext calculateLikelihoodsAndGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { - if ( UAC.COVERAGE_AT_WHICH_TO_ABORT > 0 && rawContext.size() > UAC.COVERAGE_AT_WHICH_TO_ABORT ) - return null; - final GenotypeLikelihoodsCalculationModel.Model model = getCurrentGLModel(tracker, refContext, rawContext ); if( model == null ) { return (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); @@ -325,7 +321,7 @@ public class UnifiedGenotyperEngine { // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); // find the most likely frequency int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()); @@ -383,7 +379,7 @@ public class UnifiedGenotyperEngine { // the overall lod VariantContext vcOverall = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, vc.getAlternateAllele(0), false, model); clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vcOverall.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); //double overallLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double overallLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); @@ -391,7 +387,7 @@ public class UnifiedGenotyperEngine { // the forward lod VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, vc.getAlternateAllele(0), false, model); clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vcForward.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); double forwardLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double forwardLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); @@ -400,7 +396,7 @@ public class UnifiedGenotyperEngine { // the reverse lod VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, vc.getAlternateAllele(0), false, model); clearAFarray(log10AlleleFrequencyPosteriors.get()); - afcm.get().getLog10PNonRef(tracker, refContext, vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vcReverse.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get(), true); double reverseLog10PofNull = log10AlleleFrequencyPosteriors.get()[0]; double reverseLog10PofF = MathUtils.log10sumLog10(log10AlleleFrequencyPosteriors.get(), 1); @@ -423,7 +419,7 @@ public class UnifiedGenotyperEngine { int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc); - Set myAlleles = vc.getAlleles(); + Set myAlleles = new HashSet(vc.getAlleles()); // strip out the alternate allele if it's a ref call if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { myAlleles = new HashSet(1); @@ -447,7 +443,79 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); } - private int calculateEndPos(Set alleles, Allele refAllele, GenomeLoc loc) { + // A barebones entry point to the exact model when there is no tracker or stratified contexts available -- only GLs + public VariantCallContext calculateGenotypes(final VariantContext vc, final GenomeLoc loc, final GenotypeLikelihoodsCalculationModel.Model model) { + + // initialize the data for this thread if that hasn't been done yet + if ( afcm.get() == null ) { + log10AlleleFrequencyPosteriors.set(new double[N+1]); + afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); + } + + // estimate our confidence in a reference call and return + if ( vc.getNSamples() == 0 ) + return null; + + // 'zero' out the AFs (so that we don't have to worry if not all samples have reads at this position) + clearAFarray(log10AlleleFrequencyPosteriors.get()); + afcm.get().getLog10PNonRef(vc.getGenotypes(), vc.getAlleles(), getAlleleFrequencyPriors(model), log10AlleleFrequencyPosteriors.get()); + + // find the most likely frequency + int bestAFguess = MathUtils.maxElementIndex(log10AlleleFrequencyPosteriors.get()); + + // calculate p(f>0) + double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10AlleleFrequencyPosteriors.get()); + double sum = 0.0; + for (int i = 1; i <= N; i++) + sum += normalizedPosteriors[i]; + double PofF = Math.min(sum, 1.0); // deal with precision errors + + double phredScaledConfidence; + if ( bestAFguess != 0 || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { + phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); + if ( Double.isInfinite(phredScaledConfidence) ) + phredScaledConfidence = -10.0 * log10AlleleFrequencyPosteriors.get()[0]; + } else { + phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); + if ( Double.isInfinite(phredScaledConfidence) ) { + sum = 0.0; + for (int i = 1; i <= N; i++) { + if ( log10AlleleFrequencyPosteriors.get()[i] == AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED ) + break; + sum += log10AlleleFrequencyPosteriors.get()[i]; + } + phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); + } + } + + // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero + if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestAFguess) ) { + // technically, at this point our confidence in a reference call isn't accurately estimated + // because it didn't take into account samples with no data, so let's get a better estimate + return null; + } + + // create the genotypes + Map genotypes = afcm.get().assignGenotypes(vc, log10AlleleFrequencyPosteriors.get(), bestAFguess); + + // *** note that calculating strand bias involves overwriting data structures, so we do that last + HashMap attributes = new HashMap(); + + int endLoc = calculateEndPos(vc.getAlleles(), vc.getReference(), loc); + + Set myAlleles = new HashSet(vc.getAlleles()); + // strip out the alternate allele if it's a ref call + if ( bestAFguess == 0 && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY ) { + myAlleles = new HashSet(1); + myAlleles.add(vc.getReference()); + } + VariantContext vcCall = new VariantContext("UG_call", loc.getContig(), loc.getStart(), endLoc, + myAlleles, genotypes, phredScaledConfidence/10.0, passesCallThreshold(phredScaledConfidence) ? null : filter, attributes, vc.getReferenceBaseForIndel()); + + return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); + } + + private int calculateEndPos(Collection alleles, Allele refAllele, GenomeLoc loc) { // TODO - temp fix until we can deal with extended events properly // for indels, stop location is one more than ref allele length boolean isSNP = true, hasNullAltAllele = false; @@ -494,10 +562,10 @@ public class UnifiedGenotyperEngine { if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) { // regular pileup in this case - ReadBackedPileup pileup = rawContext.getBasePileup() .getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE); + ReadBackedPileup pileup = rawContext.getBasePileup() .getMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE); // don't call when there is no coverage - if ( pileup.size() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) + if ( pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) return null; // stratify the AlignmentContext and cut by sample @@ -511,10 +579,10 @@ public class UnifiedGenotyperEngine { ReadBackedExtendedEventPileup rawPileup = rawContext.getExtendedEventPileup(); // filter the context based on min mapping quality - ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_MAPPING_QUALTY_SCORE); + ReadBackedExtendedEventPileup pileup = rawPileup.getMappingFilteredPileup(UAC.MIN_BASE_QUALTY_SCORE); // don't call when there is no coverage - if ( pileup.size() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) + if ( pileup.getNumberOfElements() == 0 && UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES ) return null; // stratify the AlignmentContext and cut by sample @@ -530,7 +598,7 @@ public class UnifiedGenotyperEngine { for( final PileupElement p : rawContext.getBasePileup() ) { if( p.isDeletion() ) { numDeletions++; } } - if( ((double) numDeletions) / ((double) rawContext.getBasePileup().size()) > UAC.MAX_DELETION_FRACTION ) { + if( ((double) numDeletions) / ((double) rawContext.getBasePileup().getNumberOfElements()) > UAC.MAX_DELETION_FRACTION ) { return null; } } @@ -544,6 +612,21 @@ public class UnifiedGenotyperEngine { AFs[i] = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; } + private final static double[] binomialProbabilityDepthCache = new double[10000]; + static { + for ( int i = 1; i < binomialProbabilityDepthCache.length; i++ ) { + binomialProbabilityDepthCache[i] = MathUtils.binomialProbability(0, i, 0.5); + } + } + + private final double getRefBinomialProb(final int depth) { + if ( depth < binomialProbabilityDepthCache.length ) + return binomialProbabilityDepthCache[depth]; + else + return MathUtils.binomialProbability(0, depth, 0.5); + } + + private VariantCallContext estimateReferenceConfidence(VariantContext vc, Map contexts, double theta, boolean ignoreCoveredSamples, double initialPofRef) { if ( contexts == null ) return null; @@ -562,12 +645,12 @@ public class UnifiedGenotyperEngine { if (isCovered) { AlignmentContext context = contexts.get(sample); if (context.hasBasePileup()) - depth = context.getBasePileup().size(); + depth = context.getBasePileup().depthOfCoverage(); else if (context.hasExtendedEventPileup()) - depth = context.getExtendedEventPileup().size(); + depth = context.getExtendedEventPileup().depthOfCoverage(); } - P_of_ref *= 1.0 - (theta / 2.0) * MathUtils.binomialProbability(0, depth, 0.5); + P_of_ref *= 1.0 - (theta / 2.0) * getRefBinomialProb(depth); } return new VariantCallContext(vc, QualityUtils.phredScaleErrorRate(1.0 - P_of_ref) >= UAC.STANDARD_CONFIDENCE_FOR_CALLING, false); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java index adb7c4c38..3dd51fa7d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/ConstrainedMateFixingManager.java @@ -9,6 +9,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; @@ -180,8 +181,8 @@ public class ConstrainedMateFixingManager { addRead(newRead, readWasModified, true); } - public void addReads(List newReads, Set modifiedReads) { - for ( SAMRecord newRead : newReads ) + public void addReads(List newReads, Set modifiedReads) { + for ( GATKSAMRecord newRead : newReads ) addRead(newRead, modifiedReads.contains(newRead), false); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index e68aa31e0..3b3f54b05 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -26,9 +26,9 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.genotype.Haplotype; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -73,7 +73,7 @@ public class HaplotypeIndelErrorModel { baseMatchArray = new double[MAX_CACHED_QUAL+1]; baseMismatchArray = new double[MAX_CACHED_QUAL+1]; for (int k=1; k <= MAX_CACHED_QUAL; k++) { - double baseProb = QualityUtils.qualToProb(k); + double baseProb = QualityUtils.qualToProb((byte)k); baseMatchArray[k] = probToQual(baseProb); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 36e4db1c5..ba031c497 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -30,9 +30,9 @@ import net.sf.samtools.*; import net.sf.samtools.util.RuntimeIOException; import net.sf.samtools.util.SequenceUtil; import net.sf.samtools.util.StringUtil; +import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; @@ -46,11 +46,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.interval.IntervalUtils; -import org.broadinstitute.sting.utils.interval.NwayIntervalMergingIterator; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.NWaySAMFileWriter; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.text.TextFormattingUtils; @@ -138,14 +135,14 @@ public class IndelRealigner extends ReadWalker { * Any number of VCF files representing known indels to be used for constructing alternate consenses. * Could be e.g. dbSNP and/or official 1000 Genomes indel calls. Non-indel variants in these files will be ignored. */ - @Input(fullName="known", shortName = "known", doc="Input VCF file(s) with known indels", required=false) + @Input(fullName="knownAlleles", shortName = "known", doc="Input VCF file(s) with known indels", required=false) public List> known = Collections.emptyList(); /** * The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s). */ @Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) - protected String intervalsFile = null; + protected IntervalBinding intervalsFile = null; /** * This term is equivalent to "significance" - i.e. is the improvement significant enough to merit realignment? Note that this number @@ -230,14 +227,6 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam") protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false; - /** - * For expert users only! This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, it will throw an exception. - * Use this argument when your interval list is not sorted to instruct the Realigner to first sort it in memory. - */ - @Advanced - @Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, doc="The target intervals are not sorted") - protected boolean TARGET_NOT_SORTED = false; - /** * Reads from all input files will be realigned together, but then each read will be saved in the output file corresponding to the input file that * the read came from. There are two ways to generate output bam file names: 1) if the value of this argument is a general string (e.g. '.cleaned.bam'), @@ -263,6 +252,12 @@ public class IndelRealigner extends ReadWalker { doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.") protected boolean NO_PG_TAG = false; + @Hidden + @Argument(fullName="keepPGTags", shortName="keepPG", required=false, + doc="Keep older PG tags left in the bam header by previous runs of this tool (by default, all these "+ + "historical tags will be replaced by the latest tag generated in the current run).") + protected boolean KEEP_ALL_PG_RECORDS = false; + @Hidden @Output(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY") protected String OUT_INDELS = null; @@ -287,10 +282,10 @@ public class IndelRealigner extends ReadWalker { // the reads and known indels that fall into the current interval private final ReadBin readsToClean = new ReadBin(); - private final ArrayList readsNotToClean = new ArrayList(); + private final ArrayList readsNotToClean = new ArrayList(); private final ArrayList knownIndelsToTry = new ArrayList(); private final HashSet indelRodsSeen = new HashSet(); - private final HashSet readsActuallyCleaned = new HashSet(); + private final HashSet readsActuallyCleaned = new HashSet(); private static final int MAX_QUAL = 99; @@ -367,48 +362,23 @@ public class IndelRealigner extends ReadWalker { catch(FileNotFoundException ex) { throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile,ex); } - - if ( !TARGET_NOT_SORTED ) { - NwayIntervalMergingIterator merger = new NwayIntervalMergingIterator(IntervalMergingRule.OVERLAPPING_ONLY); - List rawIntervals = new ArrayList(); - // separate argument on semicolon first - for (String fileOrInterval : intervalsFile.split(";")) { - // if it's a file, add items to raw interval list - if (IntervalUtils.isIntervalFile(fileOrInterval)) { - merger.add(new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new java.io.File(fileOrInterval), IntervalMergingRule.OVERLAPPING_ONLY ) ); - } else { - rawIntervals.add(getToolkit().getGenomeLocParser().parseGenomeLoc(fileOrInterval)); - } - } - if ( ! rawIntervals.isEmpty() ) merger.add(rawIntervals.iterator()); - // prepare to read intervals one-by-one, as needed (assuming they are sorted). - intervals = merger; - } else { - // read in the whole list of intervals for cleaning - boolean allowEmptyIntervalList = (getToolkit().getArguments().unsafe == ValidationExclusion.TYPE.ALLOW_EMPTY_INTERVAL_LIST || - getToolkit().getArguments().unsafe == ValidationExclusion.TYPE.ALL); - GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), - IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(intervalsFile),allowEmptyIntervalList), - IntervalMergingRule.OVERLAPPING_ONLY); - intervals = locs.iterator(); - } + intervals = intervalsFile.getIntervals(getToolkit()).iterator(); + currentInterval = intervals.hasNext() ? intervals.next() : null; writerToUse = writer; if ( N_WAY_OUT != null ) { - // Map args = getToolkit().getArguments().walkerArgs; boolean createIndex = true; - // if ( args.containsKey("disable_bam_indexing") ) { System.out.println("NO INDEXING!!"); System.exit(1); createIndex = false; } - if ( N_WAY_OUT.toUpperCase().endsWith(".MAP") ) { - writerToUse = new NWaySAMFileWriter(getToolkit(),loadFileNameMap(N_WAY_OUT),SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s); + writerToUse = new NWaySAMFileWriter(getToolkit(),loadFileNameMap(N_WAY_OUT), + SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s,createProgramRecord(),KEEP_ALL_PG_RECORDS); } else { - writerToUse = new NWaySAMFileWriter(getToolkit(),N_WAY_OUT,SAMFileHeader.SortOrder.coordinate,true, createIndex, generateMD5s); + writerToUse = new NWaySAMFileWriter(getToolkit(),N_WAY_OUT,SAMFileHeader.SortOrder.coordinate,true, + createIndex, generateMD5s,createProgramRecord(),KEEP_ALL_PG_RECORDS); } - } else { // set up the output writer @@ -448,18 +418,12 @@ public class IndelRealigner extends ReadWalker { private void setupWriter(SAMFileHeader header) { if ( !NO_PG_TAG ) { - final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); - final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); - try { - final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version"); - programRecord.setProgramVersion(version); - } catch (MissingResourceException e) {} - programRecord.setCommandLine(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this)); + final SAMProgramRecord programRecord = createProgramRecord(); List oldRecords = header.getProgramRecords(); List newRecords = new ArrayList(oldRecords.size()+1); for ( SAMProgramRecord record : oldRecords ) { - if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) ) + if ( !record.getId().startsWith(PROGRAM_RECORD_NAME) || KEEP_ALL_PG_RECORDS ) newRecords.add(record); } newRecords.add(programRecord); @@ -470,6 +434,20 @@ public class IndelRealigner extends ReadWalker { writer.setPresorted(true); } + + private SAMProgramRecord createProgramRecord() { + if ( NO_PG_TAG ) return null; + + final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_NAME); + final ResourceBundle headerInfo = TextFormattingUtils.loadResourceBundle("StingText"); + try { + final String version = headerInfo.getString("org.broadinstitute.sting.gatk.version"); + programRecord.setProgramVersion(version); + } catch (MissingResourceException e) {} + programRecord.setCommandLine(getToolkit().createApproximateCommandLineArgumentString(getToolkit(), this)); + return programRecord; + } + private void emit(final SAMRecord read) { // check to see whether the read was modified by looking at the temporary tag @@ -492,7 +470,7 @@ public class IndelRealigner extends ReadWalker { readsActuallyCleaned.clear(); } - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { if ( currentInterval == null ) { emit(read); return 0; @@ -558,7 +536,7 @@ public class IndelRealigner extends ReadWalker { // TODO -- it would be nice if we could use indels from 454 reads as alternate consenses } - private void cleanAndCallMap(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker, GenomeLoc readLoc) { + private void cleanAndCallMap(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker, GenomeLoc readLoc) { if ( readsToClean.size() > 0 ) { GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); if ( manager.canMoveReads(earliestPossibleMove) ) @@ -574,7 +552,7 @@ public class IndelRealigner extends ReadWalker { } while ( currentInterval != null && (readLoc == null || currentInterval.isBefore(readLoc)) ); } catch (ReviewedStingException e) { - throw new UserException.MissortedFile(new File(intervalsFile), " *** Are you sure that your interval file is sorted? If not, you must use the --targetIntervalsAreNotSorted argument. ***", e); + throw new UserException.MissortedFile(new File(intervalsFile.getSource()), " *** Are you sure that your interval file is sorted? If not, you must use the --targetIntervalsAreNotSorted argument. ***", e); } sawReadInCurrentInterval = false; @@ -679,14 +657,14 @@ public class IndelRealigner extends ReadWalker { private void clean(ReadBin readsToClean) { - final List reads = readsToClean.getReads(); + final List reads = readsToClean.getReads(); if ( reads.size() == 0 ) return; byte[] reference = readsToClean.getReference(referenceReader); int leftmostIndex = readsToClean.getLocation().getStart(); - final ArrayList refReads = new ArrayList(); // reads that perfectly match ref + final ArrayList refReads = new ArrayList(); // reads that perfectly match ref final ArrayList altReads = new ArrayList(); // reads that don't perfectly match final LinkedList altAlignmentsToTest = new LinkedList(); // should we try to make an alt consensus from the read? final Set altConsenses = new LinkedHashSet(); // list of alt consenses @@ -838,8 +816,9 @@ public class IndelRealigner extends ReadWalker { // however we don't have enough info to use the proper MAQ scoring system. // For now, we will just arbitrarily add 10 to the mapping quality. [EB, 6/7/2010]. // TODO -- we need a better solution here - SAMRecord read = aRead.getRead(); - read.setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + 10, 254)); + GATKSAMRecord read = aRead.getRead(); + if ( read.getMappingQuality() != 255 ) // 255 == Unknown, so don't modify it + read.setMappingQuality(Math.min(aRead.getRead().getMappingQuality() + 10, 254)); // before we fix the attribute tags we first need to make sure we have enough of the reference sequence int neededBasesToLeft = leftmostIndex - read.getAlignmentStart(); @@ -897,8 +876,8 @@ public class IndelRealigner extends ReadWalker { } } - private long determineReadsThatNeedCleaning(final List reads, - final ArrayList refReadsToPopulate, + private long determineReadsThatNeedCleaning(final List reads, + final ArrayList refReadsToPopulate, final ArrayList altReadsToPopulate, final LinkedList altAlignmentsToTest, final Set altConsenses, @@ -907,7 +886,7 @@ public class IndelRealigner extends ReadWalker { long totalRawMismatchSum = 0L; - for ( final SAMRecord read : reads ) { + for ( final GATKSAMRecord read : reads ) { // we can not deal with screwy records if ( read.getCigar().numCigarElements() == 0 ) { @@ -1395,7 +1374,7 @@ public class IndelRealigner extends ReadWalker { } private class AlignedRead { - private final SAMRecord read; + private final GATKSAMRecord read; private byte[] readBases = null; private byte[] baseQuals = null; private Cigar newCigar = null; @@ -1403,12 +1382,12 @@ public class IndelRealigner extends ReadWalker { private int mismatchScoreToReference = 0; private long alignerMismatchScore = 0; - public AlignedRead(SAMRecord read) { + public AlignedRead(GATKSAMRecord read) { this.read = read; mismatchScoreToReference = 0; } - public SAMRecord getRead() { + public GATKSAMRecord getRead() { return read; } @@ -1592,7 +1571,7 @@ public class IndelRealigner extends ReadWalker { private class ReadBin implements HasGenomeLocation { - private final ArrayList reads = new ArrayList(); + private final ArrayList reads = new ArrayList(); private byte[] reference = null; private GenomeLoc loc = null; @@ -1600,7 +1579,7 @@ public class IndelRealigner extends ReadWalker { // Return false if we can't process this read bin because the reads are not correctly overlapping. // This can happen if e.g. there's a large known indel with no overlapping reads. - public void add(SAMRecord read) { + public void add(GATKSAMRecord read) { GenomeLoc locForRead = getToolkit().getGenomeLocParser().createGenomeLoc(read); if ( loc == null ) @@ -1611,7 +1590,7 @@ public class IndelRealigner extends ReadWalker { reads.add(read); } - public List getReads() { return reads; } + public List getReads() { return reads; } public byte[] getReference(IndexedFastaSequenceFile referenceReader) { // set up the reference if we haven't done so yet diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java index 17d5a8e9b..7490262f2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** @@ -88,7 +89,7 @@ public class LeftAlignIndels extends ReadWalker { writer.addAlignment(read); } - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { // we can not deal with screwy records if ( read.getCigar().numCigarElements() == 0 ) { emit(read); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 2d7969230..319f41d53 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -28,62 +28,25 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.genotype.Haplotype; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; -import java.io.File; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; -/*import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.Covariate; -import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDataManager; -import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalDatum; -import org.broadinstitute.sting.oneoffprojects.walkers.IndelCountCovariates.RecalibrationArgumentCollection; -*/ - public class PairHMMIndelErrorModel { - - public static final int BASE_QUAL_THRESHOLD = 20; - - private static final int MATCH_OFFSET = 0; - private static final int X_OFFSET = 1; - private static final int Y_OFFSET = 2; - - private static final int DIAG = 0; - private static final int UP = 1; - private static final int LEFT = 2; - - private static final int DIAG_GOTO_M = 0; - private static final int DIAG_GOTO_X = 1; - private static final int DIAG_GOTO_Y = 2; - - private static final int UP_GOTO_M = 4; - private static final int UP_GOTO_X = 5; - private static final int UP_GOTO_Y = 6; - - private static final int LEFT_GOTO_M = 8; - private static final int LEFT_GOTO_X = 9; - private static final int LEFT_GOTO_Y = 10; - - private static final int[] ACTIONS_M = {DIAG_GOTO_M, DIAG_GOTO_X, DIAG_GOTO_Y}; - private static final int[] ACTIONS_X = {UP_GOTO_M, UP_GOTO_X, UP_GOTO_Y}; - private static final int[] ACTIONS_Y = {LEFT_GOTO_M, LEFT_GOTO_X, LEFT_GOTO_Y}; - - - private final double logGapOpenProbability; - private final double logGapContinuationProbability; - private boolean DEBUG = false; + private boolean bandedLikelihoods = false; private static final int MAX_CACHED_QUAL = 127; @@ -100,36 +63,13 @@ public class PairHMMIndelErrorModel { private static final double MIN_GAP_CONT_PENALTY = 10.0; private static final double GAP_PENALTY_HRUN_STEP = 1.0; // each increase in hrun decreases gap penalty by this. - - private boolean doViterbi = false; - - private final boolean useAffineGapModel = true; - private boolean doContextDependentPenalties = false; - private final double[] GAP_OPEN_PROB_TABLE; private final double[] GAP_CONT_PROB_TABLE; - private boolean getGapPenaltiesFromFile = false; - - private int SMOOTHING = 1; - private int MAX_QUALITY_SCORE = 50; - private int PRESERVE_QSCORES_LESS_THAN = 5; - ///////////////////////////// // Private Member Variables ///////////////////////////// -//copy+ -/* private RecalDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps - private final ArrayList requestedCovariates = new ArrayList(); // List of covariates to be used in this calculation - private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); - private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); - private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); - protected static final String EOF_MARKER = "EOF"; - private long numReadsWithMalformedColorSpace = 0; - private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); - private NestedHashMap qualityScoreByFullCovariateKey = new NestedHashMap(); // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. - */ -//copy- + static { LOG_ONE_HALF= -Math.log10(2.0); END_GAP_COST = LOG_ONE_HALF; @@ -145,155 +85,22 @@ public class PairHMMIndelErrorModel { } } - public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP, boolean dovit,boolean gpf, File RECAL_FILE) { - - this(indelGOP, indelGCP, deb, doCDP, dovit); - this.getGapPenaltiesFromFile = gpf; - - // read data from recal file - // gdebug - start copy from TableRecalibrationWalker -/* if (gpf) { - boolean sawEOF = false; - boolean REQUIRE_EOF = false; - - int lineNumber = 0; - boolean foundAllCovariates = false; - // Get a list of all available covariates - final List> classes = new PluginManager(Covariate.class).getPlugins(); - - try { - for ( String line : new XReadLines(RECAL_FILE) ) { - lineNumber++; - if ( EOF_MARKER.equals(line) ) { - sawEOF = true; - } else if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() ) { - ; // Skip over the comment lines, (which start with '#') - } - // Read in the covariates that were used from the input file - else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data - if( foundAllCovariates ) { - throw new UserException.MalformedFile( RECAL_FILE, "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE ); - } else { // Found the covariate list in input file, loop through all of them and instantiate them - String[] vals = line.split(","); - for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical - boolean foundClass = false; - for( Class covClass : classes ) { - if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) { - foundClass = true; - try { - Covariate covariate = (Covariate)covClass.newInstance(); - requestedCovariates.add( covariate ); - } catch (Exception e) { - throw new DynamicClassResolutionException(covClass, e); - } - - } - } - - if( !foundClass ) { - throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." ); - } - } - } - - } else { // Found a line of data - if( !foundAllCovariates ) { - foundAllCovariates = true; - - // At this point all the covariates should have been found and initialized - if( requestedCovariates.size() < 2 ) { - throw new UserException.MalformedFile(RECAL_FILE, "Malformed input recalibration csv file. Covariate names can't be found in file: " + RECAL_FILE ); - } - - final boolean createCollapsedTables = true; - - // Initialize any covariate member variables using the shared argument collection - for( Covariate cov : requestedCovariates ) { - cov.initialize( RAC ); - } - // Initialize the data hashMaps - dataManager = new RecalDataManager( createCollapsedTables, requestedCovariates.size() ); - - } - addCSVData(RECAL_FILE, line); // Parse the line and add the data to the HashMap - } - } - - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(RECAL_FILE, "Can not find input file", e); - } catch ( NumberFormatException e ) { - throw new UserException.MalformedFile(RECAL_FILE, "Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker."); - } - - if ( !sawEOF ) { - final String errorMessage = "No EOF marker was present in the recal covariates table; this could mean that the file is corrupted or was generated with an old version of the CountCovariates tool."; - if ( REQUIRE_EOF ) - throw new UserException.MalformedFile(RECAL_FILE, errorMessage); - } - - if( dataManager == null ) { - throw new UserException.MalformedFile(RECAL_FILE, "Can't initialize the data manager. Perhaps the recal csv file contains no data?"); - } - - // Create the tables of empirical quality scores that will be used in the sequential calculation - dataManager.generateEmpiricalQualities( SMOOTHING, MAX_QUALITY_SCORE ); - } - // debug end copy - */ - } - /** - * For each covariate read in a value and parse it. Associate those values with the data itself (num observation and num mismatches) - */ - /* - private void addCSVData(final File file, final String line) { - final String[] vals = line.split(","); - - // Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly - if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical - throw new UserException.MalformedFile(file, "Malformed input recalibration file. Found data line with too many fields: " + line + - " --Perhaps the read group string contains a comma and isn't being parsed correctly."); - } - - final Object[] key = new Object[requestedCovariates.size()]; - Covariate cov; - int iii; - for( iii = 0; iii < requestedCovariates.size(); iii++ ) { - cov = requestedCovariates.get( iii ); - key[iii] = cov.getValue( vals[iii] ); - } - - // Create a new datum using the number of observations, number of mismatches, and reported quality score - final RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 ); - // Add that datum to all the collapsed tables which will be used in the sequential calculation - dataManager.addToAllTables( key, datum, PRESERVE_QSCORES_LESS_THAN ); - } - -*/ - public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP, boolean dovit) { - this(indelGOP, indelGCP, deb, doCDP); - this.doViterbi = dovit; - } - - public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) { - - - this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob - this.logGapContinuationProbability = -indelGCP/10.0; // QUAL to log prob - this.doContextDependentPenalties = doCDP; + public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean bandedLikelihoods) { this.DEBUG = deb; - + this.bandedLikelihoods = bandedLikelihoods; // fill gap penalty table, affine naive model: this.GAP_CONT_PROB_TABLE = new double[MAX_HRUN_GAP_IDX]; this.GAP_OPEN_PROB_TABLE = new double[MAX_HRUN_GAP_IDX]; + double gop = -indelGOP/10.0; + double gcp = -indelGCP/10.0; + for (int i = 0; i < START_HRUN_GAP_IDX; i++) { - GAP_OPEN_PROB_TABLE[i] = logGapOpenProbability; - GAP_CONT_PROB_TABLE[i] = logGapContinuationProbability; + GAP_OPEN_PROB_TABLE[i] = gop; + GAP_CONT_PROB_TABLE[i] = gcp; } - double gop = logGapOpenProbability; - double gcp = logGapContinuationProbability; double step = GAP_PENALTY_HRUN_STEP/10.0; double maxGOP = -MIN_GAP_OPEN_PENALTY/10.0; // phred to log prob @@ -313,132 +120,6 @@ public class PairHMMIndelErrorModel { } - private double computeReadLikelihoodGivenHaplotype(byte[] haplotypeBases, byte[] readBases, byte[] readQuals) { - final int X_METRIC_LENGTH = readBases.length+1; - final int Y_METRIC_LENGTH = haplotypeBases.length+1; - - // initialize path metric and traceback memories for likelihood computation - double[][] pathMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - int[][] bestMetricArray = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - pathMetricArray[0][0]= 0;//Double.NEGATIVE_INFINITY; - - for (int i=1; i < X_METRIC_LENGTH; i++) { - pathMetricArray[i][0] = 0; - bestMetricArray[i][0] = UP; - } - - for (int j=1; j < Y_METRIC_LENGTH; j++) { - pathMetricArray[0][j] = 0;//logGapOpenProbability + (j-1) * logGapContinuationProbability; - bestMetricArray[0][j] = LEFT; - } - - for (int indI=1; indI < X_METRIC_LENGTH; indI++) { - for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) { - - byte x = readBases[indI-1]; - byte y = haplotypeBases[indJ-1]; - byte qual = readQuals[indI-1]; - - double bestMetric = 0.0; - int bestMetricIdx = 0; - - // compute metric for match/mismatch - // workaround for reads whose bases quality = 0, - if (qual < 1) - qual = 1; - - if (qual > MAX_CACHED_QUAL) - qual = MAX_CACHED_QUAL; - - double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; - double[] metrics = new double[3]; - - metrics[DIAG] = pathMetricArray[indI-1][indJ-1] + pBaseRead; - metrics[UP] = pathMetricArray[indI-1][indJ] + logGapOpenProbability;//(end?0.0:logGapOpenProbability); - metrics[LEFT] = pathMetricArray[indI][indJ-1] + logGapOpenProbability;//(end?0.0:logGapOpenProbability); - - if (doViterbi) { - bestMetricIdx = MathUtils.maxElementIndex(metrics); - bestMetric = metrics[bestMetricIdx]; - } - else - bestMetric = MathUtils.softMax(metrics); - - pathMetricArray[indI][indJ] = bestMetric; - bestMetricArray[indI][indJ] = bestMetricIdx; - - } - } - - - double bestMetric=0.0; - int bestMetricIdx=0,bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1; - - for (int i=0; i < X_METRIC_LENGTH; i ++ ) { - int j= Y_METRIC_LENGTH-1; - - if (pathMetricArray[i][j] > bestMetric) { - bestMetric = pathMetricArray[i][j]; - bestI = i; - bestJ = j; - } - } - for (int j=0; j < Y_METRIC_LENGTH; j++ ) { - int i= X_METRIC_LENGTH-1; - if (pathMetricArray[i][j] >= bestMetric) { - bestMetric = pathMetricArray[i][j]; - bestI = i; - bestJ = j; - } - } - - if (DEBUG && doViterbi) { - - String haplotypeString = new String (haplotypeBases); - String readString = new String(readBases); - - - int i = bestI; - int j = bestJ; - - - System.out.println("Simple NW"); - - while (i >0 || j >0) { - bestMetricIdx = bestMetricArray[i][j]; - System.out.print(bestMetricIdx); - if (bestMetricIdx == UP) { - // insert gap in Y - haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j); - i--; - } else if (bestMetricIdx == LEFT) { - readString = readString.substring(0,i)+"-"+readString.substring(i); - j--; - } - else { - i--; j--; - } - } - - - - - System.out.println("\nAlignment: "); - System.out.println("R:"+readString); - System.out.println("H:"+haplotypeString); - System.out.println(); - - - } - if (DEBUG) - System.out.format("Likelihood: %5.4f\n", bestMetric); - - return bestMetric; - - - } - static private void getContextHomopolymerLength(final byte[] refBytes, int[] hrunArray) { // compute forward hrun length, example: // AGGTGACCCCCCTGAGAG @@ -472,221 +153,199 @@ public class PairHMMIndelErrorModel { } + private void updateCell(final int indI, final int indJ, final int X_METRIC_LENGTH, final int Y_METRIC_LENGTH, byte[] readBases, byte[] readQuals, byte[] haplotypeBases, + double[] currentGOP, double[] currentGCP, double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { + if (indI > 0 && indJ > 0) { + final int im1 = indI -1; + final int jm1 = indJ - 1; + // update current point + final byte x = readBases[im1]; + final byte y = haplotypeBases[jm1]; + final byte qual = readQuals[im1] < 1 ? 1 : (readQuals[im1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[im1]); + + final double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; + + matchMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, XMetricArray[im1][jm1] + pBaseRead, + YMetricArray[im1][jm1] + pBaseRead); + + final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; + final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; + + XMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1); + + // update Y array + final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; + final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; + YMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2); + } + } + private double computeReadLikelihoodGivenHaplotypeAffineGaps(byte[] haplotypeBases, byte[] readBases, byte[] readQuals, - double[] currentGOP, double[] currentGCP) { + double[] currentGOP, double[] currentGCP, int indToStart, + double[][] matchMetricArray, double[][] XMetricArray, double[][] YMetricArray) { final int X_METRIC_LENGTH = readBases.length+1; final int Y_METRIC_LENGTH = haplotypeBases.length+1; - // initialize path metric and traceback memories for likelihood computation - double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - int[][] bestActionArrayM = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - int[][] bestActionArrayX = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - int[][] bestActionArrayY = new int[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + if (indToStart == 0) { + // default initialization for all arrays - double c,d; - matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY; + for (int i=0; i < X_METRIC_LENGTH; i++) { + Arrays.fill(matchMetricArray[i],Double.NEGATIVE_INFINITY); + Arrays.fill(YMetricArray[i],Double.NEGATIVE_INFINITY); + Arrays.fill(XMetricArray[i],Double.NEGATIVE_INFINITY); + } - for (int i=1; i < X_METRIC_LENGTH; i++) { - //initialize first column - matchMetricArray[i][0] = Double.NEGATIVE_INFINITY; - YMetricArray[i][0] = Double.NEGATIVE_INFINITY; - XMetricArray[i][0] = END_GAP_COST*(i);//logGapOpenProbability + (i-1)*logGapContinuationProbability; + for (int i=1; i < X_METRIC_LENGTH; i++) { + //initialize first column + XMetricArray[i][0] = END_GAP_COST*(i); + } - bestActionArrayX[i][0] = bestActionArrayY[i][0] = bestActionArrayM[i][0] = UP_GOTO_X; + for (int j=1; j < Y_METRIC_LENGTH; j++) { + // initialize first row + YMetricArray[0][j] = END_GAP_COST*(j); + } + matchMetricArray[0][0]= END_GAP_COST;//Double.NEGATIVE_INFINITY; + XMetricArray[0][0]= YMetricArray[0][0] = 0; } - for (int j=1; j < Y_METRIC_LENGTH; j++) { - // initialize first row - matchMetricArray[0][j] = Double.NEGATIVE_INFINITY; - XMetricArray[0][j] = Double.NEGATIVE_INFINITY; - YMetricArray[0][j] = END_GAP_COST*(j);//logGapOpenProbability + (j-1) * logGapContinuationProbability; - bestActionArrayY[0][j] = bestActionArrayM[0][j] = bestActionArrayX[0][j] = LEFT_GOTO_Y; + if (bandedLikelihoods) { + final double DIAG_TOL = 20; // means that max - min element in diags have to be > this number for banding to take effect. + + final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH -1; + final int elemsInDiag = Math.min(X_METRIC_LENGTH, Y_METRIC_LENGTH); + + int idxWithMaxElement = 0; + + for (int diag=indToStart; diag < numDiags; diag++) { + // compute default I and J start positions at edge of diagonals + int indI = 0; + int indJ = diag; + if (diag >= Y_METRIC_LENGTH ) { + indI = diag-(Y_METRIC_LENGTH-1); + indJ = Y_METRIC_LENGTH-1; + } + + // first pass: from max element to edge + int idxLow = idxWithMaxElement; + + // reset diag max value before starting + double maxElementInDiag = Double.NEGATIVE_INFINITY; + // set indI, indJ to correct values + indI += idxLow; + indJ -= idxLow; + if (indI >= X_METRIC_LENGTH || indJ < 0) { + idxLow--; + indI--; + indJ++; + } + + + for (int el = idxLow; el < elemsInDiag; el++) { + updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases, + currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); + // update max in diagonal + final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); + + // check if we've fallen off diagonal value by threshold + if (bestMetric > maxElementInDiag) { + maxElementInDiag = bestMetric; + idxWithMaxElement = el; + } + else if (bestMetric < maxElementInDiag - DIAG_TOL && idxWithMaxElement > 0) + break; // done w/current diagonal + + indI++; + if (indI >=X_METRIC_LENGTH ) + break; + indJ--; + if (indJ <= 0) + break; + } + if (idxLow > 0) { + // now do second part in opposite direction + indI = 0; + indJ = diag; + if (diag >= Y_METRIC_LENGTH ) { + indI = diag-(Y_METRIC_LENGTH-1); + indJ = Y_METRIC_LENGTH-1; + } + + indI += idxLow-1; + indJ -= idxLow-1; + for (int el = idxLow-1; el >= 0; el--) { + + updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases, + currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); + // update max in diagonal + final double bestMetric = MathUtils.max(matchMetricArray[indI][indJ], XMetricArray[indI][indJ], YMetricArray[indI][indJ]); + + // check if we've fallen off diagonal value by threshold + if (bestMetric > maxElementInDiag) { + maxElementInDiag = bestMetric; + idxWithMaxElement = el; + } + else if (bestMetric < maxElementInDiag - DIAG_TOL) + break; // done w/current diagonal + + indJ++; + if (indJ >= Y_METRIC_LENGTH ) + break; + indI--; + if (indI <= 0) + break; + } + } + // if (DEBUG) + // System.out.format("Max:%4.1f el:%d\n",maxElementInDiag, idxWithMaxElement); + } } + else { + // simplified rectangular version of update loop + for (int indI=1; indI < X_METRIC_LENGTH; indI++) { + for (int indJ=indToStart+1; indJ < Y_METRIC_LENGTH; indJ++) { + updateCell(indI, indJ, X_METRIC_LENGTH, Y_METRIC_LENGTH, readBases, readQuals, haplotypeBases, + currentGOP, currentGCP, matchMetricArray, XMetricArray, YMetricArray); - for (int indI=1; indI < X_METRIC_LENGTH; indI++) { - int im1 = indI-1; - for (int indJ=1; indJ < Y_METRIC_LENGTH; indJ++) { - int jm1 = indJ-1; - byte x = readBases[im1]; - byte y = haplotypeBases[jm1]; - byte qual = readQuals[im1]; - - double bestMetric = 0.0; - int bestMetricIdx = 0; - - // compute metric for match/mismatch - // workaround for reads whose bases quality = 0, - if (qual < 1) - qual = 1; - - if (qual > MAX_CACHED_QUAL) - qual = MAX_CACHED_QUAL; - - double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; - - - double[] metrics = new double[3]; - - - if (doViterbi) { - // update match array - metrics[MATCH_OFFSET] = matchMetricArray[im1][jm1] + pBaseRead; - metrics[X_OFFSET] = XMetricArray[im1][jm1] + pBaseRead; - metrics[Y_OFFSET] = YMetricArray[im1][jm1] + pBaseRead; - - bestMetricIdx = MathUtils.maxElementIndex(metrics); - bestMetric = metrics[bestMetricIdx]; } - else - bestMetric = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, XMetricArray[im1][jm1] + pBaseRead, - YMetricArray[im1][jm1] + pBaseRead); - - matchMetricArray[indI][indJ] = bestMetric; - bestActionArrayM[indI][indJ] = ACTIONS_M[bestMetricIdx]; - - // update X array - // State X(i,j): X(1:i) aligned to a gap in Y(1:j). - // When in last column of X, ie X(1:i) aligned to full Y, we don't want to penalize gaps - - //c = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]); - //d = (indJ==Y_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]); - if (getGapPenaltiesFromFile) { - c = currentGOP[im1]; - d = logGapContinuationProbability; - - } else { - c = currentGOP[jm1]; - d = currentGCP[jm1]; - } - if (indJ == Y_METRIC_LENGTH-1) - c = d = END_GAP_COST; - - if (doViterbi) { - metrics[MATCH_OFFSET] = matchMetricArray[im1][indJ] + c; - metrics[X_OFFSET] = XMetricArray[im1][indJ] + d; - metrics[Y_OFFSET] = Double.NEGATIVE_INFINITY; //YMetricArray[indI-1][indJ] + logGapOpenProbability; - - bestMetricIdx = MathUtils.maxElementIndex(metrics); - bestMetric = metrics[bestMetricIdx]; - } - else - bestMetric = MathUtils.softMax(matchMetricArray[im1][indJ] + c, XMetricArray[im1][indJ] + d); - - XMetricArray[indI][indJ] = bestMetric; - bestActionArrayX[indI][indJ] = ACTIONS_X[bestMetricIdx]; - - // update Y array - //c = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGOP[jm1]); - //d = (indI==X_METRIC_LENGTH-1? END_GAP_COST: currentGCP[jm1]); - if (getGapPenaltiesFromFile) { - c = currentGOP[im1]; - d = logGapContinuationProbability; - } - else { - c = currentGOP[jm1]; - d = currentGCP[jm1]; - } - if (indI == X_METRIC_LENGTH-1) - c = d = END_GAP_COST; - - - - if (doViterbi) { - metrics[MATCH_OFFSET] = matchMetricArray[indI][jm1] + c; - metrics[X_OFFSET] = Double.NEGATIVE_INFINITY; //XMetricArray[indI][indJ-1] + logGapOpenProbability; - metrics[Y_OFFSET] = YMetricArray[indI][jm1] + d; - - bestMetricIdx = MathUtils.maxElementIndex(metrics); - bestMetric = metrics[bestMetricIdx]; - } - else - bestMetric = MathUtils.softMax(matchMetricArray[indI][jm1] + c, YMetricArray[indI][jm1] + d); - - YMetricArray[indI][indJ] = bestMetric; - bestActionArrayY[indI][indJ] = ACTIONS_Y[bestMetricIdx]; - - - } } - double bestMetric; - double metrics[] = new double[3]; - int bestTable=0, bestI=X_METRIC_LENGTH - 1, bestJ=Y_METRIC_LENGTH - 1; - metrics[MATCH_OFFSET] = matchMetricArray[bestI][bestJ]; - metrics[X_OFFSET] = XMetricArray[bestI][bestJ]; - metrics[Y_OFFSET] = YMetricArray[bestI][bestJ]; - if (doViterbi) { - bestTable = MathUtils.maxElementIndex(metrics); - bestMetric = metrics[bestTable]; + + + final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1; + final double bestMetric = MathUtils.softMax(matchMetricArray[bestI][bestJ], + XMetricArray[bestI][bestJ], + YMetricArray[bestI][bestJ]); + + /* + if (DEBUG) { + PrintStream outx, outy, outm, outs; + double[][] sumMetrics = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + try { + outx = new PrintStream("datax.txt"); + outy = new PrintStream("datay.txt"); + outm = new PrintStream("datam.txt"); + outs = new PrintStream("datas.txt"); + double metrics[] = new double[3]; + for (int indI=0; indI < X_METRIC_LENGTH; indI++) { + for (int indJ=0; indJ < Y_METRIC_LENGTH; indJ++) { + metrics[0] = matchMetricArray[indI][indJ]; + metrics[1] = XMetricArray[indI][indJ]; + metrics[2] = YMetricArray[indI][indJ]; + //sumMetrics[indI][indJ] = MathUtils.softMax(metrics); + outx.format("%4.1f ", metrics[1]); + outy.format("%4.1f ", metrics[2]); + outm.format("%4.1f ", metrics[0]); + outs.format("%4.1f ", MathUtils.softMax(metrics)); + } + outx.println(); outm.println();outy.println(); outs.println(); + } + outm.close(); outx.close(); outy.close(); + } catch (java.io.IOException e) { throw new UserException("bla");} } - else - bestMetric = MathUtils.softMax(metrics); - - // Do traceback (needed only for debugging!) - if (DEBUG && doViterbi) { - - int bestAction; - int i = bestI; - int j = bestJ; - - - System.out.println("Affine gap NW"); - - - String haplotypeString = new String (haplotypeBases); - String readString = new String(readBases); - - - while (i >0 || j >0) { - if (bestTable == X_OFFSET) { - // insert gap in Y - haplotypeString = haplotypeString.substring(0,j)+"-"+haplotypeString.substring(j); - bestAction = bestActionArrayX[i][j]; - } - else if (bestTable == Y_OFFSET) { - readString = readString.substring(0,i)+"-"+readString.substring(i); - bestAction = bestActionArrayY[i][j]; - - } - else { - bestAction = bestActionArrayM[i][j]; - } - System.out.print(bestAction); - - - // bestAction contains action to take at next step - // encoding of bestAction: upper 2 bits = direction, lower 2 bits = next table - - // bestTable and nextDirection for next step - bestTable = bestAction & 0x3; - int nextDirection = bestAction >> 2; - if (nextDirection == UP) { - i--; - } else if (nextDirection == LEFT) { - j--; - } else { // if (nextDirection == DIAG) - i--; j--; - } - - } - - - - - System.out.println("\nAlignment: "); - System.out.println("R:"+readString); - System.out.println("H:"+haplotypeString); - System.out.println(); - - - } - if (DEBUG) - System.out.format("Likelihood: %5.4f\n", bestMetric); + */ return bestMetric; @@ -707,50 +366,38 @@ public class PairHMMIndelErrorModel { } } public synchronized double[] computeReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap haplotypeMap, - ReferenceContext ref, int eventLength, - HashMap> indelLikelihoodMap){ + ReferenceContext ref, int eventLength, + HashMap> indelLikelihoodMap){ int numHaplotypes = haplotypeMap.size(); - double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; - double readLikelihoods[][] = new double[pileup.getReads().size()][numHaplotypes]; + final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][numHaplotypes]; + final int readCounts[] = new int[pileup.getNumberOfElements()]; int readIdx=0; LinkedHashMap gapOpenProbabilityMap = new LinkedHashMap(); LinkedHashMap gapContProbabilityMap = new LinkedHashMap(); - if (DEBUG) { - System.out.println("Reference bases:"); - System.out.println(new String(ref.getBases())); + // will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes. + // todo -- refactor into separate function + for (Allele a: haplotypeMap.keySet()) { + Haplotype haplotype = haplotypeMap.get(a); + byte[] haplotypeBases = haplotype.getBasesAsBytes(); + double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length]; + double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length]; + + // get homopolymer length profile for current haplotype + int[] hrunProfile = new int[haplotypeBases.length]; + getContextHomopolymerLength(haplotypeBases,hrunProfile); + fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); + + gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities); + gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities); + } - if (doContextDependentPenalties && !getGapPenaltiesFromFile) { - // will context dependent probabilities based on homopolymer run. Probabilities are filled based on total complete haplotypes. - - - for (Allele a: haplotypeMap.keySet()) { - Haplotype haplotype = haplotypeMap.get(a); - byte[] haplotypeBases = haplotype.getBasesAsBytes(); - double[] contextLogGapOpenProbabilities = new double[haplotypeBases.length]; - double[] contextLogGapContinuationProbabilities = new double[haplotypeBases.length]; - - // get homopolymer length profile for current haplotype - int[] hrunProfile = new int[haplotypeBases.length]; - getContextHomopolymerLength(haplotypeBases,hrunProfile); - if (DEBUG) { - System.out.println("Haplotype bases:"); - System.out.println(new String(haplotypeBases)); - for (int i=0; i < hrunProfile.length; i++) - System.out.format("%d",hrunProfile[i]); - System.out.println(); - } - fillGapProbabilities(hrunProfile, contextLogGapOpenProbabilities, contextLogGapContinuationProbabilities); - - gapOpenProbabilityMap.put(a,contextLogGapOpenProbabilities); - gapContProbabilityMap.put(a,contextLogGapContinuationProbabilities); - - } - } for (PileupElement p: pileup) { + // > 1 when the read is a consensus read representing multiple independent observations + readCounts[readIdx] = p.getRepresentativeCount(); // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) if (indelLikelihoodMap.containsKey(p)) { @@ -762,61 +409,14 @@ public class PairHMMIndelErrorModel { } else { //System.out.format("%d %s\n",p.getRead().getAlignmentStart(), p.getRead().getClass().getName()); - GATKSAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead()); + SAMRecord read = ReadUtils.hardClipAdaptorSequence(p.getRead()); if (read == null) continue; - if(ReadUtils.is454Read(read) && !getGapPenaltiesFromFile) { + if(ReadUtils.is454Read(read)) { continue; } - double[] recalQuals = null; - - /* - if (getGapPenaltiesFromFile) { - RecalDataManager.parseSAMRecord( read, RAC ); - - - recalQuals = new double[read.getReadLength()]; - - //compute all covariate values for this read - final Comparable[][] covariateValues_offset_x_covar = - RecalDataManager.computeCovariates((GATKSAMRecord) read, requestedCovariates); - // For each base in the read - for( int offset = 0; offset < read.getReadLength(); offset++ ) { - - final Object[] fullCovariateKey = covariateValues_offset_x_covar[offset]; - - Byte qualityScore = (Byte) qualityScoreByFullCovariateKey.get(fullCovariateKey); - if(qualityScore == null) - { - qualityScore = performSequentialQualityCalculation( fullCovariateKey ); - qualityScoreByFullCovariateKey.put(qualityScore, fullCovariateKey); - } - - recalQuals[offset] = -((double)qualityScore)/10.0; - } - - // for each read/haplotype combination, compute likelihoods, ie -10*log10(Pr(R | Hi)) - // = sum_j(-10*log10(Pr(R_j | Hi) since reads are assumed to be independent - if (DEBUG) { - System.out.format("\n\nStarting read:%s S:%d US:%d E:%d UE:%d C:%s\n",read.getReadName(), - read.getAlignmentStart(), - read.getUnclippedStart(), read.getAlignmentEnd(), read.getUnclippedEnd(), - read.getCigarString()); - - byte[] bases = read.getReadBases(); - for (int k = 0; k < recalQuals.length; k++) { - System.out.format("%c",bases[k]); - } - System.out.println(); - - for (int k = 0; k < recalQuals.length; k++) { - System.out.format("%.0f ",recalQuals[k]); - } - System.out.println(); - } - } */ // get bases of candidate haplotypes that overlap with reads final int trailingBases = 3; @@ -910,18 +510,16 @@ public class PairHMMIndelErrorModel { // ok, we now figured out total number of clipped bases on both ends. // Figure out where we want to place the haplotype to score read against - if (DEBUG) - System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", - numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength()); - + /* + if (DEBUG) + System.out.format("numStartClippedBases: %d numEndClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", + numStartClippedBases, numEndClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), start, stop, read.getReadLength()); + */ LinkedHashMap readEl = new LinkedHashMap(); if (numStartClippedBases + numEndClippedBases >= unclippedReadBases.length) { - if (DEBUG) - System.out.println("BAD READ!!"); - int j=0; for (Allele a: haplotypeMap.keySet()) { readEl.put(a,0.0); @@ -930,25 +528,20 @@ public class PairHMMIndelErrorModel { } else { - byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases, + final byte[] readBases = Arrays.copyOfRange(unclippedReadBases,numStartClippedBases, unclippedReadBases.length-numEndClippedBases); - byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases, + final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartClippedBases, unclippedReadBases.length-numEndClippedBases); - double[] recalCDP = null; - if (getGapPenaltiesFromFile) { - recalCDP = Arrays.copyOfRange(recalQuals,numStartClippedBases, - unclippedReadBases.length-numEndClippedBases); - - } - - if (DEBUG) { - System.out.println("Read bases:"); - System.out.println(new String(readBases)); - } - int j=0; + + // initialize path metric and traceback memories for likelihood computation + double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null; + byte[] previousHaplotypeSeen = null; + double[] previousGOP = null; + double[] previousGCP = null; + int startIdx; for (Allele a: haplotypeMap.keySet()) { @@ -963,36 +556,41 @@ public class PairHMMIndelErrorModel { long indStart = start - haplotype.getStartPosition(); long indStop = stop - haplotype.getStartPosition(); - byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(), + final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBasesAsBytes(), (int)indStart, (int)indStop); + double readLikelihood; + if (matchMetricArray == null) { + final int X_METRIC_LENGTH = readBases.length+1; + final int Y_METRIC_LENGTH = haplotypeBases.length+1; + + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + } + final double[] currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); + final double[] currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); + if (previousHaplotypeSeen == null) + startIdx = 0; + else { + final int s1 = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); + final int s2 = computeFirstDifferingPosition(currentContextGOP, previousGOP); + final int s3 = computeFirstDifferingPosition(currentContextGCP, previousGCP); + startIdx = Math.min(Math.min(s1, s2), s3); + } + previousHaplotypeSeen = haplotypeBases.clone(); + previousGOP = currentContextGOP.clone(); + previousGCP = currentContextGCP.clone(); + + + readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, + currentContextGOP, currentContextGCP, startIdx, matchMetricArray, XMetricArray, YMetricArray); if (DEBUG) { - System.out.println("Haplotype to test:"); - System.out.println(new String(haplotypeBases)); + System.out.println("H:"+new String(haplotypeBases)); + System.out.println("R:"+new String(readBases)); + System.out.format("L:%4.2f\n",readLikelihood); + System.out.format("StPos:%d\n", startIdx); } - - Double readLikelihood = 0.0; - if (useAffineGapModel) { - - double[] currentContextGOP = null; - double[] currentContextGCP = null; - - if (doContextDependentPenalties) { - - if (getGapPenaltiesFromFile) { - readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, recalCDP, null); - - } else { - currentContextGOP = Arrays.copyOfRange(gapOpenProbabilityMap.get(a), (int)indStart, (int)indStop); - currentContextGCP = Arrays.copyOfRange(gapContProbabilityMap.get(a), (int)indStart, (int)indStop); - readLikelihood = computeReadLikelihoodGivenHaplotypeAffineGaps(haplotypeBases, readBases, readQuals, currentContextGOP, currentContextGCP); - } - } - - } - else - readLikelihood = computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals); - readEl.put(a,readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; } @@ -1004,7 +602,7 @@ public class PairHMMIndelErrorModel { if (DEBUG) { System.out.println("\nLikelihood summary"); - for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) { + for (readIdx=0; readIdx < pileup.getNumberOfElements(); readIdx++) { System.out.format("Read Index: %d ",readIdx); for (int i=0; i < readLikelihoods[readIdx].length; i++) System.out.format("L%d: %f ",i,readLikelihoods[readIdx][i]); @@ -1012,123 +610,63 @@ public class PairHMMIndelErrorModel { } } + + return getHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); + } + + private int computeFirstDifferingPosition(byte[] b1, byte[] b2) { + if (b1.length != b2.length) + return 0; // sanity check + + for (int i=0; i < b1.length; i++ ){ + if ( b1[i]!= b2[i]) + return i; + } + return b1.length; + } + + private int computeFirstDifferingPosition(double[] b1, double[] b2) { + if (b1.length != b2.length) + return 0; // sanity check + + for (int i=0; i < b1.length; i++ ){ + if ( b1[i]!= b2[i]) + return i; + } + return b1.length; + } + + private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { + final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; + + // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplied to just a single loop without the intermediate NxN matrix for (int i=0; i < numHaplotypes; i++) { for (int j=i; j < numHaplotypes; j++){ // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] // L(Hi, Hj) = sum_reads ( Pr(R|Hi)/2 + Pr(R|Hj)/2) //readLikelihoods[k][j] has log10(Pr(R_k) | H[j] ) - for (readIdx=0; readIdx < pileup.getReads().size(); readIdx++) { - + for (int readIdx = 0; readIdx < readLikelihoods.length; readIdx++) { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) // First term is approximated by Jacobian log with table lookup. if (Double.isInfinite(readLikelihoods[readIdx][i]) && Double.isInfinite(readLikelihoods[readIdx][j])) continue; - haplotypeLikehoodMatrix[i][j] += ( MathUtils.softMax(readLikelihoods[readIdx][i], - readLikelihoods[readIdx][j]) + LOG_ONE_HALF); - + final double li = readLikelihoods[readIdx][i]; + final double lj = readLikelihoods[readIdx][j]; + final int readCount = readCounts[readIdx]; + haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.softMax(li, lj) + LOG_ONE_HALF); } - - } } - return getHaplotypeLikelihoods(haplotypeLikehoodMatrix); - - } - - public static double[] getHaplotypeLikelihoods(double[][] haplotypeLikehoodMatrix) { - int hSize = haplotypeLikehoodMatrix.length; - double[] genotypeLikelihoods = new double[hSize*(hSize+1)/2]; - + final double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2]; int k=0; - double maxElement = Double.NEGATIVE_INFINITY; - for (int j=0; j < hSize; j++) { + for (int j=0; j < numHaplotypes; j++) { for (int i=0; i <= j; i++){ genotypeLikelihoods[k++] = haplotypeLikehoodMatrix[i][j]; - if (haplotypeLikehoodMatrix[i][j] > maxElement) - maxElement = haplotypeLikehoodMatrix[i][j]; } } - // renormalize - for (int i=0; i < genotypeLikelihoods.length; i++) - genotypeLikelihoods[i] -= maxElement; - - return genotypeLikelihoods; + // renormalize so that max element is zero. + return MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true); } - - /** - * Implements a serial recalibration of the reads using the combinational table. - * First, we perform a positional recalibration, and then a subsequent dinuc correction. - * - * Given the full recalibration table, we perform the following preprocessing steps: - * - * - calculate the global quality score shift across all data [DeltaQ] - * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift - * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual - * - The final shift equation is: - * - * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... ) - * @param key The list of Comparables that were calculated from the covariates - * @return A recalibrated quality score as a byte - */ - /* - private byte performSequentialQualityCalculation( final Object... key ) { - - final byte qualFromRead = (byte)Integer.parseInt(key[1].toString()); - final Object[] readGroupCollapsedKey = new Object[1]; - final Object[] qualityScoreCollapsedKey = new Object[2]; - final Object[] covariateCollapsedKey = new Object[3]; - - // The global quality shift (over the read group only) - readGroupCollapsedKey[0] = key[0]; - final RecalDatum globalRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(0).get( readGroupCollapsedKey )); - double globalDeltaQ = 0.0; - if( globalRecalDatum != null ) { - final double globalDeltaQEmpirical = globalRecalDatum.getEmpiricalQuality(); - final double aggregrateQReported = globalRecalDatum.getEstimatedQReported(); - globalDeltaQ = globalDeltaQEmpirical - aggregrateQReported; - } - - // The shift in quality between reported and empirical - qualityScoreCollapsedKey[0] = key[0]; - qualityScoreCollapsedKey[1] = key[1]; - final RecalDatum qReportedRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(1).get( qualityScoreCollapsedKey )); - double deltaQReported = 0.0; - if( qReportedRecalDatum != null ) { - final double deltaQReportedEmpirical = qReportedRecalDatum.getEmpiricalQuality(); - deltaQReported = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; - } - - // The shift in quality due to each covariate by itself in turn - double deltaQCovariates = 0.0; - double deltaQCovariateEmpirical; - covariateCollapsedKey[0] = key[0]; - covariateCollapsedKey[1] = key[1]; - for( int iii = 2; iii < key.length; iii++ ) { - covariateCollapsedKey[2] = key[iii]; // The given covariate - final RecalDatum covariateRecalDatum = ((RecalDatum)dataManager.getCollapsedTable(iii).get( covariateCollapsedKey )); - if( covariateRecalDatum != null ) { - deltaQCovariateEmpirical = covariateRecalDatum.getEmpiricalQuality(); - deltaQCovariates += ( deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported) ); - } - } - - final double newQuality = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; - return QualityUtils.boundQual( (int)Math.round(newQuality), (byte)MAX_QUALITY_SCORE ); - - // Verbose printouts used to validate with old recalibrator - //if(key.contains(null)) { - // System.out.println( key + String.format(" => %d + %.2f + %.2f + %.2f + %.2f = %d", - // qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte)); - //} - //else { - // System.out.println( String.format("%s %s %s %s => %d + %.2f + %.2f + %.2f + %.2f = %d", - // key.get(0).toString(), key.get(3).toString(), key.get(2).toString(), key.get(1).toString(), qualFromRead, globalDeltaQ, deltaQReported, deltaQPos, deltaQDinuc, newQualityByte) ); - //} - - //return newQualityByte; - - } -*/ } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java deleted file mode 100755 index 2c89b907b..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.indels; - -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.BadMateFilter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.By; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; -import org.broadinstitute.sting.utils.interval.IntervalMergingRule; -import org.broadinstitute.sting.utils.sam.ReadUtils; - -import java.io.File; -import java.util.Iterator; - -@By(DataSource.READS) -// walker to count realigned reads -public class RealignedReadCounter extends ReadWalker { - - public static final String ORIGINAL_CIGAR_TAG = "OC"; - public static final String ORIGINAL_POSITION_TAG = "OP"; - - @Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) - protected String intervalsFile = null; - - // the intervals input by the user - private Iterator intervals = null; - - // the current interval in the list - private GenomeLoc currentInterval = null; - - private long updatedIntervals = 0, updatedReads = 0, affectedBases = 0; - private boolean intervalWasUpdated = false; - - public void initialize() { - // prepare to read intervals one-by-one, as needed (assuming they are sorted). - intervals = new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); - currentInterval = intervals.hasNext() ? intervals.next() : null; - } - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - if ( currentInterval == null ) { - return 0; - } - - GenomeLoc readLoc = ref.getGenomeLocParser().createGenomeLoc(read); - // hack to get around unmapped reads having screwy locations - if ( readLoc.getStop() == 0 ) - readLoc = ref.getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); - - if ( readLoc.isBefore(currentInterval) || ReadUtils.is454Read(read) ) - return 0; - - if ( readLoc.overlapsP(currentInterval) ) { - if ( doNotTryToClean(read) ) - return 0; - - if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) { - String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG); - // deal with an old bug - if ( read.getCigar().toString().equals(newCigar) ) { - //System.out.println(currentInterval + ": " + read.getReadName() + " " + read.getCigarString() + " " + newCigar); - return 0; - } - - if ( !intervalWasUpdated ) { - intervalWasUpdated = true; - updatedIntervals++; - affectedBases += 20 + getIndelSize(read); - } - updatedReads++; - - } - } else { - do { - intervalWasUpdated = false; - currentInterval = intervals.hasNext() ? intervals.next() : null; - } while ( currentInterval != null && currentInterval.isBefore(readLoc) ); - } - - return 0; - } - - private int getIndelSize(SAMRecord read) { - for ( CigarElement ce : read.getCigar().getCigarElements() ) { - if ( ce.getOperator() == CigarOperator.I ) - return 0; - if ( ce.getOperator() == CigarOperator.D ) - return ce.getLength(); - } - logger.warn("We didn't see an indel for this read: " + read.getReadName() + " " + read.getAlignmentStart() + " " + read.getCigar()); - return 0; - } - - private boolean doNotTryToClean(SAMRecord read) { - return read.getReadUnmappedFlag() || - read.getNotPrimaryAlignmentFlag() || - read.getReadFailsVendorQualityCheckFlag() || - read.getMappingQuality() == 0 || - read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || - (BadMateFilter.hasBadMate(read)); - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer value, Integer sum) { - return sum + value; - } - - public void onTraversalDone(Integer result) { - System.out.println(updatedIntervals + " intervals were updated"); - System.out.println(updatedReads + " reads were updated"); - System.out.println(affectedBases + " bases were affected"); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index bede50a0b..424e05c20 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -50,6 +50,7 @@ import java.io.PrintStream; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.TreeSet; /** * Emits intervals for the Local Indel Realigner to target for realignment. @@ -103,7 +104,7 @@ import java.util.List; @Allows(value={DataSource.READS, DataSource.REFERENCE}) @By(DataSource.REFERENCE) @BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) -public class RealignerTargetCreator extends RodWalker { +public class RealignerTargetCreator extends RodWalker implements TreeReducible { /** * The target intervals for realignment. @@ -227,7 +228,7 @@ public class RealignerTargetCreator extends RodWalker 0.0 && mismatchThreshold <= 1.0 && - pileup.size() >= minReadsAtLocus && + pileup.getNumberOfElements() >= minReadsAtLocus && (double)mismatchQualities / (double)totalQualities >= mismatchThreshold ) hasPointEvent = true; } @@ -251,43 +252,125 @@ public class RealignerTargetCreator extends RodWalker= right.loc.getStart(); + } + + @com.google.java.contract.Requires({"left != null", "right != null"}) + static private Event mergeEvents(Event left, Event right) { + left.merge(right); + return left; + } + private enum EVENT_TYPE { POINT_EVENT, INDEL_EVENT, BOTH } + class EventPair { + public Event left, right; + public TreeSet intervals = new TreeSet(); + + public EventPair(Event left, Event right) { + this.left = left; + this.right = right; + } + + public EventPair(Event left, Event right, TreeSet set1, TreeSet set2) { + this.left = left; + this.right = right; + intervals.addAll(set1); + intervals.addAll(set2); + } + } + class Event { public int furthestStopPos; - public GenomeLoc loc; - public int eventStartPos; + private GenomeLoc loc; + private int eventStartPos; private int eventStopPos; private EVENT_TYPE type; private ArrayList pointEvents = new ArrayList(); @@ -332,6 +415,10 @@ public class RealignerTargetCreator extends RodWalker= 0 && eventStopPos - eventStartPos < maxIntervalSize; } - public String toString() { - return String.format("%s:%d-%d", loc.getContig(), eventStartPos, eventStopPos); + public GenomeLoc getLoc() { + return getToolkit().getGenomeLocParser().createGenomeLoc(loc.getContig(), eventStartPos, eventStopPos); } } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java index 8bba8eac2..414ffa09c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetectorWalker.java @@ -26,22 +26,16 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.Tags; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.filters.Platform454Filter; import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter; -import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; -import org.broadinstitute.sting.utils.codecs.refseq.Transcript; -import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; -import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; @@ -51,16 +45,19 @@ import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec; +import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature; +import org.broadinstitute.sting.utils.codecs.refseq.Transcript; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.collections.CircularArray; import org.broadinstitute.sting.utils.collections.PrimitivePair; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.interval.OverlappingIntervalIterator; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -134,17 +131,9 @@ public class SomaticIndelDetectorWalker extends ReadWalker { // boolean FORMAT_VCF = false; @Hidden - @Argument(fullName = "genotype_intervals", shortName = "genotype", + @Input(fullName = "genotype_intervals", shortName = "genotype", doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or not", required = false) - public String genotypeIntervalsFile = null; - - @Hidden - @Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false, - doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+ - "if the list turns out to be unsorted, it will throw an exception. "+ - "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+ - "to sort and keep it in memory (increases memory usage!).") - protected boolean GENOTYPE_NOT_SORTED = false; + public IntervalBinding genotypeIntervalsFile = null; @Hidden @Argument(fullName="unpaired", shortName="unpaired", @@ -265,7 +254,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker { Set headerInfo = new HashSet(); // first, the basic info - headerInfo.add(new VCFHeaderLine("source", "IndelGenotyperV2")); + headerInfo.add(new VCFHeaderLine("source", "SomaticIndelDetector")); headerInfo.add(new VCFHeaderLine("reference", getToolkit().getArguments().referenceFile.getName())); // FORMAT and INFO fields @@ -283,10 +272,10 @@ public class SomaticIndelDetectorWalker extends ReadWalker { args.addAll(getToolkit().getFilters()); Map commandLineArgs = getToolkit().getApproximateCommandLineArguments(args); for ( Map.Entry commandLineArg : commandLineArgs.entrySet() ) - headerInfo.add(new VCFHeaderLine(String.format("IGv2_%s", commandLineArg.getKey()), commandLineArg.getValue())); + headerInfo.add(new VCFHeaderLine(String.format("SID_%s", commandLineArg.getKey()), commandLineArg.getValue())); // also, the list of input bams for ( String fileName : getToolkit().getArguments().samFiles ) - headerInfo.add(new VCFHeaderLine("IGv2_bam_file_used", fileName)); + headerInfo.add(new VCFHeaderLine("SID_bam_file_used", fileName)); return headerInfo; } @@ -366,16 +355,9 @@ public class SomaticIndelDetectorWalker extends ReadWalker { } if ( genotypeIntervalsFile != null ) { - if ( ! GENOTYPE_NOT_SORTED && IntervalUtils.isIntervalFile(genotypeIntervalsFile)) { - // prepare to read intervals one-by-one, as needed (assuming they are sorted). - genotypeIntervalIterator = new IntervalFileMergingIterator(getToolkit().getGenomeLocParser(), - new java.io.File(genotypeIntervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); - } else { - // read in the whole list of intervals for cleaning - GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), - IntervalUtils.parseIntervalArguments(getToolkit().getGenomeLocParser(),Arrays.asList(genotypeIntervalsFile),true), IntervalMergingRule.OVERLAPPING_ONLY); - genotypeIntervalIterator = locs.iterator(); - } + // read in the whole list of intervals for cleaning + GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals(getToolkit().getGenomeLocParser(), genotypeIntervalsFile.getIntervals(getToolkit()), IntervalMergingRule.OVERLAPPING_ONLY); + genotypeIntervalIterator = locs.iterator(); // wrap intervals requested for genotyping inside overlapping iterator, so that we actually // genotype only on the intersections of the requested intervals with the -L intervals @@ -392,7 +374,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker { location = getToolkit().getGenomeLocParser().createGenomeLoc(getToolkit().getSAMFileHeader().getSequence(0).getSequenceName(),1); - normalSamples = getToolkit().getSamplesByReaders().get(0); + normalSamples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeaders().get(0)); try { // we already checked that bedOutput and output_file are not set simultaneously @@ -413,7 +395,7 @@ public class SomaticIndelDetectorWalker extends ReadWalker { @Override - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { // if ( read.getReadName().equals("428EFAAXX090610:2:36:1384:639#0") ) System.out.println("GOT READ"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java index a56c9e21e..63fb33295 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java @@ -37,7 +37,7 @@ public class PhasingRead extends BaseArray { public PhasingRead(int length, int mappingQual) { super(length); - this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb(mappingQual)); + this.mappingProb = new PreciseNonNegativeDouble(QualityUtils.qualToProb((byte)mappingQual)); this.baseProbs = new PreciseNonNegativeDouble[length]; Arrays.fill(this.baseProbs, null); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index 17a6e20f1..68fbe8ce2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -30,7 +30,6 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; @@ -76,9 +75,8 @@ import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFr * -T ReadBackedPhasing * -R reference.fasta * -I reads.bam - * --variant:vcf SNPs.vcf - * -BTI variant - * -BTIMR INTERSECTION + * --variant SNPs.vcf + * -L SNPs.vcf * -o phased_SNPs.vcf * --phaseQualityThresh 20.0 * @@ -260,10 +258,10 @@ public class ReadBackedPhasingWalker extends RodWalker entriesToNames = new HashMap(); - Integer numRecords = vc.getAttributeAsIntegerNoException(NUM_RECORDS_KEY); - if (numRecords != null) { + int numRecords = vc.getAttributeAsInt(NUM_RECORDS_KEY, -1); + if (numRecords != -1) { boolean done = false; if (numRecords == 1) { // Check if perhaps the single record doesn't end with "_1": - String name = vc.getAttributeAsStringNoException(nameKeyToUse); + String name = vc.getAttributeAsString(nameKeyToUse, null); if (name != null) { entriesToNames.put(nameKeyToUse, name); done = true; @@ -59,14 +59,14 @@ public class RefSeqDataParser { if (!done) { for (int i = 1; i <= numRecords; i++) { String key = nameKeyToUseMultiplePrefix + i; - String name = vc.getAttributeAsStringNoException(key); + String name = vc.getAttributeAsString(key, null); if (name != null) entriesToNames.put(key, name); } } } else { // no entry with the # of records: - String name = vc.getAttributeAsStringNoException(nameKeyToUse); + String name = vc.getAttributeAsString(nameKeyToUse, null); if (name != null) { entriesToNames.put(nameKeyToUse, name); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java new file mode 100644 index 000000000..dbbd8e761 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMalesWalker.java @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Gender; +import org.broadinstitute.sting.gatk.samples.Sample; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. + * Can also count the number of reads matching a given criterion using read filters (see the + * --read-filter command line argument). Simplest example of a read-backed analysis. + */ +@Requires({DataSource.READS, DataSource.REFERENCE}) +public class CountMalesWalker extends ReadWalker { + public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + Sample sample = getSampleDB().getSample(read); + return sample.getGender() == Gender.MALE ? 1 : 0; + } + + public Integer reduceInit() { return 0; } + + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java index 9ce9c4eec..b5a2d183f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java @@ -1,11 +1,11 @@ package org.broadinstitute.sting.gatk.walkers.qc; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. @@ -38,7 +38,7 @@ import org.broadinstitute.sting.gatk.walkers.Requires; */ @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountReadsWalker extends ReadWalker { - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { return 1; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CycleQualityWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CycleQualityWalker.java index b5f5442cd..1cb1579d0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CycleQualityWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CycleQualityWalker.java @@ -1,434 +1,434 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.utils.collections.PrimitivePair; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; - -import java.io.*; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Apr 9, 2010 - * Time: 12:16:41 PM - * To change this template use File | Settings | File Templates. - */ - -/** - * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. - * Can also count the number of reads matching a given criterion using read filters (see the - * --read-filter command line argument). Simplest example of a read-backed analysis. - */ -@Requires({DataSource.READS}) -public class CycleQualityWalker extends ReadWalker { - @Output - protected PrintStream out; - - @Argument(fullName="mappedOnly", shortName="mo", doc="when this flag is set (default), statistics will be collected "+ - "on mapped reads only, while unmapped reads will be discarded", required=false) - protected boolean MAPPED_ONLY = true; - @Argument(fullName="maxReadLength", shortName="rl", doc="maximum read length", required=false) - protected int MAX_READ_LENGTH = 500; - @Argument(fullName="out_prefix",shortName="p",doc="prefix for output report and statistics files",required=true) - protected String PREFIX = null; -// @Argument(fullName="html",shortName="html",doc="produce html-formatted output (starting with h3-level tags) rather than plain text",required=false) - protected boolean HTML = false; - @Argument(fullName="qualThreshold", shortName="Q",doc="flag as problematic all cycles with av. qualities below the threshold (applies only to the generated report)",required=false) - protected double QTHRESHOLD = 10.0; - @Argument(fullName="useBothQualities",shortName="bothQ",required=false,doc="Generate statistics both for currently set and for "+ - "original base qualities (OQ tag, must be present in the bam); two separate data files will be generated.") - protected boolean ASSESS_BOTH_QUALS = false; - - private Map cyclesByLaneMap = null; - private Map cyclesByLibraryMap = null; - private Map cyclesByLaneMapOrig = null; - private Map cyclesByLibraryMapOrig = null; - - public void initialize() { - if ( PREFIX == null ) throw new ReviewedStingException("Prefix for output file(s) must be specified"); - cyclesByLaneMap = new HashMap(); - cyclesByLibraryMap = new HashMap(); - cyclesByLaneMapOrig = new HashMap(); - cyclesByLibraryMapOrig = new HashMap(); - } - - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - - if ( AlignmentUtils.isReadUnmapped(read) ) return 0; - - SAMReadGroupRecord rg = read.getReadGroup(); - - if ( rg == null ) throw new UserException.ReadMissingReadGroup(read); - - String lane = read.getReadGroup().getPlatformUnit(); - String library = read.getReadGroup().getLibrary(); - - if ( lane == null ) throw new UserException.MalformedBAM(read, "Read "+read.getReadName()+" has no platform unit information"); - if ( library == null ) throw new UserException.MalformedBAM(read, "Read "+read.getReadName()+" has no library information"); - - int end = 0; - - if ( read.getReadPairedFlag() ) { - - if ( read.getFirstOfPairFlag() ) { - if ( read.getSecondOfPairFlag() ) - throw new UserException.MalformedBAM(read, "Read "+read.getReadName()+" has conflicting first/second in pair attributes"); - end = 1; - } else { - if ( ! read.getSecondOfPairFlag() ) - throw new UserException.MalformedBAM(read, "Read "+read.getReadName()+" has conflicting first/second in pair attributes"); - end = 2; - } - } - - CycleStats[] byLane = cyclesByLaneMap.get(lane); - CycleStats[] byLib = cyclesByLibraryMap.get(library); - - //byte [] quals = USE_ORIGINAL_QUALS ? AlignmentUtils.getOriginalQualsInCycleOrder(read) : AlignmentUtils.getQualsInCycleOrder(read); - - byte [] quals = AlignmentUtils.getQualsInCycleOrder(read); - - // if end == 0 (single end lane), we allocate array of length 1, otherwise we need two - // elements in the array in order to be able to collect statistics for each end in the pair independently - if ( byLane == null ) cyclesByLaneMap.put(lane,byLane = new CycleStats[(end==0?1:2)]); - if ( byLib == null ) cyclesByLibraryMap.put(library, byLib =new CycleStats[2]); - - if ( end != 0 ) end--; // we will now use 'end' as index into the array of stats - - if ( byLane[end] == null ) byLane[end] = new CycleStats(MAX_READ_LENGTH); - if ( byLib[end] == null ) byLib[end] =new CycleStats(MAX_READ_LENGTH); - byLane[end].add(quals); - byLib[end].add(quals); - - return 1; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - public Integer reduceInit() { - return 0; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - public Integer reduce(Integer value, Integer sum) { - return sum.intValue()+value.intValue(); //To change body of implemented methods use File | Settings | File Templates. - } - - public void onTraversalDone(Integer result) { - if ( HTML ) { - out.println("

Cycle Quality QC

\n"); - out.println("File(s) analyzed:
"); - for ( String fileName : getToolkit().getArguments().samFiles) out.println(fileName+"
"); - out.println("
"); - } - if ( HTML ) out.println("

"); - out.println("\n"+result+" reads analyzed\n"); - if ( HTML ) out.println("

"); - out.println("by platform unit:"); - if ( HTML ) out.println("
"); - report2(cyclesByLaneMap, new File(PREFIX+".byLane.txt"),true); - out.println(); - if ( HTML ) out.println("

"); - out.println("\nby library:"); - if ( HTML ) out.println("
"); - report2(cyclesByLibraryMap, new File(PREFIX+".byLibrary.txt"),true); - out.println(); - if ( HTML ) out.println("

"); - } - - - - private void report2(Map m, File f,boolean summaryReport) { - long totalReads_1 =0; - long totalReads_2 =0; - long totalReads_unpaired = 0; - SortedSet columns = new TreeSet(); - int maxLength = 0; // maximum read length across all lanes/read ends analyzed - - for( Map.Entry e : m.entrySet() ) { - if ( e.getValue()[0].getMaxReadLength() > maxLength ) maxLength = e.getValue()[0].getMaxReadLength(); - - if ( e.getValue().length == 1 || e.getValue().length == 2 && e.getValue()[1] == null ) { - totalReads_unpaired += e.getValue()[0].getReadCount(); // single end lane - } else { - totalReads_1 += e.getValue()[0].getReadCount(); - totalReads_2 += e.getValue()[1].getReadCount(); - if ( e.getValue()[1].getMaxReadLength() > maxLength ) maxLength = e.getValue()[1].getMaxReadLength(); - } - - columns.add(e.getKey()); - } - - if ( summaryReport ) { - if ( totalReads_1 == 0 && totalReads_2 != 0) { - out.println(" End 1: No reads"); - if ( HTML ) out.println("
"); - } - if ( totalReads_2 == 0 && totalReads_1 != 0 ) { - out.println(" End 2: No reads"); - if ( HTML ) out.println("
"); - } - if ( totalReads_1 == 0 && totalReads_2 == 0 && totalReads_unpaired == 0 ) { - out.println(" No reads found."); - if ( HTML ) out.println("
"); - } - } - - if ( totalReads_1 == 0 && totalReads_2 == 0 && totalReads_unpaired == 0 ) return; - - try { - BufferedWriter w = new BufferedWriter(new FileWriter(f)); - - w.write("cycle"); - - for( String col : columns ) { - CycleStats[] data = m.get(col); - if ( summaryReport ) { - out.print(" "); - out.print(col); - } - - CycleStats end1 = data[0]; - int minL = ( end1 == null ? 0 : end1.getMinReadLength() ); - int maxL = ( end1 == null ? 0 : end1.getMaxReadLength() ); - - if ( data.length == 2 && data[1] != null ) { - if ( summaryReport ) { - out.println(": paired"); - if ( HTML ) out.println("
"); - out.println(" Reads analyzed:"); - if ( HTML ) out.println("
"); - } - CycleStats end2 = data[1]; - - out.print( " End 1: "+ ( end1 == null ? 0 : end1.getReadCount()) ); - if ( minL == maxL ) out.println("; read length = "+minL); - else out.println("; WARNING: variable read length = "+minL+"-"+maxL); - if ( HTML ) out.println("
"); - - out.print( " End 2: "+ ( end2 == null ? 0 : end2.getReadCount()) ); - minL = ( end2 == null ? 0 : end2.getMinReadLength() ); - maxL = ( end2 == null ? 0 : end2.getMaxReadLength() ); - if ( minL == maxL ) out.println("; read length = "+minL); - else out.println("; WARNING: variable read length = "+minL+"-"+maxL); - if ( HTML ) out.println("
"); - } - else { - out.println(": unpaired"); - if ( HTML ) out.println("
"); - out.print( " Reads analyzed: "+ ( end1 == null ? 0 : end1.getReadCount()) ); - if ( minL == maxL ) out.println("; read length = "+minL); - else out.println("; WARNING: variable read length = "+minL+"-"+maxL); - if ( HTML ) out.println("
"); - } - - w.write('\t') ; - w.write(col); - if ( data.length == 1 || data.length == 2 && data[1] == null ) { - w.write(".unpaired"); - w.write('\t'); - w.write(col); - w.write(".unpaired.stddev"); - } else { - w.write(".end1"); - w.write('\t'); - w.write(col); - w.write(".end1.stddev"); - w.write('\t') ; - w.write(col); - w.write(".end2"); - w.write('\t'); - w.write(col); - w.write(".end2.stddev"); - } - } - - w.write('\n'); - - int cycle = 0; - - Map> problems = new HashMap>(); - - while ( cycle < maxLength ) { - w.write(Integer.toString(cycle+1)); - for ( String col : columns ) { - - CycleStats[] data = m.get(col); - CycleStats end1 = data[0]; - w.write('\t'); - if ( end1 == null || cycle >= end1.getMaxReadLength() ) w.write(".\t."); - else { - double aq = end1.getCycleQualAverage(cycle); - w.write(String.format("%.4f\t%.4f",aq,end1.getCycleQualStdDev(cycle))); - recordProblem(aq,cycle, problems,col+".End1"); - } - if ( data.length > 1 && data[1] != null ) { - w.write('\t'); - CycleStats end2 = data[1]; - if ( end2 == null || cycle >= end2.getMaxReadLength() ) w.write(".\t."); - else { - double aq = end2.getCycleQualAverage(cycle); - w.write(String.format("%.4f\t%.4f",aq,end2.getCycleQualStdDev(cycle))); - recordProblem(aq,cycle, problems,col+".End2"); - } - } - } - w.write('\n'); - cycle++; - } - w.close(); - - if ( HTML ) out.println("
"); - - if ( HTML ) out.println("
"); - out.println("\nOUTCOME (threshold at Q="+QTHRESHOLD+"):"); - if ( HTML ) out.println("
"); - for ( String col : columns ) { - List lp = problems.get(col+".End1"); - out.print(" "+col+" End1:"); - if ( lp == null ) { - out.print(" GOOD"); - } else { - for ( PrimitivePair.Int p : lp ) { - out.print(" "+(p.first+1)+"-"); - if ( p.second >= 0 ) out.print((p.second+1)); - else out.print("END"); - } - } - out.println(); - if ( HTML ) out.println("
"); - - lp = problems.get(col+".End2"); - out.print(" "+col+" End2:"); - if ( lp == null ) { - out.print(" GOOD"); - } else { - for ( PrimitivePair.Int p : lp ) { - out.print(" "+(p.first+1)+"-"); - if ( p.second >= 0 ) out.print(p.second); - else out.print("END"); - } - } - out.println(); - if ( HTML ) out.println("
"); - } - - } catch (IOException ioe) { - throw new UserException.CouldNotCreateOutputFile(f, "Failed to write report", ioe); - } - } - - - private void recordProblem(double q, int cycle, Map> problems, String name) { - - PrimitivePair.Int p = null; - List lp = null; - if ( q < QTHRESHOLD ) { // there is a problem - if ( ! problems.containsKey(name) ) { - lp = new ArrayList(); - p = new PrimitivePair.Int(cycle,-1); - lp.add(p); - problems.put(name,lp); - } else { - lp = problems.get(name); - p = lp.get(lp.size()-1); - } - if ( p.second != -1 ) { // if we are not already inside a run of bad qual bases - lp.add(new PrimitivePair.Int(cycle,-1)); // start new run - } - } else { // good base - if ( problems.containsKey(name) ) { // only if we had problem intervals at all, we need to check if the last one needs to be closed - lp = problems.get(name); - p = lp.get(lp.size()-1); - if ( p.second == -1 ) p.second = cycle - 1; - } - } - } - - - static class CycleStats { - private long readCount = 0; - private double[] cycleQualsAv = null; - private double[] cycleQualsSd = null; - private int minL = 1000000000; // read min. length - private int maxL = 0; // read max. length - - public CycleStats(int N) { - readCount = 0; - cycleQualsAv = new double[N]; - cycleQualsSd = new double[N]; - } - - public void add(byte[] quals) { - if ( quals.length > cycleQualsAv.length ) - throw new UserException("A read of length "+quals.length+" encountered, which exceeds specified maximum read length"); - if ( quals.length > maxL ) maxL = quals.length; - if ( quals.length < minL ) minL = quals.length; - readCount++; - for ( int i = 0 ; i < quals.length ; i++ ) { - // NOTE: in the update equaltions below, there is no need to check if readCount == 1 (i.e. - // we are initializing with the very first record) or not. Indeed, the arrays are initialized with - // 0; when the very first value arrives, readCount is 1 and cycleQuals[i] gets set to quals[i] (correct!); - // this will also make the second term in the update equation for Sd (quals[i]-cycleQualsAv[i]) equal - // to 0, so Sd will be initially set to 0. - double oldAvg = cycleQualsAv[i]; // save old mean, will need it for calculation of the variance - cycleQualsAv[i] += ( quals[i] - cycleQualsAv[i] ) / readCount; // update mean - cycleQualsSd[i] += ( quals[i] - oldAvg ) * ( quals[i] - cycleQualsAv[i] ); - } - } - - public long getReadCount() { return readCount; } - public int getMaxReadLength() { return maxL; } - public int getMinReadLength() { return minL; } -// long [] getCycleQualSums() { return cycleQuals; } -// long getCycleQualSum(int i) { return cycleQuals[i]; } - double getCycleQualAverage(int i) { return cycleQualsAv[i]; } - double getCycleQualStdDev(int i) { return Math.sqrt( cycleQualsSd[i]/(readCount-1) ); } - } -} +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.collections.PrimitivePair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.*; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Apr 9, 2010 + * Time: 12:16:41 PM + * To change this template use File | Settings | File Templates. + */ + +/** + * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. + * Can also count the number of reads matching a given criterion using read filters (see the + * --read-filter command line argument). Simplest example of a read-backed analysis. + */ +@Requires({DataSource.READS}) +public class CycleQualityWalker extends ReadWalker { + @Output + protected PrintStream out; + + @Argument(fullName="mappedOnly", shortName="mo", doc="when this flag is set (default), statistics will be collected "+ + "on mapped reads only, while unmapped reads will be discarded", required=false) + protected boolean MAPPED_ONLY = true; + @Argument(fullName="maxReadLength", shortName="rl", doc="maximum read length", required=false) + protected int MAX_READ_LENGTH = 500; + @Argument(fullName="out_prefix",shortName="p",doc="prefix for output report and statistics files",required=true) + protected String PREFIX = null; +// @Argument(fullName="html",shortName="html",doc="produce html-formatted output (starting with h3-level tags) rather than plain text",required=false) + protected boolean HTML = false; + @Argument(fullName="qualThreshold", shortName="Q",doc="flag as problematic all cycles with av. qualities below the threshold (applies only to the generated report)",required=false) + protected double QTHRESHOLD = 10.0; + @Argument(fullName="useBothQualities",shortName="bothQ",required=false,doc="Generate statistics both for currently set and for "+ + "original base qualities (OQ tag, must be present in the bam); two separate data files will be generated.") + protected boolean ASSESS_BOTH_QUALS = false; + + private Map cyclesByLaneMap = null; + private Map cyclesByLibraryMap = null; + private Map cyclesByLaneMapOrig = null; + private Map cyclesByLibraryMapOrig = null; + + public void initialize() { + if ( PREFIX == null ) throw new ReviewedStingException("Prefix for output file(s) must be specified"); + cyclesByLaneMap = new HashMap(); + cyclesByLibraryMap = new HashMap(); + cyclesByLaneMapOrig = new HashMap(); + cyclesByLibraryMapOrig = new HashMap(); + } + + + public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + + if ( AlignmentUtils.isReadUnmapped(read) ) return 0; + + SAMReadGroupRecord rg = read.getReadGroup(); + + if ( rg == null ) throw new UserException.ReadMissingReadGroup(read); + + String lane = read.getReadGroup().getPlatformUnit(); + String library = read.getReadGroup().getLibrary(); + + if ( lane == null ) throw new UserException.MalformedBAM(read, "Read "+read.getReadName()+" has no platform unit information"); + if ( library == null ) throw new UserException.MalformedBAM(read, "Read "+read.getReadName()+" has no library information"); + + int end = 0; + + if ( read.getReadPairedFlag() ) { + + if ( read.getFirstOfPairFlag() ) { + if ( read.getSecondOfPairFlag() ) + throw new UserException.MalformedBAM(read, "Read "+read.getReadName()+" has conflicting first/second in pair attributes"); + end = 1; + } else { + if ( ! read.getSecondOfPairFlag() ) + throw new UserException.MalformedBAM(read, "Read "+read.getReadName()+" has conflicting first/second in pair attributes"); + end = 2; + } + } + + CycleStats[] byLane = cyclesByLaneMap.get(lane); + CycleStats[] byLib = cyclesByLibraryMap.get(library); + + //byte [] quals = USE_ORIGINAL_QUALS ? AlignmentUtils.getOriginalQualsInCycleOrder(read) : AlignmentUtils.getQualsInCycleOrder(read); + + byte [] quals = AlignmentUtils.getQualsInCycleOrder(read); + + // if end == 0 (single end lane), we allocate array of length 1, otherwise we need two + // elements in the array in order to be able to collect statistics for each end in the pair independently + if ( byLane == null ) cyclesByLaneMap.put(lane,byLane = new CycleStats[(end==0?1:2)]); + if ( byLib == null ) cyclesByLibraryMap.put(library, byLib =new CycleStats[2]); + + if ( end != 0 ) end--; // we will now use 'end' as index into the array of stats + + if ( byLane[end] == null ) byLane[end] = new CycleStats(MAX_READ_LENGTH); + if ( byLib[end] == null ) byLib[end] =new CycleStats(MAX_READ_LENGTH); + byLane[end].add(quals); + byLib[end].add(quals); + + return 1; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + public Integer reduceInit() { + return 0; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + public Integer reduce(Integer value, Integer sum) { + return sum.intValue()+value.intValue(); //To change body of implemented methods use File | Settings | File Templates. + } + + public void onTraversalDone(Integer result) { + if ( HTML ) { + out.println("

Cycle Quality QC

\n"); + out.println("File(s) analyzed:
"); + for ( String fileName : getToolkit().getArguments().samFiles) out.println(fileName+"
"); + out.println("
"); + } + if ( HTML ) out.println("

"); + out.println("\n"+result+" reads analyzed\n"); + if ( HTML ) out.println("

"); + out.println("by platform unit:"); + if ( HTML ) out.println("
"); + report2(cyclesByLaneMap, new File(PREFIX+".byLane.txt"),true); + out.println(); + if ( HTML ) out.println("

"); + out.println("\nby library:"); + if ( HTML ) out.println("
"); + report2(cyclesByLibraryMap, new File(PREFIX+".byLibrary.txt"),true); + out.println(); + if ( HTML ) out.println("

"); + } + + + + private void report2(Map m, File f,boolean summaryReport) { + long totalReads_1 =0; + long totalReads_2 =0; + long totalReads_unpaired = 0; + SortedSet columns = new TreeSet(); + int maxLength = 0; // maximum read length across all lanes/read ends analyzed + + for( Map.Entry e : m.entrySet() ) { + if ( e.getValue()[0].getMaxReadLength() > maxLength ) maxLength = e.getValue()[0].getMaxReadLength(); + + if ( e.getValue().length == 1 || e.getValue().length == 2 && e.getValue()[1] == null ) { + totalReads_unpaired += e.getValue()[0].getReadCount(); // single end lane + } else { + totalReads_1 += e.getValue()[0].getReadCount(); + totalReads_2 += e.getValue()[1].getReadCount(); + if ( e.getValue()[1].getMaxReadLength() > maxLength ) maxLength = e.getValue()[1].getMaxReadLength(); + } + + columns.add(e.getKey()); + } + + if ( summaryReport ) { + if ( totalReads_1 == 0 && totalReads_2 != 0) { + out.println(" End 1: No reads"); + if ( HTML ) out.println("
"); + } + if ( totalReads_2 == 0 && totalReads_1 != 0 ) { + out.println(" End 2: No reads"); + if ( HTML ) out.println("
"); + } + if ( totalReads_1 == 0 && totalReads_2 == 0 && totalReads_unpaired == 0 ) { + out.println(" No reads found."); + if ( HTML ) out.println("
"); + } + } + + if ( totalReads_1 == 0 && totalReads_2 == 0 && totalReads_unpaired == 0 ) return; + + try { + BufferedWriter w = new BufferedWriter(new FileWriter(f)); + + w.write("cycle"); + + for( String col : columns ) { + CycleStats[] data = m.get(col); + if ( summaryReport ) { + out.print(" "); + out.print(col); + } + + CycleStats end1 = data[0]; + int minL = ( end1 == null ? 0 : end1.getMinReadLength() ); + int maxL = ( end1 == null ? 0 : end1.getMaxReadLength() ); + + if ( data.length == 2 && data[1] != null ) { + if ( summaryReport ) { + out.println(": paired"); + if ( HTML ) out.println("
"); + out.println(" Reads analyzed:"); + if ( HTML ) out.println("
"); + } + CycleStats end2 = data[1]; + + out.print( " End 1: "+ ( end1 == null ? 0 : end1.getReadCount()) ); + if ( minL == maxL ) out.println("; read length = "+minL); + else out.println("; WARNING: variable read length = "+minL+"-"+maxL); + if ( HTML ) out.println("
"); + + out.print( " End 2: "+ ( end2 == null ? 0 : end2.getReadCount()) ); + minL = ( end2 == null ? 0 : end2.getMinReadLength() ); + maxL = ( end2 == null ? 0 : end2.getMaxReadLength() ); + if ( minL == maxL ) out.println("; read length = "+minL); + else out.println("; WARNING: variable read length = "+minL+"-"+maxL); + if ( HTML ) out.println("
"); + } + else { + out.println(": unpaired"); + if ( HTML ) out.println("
"); + out.print( " Reads analyzed: "+ ( end1 == null ? 0 : end1.getReadCount()) ); + if ( minL == maxL ) out.println("; read length = "+minL); + else out.println("; WARNING: variable read length = "+minL+"-"+maxL); + if ( HTML ) out.println("
"); + } + + w.write('\t') ; + w.write(col); + if ( data.length == 1 || data.length == 2 && data[1] == null ) { + w.write(".unpaired"); + w.write('\t'); + w.write(col); + w.write(".unpaired.stddev"); + } else { + w.write(".end1"); + w.write('\t'); + w.write(col); + w.write(".end1.stddev"); + w.write('\t') ; + w.write(col); + w.write(".end2"); + w.write('\t'); + w.write(col); + w.write(".end2.stddev"); + } + } + + w.write('\n'); + + int cycle = 0; + + Map> problems = new HashMap>(); + + while ( cycle < maxLength ) { + w.write(Integer.toString(cycle+1)); + for ( String col : columns ) { + + CycleStats[] data = m.get(col); + CycleStats end1 = data[0]; + w.write('\t'); + if ( end1 == null || cycle >= end1.getMaxReadLength() ) w.write(".\t."); + else { + double aq = end1.getCycleQualAverage(cycle); + w.write(String.format("%.4f\t%.4f",aq,end1.getCycleQualStdDev(cycle))); + recordProblem(aq,cycle, problems,col+".End1"); + } + if ( data.length > 1 && data[1] != null ) { + w.write('\t'); + CycleStats end2 = data[1]; + if ( end2 == null || cycle >= end2.getMaxReadLength() ) w.write(".\t."); + else { + double aq = end2.getCycleQualAverage(cycle); + w.write(String.format("%.4f\t%.4f",aq,end2.getCycleQualStdDev(cycle))); + recordProblem(aq,cycle, problems,col+".End2"); + } + } + } + w.write('\n'); + cycle++; + } + w.close(); + + if ( HTML ) out.println("
"); + + if ( HTML ) out.println("
"); + out.println("\nOUTCOME (threshold at Q="+QTHRESHOLD+"):"); + if ( HTML ) out.println("
"); + for ( String col : columns ) { + List lp = problems.get(col+".End1"); + out.print(" "+col+" End1:"); + if ( lp == null ) { + out.print(" GOOD"); + } else { + for ( PrimitivePair.Int p : lp ) { + out.print(" "+(p.first+1)+"-"); + if ( p.second >= 0 ) out.print((p.second+1)); + else out.print("END"); + } + } + out.println(); + if ( HTML ) out.println("
"); + + lp = problems.get(col+".End2"); + out.print(" "+col+" End2:"); + if ( lp == null ) { + out.print(" GOOD"); + } else { + for ( PrimitivePair.Int p : lp ) { + out.print(" "+(p.first+1)+"-"); + if ( p.second >= 0 ) out.print(p.second); + else out.print("END"); + } + } + out.println(); + if ( HTML ) out.println("
"); + } + + } catch (IOException ioe) { + throw new UserException.CouldNotCreateOutputFile(f, "Failed to write report", ioe); + } + } + + + private void recordProblem(double q, int cycle, Map> problems, String name) { + + PrimitivePair.Int p = null; + List lp = null; + if ( q < QTHRESHOLD ) { // there is a problem + if ( ! problems.containsKey(name) ) { + lp = new ArrayList(); + p = new PrimitivePair.Int(cycle,-1); + lp.add(p); + problems.put(name,lp); + } else { + lp = problems.get(name); + p = lp.get(lp.size()-1); + } + if ( p.second != -1 ) { // if we are not already inside a run of bad qual bases + lp.add(new PrimitivePair.Int(cycle,-1)); // start new run + } + } else { // good base + if ( problems.containsKey(name) ) { // only if we had problem intervals at all, we need to check if the last one needs to be closed + lp = problems.get(name); + p = lp.get(lp.size()-1); + if ( p.second == -1 ) p.second = cycle - 1; + } + } + } + + + static class CycleStats { + private long readCount = 0; + private double[] cycleQualsAv = null; + private double[] cycleQualsSd = null; + private int minL = 1000000000; // read min. length + private int maxL = 0; // read max. length + + public CycleStats(int N) { + readCount = 0; + cycleQualsAv = new double[N]; + cycleQualsSd = new double[N]; + } + + public void add(byte[] quals) { + if ( quals.length > cycleQualsAv.length ) + throw new UserException("A read of length "+quals.length+" encountered, which exceeds specified maximum read length"); + if ( quals.length > maxL ) maxL = quals.length; + if ( quals.length < minL ) minL = quals.length; + readCount++; + for ( int i = 0 ; i < quals.length ; i++ ) { + // NOTE: in the update equaltions below, there is no need to check if readCount == 1 (i.e. + // we are initializing with the very first record) or not. Indeed, the arrays are initialized with + // 0; when the very first value arrives, readCount is 1 and cycleQuals[i] gets set to quals[i] (correct!); + // this will also make the second term in the update equation for Sd (quals[i]-cycleQualsAv[i]) equal + // to 0, so Sd will be initially set to 0. + double oldAvg = cycleQualsAv[i]; // save old mean, will need it for calculation of the variance + cycleQualsAv[i] += ( quals[i] - cycleQualsAv[i] ) / readCount; // update mean + cycleQualsSd[i] += ( quals[i] - oldAvg ) * ( quals[i] - cycleQualsAv[i] ); + } + } + + public long getReadCount() { return readCount; } + public int getMaxReadLength() { return maxL; } + public int getMinReadLength() { return minL; } +// long [] getCycleQualSums() { return cycleQuals; } +// long getCycleQualSum(int i) { return cycleQuals[i]; } + double getCycleQualAverage(int i) { return cycleQualsAv[i]; } + double getCycleQualStdDev(int i) { return Math.sqrt( cycleQualsSd[i]/(readCount-1) ); } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintLocusContextWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintLocusContextWalker.java index d3b992cb5..ac0b3e7d5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintLocusContextWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/PrintLocusContextWalker.java @@ -1,12 +1,12 @@ package org.broadinstitute.sting.gatk.walkers.qc; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.PrintStream; import java.util.Arrays; @@ -40,7 +40,7 @@ public class PrintLocusContextWalker extends LocusWalker reads ) { + private String[] getReadNames( List reads ) { String[] readNames = new String[ reads.size() ]; for( int i = 0; i < reads.size(); i++ ) { readNames[i] = String.format("%nname = %s, start = %d, end = %d", reads.get(i).getReadName(), reads.get(i).getAlignmentStart(), reads.get(i).getAlignmentEnd()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStatsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStatsWalker.java index 908e389a8..27f9d7b6d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStatsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStatsWalker.java @@ -1,142 +1,142 @@ -/* - * Copyright (c) 2009 The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.ReadWalker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.AlignmentUtils; - -import java.io.PrintStream; -import java.util.Arrays; - -/** - * User: depristo - * Date: May 5, 2010 - * Time: 12:16:41 PM - */ - -/** - * Walks over the input reads, printing out statistics about the read length, number of clipping events, and length - * of the clipping to the output stream. - */ -@Requires({DataSource.READS}) -public class ReadClippingStatsWalker extends ReadWalker { - @Output - protected PrintStream out; - - @Argument(fullName="mappedOnly", shortName="mo", doc="when this flag is set (default), statistics will be collected "+ - "on mapped reads only, while unmapped reads will be discarded", required=false) - protected boolean MAPPED_ONLY = true; - - @Argument(fullName="skip", shortName="skip", doc="When provided, only every skip reads are analyzed", required=false) - protected int SKIP = 1; - -// public void initialize() { -// -// } - - public class ReadClippingInfo { - SAMReadGroupRecord rg; - int readLength, nClippingEvents, nClippedBases; - } - - public ReadClippingInfo map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { - if ( AlignmentUtils.isReadUnmapped(read) && MAPPED_ONLY) - return null; - - ReadClippingInfo info = new ReadClippingInfo(); - info.rg = read.getReadGroup(); - - if ( info.rg == null ) throw new UserException.ReadMissingReadGroup(read); - - for ( CigarElement elt : read.getCigar().getCigarElements() ) { - if ( elt.getOperator() != CigarOperator.N ) - - switch ( elt.getOperator()) { - case H : // ignore hard clips - case S : // soft clip - info.nClippingEvents++; - info.nClippedBases += elt.getLength(); - // note the fall through here - case M : - case D : // deletion w.r.t. the reference - case P : // ignore pads - case I : // insertion w.r.t. the reference - info.readLength += elt.getLength(); // Unless we have a reference skip, the read gets longer - break; - case N : // reference skip (looks and gets processed just like a "deletion", just different logical meaning) - break; - default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + elt.getOperator()); - } - } - - return info; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Provide an initial value for reduce computations. - * - * @return Initial value of reduce. - */ - public Integer reduceInit() { - out.println(Utils.join(" \t", Arrays.asList("ReadGroup", "ReadLength", "NClippingEvents", "NClippedBases", "PercentClipped"))); - return 0; - } - - /** - * Reduces a single map with the accumulator provided as the ReduceType. - * - * @param info result of the map. - * @param sum accumulator for the reduce. - * @return accumulator with result of the map taken into account. - */ - public Integer reduce(ReadClippingInfo info, Integer sum) { - if ( info != null ) { - if ( sum % SKIP == 0 ) { - String id = info.rg.getReadGroupId(); - out.printf("%s\t %d\t %d\t %d\t %.2f%n", - id, info.readLength, info.nClippingEvents, info.nClippedBases, - 100.0 * MathUtils.ratio(info.nClippedBases, info.readLength)); - } - return sum + 1; //To change body of implemented methods use File | Settings | File Templates. - } else { - return sum; - } - } - - public void onTraversalDone(Integer result) { - - } +/* + * Copyright (c) 2009 The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.PrintStream; +import java.util.Arrays; + +/** + * User: depristo + * Date: May 5, 2010 + * Time: 12:16:41 PM + */ + +/** + * Walks over the input reads, printing out statistics about the read length, number of clipping events, and length + * of the clipping to the output stream. + */ +@Requires({DataSource.READS}) +public class ReadClippingStatsWalker extends ReadWalker { + @Output + protected PrintStream out; + + @Argument(fullName="mappedOnly", shortName="mo", doc="when this flag is set (default), statistics will be collected "+ + "on mapped reads only, while unmapped reads will be discarded", required=false) + protected boolean MAPPED_ONLY = true; + + @Argument(fullName="skip", shortName="skip", doc="When provided, only every skip reads are analyzed", required=false) + protected int SKIP = 1; + +// public void initialize() { +// +// } + + public class ReadClippingInfo { + SAMReadGroupRecord rg; + int readLength, nClippingEvents, nClippedBases; + } + + public ReadClippingInfo map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + if ( AlignmentUtils.isReadUnmapped(read) && MAPPED_ONLY) + return null; + + ReadClippingInfo info = new ReadClippingInfo(); + info.rg = read.getReadGroup(); + + if ( info.rg == null ) throw new UserException.ReadMissingReadGroup(read); + + for ( CigarElement elt : read.getCigar().getCigarElements() ) { + if ( elt.getOperator() != CigarOperator.N ) + + switch ( elt.getOperator()) { + case H : // ignore hard clips + case S : // soft clip + info.nClippingEvents++; + info.nClippedBases += elt.getLength(); + // note the fall through here + case M : + case D : // deletion w.r.t. the reference + case P : // ignore pads + case I : // insertion w.r.t. the reference + info.readLength += elt.getLength(); // Unless we have a reference skip, the read gets longer + break; + case N : // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + break; + default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + elt.getOperator()); + } + } + + return info; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + public Integer reduceInit() { + out.println(Utils.join(" \t", Arrays.asList("ReadGroup", "ReadLength", "NClippingEvents", "NClippedBases", "PercentClipped"))); + return 0; + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param info result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + public Integer reduce(ReadClippingInfo info, Integer sum) { + if ( info != null ) { + if ( sum % SKIP == 0 ) { + String id = info.rg.getReadGroupId(); + out.printf("%s\t %d\t %d\t %d\t %.2f%n", + id, info.readLength, info.nClippingEvents, info.nClippedBases, + 100.0 * MathUtils.ratio(info.nClippedBases, info.readLength)); + } + return sum + 1; //To change body of implemented methods use File | Settings | File Templates. + } else { + return sum; + } + } + + public void onTraversalDone(Integer result) { + + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadValidationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadValidationWalker.java index fa1bb4d55..4425f92c4 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadValidationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadValidationWalker.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; @@ -64,21 +65,23 @@ public class ReadValidationWalker extends ReadWalker { /** * The reads filter function. + * * @param ref the reference bases that correspond to our read, if a reference was provided * @param read the read itself, as a SAMRecord * @return true if the read passes the filter, false if it doesn't */ - public boolean filter(ReferenceContext ref, SAMRecord read) { + public boolean filter(ReferenceContext ref, GATKSAMRecord read) { return true; } /** * The reads map function. + * * @param ref the reference bases that correspond to our read, if a reference was provided * @param read the read itself, as a SAMRecord * @return the read itself */ - public SAMRecord map( ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker ) { + public SAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { return read; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java index ca30d875b..cd17e4592 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java @@ -78,7 +78,7 @@ public class ValidatingPileupWalker extends LocusWalker DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS); + private final static EnumSet FLOW_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.LS454, NGSPlatform.ION_TORRENT); + // Initialize any member variables using the command-line arguments passed to the walkers public void initialize( final RecalibrationArgumentCollection RAC ) { if( RAC.DEFAULT_PLATFORM != null ) { @@ -58,129 +64,15 @@ public class CycleCovariate implements StandardCovariate { } } - /* - // Used to pick out the covariate's value from attributes of the read - public final Comparable getValue( final SAMRecord read, final int offset ) { - - int cycle = 1; - - //----------------------------- - // ILLUMINA and SOLID - //----------------------------- - - if( read.getReadGroup().getPlatform().equalsIgnoreCase( "ILLUMINA" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "SLX" ) || // Some bams have "illumina" and others have "SLX" - read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) || read.getReadGroup().getPlatform().equalsIgnoreCase( "ABI_SOLID" )) { // Some bams have "solid" and others have "ABI_SOLID" - cycle = offset + 1; - if( read.getReadNegativeStrandFlag() ) { - cycle = read.getReadLength() - offset; - } - } - - //----------------------------- - // 454 - //----------------------------- - - else if( read.getReadGroup().getPlatform().contains( "454" ) ) { // Some bams have "LS454" and others have just "454" - final byte[] bases = read.getReadBases(); - - // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change - // For example, AAAAAAA was probably read in two flow cycles but here we count it as one - if( !read.getReadNegativeStrandFlag() ) { // Forward direction - int iii = 0; - while( iii <= offset ) - { - while( iii <= offset && bases[iii] == (byte)'T' ) { iii++; } - while( iii <= offset && bases[iii] == (byte)'A' ) { iii++; } - while( iii <= offset && bases[iii] == (byte)'C' ) { iii++; } - while( iii <= offset && bases[iii] == (byte)'G' ) { iii++; } - if( iii <= offset ) { cycle++; } - if( iii <= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii++; } - - } - } else { // Negative direction - int iii = bases.length-1; - while( iii >= offset ) - { - while( iii >= offset && bases[iii] == (byte)'T' ) { iii--; } - while( iii >= offset && bases[iii] == (byte)'A' ) { iii--; } - while( iii >= offset && bases[iii] == (byte)'C' ) { iii--; } - while( iii >= offset && bases[iii] == (byte)'G' ) { iii--; } - if( iii >= offset ) { cycle++; } - if( iii >= offset && !BaseUtils.isRegularBase(bases[iii]) ) { iii--; } - } - } - } - - //----------------------------- - // SOLID (unused), only to be used in conjunction with PrimerRoundCovariate - //----------------------------- - - //else if( read.getReadGroup().getPlatform().equalsIgnoreCase( "SOLID" ) ) { - // // The ligation cycle according to http://www3.appliedbiosystems.com/cms/groups/mcb_marketing/documents/generaldocuments/cms_057511.pdf - // int pos = offset + 1; - // if( read.getReadNegativeStrandFlag() ) { - // pos = read.getReadLength() - offset; - // } - // cycle = pos / 5; // integer division - //} - - //----------------------------- - // UNRECOGNIZED PLATFORM - //----------------------------- - - else { // Platform is unrecognized so revert to the default platform but warn the user first - if( defaultPlatform != null) { // The user set a default platform - if( !warnedUserBadPlatform ) { - Utils.warnUser( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + - "Defaulting to platform = " + defaultPlatform + "." ); - } - warnedUserBadPlatform = true; - - read.getReadGroup().setPlatform( defaultPlatform ); - return getValue( read, offset ); // A recursive call - } else { // The user did not set a default platform - throw new StingException( "Platform string (" + read.getReadGroup().getPlatform() + ") unrecognized in CycleCovariate. " + - "No default platform specified. Users must set the default platform using the --default_platform argument." ); - } - } - - // Differentiate between first and second of pair. - // The sequencing machine cycle keeps incrementing for the second read in a pair. So it is possible for a read group - // to have an error affecting quality at a particular cycle on the first of pair which carries over to the second of pair. - // Therefore the cycle covariate must differentiate between first and second of pair reads. - // This effect can not be corrected by pulling out the first of pair and second of pair flags into a separate covariate because - // the current sequential model would consider the effects independently instead of jointly. - if( read.getReadPairedFlag() && read.getSecondOfPairFlag() ) { - cycle *= -1; - } - - return cycle; - } - */ - - // todo -- this should be put into a common place in the code base - private static List PACBIO_NAMES = Arrays.asList("PACBIO"); - private static List ILLUMINA_NAMES = Arrays.asList("ILLUMINA", "SLX", "SOLEXA"); - private static List SOLID_NAMES = Arrays.asList("SOLID"); - private static List LS454_NAMES = Arrays.asList("454"); - - private static boolean isPlatform(SAMRecord read, List names) { - String pl = read.getReadGroup().getPlatform().toUpperCase(); - for ( String name : names ) - if ( pl.contains( name ) ) - return true; - return false; - } - // Used to pick out the covariate's value from attributes of the read public void getValues(SAMRecord read, Comparable[] comparable) { //----------------------------- - // ILLUMINA and SOLID + // Illumina, Solid, PacBio, and Complete Genomics //----------------------------- - - if( isPlatform(read, ILLUMINA_NAMES) || isPlatform(read, SOLID_NAMES) || isPlatform(read, PACBIO_NAMES)) { + final NGSPlatform ngsPlatform = ((GATKSAMRecord)read).getNGSPlatform(); + if( DISCRETE_CYCLE_PLATFORMS.contains(ngsPlatform) ) { final int init; final int increment; if( !read.getReadNegativeStrandFlag() ) { @@ -222,7 +114,11 @@ public class CycleCovariate implements StandardCovariate { cycle += increment; } } - else if ( isPlatform(read, LS454_NAMES) ) { // Some bams have "LS454" and others have just "454" + + //----------------------------- + // 454 and Ion Torrent + //----------------------------- + else if( FLOW_CYCLE_PLATFORMS.contains(ngsPlatform) ) { final int readLength = read.getReadLength(); final byte[] bases = read.getReadBases(); @@ -267,8 +163,6 @@ public class CycleCovariate implements StandardCovariate { else { throw new IllegalStateException("This method hasn't been implemented yet for " + read.getReadGroup().getPlatform()); } - - } // Used to get the covariate's value from input csv file in TableRecalibrationWalker diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java index ac25d4f13..a0c928afa 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.ArrayList; @@ -228,8 +229,7 @@ public class RecalDataManager { * @param RAC The list of shared command line arguments */ public static void parseSAMRecord( final SAMRecord read, final RecalibrationArgumentCollection RAC ) { - - SAMReadGroupRecord readGroup = read.getReadGroup(); + GATKSAMReadGroupRecord readGroup = ((GATKSAMRecord)read).getReadGroup(); // If there are no read groups we have to default to something, and that something could be specified by the user using command line arguments if( readGroup == null ) { @@ -241,18 +241,17 @@ public class RecalDataManager { warnUserNullReadGroup = true; } // There is no readGroup so defaulting to these values - readGroup = new SAMReadGroupRecord( RAC.DEFAULT_READ_GROUP ); + readGroup = new GATKSAMReadGroupRecord( RAC.DEFAULT_READ_GROUP ); readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); ((GATKSAMRecord)read).setReadGroup( readGroup ); } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() + - " Users must set both the default read group using the --default_read_group argument and the default platform using the --default_platform argument." ); + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no read group. First observed at read with name = " + read.getReadName() ); } } if( RAC.FORCE_READ_GROUP != null && !readGroup.getReadGroupId().equals(RAC.FORCE_READ_GROUP) ) { // Collapse all the read groups into a single common String provided by the user final String oldPlatform = readGroup.getPlatform(); - readGroup = new SAMReadGroupRecord( RAC.FORCE_READ_GROUP ); + readGroup = new GATKSAMReadGroupRecord( RAC.FORCE_READ_GROUP ); readGroup.setPlatform( oldPlatform ); ((GATKSAMRecord)read).setReadGroup( readGroup ); } @@ -271,8 +270,7 @@ public class RecalDataManager { } readGroup.setPlatform( RAC.DEFAULT_PLATFORM ); } else { - throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() + - " Users must set the default platform using the --default_platform argument." ); + throw new UserException.MalformedBAM(read, "The input .bam file contains reads with no platform information. First observed at read with name = " + read.getReadName() ); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java index f31e2fc5b..75de84cb4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.recalibration; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; /** * Created by IntelliJ IDEA. @@ -41,22 +42,29 @@ public class RecalibrationArgumentCollection { ////////////////////////////////// // Shared Command Line Arguments ////////////////////////////////// + @Hidden @Argument(fullName="default_read_group", shortName="dRG", required=false, doc="If a read has no read group then default to the provided String.") public String DEFAULT_READ_GROUP = null; + @Hidden @Argument(fullName="default_platform", shortName="dP", required=false, doc="If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; + @Hidden @Argument(fullName="force_read_group", shortName="fRG", required=false, doc="If provided, the read group ID of EVERY read will be forced to be the provided String. This is useful to collapse all data into a single read group.") public String FORCE_READ_GROUP = null; + @Hidden @Argument(fullName="force_platform", shortName="fP", required=false, doc="If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; + @Hidden @Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false) public int WINDOW_SIZE = 5; /** * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score. */ + @Hidden @Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false) public int HOMOPOLYMER_NBACK = 7; + @Hidden @Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false) public boolean EXCEPTION_IF_NO_TILE = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java index 174e810c2..1ce02a3cf 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java @@ -170,9 +170,9 @@ public class TableRecalibrationWalker extends ReadWalker requestedCovariates = new ArrayList(); // List of covariates to be used in this calculation - private static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); - private static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); - private static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); + public static final Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + public static final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*"); + public static final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*"); public static final String EOF_MARKER = "EOF"; private long numReadsWithMalformedColorSpace = 0; @@ -364,11 +364,12 @@ public class TableRecalibrationWalker extends ReadWalkerAdditional Details *
    *
  • - * You should always use -BTI on your VCF track, so that the GATK only looks at the sites on the VCF file. + * You should always use -L on your VCF track, so that the GATK only looks at the sites on the VCF file. * This speeds up the process a lot. *
  • *
  • @@ -165,7 +164,7 @@ import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel; * -R human_g1k_v37.fasta * -I myNewTechReads.bam * -alleles handAnnotatedVCF.vcf - * -BTI alleles + * -L handAnnotatedVCF.vcf * * *
  • @@ -179,7 +178,7 @@ import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel; * -R human_g1k_v37.fasta * -I myTruthDataset.bam * -alleles callsToValidate.vcf - * -BTI alleles + * -L callsToValidate.vcf * -bt * -o gav.vcf * @@ -266,8 +265,13 @@ public class GenotypeAndValidateWalker extends RodWalker 0 && context.getBasePileup().getBases().length < minDepth)) { counter.nUncovered = 1L; + if (vcComp.getAttribute("GV").equals("T")) + counter.nAltNotCalled = 1L; + else if (vcComp.getAttribute("GV").equals("F")) + counter.nRefNotCalled = 1L; + else + counter.nNoStatusNotCalled = 1L; + return counter; } @@ -382,7 +398,7 @@ public class GenotypeAndValidateWalker extends RodWalker 0) ? 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt)) : 100; logger.info(String.format("Resulting Truth Table Output\n\n" + - "---------------------------------------------------\n" + - "\t\t|\tALT\t|\tREF\t\n" + - "---------------------------------------------------\n" + - "called alt\t|\t%d\t|\t%d\n" + - "called ref\t|\t%d\t|\t%d\n" + - "---------------------------------------------------\n" + + "------------------------------------------------------------------\n" + + "\t\t|\tALT\t|\tREF\t|\tNo Status\n" + + "------------------------------------------------------------------\n" + + "called alt\t|\t%d\t|\t%d\t|\t%d\n" + + "called ref\t|\t%d\t|\t%d\t|\t%d\n" + + "not called\t|\t%d\t|\t%d\t|\t%d\n" + + "------------------------------------------------------------------\n" + "positive predictive value: %f%%\n" + "negative predictive value: %f%%\n" + - "---------------------------------------------------\n" + + "------------------------------------------------------------------\n" + "sensitivity: %f%%\n" + "specificity: %f%%\n" + - "---------------------------------------------------\n" + + "------------------------------------------------------------------\n" + "not confident: %d\n" + "not covered: %d\n" + - "---------------------------------------------------\n", reduceSum.nAltCalledAlt, reduceSum.nRefCalledAlt, reduceSum.nAltCalledRef, reduceSum.nRefCalledRef, ppv, npv, sensitivity, specificity, reduceSum.nNotConfidentCalls, reduceSum.nUncovered)); + "------------------------------------------------------------------\n", reduceSum.nAltCalledAlt, reduceSum.nRefCalledAlt, reduceSum.nNoStatusCalledAlt, reduceSum.nAltCalledRef, reduceSum.nRefCalledRef, reduceSum.nNoStatusCalledRef, reduceSum.nAltNotCalled, reduceSum.nRefNotCalled, reduceSum.nNoStatusNotCalled, ppv, npv, sensitivity, specificity, reduceSum.nNotConfidentCalls, reduceSum.nUncovered)); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 48cba6a1a..035d8d2ca 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -80,7 +80,7 @@ import java.util.List; * -jar GenomeAnalysisTK.jar * -T ValidationAmplicons * -R /humgen/1kg/reference/human_g1k_v37.fasta - * -BTI ProbeIntervals + * -L:table interval_table.table * -ProbeIntervals:table interval_table.table * -ValidateAlleles:vcf sites_to_validate.vcf * -MaskAlleles:vcf mask_sites.vcf diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java index 2913c97a6..e83434037 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java @@ -110,12 +110,12 @@ public class CountVariants extends VariantEvaluator implements StandardEval { case SNP: nVariantLoci++; nSNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; + if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; break; case MNP: nVariantLoci++; nMNPs++; - if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++; + if (vc1.getAttributeAsBoolean("ISSINGLETON", false)) nSingletons++; break; case INDEL: nVariantLoci++; @@ -130,6 +130,10 @@ public class CountVariants extends VariantEvaluator implements StandardEval { nVariantLoci++; nMixed++; break; + case SYMBOLIC: + // ignore symbolic alleles, but don't fail + // todo - consistent way of treating symbolic alleles thgoughout codebase? + break; default: throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType()); } @@ -137,7 +141,7 @@ public class CountVariants extends VariantEvaluator implements StandardEval { String refStr = vc1.getReference().getBaseString().toUpperCase(); - String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE").toUpperCase() : null; + String aaStr = vc1.hasAttribute("ANCESTRALALLELE") ? vc1.getAttributeAsString("ANCESTRALALLELE", null).toUpperCase() : null; // if (aaStr.equals(".")) { // aaStr = refStr; // } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java index a476a2680..e69dbfb28 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/GenotypePhasingEvaluator.java @@ -219,7 +219,8 @@ public class GenotypePhasingEvaluator extends VariantEvaluator { } public static Double getPQ(Genotype gt) { - return gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY); + Double d = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); + return d == -1 ? null : d; } public static boolean topMatchesTop(AllelePair b1, AllelePair b2) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java index 203c15a85..2d0163206 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/SimpleMetricsByAC.java @@ -120,7 +120,7 @@ public class SimpleMetricsByAC extends VariantEvaluator implements StandardEval if ( eval.hasGenotypes() ) ac = eval.getChromosomeCount(eval.getAlternateAllele(0)); else if ( eval.hasAttribute("AC") ) { - ac = Integer.valueOf(eval.getAttributeAsString("AC")); + ac = eval.getAttributeAsInt("AC", -1); } if ( ac != -1 ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java index 1feb37e01..9b6e145e6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/TiTvVariantEvaluator.java @@ -49,18 +49,14 @@ public class TiTvVariantEvaluator extends VariantEvaluator implements StandardEv else nTv++; } - String refStr = vc.getReference().getBaseString().toUpperCase(); - String aaStr = vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase(); - - if (aaStr != null && !aaStr.equalsIgnoreCase("null") && !aaStr.equals(".")) { - BaseUtils.BaseSubstitutionType aaSubType = BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0]); - - //System.out.println(refStr + " " + vc.getAttributeAsString("ANCESTRALALLELE").toUpperCase() + " " + aaSubType); - - if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSITION) { - nTiDerived++; - } else if (aaSubType == BaseUtils.BaseSubstitutionType.TRANSVERSION) { - nTvDerived++; + if (vc.hasAttribute("ANCESTRALALLELE")) { + final String aaStr = vc.getAttributeAsString("ANCESTRALALLELE", "null").toUpperCase(); + if ( ! aaStr.equals(".") ) { + switch ( BaseUtils.SNPSubstitutionType(aaStr.getBytes()[0], vc.getAlternateAllele(0).getBases()[0] ) ) { + case TRANSITION: nTiDerived++; break; + case TRANSVERSION: nTvDerived++; break; + default: break; + } } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java index 307b4f684..3b4967cad 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/ValidationReport.java @@ -10,6 +10,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.util.Collection; import java.util.Set; /** @@ -131,7 +132,7 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { //// System.out.printf(" ac = %d%n", ac); } else - ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY); + ac = vc.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0); return ac > 0 ? SiteStatus.POLY : SiteStatus.MONO; } else { return TREAT_ALL_SITES_IN_EVAL_VCF_AS_CALLED ? SiteStatus.POLY : SiteStatus.NO_CALL; // we can't figure out what to do @@ -142,8 +143,8 @@ public class ValidationReport extends VariantEvaluator implements StandardEval { public boolean haveDifferentAltAlleles(VariantContext eval, VariantContext comp) { - Set evalAlts = eval.getAlternateAlleles(); - Set compAlts = comp.getAlternateAlleles(); + Collection evalAlts = eval.getAlternateAlleles(); + Collection compAlts = comp.getAlternateAlleles(); if ( evalAlts.size() != compAlts.size() ) { return true; } else { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index 3cc22cc52..c7bea93b2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -44,7 +44,7 @@ public class AlleleCount extends VariantStratifier { if (eval != null) { int AC = -1; if ( eval.hasAttribute("AC") && eval.getAttribute("AC") instanceof Integer ) { - AC = eval.getAttributeAsInt("AC"); + AC = eval.getAttributeAsInt("AC", 0); } else if ( eval.isVariant() ) { for (Allele allele : eval.getAlternateAlleles()) AC = Math.max(AC, eval.getChromosomeCount(allele)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java index 3d2dda651..cd2b8e475 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java @@ -28,7 +28,7 @@ public class AlleleFrequency extends VariantStratifier { if (eval != null) { try { - relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF") / 5.0, 3)))); + relevantStates.add(String.format("%.3f", (5.0 * MathUtils.round(eval.getAttributeAsDouble("AF", 0.0) / 5.0, 3)))); } catch (Exception e) { return relevantStates; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java index 3223626c0..91c96e490 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java @@ -90,8 +90,8 @@ public class Degeneracy extends VariantStratifier { Integer frame = null; if (eval.hasAttribute("refseq.functionalClass")) { - aa = eval.getAttributeAsString("refseq.variantAA"); - frame = eval.getAttributeAsInt("refseq.frame"); + aa = eval.getAttributeAsString("refseq.variantAA", null); + frame = eval.getAttributeAsInt("refseq.frame", 0); } else if (eval.hasAttribute("refseq.functionalClass_1")) { int annotationId = 1; String key; @@ -99,7 +99,7 @@ public class Degeneracy extends VariantStratifier { do { key = String.format("refseq.functionalClass_%d", annotationId); - String newtype = eval.getAttributeAsString(key); + String newtype = eval.getAttributeAsString(key, null); if ( newtype != null && ( type == null || @@ -109,13 +109,13 @@ public class Degeneracy extends VariantStratifier { type = newtype; String aakey = String.format("refseq.variantAA_%d", annotationId); - aa = eval.getAttributeAsString(aakey); + aa = eval.getAttributeAsString(aakey, null); if (aa != null) { String framekey = String.format("refseq.frame_%d", annotationId); if (eval.hasAttribute(framekey)) { - frame = eval.getAttributeAsInt(framekey); + frame = eval.getAttributeAsInt(framekey, 0); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java index 1dc047b5d..f5dcf527a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/FunctionalClass.java @@ -28,7 +28,7 @@ public class FunctionalClass extends VariantStratifier { } - public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { +public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { ArrayList relevantStates = new ArrayList(); relevantStates.add("all"); @@ -38,7 +38,7 @@ public class FunctionalClass extends VariantStratifier { if (eval.hasAttribute("refseq.functionalClass")) { try { - type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass")); + type = FunctionalType.valueOf(eval.getAttributeAsString("refseq.functionalClass", null)); } catch ( Exception e ) {} // don't error out if the type isn't supported } else if (eval.hasAttribute("refseq.functionalClass_1")) { int annotationId = 1; @@ -47,7 +47,7 @@ public class FunctionalClass extends VariantStratifier { do { key = String.format("refseq.functionalClass_%d", annotationId); - String newtypeStr = eval.getAttributeAsString(key); + String newtypeStr = eval.getAttributeAsString(key, null); if ( newtypeStr != null && !newtypeStr.equalsIgnoreCase("null") ) { try { FunctionalType newType = FunctionalType.valueOf(newtypeStr); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java new file mode 100644 index 000000000..1b9513b9a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/IndelSize.java @@ -0,0 +1,52 @@ +package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications; + +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.ArrayList; +import java.util.List; + +/** + * Stratifies the eval RODs by the indel size + * + * Indel sizes are stratified from sizes -100 to +100. Sizes greater than this are lumped in the +/- 100 bin + * This stratification ignores multi-allelic indels (whose size is not defined uniquely) + */ +public class IndelSize extends VariantStratifier { + static final int MAX_INDEL_SIZE = 100; + @Override + public void initialize() { + states = new ArrayList(); + for( int a=-MAX_INDEL_SIZE; a <=MAX_INDEL_SIZE; a++ ) { + states.add(String.format("%d", a)); + } + } + + public List getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) { + ArrayList relevantStates = new ArrayList(); + + if (eval != null && eval.isIndel() && eval.isBiallelic()) { + try { + int eventLength = 0; + if ( eval.isSimpleInsertion() ) { + eventLength = eval.getAlternateAllele(0).length(); + } else if ( eval.isSimpleDeletion() ) { + eventLength = -eval.getReference().length(); + } + + if (eventLength > MAX_INDEL_SIZE) + eventLength = MAX_INDEL_SIZE; + else if (eventLength < -MAX_INDEL_SIZE) + eventLength = -MAX_INDEL_SIZE; + + relevantStates.add(String.format("%d",eventLength)); + } catch (Exception e) { + return relevantStates; + } + } + + return relevantStates; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java index 92e7c6554..6a057a456 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/util/VariantEvalUtils.java @@ -354,7 +354,7 @@ public class VariantEvalUtils { private void addMapping(HashMap> mappings, String sample, VariantContext vc) { if ( !mappings.containsKey(sample) ) - mappings.put(sample, new HashSet()); + mappings.put(sample, new LinkedHashSet()); mappings.get(sample).add(vc); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 16f1abf1b..1d5493daf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -32,6 +32,8 @@ import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.PartitionBy; +import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; @@ -84,6 +86,7 @@ import java.util.*; * */ +@PartitionBy(PartitionType.NONE) public class ApplyRecalibration extends RodWalker { ///////////////////////////// diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java index bc7252ec2..04ba3ff14 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VQSRCalibrationCurve.java @@ -115,7 +115,7 @@ public class VQSRCalibrationCurve { if ( vc.isFiltered() ) return 0.0; else if ( vc.hasAttribute(VQSRQualKey) ) { - double qual = vc.getAttributeAsDouble(VQSRQualKey); + double qual = vc.getAttributeAsDouble(VQSRQualKey, 0.0); return probTrueVariant(qual); } else { throw new UserException.VariantContextMissingRequiredField(VQSRQualKey, vc); @@ -143,7 +143,7 @@ public class VQSRCalibrationCurve { for ( int i = 0; i < log10Likelihoods.length; i++) { double p = Math.pow(10, log10Likelihoods[i]); double q = alpha * p + (1-alpha) * noInfoPr; - if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey), p, alpha, q); + if ( DEBUG ) System.out.printf(" vqslod = %.2f, p = %.2e, alpha = %.2e, q = %.2e%n", vc.getAttributeAsDouble(VQSRQualKey, 0.0), p, alpha, q); updated[i] = Math.log10(q); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 89e702b64..f60a94a22 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -29,13 +29,17 @@ import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.PartitionBy; +import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.io.Resource; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -62,6 +66,10 @@ import java.util.*; * the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model. * *

    + * NOTE: In order to create the model reporting plots Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version). + * See http://www.r-project.org for more info on how to download and install R. + * + *

    * See the GATK wiki for a tutorial and example recalibration accuracy plots. * http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration * @@ -94,10 +102,12 @@ import java.util.*; * */ +@PartitionBy(PartitionType.NONE) public class VariantRecalibrator extends RodWalker, ExpandingArrayList> implements TreeReducible> { public static final String VQS_LOD_KEY = "VQSLOD"; // Log odds ratio of being a true variant versus being false under the trained gaussian mixture model public static final String CULPRIT_KEY = "culprit"; // The annotation which was the worst performing in the Gaussian mixture model, likely the reason why the variant was filtered out + private static final String PLOT_TRANCHES_RSCRIPT = "plot_Tranches.R"; @ArgumentCollection private VariantRecalibratorArgumentCollection VRAC = new VariantRecalibratorArgumentCollection(); @@ -155,12 +165,8 @@ public class VariantRecalibrator extends RodWalker(Arrays.asList(USE_ANNOTATIONS)), VRAC ); + if (RSCRIPT_FILE != null && !RScriptExecutor.RSCRIPT_EXISTS) + Utils.warnUser(logger, String.format( + "Rscript not found in environment path. %s will be generated but PDF plots will not.", + RSCRIPT_FILE)); + if( IGNORE_INPUT_FILTERS != null ) { ignoreInputFilterSet.addAll( Arrays.asList(IGNORE_INPUT_FILTERS) ); } @@ -324,20 +334,13 @@ public class VariantRecalibrator extends RodWalker randomData, final GaussianMixtureModel goodModel, final GaussianMixtureModel badModel, final double lodCutoff ) { @@ -345,15 +348,18 @@ public class VariantRecalibrator extends RodWalker { if (minimumN > 1 && (vcs.size() - numFilteredRecords < minimumN)) return 0; - + List mergedVCs = new ArrayList(); Map> VCsByType = VariantContextUtils.separateVariantContextsByType(vcs); // iterate over the types so that it's deterministic @@ -244,7 +244,7 @@ public class CombineVariants extends RodWalker { SET_KEY, filteredAreUncalled, MERGE_INFO_WITH_MAX_AC)); } - for ( VariantContext mergedVC : mergedVCs ) { + for ( VariantContext mergedVC : mergedVCs ) { // only operate at the start of events if ( mergedVC == null ) continue; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java index 1c76a21ea..a932d44ed 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariants.java @@ -99,7 +99,7 @@ public class LiftoverVariants extends RodWalker { final VCFHeader vcfHeader = new VCFHeader(metaData, samples); - writer = new StandardVCFWriter(file, false); + writer = new StandardVCFWriter(file, getMasterSequenceDictionary(), false); writer.writeHeader(vcfHeader); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java index 1fefd20fc..88de12f9a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/RandomlySplitVariants.java @@ -58,15 +58,12 @@ public class RandomlySplitVariants extends RodWalker { @Argument(fullName="fractionToOut1", shortName="fraction", doc="Fraction of records to be placed in out1 (must be 0 >= fraction <= 1); all other records are placed in out2", required=false) protected double fraction = 0.5; - protected int iFraction; - /** * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher */ public void initialize() { if ( fraction < 0.0 || fraction > 1.0 ) throw new UserException.BadArgumentValue("fractionToOut1", "this value needs to be a number between 0 and 1"); - iFraction = (int)(fraction * 1000.0); // setup the header info final List inputNames = Arrays.asList(variantCollection.variants.getName()); @@ -75,7 +72,7 @@ public class RandomlySplitVariants extends RodWalker { hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames)); vcfWriter1.writeHeader(new VCFHeader(hInfo, samples)); - vcfWriter2 = new StandardVCFWriter(file2, true); + vcfWriter2 = new StandardVCFWriter(file2, getMasterSequenceDictionary(), true); vcfWriter2.writeHeader(new VCFHeader(hInfo, samples)); } @@ -93,8 +90,8 @@ public class RandomlySplitVariants extends RodWalker { Collection vcs = tracker.getValues(variantCollection.variants, context.getLocation()); for ( VariantContext vc : vcs ) { - int random = GenomeAnalysisEngine.getRandomGenerator().nextInt(1000); - if ( random < iFraction ) + double random = GenomeAnalysisEngine.getRandomGenerator().nextDouble(); + if ( random < fraction ) vcfWriter1.add(vc); else vcfWriter2.add(vc); @@ -107,5 +104,8 @@ public class RandomlySplitVariants extends RodWalker { public Integer reduce(Integer value, Integer sum) { return value + sum; } - public void onTraversalDone(Integer result) { logger.info(result + " records processed."); } + public void onTraversalDone(Integer result) { + logger.info(result + " records processed."); + vcfWriter2.close(); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 459ffb75e..609593acc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -255,14 +255,6 @@ public class SelectVariants extends RodWalker { @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; - @Hidden - @Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false) - private boolean KEEP_AF_SPECTRUM = false; - - @Hidden - @Argument(fullName="afFile", shortName="afFile", doc="The output recal file used by ApplyRecalibration", required=false) - private File AF_FILE = new File(""); - @Hidden @Argument(fullName="family_structure_file", shortName="familyFile", doc="use -family unless you know what you're doing", required=false) private File FAMILY_STRUCTURE_FILE = null; @@ -442,7 +434,7 @@ public class SelectVariants extends RodWalker { mvSet.add(mv); } } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(AF_FILE, e); + throw new UserException.CouldNotReadInputFile(FAMILY_STRUCTURE_FILE, e); } if (outMVFile != null) try { @@ -452,7 +444,7 @@ public class SelectVariants extends RodWalker { throw new UserException.CouldNotCreateOutputFile(outMVFile, "Can't open output file", e); } } else - mvSet.add(new MendelianViolation(getToolkit(), MENDELIAN_VIOLATION_QUAL_THRESHOLD)); + mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); } else if (!FAMILY_STRUCTURE.isEmpty()) { mvSet.add(new MendelianViolation(FAMILY_STRUCTURE, MENDELIAN_VIOLATION_QUAL_THRESHOLD)); @@ -469,31 +461,7 @@ public class SelectVariants extends RodWalker { if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track"); - if (KEEP_AF_SPECTRUM) { - try { - afBreakpoints = new ArrayList(); - afBoosts = new ArrayList(); - logger.info("Reading in AF boost table..."); - boolean firstLine = false; - for ( final String line : new XReadLines( AF_FILE ) ) { - if (!firstLine) { - firstLine = true; - continue; - } - final String[] vals = line.split(" "); - double bkp = Double.valueOf(vals[0]); - double afb = Double.valueOf(vals[1]); - afBreakpoints.add(bkp); - afBoosts.add(afb); - - } - bkDelta = afBreakpoints.get(0); - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(AF_FILE, e); - } - - } } /** @@ -566,61 +534,11 @@ public class SelectVariants extends RodWalker { if (SELECT_RANDOM_NUMBER) { randomlyAddVariant(++variantNumber, sub, ref.getBase()); } - else if (!SELECT_RANDOM_FRACTION || (!KEEP_AF_SPECTRUM && GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { + else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { vcfWriter.add(sub); } - else { - if (SELECT_RANDOM_FRACTION && KEEP_AF_SPECTRUM ) { - // ok we have a comp VC and we need to match the AF spectrum of inputAFRodName. - // We then pick a variant with probablity AF*desiredFraction - if ( sub.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY) ) { - String afo = sub.getAttributeAsString(VCFConstants.ALLELE_FREQUENCY_KEY); - - double af; - double afBoost = 1.0; - if (afo.contains(",")) { - String[] afs = afo.split(","); - afs[0] = afs[0].substring(1,afs[0].length()); - afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1); - - double[] afd = new double[afs.length]; - - for (int k=0; k < afd.length; k++) - afd[k] = Double.valueOf(afs[k]); - - af = MathUtils.arrayMax(afd); - //af = Double.valueOf(afs[0]); - - } - else - af = Double.valueOf(afo); - - // now boost af by table read from file if desired - //double bkpt = 0.0; - int bkidx = 0; - if (!afBreakpoints.isEmpty()) { - for ( Double bkpt : afBreakpoints) { - if (af < bkpt + bkDelta) - break; - else bkidx++; - } - if (bkidx >=afBoosts.size()) - bkidx = afBoosts.size()-1; - afBoost = afBoosts.get(bkidx); - //System.out.formatPrin("af:%f bkidx:%d afboost:%f\n",af,bkidx,afBoost); - - } - - //System.out.format("%s .. %4.4f\n",afo.toString(), af); - if (GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom * afBoost * afBoost) - vcfWriter.add(sub); - } - - - } - } } } @@ -638,9 +556,9 @@ public class SelectVariants extends RodWalker { if (vc == null) return false; - // if we're not looking at specific samples then the absense of a compVC means discordance - if (NO_SAMPLES_SPECIFIED && (compVCs == null || compVCs.isEmpty())) - return true; + // if we're not looking at specific samples then the absence of a compVC means discordance + if (NO_SAMPLES_SPECIFIED) + return (compVCs == null || compVCs.isEmpty()); // check if we find it in the variant rod Map genotypes = vc.getGenotypes(samples); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java index 8eaf976d0..4e6cc722d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantValidationAssessor.java @@ -237,7 +237,7 @@ public class VariantValidationAssessor extends RodWalker infoMap.put("HomVarPct", String.format("%.1f", 100.0*homVarProp)); infoMap.put("HetPct", String.format("%.1f", 100.0*hetProp)); infoMap.put("HW", String.format("%.2f", hwScore)); - Set altAlleles = vContext.getAlternateAlleles(); + Collection altAlleles = vContext.getAlternateAlleles(); int altAlleleCount = altAlleles.size() == 0 ? 0 : vContext.getChromosomeCount(altAlleles.iterator().next()); if ( !isViolation && altAlleleCount > 0 ) numTrueVariants++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 2a877fb09..454909634 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; @@ -65,12 +66,14 @@ import java.util.*; * *

    Output

    *

    - * A table deliminated file containing the values of the requested fields in the VCF file + * A tab-delimited file containing the values of the requested fields in the VCF file *

    * *

    Examples

    *
    - *     -T $WalkerName \
    + *     java -jar GenomeAnalysisTK.jar \
    + *     -R reference.fasta
    + *     -T VariantsToTable \
      *     -V file.vcf \
      *     -F CHROM -F POS -F ID -F QUAL -F AC \
      *     -o results.table
    @@ -103,7 +106,7 @@ public class VariantsToTable extends RodWalker {
     
         /**
          * By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
    -     * Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value.
    +     * Throwing this flag will cause VariantsToTable to emit values regardless of the FILTER field value.
          */
         @Advanced
         @Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false)
    @@ -133,7 +136,7 @@ public class VariantsToTable extends RodWalker {
     
         /**
          * By default, this tool throws a UserException when it encounters a field without a value in some record.  This
    -     * is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being
    +     * is generally useful when you mistype -F CHROM, so that you get a friendly warning about CHROM not being
          * found before the tool runs through 40M 1000G records.  However, in some cases you genuinely want to allow such
          * fields (e.g., AC not being calculated for filtered records, if included).  When provided, this argument
          * will cause VariantsToTable to write out NA values for missing fields instead of throwing an error.
    @@ -192,7 +195,7 @@ public class VariantsToTable extends RodWalker {
                 if ( getters.containsKey(field) ) {
                     val = getters.get(field).get(vc);
                 } else if ( vc.hasAttribute(field) ) {
    -                val = vc.getAttributeAsString(field);
    +                val = vc.getAttributeAsString(field, null);
                 } else if ( isWildCard(field) ) {
                     Set wildVals = new HashSet();
                     for ( Map.Entry elt : vc.getAttributes().entrySet()) {
    @@ -294,6 +297,14 @@ public class VariantsToTable extends RodWalker {
                     return x.toString();
                 }
             });
    +        getters.put("EVENTLENGTH", new Getter() { public String get(VariantContext vc) {
    +            int maxLength = 0;
    +            for ( final Allele a : vc.getAlternateAlleles() ) {
    +                final int length = a.length() - vc.getReference().length();
    +                if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
    +            }
    +            return Integer.toString(maxLength);
    +        }});
             getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } });
             getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) {
                 if ( vc.isSNP() && vc.isBiallelic() )
    @@ -304,11 +315,12 @@ public class VariantsToTable extends RodWalker {
             getters.put("FILTER", new Getter() { public String get(VariantContext vc) {
                 return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); }
             });
    -
    +        getters.put("ID", new Getter() { public String get(VariantContext vc) { return vc.hasID() ? vc.getID() : "."; } });
             getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } });
             getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
             getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
             getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
    +        getters.put("TYPE", new Getter() { public String get(VariantContext vc) { return vc.getType().toString(); } });
             getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
             getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
             getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
    diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
    index b96923589..c1479bc69 100644
    --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java
    @@ -5,6 +5,10 @@ import com.google.java.contract.Requires;
     import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
     
     import java.io.Serializable;
    +import java.util.ArrayList;
    +import java.util.Arrays;
    +import java.util.Collections;
    +import java.util.List;
     
     /**
      * Created by IntelliJ IDEA.
    @@ -174,6 +178,8 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome
             return new GenomeLoc[] { new GenomeLoc(getContig(),contigIndex,getStart(),splitPoint-1), new GenomeLoc(getContig(),contigIndex,splitPoint,getStop()) };
         }
     
    +    public GenomeLoc union( GenomeLoc that ) { return merge(that); }
    +
         @Requires("that != null")
         @Ensures("result != null")
         public GenomeLoc intersect( GenomeLoc that ) throws ReviewedStingException {
    @@ -192,6 +198,79 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome
                                  Math.min( getStop(), that.getStop()) );
         }
     
    +    @Requires("that != null")
    +    public final List subtract( final GenomeLoc that ) {
    +        if(GenomeLoc.isUnmapped(this) || GenomeLoc.isUnmapped(that)) {
    +            if(! GenomeLoc.isUnmapped(this) || !GenomeLoc.isUnmapped(that))
    +                throw new ReviewedStingException("Tried to intersect a mapped and an unmapped genome loc");
    +            return Arrays.asList(UNMAPPED);
    +        }
    +
    +        if (!(this.overlapsP(that))) {
    +            throw new ReviewedStingException("GenomeLoc::minus(): The two genome loc's need to overlap");
    +        }
    +
    +        if (equals(that)) {
    +            return Collections.emptyList();
    +        } else if (containsP(that)) {
    +            List l = new ArrayList(2);
    +
    +            /**
    +             * we have to create two new region, one for the before part, one for the after
    +             * The old region:
    +             * |----------------- old region (g) -------------|
    +             *        |----- to delete (e) ------|
    +             *
    +             * product (two new regions):
    +             * |------|  + |--------|
    +             *
    +             */
    +            int afterStop = this.getStop(), afterStart = that.getStop() + 1;
    +            int beforeStop = that.getStart() - 1, beforeStart = this.getStart();
    +            if (afterStop - afterStart >= 0) {
    +                GenomeLoc after = new GenomeLoc(this.getContig(), getContigIndex(), afterStart, afterStop);
    +                l.add(after);
    +            }
    +            if (beforeStop - beforeStart >= 0) {
    +                GenomeLoc before = new GenomeLoc(this.getContig(), getContigIndex(), beforeStart, beforeStop);
    +                l.add(before);
    +            }
    +
    +            return l;
    +        } else if (that.containsP(this)) {
    +            /**
    +             * e completely contains g, delete g, but keep looking, there may be more regions
    +             * i.e.:
    +             *   |--------------------- e --------------------|
    +             *       |--- g ---|    |---- others ----|
    +             */
    +            return Collections.emptyList();   // don't need to do anything
    +        } else {
    +            /**
    +             * otherwise e overlaps some part of g
    +             *
    +             * figure out which region occurs first on the genome.  I.e., is it:
    +             * |------------- g ----------|
    +             *       |------------- e ----------|
    +             *
    +             * or:
    +             *       |------------- g ----------|
    +             * |------------ e -----------|
    +             *
    +             */
    +
    +            GenomeLoc n;
    +            if (that.getStart() < this.getStart()) {
    +                n = new GenomeLoc(this.getContig(), getContigIndex(), that.getStop() + 1, this.getStop());
    +            } else {
    +                n = new GenomeLoc(this.getContig(), getContigIndex(), this.getStart(), that.getStart() - 1);
    +            }
    +
    +            // replace g with the new region
    +            return Arrays.asList(n);
    +        }
    +    }
    +
         @Requires("that != null")
         public final boolean containsP(GenomeLoc that) {
             return onSameContig(that) && getStart() <= that.getStart() && getStop() >= that.getStop();
    @@ -203,19 +282,14 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome
         }
     
         @Requires("that != null")
    -    public final int minus( final GenomeLoc that ) {
    +    @Ensures("result >= 0")
    +    public final int distance( final GenomeLoc that ) {
             if ( this.onSameContig(that) )
    -            return this.getStart() - that.getStart();
    +            return Math.abs(this.getStart() - that.getStart());
             else
                 return Integer.MAX_VALUE;
         }
     
    -    @Requires("that != null")
    -    @Ensures("result >= 0")
    -    public final int distance( final GenomeLoc that ) {
    -        return Math.abs(minus(that));
    -    }    
    -
         @Requires({"left != null", "right != null"})
         public final boolean isBetween( final GenomeLoc left, final GenomeLoc right ) {
             return this.compareTo(left) > -1 && this.compareTo(right) < 1;
    @@ -306,7 +380,7 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome
         
         @Override
         public int hashCode() {
    -        return (int)( start << 16 + stop << 4 + contigIndex );
    +        return start << 16 | stop << 4 | contigIndex;
         }
     
     
    diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java
    new file mode 100644
    index 000000000..7aa9fdd65
    --- /dev/null
    +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocComparator.java
    @@ -0,0 +1,56 @@
    +package org.broadinstitute.sting.utils;
    +
    +import com.google.java.contract.Ensures;
    +import com.google.java.contract.Requires;
    +
    +import java.util.Comparator;
    +
    +/**
    + *
    + * @author Mauricio Carneiro
    + * @since 9/28/11
    + */
    +public class GenomeLocComparator implements Comparator {
    +    /**
    +     * compares genomeLoc's contigs
    +     *
    +     * @param gl1 the genome loc to compare contigs
    +     * @param gl2 the genome loc to compare contigs
    +     * @return 0 if equal, -1 if gl2.contig is greater, 1 if gl1.contig is greater
    +     */
    +    @Requires("gl2 != null")
    +    @Ensures("result == 0 || result == 1 || result == -1")
    +    public final int compareContigs( GenomeLoc gl1, GenomeLoc gl2 ) {
    +        if (gl1.contigIndex == gl2.contigIndex)
    +            return 0;
    +        else if (gl1.contigIndex > gl2.contigIndex)
    +            return 1;
    +        return -1;
    +    }
    +
    +    @Requires("gl2 != null")
    +    @Ensures("result == 0 || result == 1 || result == -1")
    +    public int compare ( GenomeLoc gl1, GenomeLoc gl2 ) {
    +        int result = 0;
    +
    +        if ( gl1 == gl2 ) {
    +            result = 0;
    +        }
    +        else if(GenomeLoc.isUnmapped(gl1))
    +            result = 1;
    +        else if(GenomeLoc.isUnmapped(gl2))
    +            result = -1;
    +        else {
    +            final int cmpContig = compareContigs(gl1, gl2);
    +
    +            if ( cmpContig != 0 ) {
    +                result = cmpContig;
    +            } else {
    +                if ( gl1.getStart() < gl2.getStart() ) result = -1;
    +                if ( gl1.getStart() > gl2.getStart() ) result = 1;
    +            }
    +        }
    +
    +        return result;
    +    }
    +}
    diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
    index fd7a79f48..26be0e59e 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java
    @@ -215,7 +215,7 @@ public class GenomeLocSortedSet extends AbstractSet {
     
                 if ( p.overlapsP(e) ) {
                     toProcess.pop();
    -                for ( GenomeLoc newP : subtractRegion(p, e) )
    +                for ( GenomeLoc newP : p.subtract(e) )
                         toProcess.push(newP);
                 } else if ( p.compareContigs(e) < 0 ) {
                     good.add(toProcess.pop());         // p is now good
    @@ -236,69 +236,6 @@ public class GenomeLocSortedSet extends AbstractSet {
             return createSetFromList(genomeLocParser,good);
         }
     
    -    private static final List EMPTY_LIST = new ArrayList();
    -    private List subtractRegion(GenomeLoc g, GenomeLoc e) {
    -        if (g.equals(e)) {
    -            return EMPTY_LIST;
    -        } else if (g.containsP(e)) {
    -            List l = new ArrayList();
    -
    -            /**
    -             * we have to create two new region, one for the before part, one for the after
    -             * The old region:
    -             * |----------------- old region (g) -------------|
    -             *        |----- to delete (e) ------|
    -             *
    -             * product (two new regions):
    -             * |------|  + |--------|
    -             *
    -             */
    -            int afterStop = g.getStop(), afterStart = e.getStop() + 1;
    -            int beforeStop = e.getStart() - 1, beforeStart = g.getStart();
    -            if (afterStop - afterStart >= 0) {
    -                GenomeLoc after = genomeLocParser.createGenomeLoc(g.getContig(), afterStart, afterStop);
    -                l.add(after);
    -            }
    -            if (beforeStop - beforeStart >= 0) {
    -                GenomeLoc before = genomeLocParser.createGenomeLoc(g.getContig(), beforeStart, beforeStop);
    -                l.add(before);
    -            }
    -
    -            return l;
    -        } else if (e.containsP(g)) {
    -            /**
    -             * e completely contains g, delete g, but keep looking, there may be more regions
    -             * i.e.:
    -             *   |--------------------- e --------------------|
    -             *       |--- g ---|    |---- others ----|
    -             */
    -            return EMPTY_LIST;   // don't need to do anything
    -        } else {
    -            /**
    -             * otherwise e overlaps some part of g
    -             *
    -             * figure out which region occurs first on the genome.  I.e., is it:
    -             * |------------- g ----------|
    -             *       |------------- e ----------|
    -             *
    -             * or:
    -             *       |------------- g ----------|
    -             * |------------ e -----------|
    -             *
    -             */
    -
    -            GenomeLoc n;
    -            if (e.getStart() < g.getStart()) {
    -                n = genomeLocParser.createGenomeLoc(g.getContig(), e.getStop() + 1, g.getStop());
    -            } else {
    -                n = genomeLocParser.createGenomeLoc(g.getContig(), g.getStart(), e.getStart() - 1);
    -            }
    -
    -            // replace g with the new region
    -            return Arrays.asList(n);
    -        }
    -    }
    -
     
         /**
          * a simple removal of an interval contained in this list.  The interval must be identical to one in the list (no partial locations or overlapping)
    diff --git a/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
    similarity index 98%
    rename from public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java
    rename to public/java/src/org/broadinstitute/sting/utils/Haplotype.java
    index a17e81461..ce2ca2c28 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/genotype/Haplotype.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java
    @@ -22,10 +22,9 @@
      * OTHER DEALINGS IN THE SOFTWARE.
      */
     
    -package org.broadinstitute.sting.utils.genotype;
    +package org.broadinstitute.sting.utils;
     
     import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
    -import org.broadinstitute.sting.utils.GenomeLoc;
     import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
     import org.broadinstitute.sting.utils.variantcontext.Allele;
     
    diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
    index 0d85f9606..17f458f31 100644
    --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java
    @@ -408,12 +408,12 @@ public class MathUtils {
             return Math.sqrt(rms);
         }
     
    -    public static double rms(Collection l) {
    +    public static double rms(Collection l) {
             if (l.size() == 0)
                 return 0.0;
     
             double rms = 0.0;
    -        for (Double i : l)
    +        for (int i : l)
                 rms += i*i;
             rms /= l.size();
             return Math.sqrt(rms);
    @@ -444,11 +444,25 @@ public class MathUtils {
          * @return a newly allocated array corresponding the normalized values in array, maybe log10 transformed
         */
         public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput) {
    -        double[] normalized = new double[array.length];
    +        return normalizeFromLog10(array, takeLog10OfOutput, false);
    +    }
    +
    +    public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput, boolean keepInLogSpace) {
     
             // for precision purposes, we need to add (or really subtract, since they're
             // all negative) the largest value; also, we need to convert to normal-space.
             double maxValue = Utils.findMaxEntry(array);
    +
    +        // we may decide to just normalize in log space with converting to linear space
    +        if (keepInLogSpace) {
    +            for (int i = 0; i < array.length; i++)
    +                array[i] -= maxValue;
    +            return array;
    +        }
    +
    +        // default case: go to linear space
    +        double[] normalized = new double[array.length];
    +
             for (int i = 0; i < array.length; i++)
                 normalized[i] = Math.pow(10, array[i] - maxValue);
     
    diff --git a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java
    index c6a07b5ce..cf45dab79 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java
    @@ -1,7 +1,6 @@
     package org.broadinstitute.sting.utils;
     
    -import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
    -import org.broadinstitute.sting.gatk.datasources.sample.Sample;
    +import org.broadinstitute.sting.gatk.samples.Sample;
     import org.broadinstitute.sting.utils.exceptions.UserException;
     import org.broadinstitute.sting.utils.variantcontext.Genotype;
     import org.broadinstitute.sting.utils.variantcontext.VariantContext;
    @@ -17,9 +16,6 @@ import java.util.regex.Pattern;
      * Time: 12:38 PM
      */
     public class MendelianViolation {
    -
    -
    -
         String sampleMom;
         String sampleDad;
         String sampleChild;
    @@ -30,21 +26,20 @@ public class MendelianViolation {
     
         double minGenotypeQuality;
     
    -    private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
    +    static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 };
    +    static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 };
     
    +    private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");
     
         public String getSampleMom() {
             return sampleMom;
         }
    -
         public String getSampleDad() {
             return sampleDad;
         }
    -
         public String getSampleChild() {
             return sampleChild;
         }
    -
         public double getMinGenotypeQuality() {
             return minGenotypeQuality;
         }
    @@ -85,37 +80,12 @@ public class MendelianViolation {
          * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
          */
         public MendelianViolation(Sample sample, double minGenotypeQualityP) {
    -        sampleMom = sample.getMother().getId();
    -        sampleDad = sample.getFather().getId();
    -        sampleChild = sample.getId();
    +        sampleMom = sample.getMother().getID();
    +        sampleDad = sample.getFather().getID();
    +        sampleChild = sample.getID();
             minGenotypeQuality = minGenotypeQualityP;
         }
     
    -
    -    /**
    -     * The most common constructor to be used when give a YAML file with the relationships to the engine with the -SM option.
    -     * @param engine - The GATK engine, use getToolkit(). That's where the sample information is stored.
    -     * @param minGenotypeQualityP - the minimum phred scaled genotype quality score necessary to asses mendelian violation
    -     */
    -    public MendelianViolation(GenomeAnalysisEngine engine, double minGenotypeQualityP) {
    -        boolean gotSampleInformation = false;
    -        Collection samples = engine.getSamples();
    -        // Iterate through all samples in the sample_metadata file but we really can only take one.
    -        for (Sample sample : samples) {
    -            if (sample.getMother() != null && sample.getFather() != null) {
    -                sampleMom = sample.getMother().getId();
    -                sampleDad = sample.getFather().getId();
    -                sampleChild = sample.getId();
    -                minGenotypeQuality = minGenotypeQualityP;
    -                gotSampleInformation = true;
    -                break; // we can only deal with one trio information
    -            }
    -        }
    -        if (!gotSampleInformation)
    -            throw new UserException("YAML file has no sample with relationship information (mother/father)");
    -    }
    -
    -
         /**
          * This method prepares the object to evaluate for violation. Typically you won't call it directly, a call to
          * isViolation(vc) will take care of this. But if you want to know whether your site was a valid comparison site
    @@ -153,7 +123,7 @@ public class MendelianViolation {
          * @return False if we can't determine (lack of information), or it's not a violation. True if it is a violation.
          *
          */
    -    public boolean isViolation (VariantContext vc)
    +    public boolean isViolation(VariantContext vc)
         {
             return setAlleles(vc) && isViolation();
         }
    @@ -168,4 +138,42 @@ public class MendelianViolation {
             return true;
         }
     
    +    /**
    +     * @return the likelihood ratio for a mendelian violation
    +     */
    +    public double violationLikelihoodRatio(VariantContext vc) {
    +        double[] logLikAssignments = new double[27];
    +        // the matrix to set up is
    +        // MOM   DAD    CHILD
    +        //                    |-  AA
    +        //   AA     AA    |    AB
    +        //                    |-   BB
    +        //                    |- AA
    +        //  AA     AB     |   AB
    +        //                    |- BB
    +        // etc. The leaves are counted as 0-11 for MVs and 0-14 for non-MVs
    +        double[] momGL = vc.getGenotype(sampleMom).getLikelihoods().getAsVector();
    +        double[] dadGL = vc.getGenotype(sampleDad).getLikelihoods().getAsVector();
    +        double[] childGL = vc.getGenotype(sampleChild).getLikelihoods().getAsVector();
    +        int offset = 0;
    +        for ( int oMom = 0; oMom < 3; oMom++ ) {
    +            for ( int oDad = 0; oDad < 3; oDad++ ) {
    +                for ( int oChild = 0; oChild < 3; oChild ++ ) {
    +                    logLikAssignments[offset++] = momGL[oMom] + dadGL[oDad] + childGL[oChild];
    +                }
    +            }
    +        }
    +        double[] mvLiks = new double[12];
    +        double[] nonMVLiks = new double[15];
    +        for ( int i = 0; i < 12; i ++ ) {
    +            mvLiks[i] = logLikAssignments[mvOffsets[i]];
    +        }
    +
    +        for ( int i = 0; i < 15; i++) {
    +            nonMVLiks[i] = logLikAssignments[nonMVOffsets[i]];
    +        }
    +
    +        return MathUtils.log10sumLog10(mvLiks) - MathUtils.log10sumLog10(nonMVLiks);
    +    }
    +
     }
    diff --git a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java
    new file mode 100644
    index 000000000..4f01f2b7a
    --- /dev/null
    +++ b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java
    @@ -0,0 +1,108 @@
    +/*
    + * Copyright (c) 2011, The Broad Institute
    + *
    + * Permission is hereby granted, free of charge, to any person
    + * obtaining a copy of this software and associated documentation
    + * files (the "Software"), to deal in the Software without
    + * restriction, including without limitation the rights to use,
    + * copy, modify, merge, publish, distribute, sublicense, and/or sell
    + * copies of the Software, and to permit persons to whom the
    + * Software is furnished to do so, subject to the following
    + * conditions:
    + *
    + * The above copyright notice and this permission notice shall be
    + * included in all copies or substantial portions of the Software.
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    + * OTHER DEALINGS IN THE SOFTWARE.
    + */
    +
    +package org.broadinstitute.sting.utils;
    +
    +import net.sf.samtools.SAMReadGroupRecord;
    +import net.sf.samtools.SAMRecord;
    +
    +/**
    + * A canonical, master list of the standard NGS platforms.  These values
    + * can be obtained (efficiently) from a GATKSAMRecord object with the
    + * getNGSPlatform method.
    + *
    + * @author Mark DePristo
    + * @since 2011
    + */
    +public enum NGSPlatform {
    +    ILLUMINA("ILLUMINA", "SLX", "SOLEXA"),
    +    SOLID("SOLID"),
    +    LS454("454"),
    +    COMPLETE_GENOMICS("COMPLETE"),
    +    PACBIO("PACBIO"),
    +    ION_TORRENT("IONTORRENT"),
    +    UNKNOWN("UNKNOWN");
    +
    +    /**
    +     * Array of the prefix names in a BAM file for each of the platforms.
    +     */
    +    private final String[] BAM_PL_NAMES;
    +
    +    NGSPlatform(final String... BAM_PL_NAMES) {
    +        for ( int i = 0; i < BAM_PL_NAMES.length; i++ )
    +            BAM_PL_NAMES[i] = BAM_PL_NAMES[i].toUpperCase();
    +        this.BAM_PL_NAMES = BAM_PL_NAMES;
    +    }
    +
    +    /**
    +     * Returns a representative PL string for this platform
    +     * @return
    +     */
    +    public final String getDefaultPlatform() {
    +        return BAM_PL_NAMES[0];
    +    }
    +
    +    /**
    +     * Convenience constructor -- calculates the NGSPlatfrom from a SAMRecord.
    +     * Note you should not use this function if you have a GATKSAMRecord -- use the
    +     * accessor method instead.
    +     *
    +     * @param read
    +     * @return an NGSPlatform object matching the PL field of the header, of UNKNOWN if there was no match
    +     */
    +    public static final NGSPlatform fromRead(SAMRecord read) {
    +        return fromReadGroup(read.getReadGroup());
    +    }
    +
    +    /**
    +     * Returns the NGSPlatform corresponding to the PL tag in the read group
    +     * @param rg
    +     * @return an NGSPlatform object matching the PL field of the header, of UNKNOWN if there was no match
    +     */
    +    public static final NGSPlatform fromReadGroup(SAMReadGroupRecord rg) {
    +        return fromReadGroupPL(rg.getPlatform());
    +    }
    +
    +    /**
    +     * Returns the NGSPlatform corresponding to the PL tag in the read group
    +     * @param plFromRG -- the PL field (or equivalent) in a ReadGroup object
    +     * @return an NGSPlatform object matching the PL field of the header, of UNKNOWN if there was no match
    +     */
    +    public static final NGSPlatform fromReadGroupPL(final String plFromRG) {
    +        if ( plFromRG == null ) return UNKNOWN;
    +
    +        // todo -- algorithm could be implemented more efficiently, as the list of all
    +        // todo -- names is known upfront, so a decision tree could be used to identify
    +        // todo -- a prefix common to PL
    +        final String pl = plFromRG.toUpperCase();
    +        for ( final NGSPlatform ngsPlatform : NGSPlatform.values() ) {
    +            for ( final String bamPLName : ngsPlatform.BAM_PL_NAMES ) {
    +                if ( pl.contains(bamPLName) )
    +                    return ngsPlatform;
    +            }
    +        }
    +
    +        return UNKNOWN;
    +    }
    +}
    diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java
    index fad2320fc..19e03a19d 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java
    @@ -9,14 +9,17 @@ import net.sf.samtools.SAMUtils;
      * @author Kiran Garimella
      */
     public class QualityUtils {
    -
         public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE;
         public final static double MIN_REASONABLE_ERROR = 0.0001;
         public final static byte MAX_REASONABLE_Q_SCORE = 40;
         public final static byte MIN_USABLE_Q_SCORE = 6;
    -
         public final static int MAPPING_QUALITY_UNAVAILABLE = 255;
     
    +    private static double qualToErrorProbCache[] = new double[256];
    +    static {
    +        for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw((byte)i);
    +    }
    +
         /**
          * Private constructor.  No instantiating this class!
          */
    @@ -33,10 +36,6 @@ public class QualityUtils {
             return 1.0 - qualToErrorProb(qual);
         }
     
    -    static public double qualToProb(int qual) {
    -        return qualToProb( (double)qual );
    -    }
    -
         static public double qualToProb(double qual) {
             return 1.0 - Math.pow(10.0, qual/(-10.0));
         }
    @@ -48,10 +47,14 @@ public class QualityUtils {
          * @param qual a quality score (0-40)
          * @return a probability (0.0-1.0)
          */
    -    static public double qualToErrorProb(byte qual) {
    +    static public double qualToErrorProbRaw(byte qual) {
             return Math.pow(10.0, ((double) qual)/-10.0);
         }
     
    +    static public double qualToErrorProb(byte qual) {
    +        return qualToErrorProbCache[qual];
    +    }
    +
         /**
          * Convert a probability to a quality score.  Note, this is capped at Q40.
          *
    @@ -110,88 +113,4 @@ public class QualityUtils {
             //return (byte) Math.min(qual, maxQual);
             return (byte) Math.max(Math.min(qual, maxQual), 1);
         }
    -
    -    /**
    -     * Compress a base and a probability into a single byte so that it can be output in a SAMRecord's SQ field.
    -     * Note: the highest probability this function can encode is 64%, so this function should only never be used on the best base hypothesis.
    -     * Another note: the probability encoded here gets rounded to the nearest 1%.
    -     *
    -     * @param baseIndex the base index
    -     * @param prob      the base probability
    -     * @return a byte containing the index and the probability
    -     */
    -    static public byte baseAndProbToCompressedQuality(int baseIndex, double prob) {
    -        byte compressedQual = 0;
    -
    -        compressedQual = (byte) baseIndex;
    -
    -        byte cprob = (byte) (100.0*prob);
    -        byte qualmask = (byte) 252;
    -        compressedQual += ((cprob << 2) & qualmask);
    -        
    -        return compressedQual;
    -    }
    -
    -    /**
    -     * From a compressed base, extract the base index (0:A, 1:C, 2:G, 3:T)
    -     *
    -     * @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality
    -     * @return base index
    -     */
    -    static public int compressedQualityToBaseIndex(byte compressedQual) {
    -        return (int) (compressedQual & 0x3);
    -    }
    -
    -    /**
    -     * From a compressed base, extract the base probability
    -     *
    -     * @param compressedQual the compressed quality score, as returned by baseAndProbToCompressedQuality
    -     * @return the probability
    -     */
    -    static public double compressedQualityToProb(byte compressedQual) {
    -        // Because java natives are signed, extra care must be taken to avoid
    -        // shifting a 1 into the sign bit in the implicit promotion of 2 to an int.
    -        int x2 = ((int) compressedQual) & 0xff;
    -        x2 = (x2 >>> 2);
    -
    -        return ((double) x2)/100.0;
    -    }
    -
    -    /**
    -     * Return the complement of a compressed quality
    -     *
    -     * @param compressedQual  the compressed quality score (as returned by baseAndProbToCompressedQuality)
    -     * @return the complementary compressed quality
    -     */
    -    static public byte complementCompressedQuality(byte compressedQual) {
    -        int baseIndex = compressedQualityToBaseIndex(compressedQual);
    -        double prob = compressedQualityToProb(compressedQual);
    -
    -        return baseAndProbToCompressedQuality(BaseUtils.complementIndex(baseIndex), prob);
    -    }
    -
    -    /**
    -     * Return the reverse complement of a byte array of compressed qualities
    -     *
    -     * @param compressedQuals  a byte array of compressed quality scores
    -     * @return the reverse complement of the byte array
    -     */
    -    static public byte[] reverseComplementCompressedQualityArray(byte[] compressedQuals) {
    -        byte[] rcCompressedQuals = new byte[compressedQuals.length];
    -
    -        for (int pos = 0; pos < compressedQuals.length; pos++) {
    -            rcCompressedQuals[compressedQuals.length - pos - 1] = complementCompressedQuality(compressedQuals[pos]);
    -        }
    -
    -        return rcCompressedQuals;
    -    }
    -
    -    /**
    -     * Return the reverse of a byte array of qualities (compressed or otherwise)
    -     * @param quals   the array of bytes to be reversed
    -     * @return the reverse of the quality array
    -     */
    -    static public byte[] reverseQualityArray( byte[] quals ) {
    -        return Utils.reverse(quals); // no sense in duplicating functionality
    -    }
     }
    diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java b/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java
    index 58f7942fe..d8176ff4e 100644
    --- a/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java
    @@ -25,104 +25,164 @@
     package org.broadinstitute.sting.utils.R;
     
     import org.apache.commons.io.FileUtils;
    +import org.apache.commons.lang.StringUtils;
     import org.apache.log4j.Logger;
    -import org.broadinstitute.sting.commandline.Advanced;
    -import org.broadinstitute.sting.commandline.Argument;
    -import org.broadinstitute.sting.commandline.ArgumentCollection;
    -import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
    -import org.broadinstitute.sting.utils.PathUtils;
     import org.broadinstitute.sting.utils.Utils;
    +import org.broadinstitute.sting.utils.exceptions.StingException;
     import org.broadinstitute.sting.utils.exceptions.UserException;
    +import org.broadinstitute.sting.utils.io.IOUtils;
    +import org.broadinstitute.sting.utils.io.Resource;
    +import org.broadinstitute.sting.utils.runtime.ProcessController;
    +import org.broadinstitute.sting.utils.runtime.ProcessSettings;
    +import org.broadinstitute.sting.utils.runtime.RuntimeUtils;
     
     import java.io.File;
    -import java.io.IOException;
    -import java.util.Arrays;
    +import java.util.ArrayList;
     import java.util.List;
     
     /**
    - * Generic service for executing RScripts in the GATK directory
    - *
    - * @author Your Name
    - * @since Date created
    + * Generic service for executing RScripts
      */
     public class RScriptExecutor {
    +    private static final String RSCRIPT_BINARY = "Rscript";
    +    private static final File RSCRIPT_PATH = RuntimeUtils.which(RSCRIPT_BINARY);
    +    public static final boolean RSCRIPT_EXISTS = (RSCRIPT_PATH != null);
    +    private static final String RSCRIPT_MISSING_MESSAGE = "Please add the Rscript directory to your environment ${PATH}";
    +
         /**
          * our log
          */
    -    protected static Logger logger = Logger.getLogger(RScriptExecutor.class);
    +    private static Logger logger = Logger.getLogger(RScriptExecutor.class);
     
    -    public static class RScriptArgumentCollection {
    -        @Advanced
    -        @Argument(fullName = "path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/software/free/Linux/redhat_5_x86_64/pkgs/r_2.12.0/bin/Rscript", required = false)
    -        public String PATH_TO_RSCRIPT = "Rscript";
    +    private boolean exceptOnError = false;
    +    private final List libraries = new ArrayList();
    +    private final List scriptResources = new ArrayList();
    +    private final List scriptFiles = new ArrayList();
    +    private final List args = new ArrayList();
     
    -        @Advanced
    -        @Argument(fullName = "path_to_Rresources", shortName = "Rresources", doc = "Path to resources folder holding the Sting R scripts.", required = false)
    -        public List PATH_TO_RESOURCES = Arrays.asList("public/R/", "private/R/");
    -
    -        public RScriptArgumentCollection() {}
    -
    -        /** For testing and convenience */
    -        public RScriptArgumentCollection(final String PATH_TO_RSCRIPT, final List PATH_TO_RESOURCES) {
    -            this.PATH_TO_RSCRIPT = PATH_TO_RSCRIPT;
    -            this.PATH_TO_RESOURCES = PATH_TO_RESOURCES;
    -        }
    -    }
    -
    -    final RScriptArgumentCollection myArgs;
    -    final boolean exceptOnError;
    -
    -    public RScriptExecutor(final RScriptArgumentCollection myArgs, final boolean exceptOnError) {
    -        this.myArgs = myArgs;
    +    public void setExceptOnError(boolean exceptOnError) {
             this.exceptOnError = exceptOnError;
         }
     
    -    public void callRScripts(String scriptName, Object... scriptArgs) {
    -        callRScripts(scriptName, Arrays.asList(scriptArgs));
    +    public void addLibrary(RScriptLibrary library) {
    +        this.libraries.add(library);
         }
     
    -    public void callRScripts(String scriptName, List scriptArgs) {
    -        try {
    -            final File pathToScript = findScript(scriptName);
    -            if ( pathToScript == null ) return; // we failed but shouldn't exception out
    -            final String argString = Utils.join(" ", scriptArgs);
    -            final String cmdLine = Utils.join(" ", Arrays.asList(myArgs.PATH_TO_RSCRIPT, pathToScript, argString));
    -            logger.info("Executing RScript: " + cmdLine);
    -            Runtime.getRuntime().exec(cmdLine).waitFor();
    -        } catch (InterruptedException e) {
    -            generateException(e);
    -        } catch (IOException e) {
    -            generateException("Fatal Exception: Perhaps RScript jobs are being spawned too quickly?", e);
    -        }
    +    public void addScript(Resource script) {
    +        this.scriptResources.add(script);
         }
     
    -    public File findScript(final String scriptName) {
    -        for ( String pathToResource : myArgs.PATH_TO_RESOURCES ) {
    -            final File f = new File(pathToResource + "/" + scriptName);
    -            if ( f.exists() ) {
    -                if ( f.canRead() )
    -                    return f;
    -                else
    -                    generateException("Script exists but couldn't be read: " + scriptName);
    +    public void addScript(File script) {
    +        this.scriptFiles.add(script);
    +    }
    +
    +    /**
    +     * Adds args to the end of the Rscript command line.
    +     * @param args the args.
    +     * @throws NullPointerException if any of the args are null.
    +     */
    +    public void addArgs(Object... args) {
    +        for (Object arg: args)
    +            this.args.add(arg.toString());
    +    }
    +
    +    public String getApproximateCommandLine() {
    +        StringBuilder command = new StringBuilder("Rscript");
    +        for (Resource script: this.scriptResources)
    +            command.append(" (resource)").append(script.getFullPath());
    +        for (File script: this.scriptFiles)
    +            command.append(" ").append(script.getAbsolutePath());
    +        for (String arg: this.args)
    +            command.append(" ").append(arg);
    +        return command.toString();
    +    }
    +
    +    public boolean exec() {
    +        if (!RSCRIPT_EXISTS) {
    +            if (exceptOnError) {
    +                throw new UserException.CannotExecuteRScript(RSCRIPT_MISSING_MESSAGE);
    +            } else {
    +                logger.warn("Skipping: " + getApproximateCommandLine());
    +                return false;
                 }
             }
     
    -        generateException("Couldn't find script: " + scriptName + " in " + myArgs.PATH_TO_RESOURCES);
    -        return null;
    -    }
    +        List tempFiles = new ArrayList();
    +        try {
    +            File tempLibDir = IOUtils.tempDir("R.", ".lib");
    +            tempFiles.add(tempLibDir);
     
    -    private void generateException(String msg) {
    -        generateException(msg, null);
    -    }
    +            StringBuilder expression = new StringBuilder("tempLibDir = '").append(tempLibDir).append("';");
     
    -    private void generateException(Throwable e) {
    -        generateException("", e);
    -    }
    +            if (this.libraries.size() > 0) {
    +                List tempLibraryPaths = new ArrayList();
    +                for (RScriptLibrary library: this.libraries) {
    +                    File tempLibrary = library.writeTemp();
    +                    tempFiles.add(tempLibrary);
    +                    tempLibraryPaths.add(tempLibrary.getAbsolutePath());
    +                }
     
    -    private void generateException(String msg, Throwable e) {
    -        if ( exceptOnError )
    -            throw new UserException(msg, e);
    -        else
    -            logger.warn(msg + (e == null ? "" : ":" + e.getMessage()));
    +                expression.append("install.packages(");
    +                expression.append("pkgs=c('").append(StringUtils.join(tempLibraryPaths, "', '")).append("'), lib=tempLibDir, repos=NULL, type='source', ");
    +                // Install faster by eliminating cruft.
    +                expression.append("INSTALL_opts=c('--no-libs', '--no-data', '--no-help', '--no-demo', '--no-exec')");
    +                expression.append(");");
    +
    +                for (RScriptLibrary library: this.libraries) {
    +                    expression.append("library('").append(library.getLibraryName()).append("', lib.loc=tempLibDir);");
    +                }
    +            }
    +
    +            for (Resource script: this.scriptResources) {
    +                File tempScript = IOUtils.writeTempResource(script);
    +                tempFiles.add(tempScript);
    +                expression.append("source('").append(tempScript.getAbsolutePath()).append("');");
    +            }
    +
    +            for (File script: this.scriptFiles) {
    +                expression.append("source('").append(script.getAbsolutePath()).append("');");
    +            }
    +
    +            String[] cmd = new String[this.args.size() + 3];
    +            int i = 0;
    +            cmd[i++] = RSCRIPT_BINARY;
    +            cmd[i++] = "-e";
    +            cmd[i++] = expression.toString();
    +            for (String arg: this.args)
    +                cmd[i++] = arg;
    +
    +            ProcessSettings processSettings = new ProcessSettings(cmd);
    +            if (logger.isDebugEnabled()) {
    +                processSettings.getStdoutSettings().printStandard(true);
    +                processSettings.getStderrSettings().printStandard(true);
    +            }
    +
    +            ProcessController controller = ProcessController.getThreadLocal();
    +
    +            if (logger.isDebugEnabled()) {
    +                logger.debug("Executing:");
    +                for (String arg: cmd)
    +                    logger.debug("  " + arg);
    +            }
    +            int exitValue = controller.exec(processSettings).getExitValue();
    +            logger.debug("Result: " + exitValue);
    +
    +            if (exitValue != 0)
    +                throw new RScriptExecutorException(
    +                        "RScript exited with " + exitValue +
    +                                (logger.isDebugEnabled() ? "" : ". Run with -l DEBUG for more info."));
    +
    +            return true;
    +        } catch (StingException e) {
    +            if (exceptOnError) {
    +                throw e;
    +            } else {
    +                logger.warn(e.getMessage());
    +                return false;
    +            }
    +        } finally {
    +            for (File temp: tempFiles)
    +                FileUtils.deleteQuietly(temp);
    +        }
         }
     }
    diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutorException.java b/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutorException.java
    new file mode 100644
    index 000000000..794c3ade4
    --- /dev/null
    +++ b/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutorException.java
    @@ -0,0 +1,33 @@
    +/*
    + * Copyright (c) 2011, The Broad Institute
    + *
    + * Permission is hereby granted, free of charge, to any person
    + * obtaining a copy of this software and associated documentation
    + * files (the "Software"), to deal in the Software without
    + * restriction, including without limitation the rights to use,
    + * copy, modify, merge, publish, distribute, sublicense, and/or sell
    + * copies of the Software, and to permit persons to whom the
    + * Software is furnished to do so, subject to the following
    + * conditions:
    + *
    + * The above copyright notice and this permission notice shall be
    + * included in all copies or substantial portions of the Software.
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    + * OTHER DEALINGS IN THE SOFTWARE.
    + */
    +
    +package org.broadinstitute.sting.utils.R;
    +
    +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
    +
    +public class RScriptExecutorException extends ReviewedStingException {
    +    public RScriptExecutorException(String msg) {
    +        super(msg);
    +    }
    +}
    diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RScriptLibrary.java b/public/java/src/org/broadinstitute/sting/utils/R/RScriptLibrary.java
    new file mode 100644
    index 000000000..60cd7504b
    --- /dev/null
    +++ b/public/java/src/org/broadinstitute/sting/utils/R/RScriptLibrary.java
    @@ -0,0 +1,59 @@
    +/*
    + * Copyright (c) 2011, The Broad Institute
    + *
    + * Permission is hereby granted, free of charge, to any person
    + * obtaining a copy of this software and associated documentation
    + * files (the "Software"), to deal in the Software without
    + * restriction, including without limitation the rights to use,
    + * copy, modify, merge, publish, distribute, sublicense, and/or sell
    + * copies of the Software, and to permit persons to whom the
    + * Software is furnished to do so, subject to the following
    + * conditions:
    + *
    + * The above copyright notice and this permission notice shall be
    + * included in all copies or substantial portions of the Software.
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    + * OTHER DEALINGS IN THE SOFTWARE.
    + */
    +
    +package org.broadinstitute.sting.utils.R;
    +
    +import org.broadinstitute.sting.utils.io.IOUtils;
    +import org.broadinstitute.sting.utils.io.Resource;
    +
    +import java.io.File;
    +
    +/**
    + * Libraries embedded in the StingUtils package.
    + */
    +public enum RScriptLibrary {
    +    GSALIB("gsalib");
    +
    +    private final String name;
    +
    +    private RScriptLibrary(String name) {
    +        this.name = name;
    +    }
    +
    +    public String getLibraryName() {
    +        return this.name;
    +    }
    +
    +    public String getResourcePath() {
    +        return name + ".tar.gz";
    +    }
    +
    +    /**
    +     * Writes the library source code to a temporary tar.gz file and returns the path.
    +     * @return The path to the library source code. The caller must delete the code when done.
    +     */
    +    public File writeTemp() {
    +        return IOUtils.writeTempResource(new Resource(getResourcePath(), RScriptLibrary.class));
    +    }
    +}
    diff --git a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java
    index f9997bfd8..edc1413ba 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/SampleUtils.java
    @@ -69,6 +69,18 @@ public class SampleUtils {
             return samples;
         }
     
    +
    +    /**
    +     * Same as @link getSAMFileSamples but gets all of the samples
    +     * in the SAM files loaded by the engine
    +     *
    +     * @param engine
    +     * @return
    +     */
    +    public final static Set getSAMFileSamples(GenomeAnalysisEngine engine) {
    +        return SampleUtils.getSAMFileSamples(engine.getSAMFileHeader());
    +    }
    +
         /**
          * Gets all of the unique sample names from all VCF rods input by the user
          *
    @@ -190,11 +202,21 @@ public class SampleUtils {
     
         }
     
    -    public static List getSamplesFromCommandLineInput(Collection sampleArgs) {
    +    /**
    +     * Returns a new set of samples, containing a final list of samples expanded from sampleArgs
    +     *
    +     * Each element E of sampleArgs can either be a literal sample name or a file.  For each E,
    +     * we try to read a file named E from disk, and if possible all lines from that file are expanded
    +     * into unique sample names.
    +     *
    +     * @param sampleArgs
    +     * @return
    +     */
    +    public static Set getSamplesFromCommandLineInput(Collection sampleArgs) {
             if (sampleArgs != null) {
                 // Let's first go through the list and see if we were given any files.  We'll add every entry in the file to our
                 // sample list set, and treat the entries as if they had been specified on the command line.
    -            List samplesFromFiles = new ArrayList();
    +            Set samplesFromFiles = new HashSet();
                 for (String SAMPLE_EXPRESSION : sampleArgs) {
                     File sampleFile = new File(SAMPLE_EXPRESSION);
     
    @@ -203,7 +225,7 @@ public class SampleUtils {
     
                         List lines = reader.readLines();
                         for (String line : lines) {
    -                        samplesFromFiles.add(line);
    +                        samplesFromFiles.add(line.trim());
                         }
                     } catch (FileNotFoundException e) {
                         samplesFromFiles.add(SAMPLE_EXPRESSION); // not a file, so must be a sample
    @@ -212,7 +234,8 @@ public class SampleUtils {
     
                 return samplesFromFiles;
             }
    -        return new ArrayList();
    +
    +        return new HashSet();
         }
     
         public static Set getSamplesFromCommandLineInput(Collection vcfSamples, Collection sampleExpressions) {
    diff --git a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java
    index a5ac10250..15d34a348 100644
    --- a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java
    @@ -1,10 +1,5 @@
     package org.broadinstitute.sting.utils;
     
    -import com.google.java.contract.Ensures;
    -import com.google.java.contract.Invariant;
    -import com.google.java.contract.Requires;
    -
    -import java.io.PrintStream;
     
     /**
      * A useful simple system for timing code.  This code is not thread safe!
    @@ -13,11 +8,6 @@ import java.io.PrintStream;
      * Date: Dec 10, 2010
      * Time: 9:07:44 AM
      */
    -@Invariant({
    -        "elapsed >= 0",
    -        "startTime >= 0",
    -        "name != null",
    -        "! running || startTime > 0"})
     public class SimpleTimer {
         final private String name;
         private long elapsed = 0l;
    @@ -27,7 +17,6 @@ public class SimpleTimer {
         /**
          * Creates an anonymous simple timer
          */
    -    @Ensures("name != null && name.equals(\"Anonymous\")")
         public SimpleTimer() {
             this("Anonymous");
         }
    @@ -36,8 +25,6 @@ public class SimpleTimer {
          * Creates a simple timer named name
          * @param name of the timer, must not be null
          */
    -    @Requires("name != null")
    -    @Ensures("this.name != null && this.name.equals(name)")
         public SimpleTimer(String name) {
             this.name = name;
         }
    @@ -45,7 +32,6 @@ public class SimpleTimer {
         /**
          * @return the name associated with this timer
          */
    -    @Ensures("result != null")
         public synchronized String getName() {
             return name;
         }
    @@ -56,8 +42,6 @@ public class SimpleTimer {
          *
          * @return this object, for programming convenience
          */
    -    @Requires("running == false")
    -    @Ensures({"result != null", "elapsed == 0l"})
         public synchronized SimpleTimer start() {
             elapsed = 0l;
             restart();
    @@ -71,8 +55,6 @@ public class SimpleTimer {
          *
          * @return this object, for programming convenience
          */
    -    @Requires("running == false")
    -    @Ensures("result != null")
         public synchronized SimpleTimer restart() {
             running = true;
             startTime = currentTime();
    @@ -99,8 +81,6 @@ public class SimpleTimer {
          *
          * @return this object, for programming convenience
          */
    -    @Requires("running == true")
    -    @Ensures({"result != null", "elapsed >= old(elapsed)", "running == false"})
         public synchronized SimpleTimer stop() {
             running = false;
             elapsed += currentTime() - startTime;
    @@ -113,9 +93,6 @@ public class SimpleTimer {
          *
          * @return this time, in seconds
          */
    -    @Ensures({
    -            "result >= (elapsed/1000.0)",
    -            "result >= 0"})
         public synchronized double getElapsedTime() {
             return (running ? (currentTime() - startTime + elapsed) : elapsed) / 1000.0;
         }
    diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java
    index 6ce492c63..f0eb5d399 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java
    @@ -58,33 +58,6 @@ public class Utils {
             return (int)(maxElements / JAVA_DEFAULT_HASH_LOAD_FACTOR) + 2;
         }
     
    -    public static String getClassName(Class c) {
    -        String FQClassName = c.getName();
    -        int firstChar;
    -        firstChar = FQClassName.lastIndexOf ('.') + 1;
    -        if ( firstChar > 0 ) {
    -            FQClassName = FQClassName.substring ( firstChar );
    -        }
    -        return FQClassName;
    -    }
    -
    -
    -    // returns package and class name
    -    public static String getFullClassName(Class c) {
    -        return  c.getName();
    -    }
    -
    -    // returns the package without the classname, empty string if
    -    // there is no package
    -    public static String getPackageName(Class c) {
    -        String fullyQualifiedName = c.getName();
    -        int lastDot = fullyQualifiedName.lastIndexOf ('.');
    -        if (lastDot==-1){ return ""; }
    -        return fullyQualifiedName.substring (0, lastDot);
    -    }
    -
    -
    -
         /**
          * Compares two objects, either of which might be null.
          *
    @@ -107,20 +80,24 @@ public class Utils {
         }
     
         public static void warnUser(final String msg) {
    +        warnUser(logger, msg);
    +    }
    +    
    +    public static void warnUser(final Logger logger, final String msg) {
             logger.warn(String.format("********************************************************************************"));
             logger.warn(String.format("* WARNING:"));
             logger.warn(String.format("*"));
    -        prettyPrintWarningMessage(msg);
    +        prettyPrintWarningMessage(logger, msg);
             logger.warn(String.format("********************************************************************************"));
         }
    -    
     
         /**
          * pretty print the warning message supplied
          *
    +     * @param logger logger for the message
          * @param message the message
          */
    -    private static void prettyPrintWarningMessage(String message) {
    +    private static void prettyPrintWarningMessage(Logger logger, String message) {
             StringBuilder builder = new StringBuilder(message);
             while (builder.length() > 70) {
                 int space = builder.lastIndexOf(" ", 70);
    @@ -609,6 +586,12 @@ public class Utils {
             return rcbases;
         }
     
    +    static public final  List reverse(final List l) {
    +        final List newL = new ArrayList(l);
    +        Collections.reverse(newL);
    +        return newL;
    +    }
    +
         /**
          * Reverse an int array of bases
          *
    diff --git a/public/java/src/org/broadinstitute/sting/utils/bed/BedParser.java b/public/java/src/org/broadinstitute/sting/utils/bed/BedParser.java
    deleted file mode 100644
    index b95165841..000000000
    --- a/public/java/src/org/broadinstitute/sting/utils/bed/BedParser.java
    +++ /dev/null
    @@ -1,104 +0,0 @@
    -package org.broadinstitute.sting.utils.bed;
    -
    -import org.broadinstitute.sting.utils.GenomeLoc;
    -import org.broadinstitute.sting.utils.GenomeLocParser;
    -import org.broadinstitute.sting.utils.exceptions.UserException;
    -
    -import java.io.*;
    -import java.util.ArrayList;
    -import java.util.List;
    -
    -/**
    - * Created by IntelliJ IDEA.
    - * User: aaron
    - * Date: Oct 5, 2009
    - * Time: 5:46:45 PM
    - */
    -public class BedParser {
    -    // the GATk operates as a one based location, bed files are 0 based
    -    static final int TO_ONE_BASED_ADDITION = 1;
    -
    -    // the buffered reader input
    -    private final BufferedReader mIn;
    -
    -    private GenomeLocParser genomeLocParser;
    -
    -    // our array of locations
    -    private List mLocations;
    -
    -    /**
    -     * parse a bed file, given it's location
    -     *
    -     * @param fl
    -     */
    -    public BedParser(GenomeLocParser genomeLocParser,File fl) {
    -        this.genomeLocParser = genomeLocParser;
    -        try {
    -            mIn = new BufferedReader(new FileReader(fl));
    -        } catch (FileNotFoundException e) {
    -            throw new UserException.CouldNotReadInputFile(fl, e);
    -        }
    -        mLocations = parseLocations();
    -    }
    -
    -    /**
    -     * parse a bed file, given an input reader
    -     *
    -     * @param fl the bed file
    -     */
    -    public BedParser(BufferedReader fl) {
    -        mIn = fl;
    -        mLocations = parseLocations();
    -    }
    -
    -    /**
    -     * parse out the locations
    -     *
    -     * @return a list of GenomeLocs, sorted and merged
    -     */
    -    private List parseLocations() {
    -        String line = null;
    -        List locArray = new ArrayList();
    -        try {
    -            while ((line = mIn.readLine()) != null) {
    -                locArray.add(parseLocation(genomeLocParser,line));
    -            }
    -        } catch (IOException e) {
    -            throw new UserException.MalformedFile("Unable to parse line in BED file.");
    -        }
    -        return locArray;
    -    }
    -
    -    /**
    -     * parse a single location
    -     *
    -     * @param line the line, as a string
    -     * @return a parsed genome loc
    -     */
    -    public static GenomeLoc parseLocation(GenomeLocParser genomeLocParser,String line) {
    -        String contig;
    -        int start;
    -        int stop;
    -        try {
    -            String parts[] = line.split("\\s+");
    -            contig = parts[0];
    -            start = Integer.valueOf(parts[1]) + TO_ONE_BASED_ADDITION;
    -            stop = Integer.valueOf(parts[2]); // the ending point is an open interval
    -        } catch (Exception e) {
    -            throw new UserException.MalformedFile("Unable to process bed file line = " + line, e);
    -        }
    -
    -        // we currently drop the rest of the bed record, which can contain names, scores, etc
    -        return genomeLocParser.createGenomeLoc(contig, start, stop, true);
    -
    -    }
    -
    -    /**
    -     * return the sorted, and merged (for overlapping regions)
    -     *
    -     * @return an arraylist
    -     */
    -    public List getLocations() {
    -        return mLocations;
    -    }
    -}
    diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java
    index bc200372f..4a253b217 100644
    --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java
    @@ -4,9 +4,9 @@ import com.google.java.contract.Requires;
     import net.sf.samtools.Cigar;
     import net.sf.samtools.CigarElement;
     import net.sf.samtools.CigarOperator;
    -import net.sf.samtools.SAMRecord;
     import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
     import org.broadinstitute.sting.utils.exceptions.UserException;
    +import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
     
     import java.util.Iterator;
     import java.util.Stack;
    @@ -39,14 +39,14 @@ public class ClippingOp {
          * @param algorithm
          * @param read
          */
    -    public SAMRecord apply(ClippingRepresentation algorithm, SAMRecord read) {
    +    public GATKSAMRecord apply(ClippingRepresentation algorithm, GATKSAMRecord read) {
             byte[] quals = read.getBaseQualities();
             byte[] bases = read.getReadBases();
     
             switch (algorithm) {
                 // important note:
                 //   it's not safe to call read.getReadBases()[i] = 'N' or read.getBaseQualities()[i] = 0
    -            //   because you're not guaranteed to get a pointer to the actual array of bytes in the SAMRecord
    +            //   because you're not guaranteed to get a pointer to the actual array of bytes in the GATKSAMRecord
                 case WRITE_NS:
                     for (int i = start; i <= stop; i++)
                         bases[i] = 'N';
    @@ -248,9 +248,9 @@ public class ClippingOp {
         }
     
         @Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1", "!read.getReadUnmappedFlag()"})
    -    private SAMRecord hardClip (SAMRecord read, int start, int stop) {
    -        if (start == 0 && stop == read.getReadLength() -1)
    -            return new SAMRecord(read.getHeader());
    +    private GATKSAMRecord hardClip (GATKSAMRecord read, int start, int stop) {
    +        if (start == 0 && stop == read.getReadLength() - 1)
    +            return new GATKSAMRecord(read.getHeader());
     
             // If the read is unmapped there is no Cigar string and neither should we create a new cigar string
             CigarShift cigarShift = (read.getReadUnmappedFlag()) ? new CigarShift(new Cigar(), 0, 0) : hardClipCigar(read.getCigar(), start, stop);
    @@ -265,9 +265,9 @@ public class ClippingOp {
             System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength);
             System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength);
     
    -        SAMRecord hardClippedRead;
    +        GATKSAMRecord hardClippedRead;
             try {
    -            hardClippedRead = (SAMRecord) read.clone();
    +            hardClippedRead = (GATKSAMRecord) read.clone();
             } catch (CloneNotSupportedException e) {
                 throw new ReviewedStingException("Where did the clone go?");
             }
    @@ -324,6 +324,8 @@ public class ClippingOp {
     
                     if (index <= stop && cigarElementIterator.hasNext())
                         cigarElement = cigarElementIterator.next();
    +                else
    +                    break;
                 }
     
                 // add the remaining cigar elements
    @@ -363,6 +365,8 @@ public class ClippingOp {
                     index += shift;
                     if (index < start && cigarElementIterator.hasNext())
                         cigarElement = cigarElementIterator.next();
    +                else
    +                    break;
                 }
     
                 // check if we are hard clipping indels
    @@ -394,7 +398,9 @@ public class ClippingOp {
     
             for (int i = 1; i <= 2; i++) {
                 int shift = 0;
    +            int totalHardClip = 0;
                 boolean readHasStarted = false;
    +            boolean addedHardClips = false;
     
                 while(!cigarStack.empty()) {
                     CigarElement cigarElement = cigarStack.pop();
    @@ -404,14 +410,33 @@ public class ClippingOp {
                             cigarElement.getOperator() != CigarOperator.DELETION &&
                             cigarElement.getOperator() != CigarOperator.HARD_CLIP)
                         readHasStarted = true;
    +
    +                else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.HARD_CLIP)
    +                    totalHardClip += cigarElement.getLength();
    +
                     else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.INSERTION)
                         shift += cigarElement.getLength();
     
    -                if (readHasStarted || cigarElement.getOperator() == CigarOperator.HARD_CLIP) {
    -                    if (i==1)
    +                else if ( !readHasStarted && cigarElement.getOperator() == CigarOperator.DELETION)
    +                    totalHardClip += cigarElement.getLength();
    +
    +                if (readHasStarted) {
    +                    if (i==1) {
    +                        if (!addedHardClips) {
    +                            if (totalHardClip > 0)
    +                                inverseCigarStack.push(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP));
    +                            addedHardClips = true;
    +                        }
                             inverseCigarStack.push(cigarElement);
    -                    else
    +                    }
    +                    else {
    +                        if (!addedHardClips) {
    +                            if (totalHardClip > 0)
    +                                cleanCigar.add(new CigarElement(totalHardClip, CigarOperator.HARD_CLIP));
    +                            addedHardClips = true;
    +                        }
                             cleanCigar.add(cigarElement);
    +                    }
                     }
                 }
                 // first pass  (i=1) is from end to start of the cigar elements
    @@ -428,39 +453,35 @@ public class ClippingOp {
         }
     
         private int calculateAlignmentStartShift(Cigar oldCigar, Cigar newCigar) {
    -        int shift = 0;
    +        int newShift = 0;
    +        int oldShift = 0;
     
    -        // Rewind to previous start (by counting everything that was already clipped in this read)
    -        for (CigarElement cigarElement : oldCigar.getCigarElements()) {
    -            if (!cigarElement.getOperator().consumesReferenceBases())
    -                shift -= cigarElement.getLength();
    -            else
    -                break;
    -        }
    -
    -        // Advance to new start (by counting everything new that has been clipped )
             for (CigarElement cigarElement : newCigar.getCigarElements()) {
    -            if (!cigarElement.getOperator().consumesReferenceBases())
    -                shift += cigarElement.getLength();
    +            if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP)
    +                newShift += cigarElement.getLength();
                 else
                     break;
             }
     
    -        return shift;
    +        for (CigarElement cigarElement : oldCigar.getCigarElements()) {
    +            if (cigarElement.getOperator() == CigarOperator.HARD_CLIP || cigarElement.getOperator() == CigarOperator.SOFT_CLIP )
    +                oldShift += Math.min(cigarElement.getLength(), newShift - oldShift);
    +            else
    +                break;
    +        }
    +        return newShift - oldShift;
         }
     
         private int calculateHardClippingAlignmentShift(CigarElement cigarElement, int clippedLength) {
    -        if (cigarElement.getOperator() == CigarOperator.INSERTION) {
    -            int cigarElementLength = cigarElement.getLength();
    -            if (clippedLength >= cigarElementLength)
    -                return -cigarElement.getLength();
    -            else
    -                return -clippedLength;
    -        }
    +        // Insertions should be discounted from the total hard clip count
    +        if (cigarElement.getOperator() == CigarOperator.INSERTION)
    +            return -clippedLength;
     
    -        if (cigarElement.getOperator() == CigarOperator.DELETION)
    +        // Deletions should be added to the total hard clip count
    +        else if (cigarElement.getOperator() == CigarOperator.DELETION)
                 return cigarElement.getLength();
     
    +        // There is no shift if we are not clipping an indel
             return 0;
         }
     
    diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java
    index 26c25850a..a6df986ba 100644
    --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java
    @@ -1,8 +1,10 @@
     package org.broadinstitute.sting.utils.clipreads;
     
     import com.google.java.contract.Requires;
    -import net.sf.samtools.SAMRecord;
    +import net.sf.samtools.CigarElement;
    +import net.sf.samtools.CigarOperator;
     import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
    +import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
     import org.broadinstitute.sting.utils.sam.ReadUtils;
     
     import java.util.ArrayList;
    @@ -12,7 +14,7 @@ import java.util.List;
      * A simple collection of the clipping operations to apply to a read along with its read
      */
     public class ReadClipper {
    -    SAMRecord read;
    +    GATKSAMRecord read;
         boolean wasClipped;
         List ops = null;
     
    @@ -21,7 +23,7 @@ public class ReadClipper {
          *
          * @param read
          */
    -    public ReadClipper(final SAMRecord read) {
    +    public ReadClipper(final GATKSAMRecord read) {
             this.read = read;
             this.wasClipped = false;
         }
    @@ -44,50 +46,64 @@ public class ReadClipper {
             return wasClipped;
         }
     
    -    public SAMRecord getRead() {
    +    public GATKSAMRecord getRead() {
             return read;
         }
     
    -    public SAMRecord hardClipByReferenceCoordinatesLeftTail(int refStop) {
    +    public GATKSAMRecord hardClipByReferenceCoordinatesLeftTail(int refStop) {
             return hardClipByReferenceCoordinates(-1, refStop);
         }
     
    -    public SAMRecord hardClipByReferenceCoordinatesRightTail(int refStart) {
    +    public GATKSAMRecord hardClipByReferenceCoordinatesRightTail(int refStart) {
             return hardClipByReferenceCoordinates(refStart, -1);
         }
     
    -    private SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
    -        int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart);
    -        int stop =  (refStop  < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop);
    +    private int numDeletions(GATKSAMRecord read) {
    +        int result = 0;
    +        for (CigarElement e: read.getCigar().getCigarElements()) {
    +            if ( e.getOperator() == CigarOperator.DELETION || e.getOperator() == CigarOperator.D )
    +                result =+ e.getLength();
    +        }
    +        return result;
    +    }
    +
    +    protected GATKSAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
    +        int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart, ReadUtils.ClippingTail.RIGHT_TAIL);
    +        int stop =  (refStop  < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop, ReadUtils.ClippingTail.LEFT_TAIL);
     
             if (start < 0 || stop > read.getReadLength() - 1)
                 throw new ReviewedStingException("Trying to clip before the start or after the end of a read");
     
    -        // TODO add requires statement/check in the Hardclip function
             if ( start > stop )
    -            stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, ReadUtils.getRefCoordSoftUnclippedEnd(read));
    +            throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!");
     
    -        //System.out.println("Clipping start/stop: " + start + "/" + stop);
             this.addOp(new ClippingOp(start, stop));
    -        SAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES);
    +        GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES);
             this.ops = null;
             return clippedRead;
         }
     
    -    public SAMRecord hardClipByReadCoordinates(int start, int stop) {
    +    public GATKSAMRecord hardClipByReadCoordinates(int start, int stop) {
             this.addOp(new ClippingOp(start, stop));
             return clipRead(ClippingRepresentation.HARDCLIP_BASES);
         }
     
         @Requires("left <= right")
    -    public SAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) {
    +    public GATKSAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) {
             if (left == right)
    -            return new SAMRecord(read.getHeader());
    -        this.read = hardClipByReferenceCoordinates(right, -1);
    -        return hardClipByReferenceCoordinates(-1, left);
    +            return new GATKSAMRecord(read.getHeader());
    +        GATKSAMRecord leftTailRead = hardClipByReferenceCoordinates(right, -1);
    +
    +        // after clipping one tail, it is possible that the consequent hard clipping of adjacent deletions
    +        // make the left cut index no longer part of the read. In that case, clip the read entirely.
    +        if (left > leftTailRead.getAlignmentEnd())
    +            return new GATKSAMRecord(read.getHeader());
    +
    +        ReadClipper clipper = new ReadClipper(leftTailRead);
    +        return clipper.hardClipByReferenceCoordinatesLeftTail(left);
         }
     
    -    public SAMRecord hardClipLowQualEnds(byte lowQual) {
    +    public GATKSAMRecord hardClipLowQualEnds(byte lowQual) {
             byte [] quals = read.getBaseQualities();
             int leftClipIndex = 0;
             int rightClipIndex = read.getReadLength() - 1;
    @@ -98,7 +114,7 @@ public class ReadClipper {
     
             // if the entire read should be clipped, then return an empty read. (--todo: maybe null is better? testing this for now)
             if (leftClipIndex > rightClipIndex)
    -            return (new SAMRecord(read.getHeader()));
    +            return (new GATKSAMRecord(read.getHeader()));
     
             if (rightClipIndex < read.getReadLength() - 1) {
                 this.addOp(new ClippingOp(rightClipIndex + 1, read.getReadLength() - 1));
    @@ -109,18 +125,51 @@ public class ReadClipper {
             return this.clipRead(ClippingRepresentation.HARDCLIP_BASES);
         }
     
    +    public GATKSAMRecord hardClipSoftClippedBases () {
    +        int readIndex = 0;
    +        int cutLeft = -1;            // first position to hard clip (inclusive)
    +        int cutRight = -1;           // first position to hard clip (inclusive)
    +        boolean rightTail = false;   // trigger to stop clipping the left tail and start cutting the right tail
    +
    +        for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
    +            if (cigarElement.getOperator() == CigarOperator.SOFT_CLIP) {
    +                if (rightTail) {
    +                    cutRight = readIndex;
    +                }
    +                else {
    +                    cutLeft = readIndex + cigarElement.getLength() - 1;
    +                }
    +            }
    +            else if (cigarElement.getOperator() != CigarOperator.HARD_CLIP)
    +                rightTail = true;
    +
    +            if (cigarElement.getOperator().consumesReadBases())
    +                readIndex += cigarElement.getLength();
    +        }
    +
    +        // It is extremely important that we cut the end first otherwise the read coordinates change.
    +        if (cutRight >= 0)
    +            this.addOp(new ClippingOp(cutRight, read.getReadLength() - 1));
    +        if (cutLeft >= 0)
    +            this.addOp(new ClippingOp(0, cutLeft));
    +
    +        return clipRead(ClippingRepresentation.HARDCLIP_BASES);
    +    }
    +
    +
    +
         /**
          * Return a new read corresponding to this.read that's been clipped according to ops, if any are present.
          *
          * @param algorithm
          * @return
          */
    -    public SAMRecord clipRead(ClippingRepresentation algorithm) {
    +    public GATKSAMRecord clipRead(ClippingRepresentation algorithm) {
             if (ops == null)
                 return getRead();
             else {
                 try {
    -                SAMRecord clippedRead = (SAMRecord) read.clone();
    +                GATKSAMRecord clippedRead = (GATKSAMRecord) read.clone();
                     for (ClippingOp op : getOps()) {
                         clippedRead = op.apply(algorithm, clippedRead);
                     }
    @@ -131,4 +180,21 @@ public class ReadClipper {
                 }
             }
         }
    +
    +    public GATKSAMRecord hardClipLeadingInsertions() {
    +        for(CigarElement cigarElement : read.getCigar().getCigarElements()) {
    +            if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP &&
    +                cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.DELETION)
    +                break;
    +
    +            else if (cigarElement.getOperator() == CigarOperator.INSERTION) {
    +                this.addOp(new ClippingOp(0, cigarElement.getLength() - 1));
    +            }
    +
    +            else if (cigarElement.getOperator() == CigarOperator.DELETION) {
    +                throw new ReviewedStingException("No read should start with a deletion. Aligner bug?");
    +            }
    +        }
    +        return clipRead(ClippingRepresentation.HARDCLIP_BASES);
    +    }
     }
    diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java
    index 1919ccbf0..4082a5597 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableCodec.java
    @@ -86,7 +86,13 @@ public class TableCodec implements ReferenceDependentFeatureCodec {
         public Object readHeader(LineReader reader) {
             String line = "";
             try {
    +            boolean isFirst = true;
                 while ((line = reader.readLine()) != null) {
    +                System.out.println(line);
    +                if ( isFirst && ! line.startsWith(headerDelimiter) && ! line.startsWith(commentDelimiter)) {
    +                    throw new UserException.MalformedFile("TableCodec file does not have a header");
    +                }
    +		isFirst &= line.startsWith(commentDelimiter);
                     if (line.startsWith(headerDelimiter)) {
                         if (header.size() > 0) throw new IllegalStateException("Input table file seems to have two header lines.  The second is = " + line);
                         String spl[] = line.split(delimiterRegex);
    diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java
    index a85849f0b..4b5c51bd4 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/table/TableFeature.java
    @@ -1,7 +1,9 @@
     package org.broadinstitute.sting.utils.codecs.table;
     
    +
     import org.broad.tribble.Feature;
     import org.broadinstitute.sting.utils.GenomeLoc;
    +import org.broadinstitute.sting.utils.Utils;
     
     import java.util.List;
     
    @@ -44,6 +46,10 @@ public class TableFeature implements Feature {
             return values.get(columnPosition);
         }
     
    +    public String toString() {
    +        return String.format("%s\t%s",position.toString(), Utils.join("\t",values));
    +    }
    +
         public String get(String columnName) {
             int position = keys.indexOf(columnName);
             if (position < 0) throw new IllegalArgumentException("We don't have a column named " + columnName);
    diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
    index bb212e128..0e0cb14bf 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
    @@ -6,6 +6,7 @@ import org.broad.tribble.FeatureCodec;
     import org.broad.tribble.NameAwareCodec;
     import org.broad.tribble.TribbleException;
     import org.broad.tribble.readers.LineReader;
    +import org.broad.tribble.util.BlockCompressedInputStream;
     import org.broad.tribble.util.ParsingUtils;
     import org.broadinstitute.sting.gatk.refdata.SelfScopingFeatureCodec;
     import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
    @@ -35,6 +36,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
         // for ParsingUtils.split
         protected String[] GTValueArray = new String[100];
         protected String[] genotypeKeyArray = new String[100];
    +    protected String[] infoFieldArray = new String[1000];
         protected String[] infoValueArray = new String[1000];
     
         // for performance testing purposes
    @@ -114,15 +116,21 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
                         }
                         arrayIndex++;
                     }
    +
    +                boolean sawFormatTag = false;
                     if ( arrayIndex < strings.length ) {
                         if ( !strings[arrayIndex].equals("FORMAT") )
                             throw new TribbleException.InvalidHeader("we were expecting column name 'FORMAT' but we saw '" + strings[arrayIndex] + "'");
    +                    sawFormatTag = true;
                         arrayIndex++;
                     }
     
    -                while (arrayIndex < strings.length)
    +                while ( arrayIndex < strings.length )
                         auxTags.add(strings[arrayIndex++]);
     
    +                if ( sawFormatTag && auxTags.size() == 0 )
    +                    throw new UserException.MalformedVCFHeader("The FORMAT field was provided but there is no genotype/sample data");
    +
                 } else {
                     if ( str.startsWith("##INFO=") ) {
                         VCFInfoHeaderLine info = new VCFInfoHeaderLine(str.substring(7),version);
    @@ -154,16 +162,27 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
          * @return a feature, (not guaranteed complete) that has the correct start and stop
          */
         public Feature decodeLoc(String line) {
    -        String[] locParts = new String[6];
    -        ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
    +        lineNo++;
    +
    +        // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
    +        if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
    +
    +        // our header cannot be null, we need the genotype sample names and counts
    +        if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
    +
    +        final String[] locParts = new String[6];
    +        int nParts = ParsingUtils.split(line, locParts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
    +
    +        if ( nParts != 6 )
    +            throw new UserException.MalformedVCF("there aren't enough columns for line " + line, lineNo);
     
             // get our alleles (because the end position depends on them)
    -        String ref = getCachedString(locParts[3].toUpperCase());
    -        String alts = getCachedString(locParts[4].toUpperCase());
    -        List alleles = parseAlleles(ref, alts, lineNo);
    +        final String ref = getCachedString(locParts[3].toUpperCase());
    +        final String alts = getCachedString(locParts[4].toUpperCase());
    +        final List alleles = parseAlleles(ref, alts, lineNo);
     
             // find out our location
    -        int start = Integer.valueOf(locParts[1]);
    +        final int start = Integer.valueOf(locParts[1]);
             int stop = start;
     
             // ref alleles don't need to be single bases for monomorphic sites
    @@ -199,35 +218,31 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
          * @return a VariantContext
          */
         public Feature decode(String line) {
    -        return reallyDecode(line);
    -    }
    +        // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
    +        if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
     
    -    private Feature reallyDecode(String line) {
    -            // the same line reader is not used for parsing the header and parsing lines, if we see a #, we've seen a header line
    -            if (line.startsWith(VCFHeader.HEADER_INDICATOR)) return null;
    +        // our header cannot be null, we need the genotype sample names and counts
    +        if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
     
    -            // our header cannot be null, we need the genotype sample names and counts
    -            if (header == null) throw new ReviewedStingException("VCF Header cannot be null when decoding a record");
    +        if (parts == null)
    +            parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)];
     
    -            if (parts == null)
    -                parts = new String[Math.min(header.getColumnCount(), NUM_STANDARD_FIELDS+1)];
    +        int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
     
    -            int nParts = ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR, true);
    +        // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns.  Otherwise check that we have nine (normal colummns + genotyping data)
    +        if (( (header == null || !header.hasGenotypingData()) && nParts != NUM_STANDARD_FIELDS) ||
    +             (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) )
    +            throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
    +                    " tokens, and saw " + nParts + " )", lineNo);
     
    -            // if we have don't have a header, or we have a header with no genotyping data check that we have eight columns.  Otherwise check that we have nine (normal colummns + genotyping data)
    -            if (( (header == null || (header != null && !header.hasGenotypingData())) && nParts != NUM_STANDARD_FIELDS) ||
    -                 (header != null && header.hasGenotypingData() && nParts != (NUM_STANDARD_FIELDS + 1)) )
    -                throw new UserException.MalformedVCF("there aren't enough columns for line " + line + " (we expected " + (header == null ? NUM_STANDARD_FIELDS : NUM_STANDARD_FIELDS + 1) +
    -                        " tokens, and saw " + nParts + " )", lineNo);
    -
    -            return parseVCFLine(parts);
    +        return parseVCFLine(parts);
         }
     
         protected void generateException(String message) {
             throw new UserException.MalformedVCF(message, lineNo);
         }
     
    -    private static void generateException(String message, int lineNo) {
    +    protected static void generateException(String message, int lineNo) {
             throw new UserException.MalformedVCF(message, lineNo);
         }
     
    @@ -345,23 +360,31 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
                 generateException("The VCF specification requires a valid info field");
     
             if ( !infoField.equals(VCFConstants.EMPTY_INFO_FIELD) ) {
    -            int infoValueSplitSize = ParsingUtils.split(infoField, infoValueArray, VCFConstants.INFO_FIELD_SEPARATOR_CHAR);
    -            for (int i = 0; i < infoValueSplitSize; i++) {
    +            if ( infoField.indexOf("\t") != -1 || infoField.indexOf(" ") != -1 )
    +                generateException("The VCF specification does not allow for whitespace in the INFO field");
    +
    +            int infoFieldSplitSize = ParsingUtils.split(infoField, infoFieldArray, VCFConstants.INFO_FIELD_SEPARATOR_CHAR, false);
    +            for (int i = 0; i < infoFieldSplitSize; i++) {
                     String key;
                     Object value;
     
    -                int eqI = infoValueArray[i].indexOf("=");
    +                int eqI = infoFieldArray[i].indexOf("=");
                     if ( eqI != -1 ) {
    -                    key = infoValueArray[i].substring(0, eqI);
    -                    String str = infoValueArray[i].substring(eqI+1, infoValueArray[i].length());
    +                    key = infoFieldArray[i].substring(0, eqI);
    +                    String str = infoFieldArray[i].substring(eqI+1);
     
    -                    // lets see if the string contains a , separator
    -                    if ( str.contains(",") )
    -                        value = Arrays.asList(str.split(","));
    -                    else
    -                        value = str;
    +                    // split on the INFO field separator
    +                    int infoValueSplitSize = ParsingUtils.split(str, infoValueArray, VCFConstants.INFO_FIELD_ARRAY_SEPARATOR_CHAR, false);
    +                    if ( infoValueSplitSize == 1 ) {
    +                        value = infoValueArray[0];
    +                    } else {
    +                        ArrayList valueList = new ArrayList(infoValueSplitSize);
    +                        for ( int j = 0; j < infoValueSplitSize; j++ )
    +                            valueList.add(infoValueArray[j]);
    +                        value = valueList;
    +                    }
                     } else {
    -                    key = infoValueArray[i];
    +                    key = infoFieldArray[i];
                         value = true;
                     }
     
    @@ -369,7 +392,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
                 }
             }
     
    -        attributes.put(VariantContext.ID_KEY, id);
    +        if ( ! id.equals(VCFConstants.EMPTY_ID_FIELD) )
    +            attributes.put(VariantContext.ID_KEY, id);
             return attributes;
         }
     
    @@ -587,7 +611,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
         public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) {
             try {
                 return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) ||
    -                    isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE);
    +                    isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE) ||
    +                    isVCFStream(new BlockCompressedInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE);
             } catch ( FileNotFoundException e ) {
                 return false;
             } catch ( IOException e ) {
    @@ -598,12 +623,17 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
         private final static boolean isVCFStream(final InputStream stream, final String MAGIC_HEADER_LINE) {
             try {
                 byte[] buff = new byte[MAGIC_HEADER_LINE.length()];
    -            stream.read(buff, 0, MAGIC_HEADER_LINE.length());
    -            String firstLine = new String(buff);
    -            stream.close();
    -            return firstLine.startsWith(MAGIC_HEADER_LINE);
    +            int nread = stream.read(buff, 0, MAGIC_HEADER_LINE.length());
    +            boolean eq = Arrays.equals(buff, MAGIC_HEADER_LINE.getBytes());
    +            return eq;
    +//            String firstLine = new String(buff);
    +//            return firstLine.startsWith(MAGIC_HEADER_LINE);
             } catch ( IOException e ) {
                 return false;
    +        } catch ( RuntimeException e ) {
    +            return false;
    +        } finally {
    +            try { stream.close(); } catch ( IOException e ) {}
             }
         }
     }
    diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java
    new file mode 100644
    index 000000000..71ec4ce1b
    --- /dev/null
    +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/IndexingVCFWriter.java
    @@ -0,0 +1,144 @@
    +/*
    + * Copyright (c) 2011, The Broad Institute
    + *
    + * Permission is hereby granted, free of charge, to any person
    + * obtaining a copy of this software and associated documentation
    + * files (the "Software"), to deal in the Software without
    + * restriction, including without limitation the rights to use,
    + * copy, modify, merge, publish, distribute, sublicense, and/or sell
    + * copies of the Software, and to permit persons to whom the
    + * Software is furnished to do so, subject to the following
    + * conditions:
    + *
    + * The above copyright notice and this permission notice shall be
    + * included in all copies or substantial portions of the Software.
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    + * OTHER DEALINGS IN THE SOFTWARE.
    + */
    +
    +package org.broadinstitute.sting.utils.codecs.vcf;
    +
    +import com.google.java.contract.Ensures;
    +import com.google.java.contract.Requires;
    +import net.sf.samtools.SAMSequenceDictionary;
    +import org.broad.tribble.Tribble;
    +import org.broad.tribble.TribbleException;
    +import org.broad.tribble.index.DynamicIndexCreator;
    +import org.broad.tribble.index.Index;
    +import org.broad.tribble.index.IndexFactory;
    +import org.broad.tribble.util.LittleEndianOutputStream;
    +import org.broad.tribble.util.PositionalStream;
    +import org.broadinstitute.sting.gatk.refdata.tracks.IndexDictionaryUtils;
    +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
    +import org.broadinstitute.sting.utils.exceptions.UserException;
    +import org.broadinstitute.sting.utils.variantcontext.VariantContext;
    +
    +import java.io.*;
    +
    +/**
    + * this class writes VCF files
    + */
    +public abstract class IndexingVCFWriter implements VCFWriter {
    +    final private String name;
    +    private final SAMSequenceDictionary refDict;
    +
    +    private OutputStream outputStream;
    +    private PositionalStream positionalStream = null;
    +    private DynamicIndexCreator indexer = null;
    +    private LittleEndianOutputStream idxStream = null;
    +
    +    @Requires({"name != null",
    +            "! ( location == null && output == null )",
    +            "! ( enableOnTheFlyIndexing && location == null )"})
    +    protected IndexingVCFWriter(final String name, final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing) {
    +        outputStream = output;
    +        this.name = name;
    +        this.refDict = refDict;
    +
    +        if ( enableOnTheFlyIndexing ) {
    +            try {
    +                idxStream = new LittleEndianOutputStream(new FileOutputStream(Tribble.indexFile(location)));
    +                //System.out.println("Creating index on the fly for " + location);
    +                indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
    +                indexer.initialize(location, indexer.defaultBinSize());
    +                positionalStream = new PositionalStream(output);
    +                outputStream = positionalStream;
    +            } catch ( IOException ex ) {
    +                // No matter what we keep going, since we don't care if we can't create the index file
    +                idxStream = null;
    +                indexer = null;
    +                positionalStream = null;
    +            }
    +        }
    +    }
    +
    +    @Ensures("result != null")
    +    public OutputStream getOutputStream() {
    +        return outputStream;
    +    }
    +
    +    @Ensures("result != null")
    +    public String getStreamName() {
    +        return name;
    +    }
    +
    +    public abstract void writeHeader(VCFHeader header);
    +
    +    /**
    +     * attempt to close the VCF file
    +     */
    +    public void close() {
    +        // try to close the index stream (keep it separate to help debugging efforts)
    +        if ( indexer != null ) {
    +            try {
    +                Index index = indexer.finalizeIndex(positionalStream.getPosition());
    +                IndexDictionaryUtils.setIndexSequenceDictionary(index, refDict);
    +                index.write(idxStream);
    +                idxStream.close();
    +            } catch (IOException e) {
    +                throw new ReviewedStingException("Unable to close index for " + getStreamName(), e);
    +            }
    +        }
    +    }
    +
    +    /**
    +     * add a record to the file
    +     *
    +     * @param vc      the Variant Context object
    +     */
    +    public void add(VariantContext vc) {
    +        // if we are doing on the fly indexing, add the record ***before*** we write any bytes
    +        if ( indexer != null )
    +            indexer.addFeature(vc, positionalStream.getPosition());
    +    }
    +
    +    /**
    +     * Returns a reasonable "name" for this writer, to display to the user if something goes wrong
    +     *
    +     * @param location
    +     * @param stream
    +     * @return
    +     */
    +    protected static final String writerName(final File location, final OutputStream stream) {
    +        return location == null ? stream.toString() : location.getAbsolutePath();
    +    }
    +
    +    /**
    +     * Returns a output stream writing to location, or throws a UserException if this fails
    +     * @param location
    +     * @return
    +     */
    +    protected static OutputStream openOutputStream(final File location) {
    +        try {
    +            return new FileOutputStream(location);
    +        } catch (FileNotFoundException e) {
    +            throw new UserException.CouldNotCreateOutputFile(location, "Unable to create VCF writer", e);
    +        }
    +    }
    +}
    diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java
    index d3705813c..0da7a100f 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/StandardVCFWriter.java
    @@ -24,6 +24,7 @@
     
     package org.broadinstitute.sting.utils.codecs.vcf;
     
    +import net.sf.samtools.SAMSequenceDictionary;
     import org.broad.tribble.Tribble;
     import org.broad.tribble.TribbleException;
     import org.broad.tribble.index.DynamicIndexCreator;
    @@ -44,46 +45,30 @@ import java.util.*;
     /**
      * this class writes VCF files
      */
    -public class StandardVCFWriter implements VCFWriter {
    +public class StandardVCFWriter extends IndexingVCFWriter {
    +    // the print stream we're writing to
    +    final protected BufferedWriter mWriter;
    +
    +    // should we write genotypes or just sites?
    +    final protected boolean doNotWriteGenotypes;
     
         // the VCF header we're storing
         protected VCFHeader mHeader = null;
     
    -    // the print stream we're writing to
    -    protected BufferedWriter mWriter;
    -    protected PositionalStream positionalStream = null;
    -
         // were filters applied?
         protected boolean filtersWereAppliedToContext = false;
     
    -    // should we write genotypes or just sites?
    -    protected boolean doNotWriteGenotypes = false;
    -
    -    protected DynamicIndexCreator indexer = null;
    -    protected File indexFile = null;
    -    LittleEndianOutputStream idxStream = null;
    -    File location = null;
    -
         /**         
          * create a VCF writer, given a file to write to
          *
          * @param location the file location to write to
          */
    -    public StandardVCFWriter(File location) {
    -        this(location, openOutputStream(location), true, false);
    +    public StandardVCFWriter(final File location, final SAMSequenceDictionary refDict) {
    +        this(location, openOutputStream(location), refDict, true, false);
         }
     
    -    public StandardVCFWriter(File location, boolean enableOnTheFlyIndexing) {
    -        this(location, openOutputStream(location), enableOnTheFlyIndexing, false);
    -    }
    -
    -    /**
    -     * create a VCF writer, given a stream to write to
    -     *
    -     * @param output   the file location to write to
    -     */
    -    public StandardVCFWriter(OutputStream output) {
    -        this(output, false);
    +    public StandardVCFWriter(File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing) {
    +        this(location, openOutputStream(location), refDict, enableOnTheFlyIndexing, false);
         }
     
         /**
    @@ -92,33 +77,23 @@ public class StandardVCFWriter implements VCFWriter {
          * @param output   the file location to write to
          * @param doNotWriteGenotypes   do not write genotypes
          */
    -    public StandardVCFWriter(OutputStream output, boolean doNotWriteGenotypes) {
    -        mWriter = new BufferedWriter(new OutputStreamWriter(output));
    +    public StandardVCFWriter(final OutputStream output, final SAMSequenceDictionary refDict, final boolean doNotWriteGenotypes) {
    +        this(null, output, refDict, false, doNotWriteGenotypes);
    +    }
    +
    +    public StandardVCFWriter(final File location, final OutputStream output, final SAMSequenceDictionary refDict, final boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
    +        super(writerName(location, output), location, output, refDict, enableOnTheFlyIndexing);
    +        mWriter = new BufferedWriter(new OutputStreamWriter(getOutputStream())); // todo -- fix buffer size
             this.doNotWriteGenotypes = doNotWriteGenotypes;
         }
     
    -    public StandardVCFWriter(File location, OutputStream output, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) {
    -        this.location = location;
    -
    -        if ( enableOnTheFlyIndexing ) {
    -            indexFile = Tribble.indexFile(location);
    -            try {
    -                idxStream = new LittleEndianOutputStream(new FileOutputStream(indexFile));
    -                //System.out.println("Creating index on the fly for " + location);
    -                indexer = new DynamicIndexCreator(IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME);
    -                indexer.initialize(location, indexer.defaultBinSize());
    -                positionalStream = new PositionalStream(output);
    -                output = positionalStream;
    -            } catch ( IOException ex ) {
    -                // No matter what we keep going, since we don't care if we can't create the index file
    -            }
    -        }
    -
    -        //mWriter = new BufferedWriter(new OutputStreamWriter(new PositionalStream(output)));
    -        mWriter = new BufferedWriter(new OutputStreamWriter(output));
    -        this.doNotWriteGenotypes = doNotWriteGenotypes;
    -    }
    +    // --------------------------------------------------------------------------------
    +    //
    +    // VCFWriter interface functions
    +    //
    +    // --------------------------------------------------------------------------------
     
    +    @Override
         public void writeHeader(VCFHeader header) {
             mHeader = doNotWriteGenotypes ? new VCFHeader(header.getMetaData()) : header;
             
    @@ -158,44 +133,24 @@ public class StandardVCFWriter implements VCFWriter {
                 mWriter.flush();  // necessary so that writing to an output stream will work
             }
             catch (IOException e) {
    -            throw new TribbleException("IOException writing the VCF header to " + locationString(), e);
    +            throw new ReviewedStingException("IOException writing the VCF header to " + getStreamName(), e);
             }
         }
     
    -    private String locationString() {
    -        return location == null ? mWriter.toString() : location.getAbsolutePath();
    -    }
    -
         /**
          * attempt to close the VCF file
          */
    +    @Override
         public void close() {
             // try to close the vcf stream
             try {
                 mWriter.flush();
                 mWriter.close();
             } catch (IOException e) {
    -            throw new TribbleException("Unable to close " + locationString() + " because of " + e.getMessage());
    +            throw new ReviewedStingException("Unable to close " + getStreamName(), e);
             }
     
    -        // try to close the index stream (keep it separate to help debugging efforts)
    -        if ( indexer != null ) {
    -            try {
    -                Index index = indexer.finalizeIndex(positionalStream.getPosition());
    -                index.write(idxStream);
    -                idxStream.close();
    -            } catch (IOException e) {
    -                throw new TribbleException("Unable to close index for " + locationString() + " because of " + e.getMessage());
    -            }
    -        }
    -    }
    -
    -    protected static OutputStream openOutputStream(File location) {
    -        try {
    -            return new FileOutputStream(location);
    -        } catch (FileNotFoundException e) {
    -            throw new TribbleException("Unable to create VCF file at location: " + location);
    -        }
    +        super.close();
         }
     
         /**
    @@ -203,28 +158,17 @@ public class StandardVCFWriter implements VCFWriter {
          *
          * @param vc      the Variant Context object
          */
    +    @Override
         public void add(VariantContext vc) {
    -        add(vc, false);
    -    }
    -
    -    /**
    -     * add a record to the file
    -     *
    -     * @param vc      the Variant Context object
    -     * @param refBaseShouldBeAppliedToEndOfAlleles *** THIS SHOULD BE FALSE EXCEPT FOR AN INDEL AT THE EXTREME BEGINNING OF A CONTIG (WHERE THERE IS NO PREVIOUS BASE, SO WE USE THE BASE AFTER THE EVENT INSTEAD)
    -     */
    -    public void add(VariantContext vc, boolean refBaseShouldBeAppliedToEndOfAlleles) {
             if ( mHeader == null )
    -            throw new IllegalStateException("The VCF Header must be written before records can be added: " + locationString());
    +            throw new IllegalStateException("The VCF Header must be written before records can be added: " + getStreamName());
     
             if ( doNotWriteGenotypes )
                 vc = VariantContext.modifyGenotypes(vc, null);
     
             try {
    -            vc = VariantContext.createVariantContextWithPaddedAlleles(vc, refBaseShouldBeAppliedToEndOfAlleles);
    -
    -            // if we are doing on the fly indexing, add the record ***before*** we write any bytes 
    -            if ( indexer != null ) indexer.addFeature(vc, positionalStream.getPosition());
    +            vc = VariantContext.createVariantContextWithPaddedAlleles(vc, false);
    +            super.add(vc);
     
                 Map alleleMap = new HashMap(vc.getAlleles().size());
                 alleleMap.put(Allele.NO_CALL, VCFConstants.EMPTY_ALLELE); // convenience for lookup
    @@ -275,7 +219,7 @@ public class StandardVCFWriter implements VCFWriter {
                 mWriter.write(VCFConstants.FIELD_SEPARATOR);
     
                 // FILTER
    -            String filters = vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (filtersWereAppliedToContext || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
    +            String filters = getFilterString(vc, filtersWereAppliedToContext);
                 mWriter.write(filters);
                 mWriter.write(VCFConstants.FIELD_SEPARATOR);
     
    @@ -317,9 +261,22 @@ public class StandardVCFWriter implements VCFWriter {
                 mWriter.write("\n");
                 mWriter.flush();  // necessary so that writing to an output stream will work
             } catch (IOException e) {
    -            throw new RuntimeException("Unable to write the VCF object to " + locationString());
    +            throw new RuntimeException("Unable to write the VCF object to " + getStreamName());
             }
    +    }
     
    +    // --------------------------------------------------------------------------------
    +    //
    +    // implementation functions
    +    //
    +    // --------------------------------------------------------------------------------
    +
    +    public static final String getFilterString(final VariantContext vc) {
    +        return getFilterString(vc, false);
    +    }
    +
    +    public static final String getFilterString(final VariantContext vc, boolean forcePASS) {
    +        return vc.isFiltered() ? ParsingUtils.join(";", ParsingUtils.sortList(vc.getFilters())) : (forcePASS || vc.filtersWereApplied() ? VCFConstants.PASSES_FILTERS_v4 : VCFConstants.UNFILTERED);
         }
     
         private String getQualValue(double qual) {
    @@ -462,7 +419,7 @@ public class StandardVCFWriter implements VCFWriter {
             mWriter.write(encoding);
         }
     
    -    private static String formatVCFField(Object val) {
    +    public static String formatVCFField(Object val) {
             String result;
             if ( val == null )
                 result = VCFConstants.MISSING_VALUE_v4;
    @@ -524,12 +481,11 @@ public class StandardVCFWriter implements VCFWriter {
         }
     
     
    -    public static int countOccurrences(char c, String s) {
    +    private static int countOccurrences(char c, String s) {
                int count = 0;
                for (int i = 0; i < s.length(); i++) {
                    count += s.charAt(i) == c ? 1 : 0;
                }
                return count;
         }
    -
     }
    diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java
    index fa030ef5f..42ea05355 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java
    @@ -105,34 +105,37 @@ public class VCFCodec extends AbstractVCFCodec {
          * @return a set of the filters applied or null if filters were not applied to the record (e.g. as per the missing value in a VCF)
          */
         protected Set parseFilters(String filterString) {
    +        return parseFilters(filterHash, lineNo, filterString);
    +    }
     
    +    public static Set parseFilters(final Map> cache, final int lineNo, final String filterString) {
             // null for unfiltered
             if ( filterString.equals(VCFConstants.UNFILTERED) )
                 return null;
     
    -        // empty set for passes filters
    -        LinkedHashSet fFields = new LinkedHashSet();
    -
             if ( filterString.equals(VCFConstants.PASSES_FILTERS_v4) )
    -            return fFields;
    +            return Collections.emptySet();
             if ( filterString.equals(VCFConstants.PASSES_FILTERS_v3) )
    -            generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4");
    +            generateException(VCFConstants.PASSES_FILTERS_v3 + " is an invalid filter name in vcf4", lineNo);
             if ( filterString.length() == 0 )
    -            generateException("The VCF specification requires a valid filter status");
    +            generateException("The VCF specification requires a valid filter status: filter was " + filterString, lineNo);
     
             // do we have the filter string cached?
    -        if ( filterHash.containsKey(filterString) )
    -            return filterHash.get(filterString);
    +        if ( cache != null && cache.containsKey(filterString) )
    +            return Collections.unmodifiableSet(cache.get(filterString));
     
    +        // empty set for passes filters
    +        LinkedHashSet fFields = new LinkedHashSet();
             // otherwise we have to parse and cache the value
             if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 )
                 fFields.add(filterString);
             else
                 fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR)));
     
    -        filterHash.put(filterString, fFields);
    +        fFields = fFields;
    +        if ( cache != null ) cache.put(filterString, fFields);
     
    -        return fFields;
    +        return Collections.unmodifiableSet(fFields);
         }
     
     
    diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java
    index 91cf86c70..8e9d989cc 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFConstants.java
    @@ -71,6 +71,7 @@ public final class VCFConstants {
         public static final char   FIELD_SEPARATOR_CHAR = '\t';
         public static final String FILTER_CODE_SEPARATOR = ";";
         public static final String INFO_FIELD_ARRAY_SEPARATOR = ",";
    +    public static final char INFO_FIELD_ARRAY_SEPARATOR_CHAR = ',';
         public static final String ID_FIELD_SEPARATOR = ";";
         public static final String INFO_FIELD_SEPARATOR = ";";
         public static final char INFO_FIELD_SEPARATOR_CHAR = ';';
    diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
    index fd1c74993..66e11bc1e 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java
    @@ -35,9 +35,6 @@ public class VCFHeader {
         // the header string indicator
         public static final String HEADER_INDICATOR = "#";
     
    -    /** do we have genotying data? */
    -    private boolean hasGenotypingData = false;
    -
         // were the input samples sorted originally (or are we sorting them)?
         private boolean samplesWereAlreadySorted = true;
     
    @@ -57,17 +54,15 @@ public class VCFHeader {
          * create a VCF header, given a list of meta data and auxillary tags
          *
          * @param metaData            the meta data associated with this header
    -     * @param genotypeSampleNames the genotype format field, and the sample names
    +     * @param genotypeSampleNames the sample names
          */
         public VCFHeader(Set metaData, Set genotypeSampleNames) {
             mMetaData = new TreeSet();
             if ( metaData != null )
                 mMetaData.addAll(metaData);
    -        for (String col : genotypeSampleNames) {
    -            if (!col.equals("FORMAT"))
    -                mGenotypeSampleNames.add(col);
    -        }
    -        if (genotypeSampleNames.size() > 0) hasGenotypingData = true;
    +
    +        mGenotypeSampleNames.addAll(genotypeSampleNames);
    +
             loadVCFVersion();
             loadMetaDataMaps();
     
    @@ -157,7 +152,7 @@ public class VCFHeader {
          * @return true if we have genotyping columns, false otherwise
          */
         public boolean hasGenotypingData() {
    -        return hasGenotypingData;
    +        return mGenotypeSampleNames.size() > 0;
         }
     
         /**
    @@ -171,7 +166,7 @@ public class VCFHeader {
     
         /** @return the column count */
         public int getColumnCount() {
    -        return HEADER_FIELDS.values().length + ((hasGenotypingData) ? mGenotypeSampleNames.size() + 1 : 0);
    +        return HEADER_FIELDS.values().length + (hasGenotypingData() ? mGenotypeSampleNames.size() + 1 : 0);
         }
     
         /**
    diff --git a/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java b/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java
    index bba47c76c..7ae575534 100644
    --- a/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java
    @@ -25,7 +25,6 @@
     
     package org.broadinstitute.sting.utils.duplicates;
     
    -import net.sf.samtools.SAMRecord;
     import org.broadinstitute.sting.utils.BaseUtils;
     import org.broadinstitute.sting.utils.GenomeLoc;
     import org.broadinstitute.sting.utils.GenomeLocParser;
    @@ -35,27 +34,28 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
     import org.broadinstitute.sting.utils.pileup.PileupElement;
     import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
     import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
    +import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
     
     import java.util.Arrays;
     import java.util.List;
     
     public class DupUtils {
    -    private static SAMRecord tmpCopyRead(SAMRecord read) {
    +    private static GATKSAMRecord tmpCopyRead(GATKSAMRecord read) {
             try {
    -            return (SAMRecord)read.clone();
    +            return (GATKSAMRecord)read.clone();
             } catch ( CloneNotSupportedException e ) {
                 throw new ReviewedStingException("Unexpected Clone failure!");
             }
         }
     
    -    public static SAMRecord combineDuplicates(GenomeLocParser genomeLocParser,List duplicates, int maxQScore) {
    +    public static GATKSAMRecord combineDuplicates(GenomeLocParser genomeLocParser,List duplicates, int maxQScore) {
             if ( duplicates.size() == 0 )
                 return null;
     
             // make the combined read by copying the first read and setting the
             // bases and quals to new arrays
    -        SAMRecord comb = tmpCopyRead(duplicates.get(0));
    -        //SAMRecord comb = tmpCopyRead(duplicates.get(0));
    +        GATKSAMRecord comb = tmpCopyRead(duplicates.get(0));
    +        //GATKSAMRecord comb = tmpCopyRead(duplicates.get(0));
             comb.setDuplicateReadFlag(false);
             int readLen = comb.getReadBases().length;
             byte[] bases = new byte[readLen];
    @@ -63,7 +63,7 @@ public class DupUtils {
     
             for ( int i = 0; i < readLen; i++ ) {
                 //System.out.printf("I is %d%n", i);
    -            //for ( SAMRecord read : duplicates ) {
    +            //for ( GATKSAMRecord read : duplicates ) {
                 //    System.out.printf("dup base %c %d%n", (char)read.getReadBases()[i], read.getBaseQualities()[i]);
                 //}
                 Pair baseAndQual = combineBaseProbs(genomeLocParser,duplicates, i, maxQScore);
    @@ -117,7 +117,7 @@ public class DupUtils {
             System.out.printf("%n");
         }
     
    -    private static Pair combineBaseProbs(GenomeLocParser genomeLocParser,List duplicates, int readOffset, int maxQScore) {
    +    private static Pair combineBaseProbs(GenomeLocParser genomeLocParser,List duplicates, int readOffset, int maxQScore) {
             GenomeLoc loc = genomeLocParser.createGenomeLoc(duplicates.get(0));
             ReadBackedPileup pileup = new ReadBackedPileupImpl(loc, duplicates, readOffset);
     
    diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
    index 9d131ae0c..a208d2dc0 100755
    --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
    +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
    @@ -164,7 +164,15 @@ public class UserException extends ReviewedStingException {
     
         public static class MalformedBAM extends UserException {
             public MalformedBAM(SAMRecord read, String message) {
    -            super(String.format("SAM/BAM file %s is malformed: %s", read.getFileSource() != null ? read.getFileSource().getReader() : "(none)", message));
    +            this(read.getFileSource() != null ? read.getFileSource().getReader().toString() : "(none)", message);
    +        }
    +
    +        public MalformedBAM(File file, String message) {
    +            this(file.toString(), message);
    +        }
    +
    +        public MalformedBAM(String source, String message) {
    +            super(String.format("SAM/BAM file %s is malformed: %s", source, message));
             }
         }
     
    @@ -178,6 +186,12 @@ public class UserException extends ReviewedStingException {
             }
         }
     
    +    public static class MalformedVCFHeader extends UserException {
    +        public MalformedVCFHeader(String message) {
    +            super(String.format("The provided VCF file has a malformed header: %s", message));
    +        }
    +    }
    +
         public static class ReadMissingReadGroup extends MalformedBAM {
             public ReadMissingReadGroup(SAMRecord read) {
                 super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK.  Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName()));
    @@ -213,12 +227,19 @@ public class UserException extends ReviewedStingException {
                 super(String.format("File %s is malformed: %s caused by %s", f.getAbsolutePath(), message, e.getMessage()));
             }
     
    +        public MalformedFile(String name, String message) {
    +            super(String.format("File associated with name %s is malformed: %s", name, message));
    +        }
    +
             public MalformedFile(String name, String message, Exception e) {
                 super(String.format("File associated with name %s is malformed: %s caused by %s", name, message, e.getMessage()));
             }
          }
     
         public static class CannotExecuteRScript extends UserException {
    +        public CannotExecuteRScript(String message) {
    +            super(String.format("Unable to execute RScript command: " + message));
    +        }
             public CannotExecuteRScript(String message, Exception e) {
                 super(String.format("Unable to execute RScript command: " + message), e);
             }
    diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentCollection.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentCollection.java
    new file mode 100644
    index 000000000..3261e8d2e
    --- /dev/null
    +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentCollection.java
    @@ -0,0 +1,66 @@
    +/*
    + * Copyright (c) 2011, The Broad Institute
    + *
    + * Permission is hereby granted, free of charge, to any person
    + * obtaining a copy of this software and associated documentation
    + * files (the "Software"), to deal in the Software without
    + * restriction, including without limitation the rights to use,
    + * copy, modify, merge, publish, distribute, sublicense, and/or sell
    + * copies of the Software, and to permit persons to whom the
    + * Software is furnished to do so, subject to the following
    + * conditions:
    + *
    + * The above copyright notice and this permission notice shall be
    + * included in all copies or substantial portions of the Software.
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    + * OTHER DEALINGS IN THE SOFTWARE.
    + */
    +
    +package org.broadinstitute.sting.utils.fragments;
    +
    +import java.util.Collection;
    +import java.util.Collections;
    +import java.util.List;
    +
    +/**
    + * Useful helper class to represent the results of the reads -> fragment calculation.
    + *
    + * Contains singleton -- objects whose underlying reads do not overlap their mate pair
    + * Contains overlappingPairs -- objects whose underlying reads do overlap their mate pair
    + *
    + * User: ebanks, depristo
    + * Date: Jan 10, 2011
    + */
    +public class FragmentCollection {
    +    Collection singletons;
    +    Collection> overlappingPairs;
    +
    +    public FragmentCollection(final Collection singletons, final Collection> overlappingPairs) {
    +        this.singletons = singletons == null ? Collections.emptyList() : singletons;
    +        this.overlappingPairs = overlappingPairs == null ? Collections.>emptyList() : overlappingPairs;
    +    }
    +
    +    /**
    +     * Gets the T elements not containing overlapping elements, in no particular order
    +     *
    +     * @return
    +     */
    +    public Collection getSingletonReads() {
    +        return singletons;
    +    }
    +
    +    /**
    +     * Gets the T elements containing overlapping elements, in no particular order
    +     *
    +     * @return
    +     */
    +    public Collection> getOverlappingPairs() {
    +        return overlappingPairs;
    +    }
    +}
    diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java
    new file mode 100644
    index 000000000..e5500ca21
    --- /dev/null
    +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java
    @@ -0,0 +1,124 @@
    +package org.broadinstitute.sting.utils.fragments;
    +
    +import net.sf.samtools.SAMRecord;
    +import org.broadinstitute.sting.utils.pileup.PileupElement;
    +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
    +import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
    +
    +import java.util.*;
    +
    +/**
    + * An easy to access fragment-based pileup, which contains two separate pileups.  The first
    + * is a regular collection of PileupElements containing all of the reads in the original RBP
    + * that uniquely info about a fragment.  The second are TwoReadPileupElements that, as the
    + * name suggests, contain two reads that are sequenced from the same underlying fragment.
    + *
    + * Based on the original code by E. Banks
    + *
    + * Oct 21: note that the order of the oneReadPileup and twoReadPileups are not
    + * defined.  The algorithms that produce these lists are in fact producing
    + * lists of Pileup elements *NOT* sorted by alignment start position of the underlying
    + * reads.
    + *
    + * User: depristo
    + * Date: 3/26/11
    + * Time: 10:09 PM
    + */
    +public class FragmentUtils {
    +    private FragmentUtils() {} // private constructor
    +
    +    /**
    +     * A getter function that takes an Object of type T and returns its associated SAMRecord.
    +     *
    +     * Allows us to write a generic T -> Fragment algorithm that works with any object containing
    +     * a read.
    +     *
    +     * @param 
    +     */
    +    public interface ReadGetter {
    +        public GATKSAMRecord get(T object);
    +    }
    +
    +    /** Identify getter for SAMRecords themselves */
    +    private final static ReadGetter SamRecordGetter = new ReadGetter() {
    +        @Override public GATKSAMRecord get(final GATKSAMRecord object) { return object; }
    +    };
    +
    +    /** Gets the SAMRecord in a PileupElement */
    +    private final static ReadGetter PileupElementGetter = new ReadGetter() {
    +        @Override public GATKSAMRecord get(final PileupElement object) { return object.getRead(); }
    +    };
    +
    +
    +    /**
    +     * Generic algorithm that takes an iterable over T objects, a getter routine to extract the reads in T,
    +     * and returns a FragmentCollection that contains the T objects whose underlying reads either overlap (or
    +     * not) with their mate pairs.
    +     *
    +     * @param readContainingObjects
    +     * @param nElements
    +     * @param getter
    +     * @param 
    +     * @return
    +     */
    +    private final static  FragmentCollection create(Iterable readContainingObjects, int nElements, ReadGetter getter) {
    +        Collection singletons = null;
    +        Collection> overlapping = null;
    +        Map nameMap = null;
    +
    +        int lastStart = -1;
    +
    +        // build an initial map, grabbing all of the multi-read fragments
    +        for ( final T p : readContainingObjects ) {
    +            final SAMRecord read = getter.get(p);
    +
    +            if ( read.getAlignmentStart() < lastStart ) {
    +                throw new IllegalArgumentException(String.format(
    +                        "FragmentUtils.create assumes that the incoming objects are ordered by " +
    +                                "SAMRecord alignment start, but saw a read %s with alignment start " +
    +                                "%d before the previous start %d", read.getSAMString(), read.getAlignmentStart(), lastStart));
    +            }
    +            lastStart = read.getAlignmentStart();
    +
    +            final int mateStart = read.getMateAlignmentStart();
    +            if ( mateStart == 0 || mateStart > read.getAlignmentEnd() ) {
    +                // if we know that this read won't overlap its mate, or doesn't have one, jump out early
    +                if ( singletons == null ) singletons = new ArrayList(nElements); // lazy init
    +                singletons.add(p);
    +            } else {
    +                // the read might overlap it's mate, or is the rightmost read of a pair
    +                final String readName = read.getReadName();
    +                final T pe1 = nameMap == null ? null : nameMap.get(readName);
    +                if ( pe1 != null ) {
    +                    // assumes we have at most 2 reads per fragment
    +                    if ( overlapping == null ) overlapping = new ArrayList>(); // lazy init
    +                    overlapping.add(Arrays.asList(pe1, p));
    +                    nameMap.remove(readName);
    +                } else {
    +                    if ( nameMap == null ) nameMap = new HashMap(nElements); // lazy init
    +                    nameMap.put(readName, p);
    +                }
    +            }
    +        }
    +
    +        // add all of the reads that are potentially overlapping but whose mate never showed
    +        // up to the oneReadPile
    +        if ( nameMap != null && ! nameMap.isEmpty() ) {
    +            if ( singletons == null )
    +                singletons = nameMap.values();
    +            else
    +                singletons.addAll(nameMap.values());
    +        }
    +
    +        return new FragmentCollection(singletons, overlapping);
    +    }
    +
    +    public final static FragmentCollection create(ReadBackedPileup rbp) {
    +        return create(rbp, rbp.getNumberOfElements(), PileupElementGetter);
    +    }
    +
    +    public final static FragmentCollection create(List reads) {
    +        return create(reads, reads.size(), SamRecordGetter);
    +    }
    +
    +}
    diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java
    new file mode 100644
    index 000000000..ef0d9ca42
    --- /dev/null
    +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCF.java
    @@ -0,0 +1,256 @@
    +/*
    + * Copyright (c) 2011, The Broad Institute
    + *
    + * Permission is hereby granted, free of charge, to any person
    + * obtaining a copy of this software and associated documentation
    + * files (the "Software"), to deal in the Software without
    + * restriction, including without limitation the rights to use,
    + * copy, modify, merge, publish, distribute, sublicense, and/or sell
    + * copies of the Software, and to permit persons to whom the
    + * Software is furnished to do so, subject to the following
    + * conditions:
    + *
    + * The above copyright notice and this permission notice shall be
    + * included in all copies or substantial portions of the Software.
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    + * OTHER DEALINGS IN THE SOFTWARE.
    + */
    +
    +package org.broadinstitute.sting.utils.gcf;
    +
    +import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter;
    +import org.broadinstitute.sting.utils.exceptions.UserException;
    +import org.broadinstitute.sting.utils.variantcontext.Allele;
    +import org.broadinstitute.sting.utils.variantcontext.Genotype;
    +import org.broadinstitute.sting.utils.variantcontext.VariantContext;
    +
    +import java.io.*;
    +import java.util.*;
    +
    +/**
    + * GATK binary VCF record
    + *
    + * @author Your Name
    + * @since Date created
    + */
    +public class GCF {
    +    private final static int RECORD_TERMINATOR = 123456789;
    +    private int chromOffset;
    +    private int start, stop;
    +    private String id;
    +    private List alleleMap;
    +    private int alleleOffsets[];
    +    private float qual;
    +    private byte refPad;
    +    private String info;
    +    private int filterOffset;
    +
    +    private List genotypes = Collections.emptyList();
    +
    +    public GCF(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc, boolean skipGenotypes) {
    +        chromOffset = GCFHeaderBuilder.encodeString(vc.getChr());
    +        start = vc.getStart();
    +        stop = vc.getEnd();
    +        refPad = vc.hasReferenceBaseForIndel() ? vc.getReferenceBaseForIndel() : 0;
    +        id = vc.getID();
    +
    +        // encode alleles
    +        alleleMap = new ArrayList(vc.getNAlleles());
    +        alleleOffsets = new int[vc.getNAlleles()];
    +        alleleMap.add(vc.getReference());
    +        alleleOffsets[0] = GCFHeaderBuilder.encodeAllele(vc.getReference());
    +        for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) {
    +            alleleMap.add(vc.getAlternateAllele(i));
    +            alleleOffsets[i+1] = GCFHeaderBuilder.encodeAllele(vc.getAlternateAllele(i));
    +        }
    +
    +        qual = (float)vc.getNegLog10PError(); //qualToByte(vc.getPhredScaledQual());
    +        info = infoFieldString(vc, GCFHeaderBuilder);
    +        filterOffset = GCFHeaderBuilder.encodeString(StandardVCFWriter.getFilterString(vc));
    +
    +        if ( ! skipGenotypes ) {
    +            genotypes = encodeGenotypes(GCFHeaderBuilder, vc);
    +        }
    +    }
    +
    +    public GCF(DataInputStream inputStream, boolean skipGenotypes) throws IOException, EOFException {
    +        chromOffset = inputStream.readInt();
    +
    +        // have we reached the footer?
    +        if ( chromOffset == GCFHeader.FOOTER_START_MARKER )
    +            throw new EOFException();
    +
    +        start = inputStream.readInt();
    +        stop = inputStream.readInt();
    +        id = inputStream.readUTF();
    +        refPad = inputStream.readByte();
    +        alleleOffsets = readIntArray(inputStream);
    +        qual = inputStream.readFloat();
    +        info = inputStream.readUTF();
    +        filterOffset = inputStream.readInt();
    +
    +        int nGenotypes = inputStream.readInt();
    +        int sizeOfGenotypes = inputStream.readInt();
    +        if ( skipGenotypes ) {
    +            genotypes = Collections.emptyList();
    +            inputStream.skipBytes(sizeOfGenotypes);
    +        } else {
    +            genotypes = new ArrayList(nGenotypes);
    +            for ( int i = 0; i < nGenotypes; i++ )
    +                genotypes.add(new GCFGenotype(this, inputStream));
    +        }
    +
    +        int recordDone = inputStream.readInt();
    +        if ( recordDone != RECORD_TERMINATOR )
    +            throw new UserException.MalformedFile("Record not terminated by RECORD_TERMINATOR key");
    +    }
    +
    +    public int write(DataOutputStream outputStream) throws IOException {
    +        int startSize = outputStream.size();
    +        outputStream.writeInt(chromOffset);
    +        outputStream.writeInt(start);
    +        outputStream.writeInt(stop);
    +        outputStream.writeUTF(id);
    +        outputStream.writeByte(refPad);
    +        writeIntArray(alleleOffsets, outputStream, true);
    +        outputStream.writeFloat(qual);
    +        outputStream.writeUTF(info);
    +        outputStream.writeInt(filterOffset);
    +
    +        int nGenotypes = genotypes.size();
    +        int expectedSizeOfGenotypes = nGenotypes == 0 ? 0 : genotypes.get(0).sizeInBytes() * nGenotypes;
    +        outputStream.writeInt(nGenotypes);
    +        outputStream.writeInt(expectedSizeOfGenotypes);
    +        int obsSizeOfGenotypes = 0;
    +        for ( GCFGenotype g : genotypes )
    +            obsSizeOfGenotypes += g.write(outputStream);
    +        if ( obsSizeOfGenotypes != expectedSizeOfGenotypes )
    +            throw new RuntimeException("Expect and observed genotype sizes disagree! expect = " + expectedSizeOfGenotypes + " obs =" + obsSizeOfGenotypes);
    +
    +        outputStream.writeInt(RECORD_TERMINATOR);
    +        return outputStream.size() - startSize;
    +    }
    +
    +    public VariantContext decode(final String source, final GCFHeader header) {
    +        final String contig = header.getString(chromOffset);
    +        alleleMap = header.getAlleles(alleleOffsets);
    +        double negLog10PError = qual; // QualityUtils.qualToErrorProb(qual);
    +        Set filters = header.getFilters(filterOffset);
    +        Map attributes = new HashMap();
    +        attributes.put("INFO", info);
    +        Byte refPadByte = refPad == 0 ? null : refPad;
    +        Map genotypes = decodeGenotypes(header);
    +
    +        return new VariantContext(source, contig, start, stop, alleleMap, genotypes, negLog10PError, filters, attributes, refPadByte);
    +    }
    +
    +    private Map decodeGenotypes(final GCFHeader header) {
    +        if ( genotypes.isEmpty() )
    +            return VariantContext.NO_GENOTYPES;
    +        else {
    +            Map map = new TreeMap();
    +
    +            for ( int i = 0; i < genotypes.size(); i++ ) {
    +                final String sampleName = header.getSample(i);
    +                final Genotype g = genotypes.get(i).decode(sampleName, header, this, alleleMap);
    +                map.put(sampleName, g);
    +            }
    +
    +            return map;
    +        }
    +    }
    +
    +    private List encodeGenotypes(final GCFHeaderBuilder GCFHeaderBuilder, final VariantContext vc) {
    +        int nGenotypes = vc.getNSamples();
    +        if ( nGenotypes > 0 ) {
    +            List genotypes = new ArrayList(nGenotypes);
    +            for ( int i = 0; i < nGenotypes; i++ ) genotypes.add(null);
    +
    +            for ( Genotype g : vc.getGenotypes().values() ) {
    +                int i = GCFHeaderBuilder.encodeSample(g.getSampleName());
    +                genotypes.set(i, new GCFGenotype(GCFHeaderBuilder, alleleMap, g));
    +            }
    +
    +            return genotypes;
    +        } else {
    +            return Collections.emptyList();
    +        }
    +    }
    +
    +    public int getNAlleles() { return alleleOffsets.length; }
    +
    +
    +    private final String infoFieldString(VariantContext vc, final GCFHeaderBuilder GCFHeaderBuilder) {
    +        StringBuilder s = new StringBuilder();
    +
    +        boolean first = true;
    +        for ( Map.Entry field : vc.getAttributes().entrySet() ) {
    +            String key = field.getKey();
    +            if ( key.equals(VariantContext.ID_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_MAP_KEY) || key.equals(VariantContext.UNPARSED_GENOTYPE_PARSER_KEY) )
    +                continue;
    +            int stringIndex = GCFHeaderBuilder.encodeString(key);
    +            String outputValue = StandardVCFWriter.formatVCFField(field.getValue());
    +            if ( outputValue != null ) {
    +                if ( ! first ) s.append(";");
    +                s.append(stringIndex).append("=").append(outputValue);
    +                first = false;
    +            }
    +        }
    +
    +        return s.toString();
    +    }
    +
    +    protected final static int BUFFER_SIZE = 1048576; // 2**20
    +
    +    public static DataInputStream createDataInputStream(final InputStream stream) {
    +        return new DataInputStream(new BufferedInputStream(stream, BUFFER_SIZE));
    +    }
    +
    +    public static FileInputStream createFileInputStream(final File file) throws FileNotFoundException {
    +        return new FileInputStream(file);
    +    }
    +
    +    protected final static int[] readIntArray(final DataInputStream inputStream) throws IOException {
    +        return readIntArray(inputStream, inputStream.readInt());
    +    }
    +
    +    protected final static int[] readIntArray(final DataInputStream inputStream, int size) throws IOException {
    +        int[] array = new int[size];
    +        for ( int i = 0; i < array.length; i++ )
    +            array[i] = inputStream.readInt();
    +        return array;
    +    }
    +
    +    protected final static void writeIntArray(int[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException {
    +        if ( writeSize ) outputStream.writeInt(array.length);
    +        for ( int i : array )
    +            outputStream.writeInt(i);
    +    }
    +
    +    protected final static byte[] readByteArray(final DataInputStream inputStream) throws IOException {
    +        return readByteArray(inputStream, inputStream.readInt());
    +    }
    +
    +    protected final static byte[] readByteArray(final DataInputStream inputStream, int size) throws IOException {
    +        byte[] array = new byte[size];
    +        for ( int i = 0; i < array.length; i++ )
    +            array[i] = inputStream.readByte();
    +        return array;
    +    }
    +
    +    protected final static void writeByteArray(byte[] array, final DataOutputStream outputStream, boolean writeSize) throws IOException {
    +        if ( writeSize ) outputStream.writeInt(array.length);
    +        for ( byte i : array )
    +            outputStream.writeByte(i);
    +    }
    +
    +    protected final static byte qualToByte(double phredScaledQual) {
    +        return (byte)Math.round(Math.min(phredScaledQual, 255));
    +    }
    +}
    diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java
    new file mode 100644
    index 000000000..dd1fb091c
    --- /dev/null
    +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFGenotype.java
    @@ -0,0 +1,147 @@
    +/*
    + * Copyright (c) 2011, The Broad Institute
    + *
    + * Permission is hereby granted, free of charge, to any person
    + * obtaining a copy of this software and associated documentation
    + * files (the "Software"), to deal in the Software without
    + * restriction, including without limitation the rights to use,
    + * copy, modify, merge, publish, distribute, sublicense, and/or sell
    + * copies of the Software, and to permit persons to whom the
    + * Software is furnished to do so, subject to the following
    + * conditions:
    + *
    + * The above copyright notice and this permission notice shall be
    + * included in all copies or substantial portions of the Software.
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    + * OTHER DEALINGS IN THE SOFTWARE.
    + */
    +
    +package org.broadinstitute.sting.utils.gcf;
    +
    +import org.broadinstitute.sting.utils.variantcontext.Allele;
    +import org.broadinstitute.sting.utils.variantcontext.Genotype;
    +
    +import java.io.DataInputStream;
    +import java.io.DataOutputStream;
    +import java.io.IOException;
    +import java.util.*;
    +
    +/**
    + * GATK binary VCF record
    + *
    + * @author Your Name
    + * @since Date created
    + */
    +public class GCFGenotype {
    +    private byte gq;
    +    private int gt;
    +    private int dp;
    +    private int ad[];
    +    private byte[] pl;
    +
    +    // todo -- what to do about phasing?  Perhaps we shouldn't support it
    +    // todo -- is the FL field generic or just a flag?  Should we even support per sample filtering?
    +
    +    public GCFGenotype(final GCFHeaderBuilder GCFHeaderBuilder, final List allAlleles, Genotype genotype) {
    +        gq = GCF.qualToByte(genotype.getPhredScaledQual());
    +        gt = encodeAlleles(genotype.getAlleles(), allAlleles);
    +
    +        dp = genotype.getAttributeAsInt("DP", 0);
    +
    +        int nAlleles = allAlleles.size();
    +        ad = new int[nAlleles];
    +
    +        int npls = nAllelesToNPls(nAlleles);
    +        pl = new byte[npls];
    +    }
    +
    +    private int nAllelesToNPls( int nAlleles ) {
    +        return nAlleles*(nAlleles+1) / 2;
    +    }
    +
    +    public GCFGenotype(GCF GCF, DataInputStream inputStream) throws IOException {
    +        int gqInt = inputStream.readUnsignedByte();
    +        gq = (byte)gqInt;
    +        gt = inputStream.readInt();
    +        dp = inputStream.readInt();
    +        ad = GCF.readIntArray(inputStream, GCF.getNAlleles());
    +        pl = GCF.readByteArray(inputStream, nAllelesToNPls(GCF.getNAlleles()));
    +    }
    +
    +    // 2 alleles => 1 + 8 + 8 + 3 => 20
    +    protected int sizeInBytes() {
    +        return 1 // gq
    +                + 4 * 2 // gt + dp
    +                + 4 * ad.length // ad
    +                + 1 * pl.length; // pl
    +    }
    +
    +    public Genotype decode(final String sampleName, final GCFHeader header, GCF GCF, List alleleIndex) {
    +        final List alleles = decodeAlleles(gt, alleleIndex);
    +        final double negLog10PError = gq / 10.0;
    +        final Set filters = Collections.emptySet();
    +        final Map attributes = new HashMap();
    +        attributes.put("DP", dp);
    +        attributes.put("AD", ad);
    +        attributes.put("PL", pl);
    +
    +        return new Genotype(sampleName, alleles, negLog10PError, filters, attributes, false);
    +    }
    +
    +    private static int encodeAlleles(List gtList, List allAlleles) {
    +        final int nAlleles = gtList.size();
    +        if ( nAlleles  > 4 )
    +            throw new IllegalArgumentException("encodeAlleles doesn't support more than 4 alt alleles, but I saw " + gtList);
    +
    +        int gtInt = 0;
    +        for ( int i = 0; i < nAlleles ; i++ ) {
    +            final int bitOffset = i * 8;
    +            final int allelei = getAlleleIndex(gtList.get(i), allAlleles);
    +            final int gti = (allelei + 1) << bitOffset;
    +            gtInt = gtInt | gti;
    +        }
    +
    +        return gtInt;
    +    }
    +
    +    private static int getAlleleIndex(Allele q, List allAlleles) {
    +        if ( q.isNoCall() )
    +            return 254;
    +        for ( int i = 0; i < allAlleles.size(); i++ )
    +            if ( q.equals(allAlleles.get(i)) )
    +                return i;
    +        throw new IllegalStateException("getAlleleIndex passed allele not in map! allele " + q + " allAlleles " + allAlleles);
    +    }
    +
    +    private static List decodeAlleles(int gtInt, List alleleIndex) {
    +        List alleles = new ArrayList(4);
    +
    +        for ( int i = 0; i < 32; i += 8 ) {
    +            final int gi = (gtInt & (0x000000FF << i)) >> i;
    +            if ( gi != 0 ) {
    +                final int allelei = gi - 1;
    +                alleles.add( allelei == 254 ? Allele.NO_CALL : alleleIndex.get(allelei) );
    +            } else {
    +                break;
    +            }
    +        }
    +
    +        return alleles;
    +    }
    +
    +    public int write(DataOutputStream outputStream) throws IOException {
    +        int startSize = outputStream.size();
    +        outputStream.writeByte(gq);
    +        outputStream.writeInt(gt);
    +        outputStream.writeInt(dp);
    +        GCF.writeIntArray(ad, outputStream, false);
    +        GCF.writeByteArray(pl, outputStream, false);
    +        return outputStream.size() - startSize;
    +    }
    +}
    diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java
    new file mode 100644
    index 000000000..6d96eda56
    --- /dev/null
    +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeader.java
    @@ -0,0 +1,205 @@
    +/*
    + * Copyright (c) 2011, The Broad Institute
    + *
    + * Permission is hereby granted, free of charge, to any person
    + * obtaining a copy of this software and associated documentation
    + * files (the "Software"), to deal in the Software without
    + * restriction, including without limitation the rights to use,
    + * copy, modify, merge, publish, distribute, sublicense, and/or sell
    + * copies of the Software, and to permit persons to whom the
    + * Software is furnished to do so, subject to the following
    + * conditions:
    + *
    + * The above copyright notice and this permission notice shall be
    + * included in all copies or substantial portions of the Software.
    + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
    + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
    + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
    + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
    + * OTHER DEALINGS IN THE SOFTWARE.
    + */
    +
    +package org.broadinstitute.sting.utils.gcf;
    +
    +import org.apache.log4j.Logger;
    +import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
    +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
    +import org.broadinstitute.sting.utils.exceptions.UserException;
    +import org.broadinstitute.sting.utils.variantcontext.Allele;
    +
    +import java.io.*;
    +import java.util.*;
    +
    +/**
    + * [Short one sentence description of this walker]
    + * 

    + *

    + * [Functionality of this walker] + *

    + *

    + *

    Input

    + *

    + * [Input description] + *

    + *

    + *

    Output

    + *

    + * [Output description] + *

    + *

    + *

    Examples

    + *
    + *    java
    + *      -jar GenomeAnalysisTK.jar
    + *      -T $WalkerName
    + *  
    + * + * @author Your Name + * @since Date created + */ +public class GCFHeader { + final protected static Logger logger = Logger.getLogger(GCFHeader.class); + + public final static int GCF_VERSION = 1; + public final static byte[] GCF_FILE_START_MARKER = "GCF\1".getBytes(); + public final static int FOOTER_START_MARKER = -1; + public final static long HEADER_FORWARD_REFERENCE_OFFSET = GCF_FILE_START_MARKER.length + 4; // for the version + + final int version; + long footerPosition; + final List alleles; + final List strings; + final List samples; + final List> filters; + + public GCFHeader(final Map allelesIn, final Map stringIn, final Map samplesIn) { + version = GCF_VERSION; + footerPosition = 0; + this.alleles = linearize(allelesIn); + this.strings = linearize(stringIn); + this.samples = linearize(samplesIn); + this.filters = null; // not used with this constructor + } + + public GCFHeader(FileInputStream fileInputStream) throws IOException { + DataInputStream inputStream = new DataInputStream(fileInputStream); + byte[] headerTest = new byte[GCF_FILE_START_MARKER.length]; + inputStream.read(headerTest); + if ( ! Arrays.equals(headerTest, GCF_FILE_START_MARKER) ) { + throw new UserException("Could not read GVCF file. GCF_FILE_START_MARKER missing. Saw " + new String(headerTest)); + } else { + version = inputStream.readInt(); + logger.info("Read GCF version " + version); + footerPosition = inputStream.readLong(); + logger.info("Read footer position of " + footerPosition); + long lastPos = fileInputStream.getChannel().position(); + logger.info(" Last position is " + lastPos); + + // seek to the footer + fileInputStream.getChannel().position(footerPosition); + if ( inputStream.readInt() != FOOTER_START_MARKER ) + throw new UserException.MalformedFile("Malformed GCF file: couldn't find the footer marker"); + alleles = stringsToAlleles(readStrings(inputStream)); + strings = readStrings(inputStream); + samples = readStrings(inputStream); + logger.info(String.format("Allele map of %d elements", alleles.size())); + logger.info(String.format("String map of %d elements", strings.size())); + logger.info(String.format("Sample map of %d elements", samples.size())); + filters = initializeFilterCache(); + fileInputStream.getChannel().position(lastPos); + } + } + + public static int writeHeader(final DataOutputStream outputStream) throws IOException { + int startBytes = outputStream.size(); + outputStream.write(GCF_FILE_START_MARKER); + outputStream.writeInt(GCF_VERSION); + outputStream.writeLong(0); + return outputStream.size() - startBytes; + } + + public int writeFooter(final DataOutputStream outputStream) throws IOException { + int startBytes = outputStream.size(); + outputStream.writeInt(FOOTER_START_MARKER); // has to be the same as chrom encoding + write(outputStream, allelesToStrings(alleles)); + write(outputStream, strings); + write(outputStream, samples); + return outputStream.size() - startBytes; + } + + private void write(DataOutputStream outputStream, List l) throws IOException { + outputStream.writeInt(l.size()); + for ( String elt : l ) outputStream.writeUTF(elt); + } + + private List allelesToStrings(List alleles) { + List strings = new ArrayList(alleles.size()); + for ( Allele allele : alleles ) strings.add(allele.toString()); + return strings; + } + + private List> initializeFilterCache() { + // required to allow offset -> set lookup + List> l = new ArrayList>(strings.size()); + for ( int i = 0; i < strings.size(); i++ ) l.add(null); + return l; + } + + private static List stringsToAlleles(final List strings) { + final List alleles = new ArrayList(strings.size()); + for ( String string : strings ) { + boolean isRef = string.endsWith("*"); + if ( isRef ) string = string.substring(0, string.length() - 1); + alleles.add(Allele.create(string, isRef)); + } + return alleles; + } + + private static List readStrings(final DataInputStream inputStream) throws IOException { + final int nStrings = inputStream.readInt(); + + final List strings = new ArrayList(nStrings); + for ( int i = 0; i < nStrings; i++ ) { + strings.add(inputStream.readUTF()); + } + + return strings; + } + + private static List linearize(final Map map) { + final ArrayList l = new ArrayList(map.size()); + for ( int i = 0; i < map.size(); i++ ) l.add(null); + for ( final Map.Entry elt : map.entrySet() ) + l.set(elt.getValue(), elt.getKey()); + return l; + } + + public String getSample(final int offset) { return samples.get(offset); } + public String getString(final int offset) { return strings.get(offset); } + public Allele getAllele(final int offset) { return alleles.get(offset); } + public List getAlleles(final int[] offsets) { + final List alleles = new ArrayList(offsets.length); + for ( int i : offsets ) alleles.add(getAllele(i)); + return alleles; + } + + public Set getFilters(final int offset) { + Set cached = filters.get(offset); + + if ( cached != null ) + return cached; + else { + final String filterString = getString(offset); + if ( filterString.equals(VCFConstants.UNFILTERED) ) + return null; // UNFILTERED records are represented by null + else { + Set set = VCFCodec.parseFilters(null, -1, filterString); + filters.set(offset, set); // remember the result + return set; + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java new file mode 100644 index 000000000..40e01ec72 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFHeaderBuilder.java @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gcf; + +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.HashMap; +import java.util.Map; + +/** + * [Short one sentence description of this walker] + *

    + *

    + * [Functionality of this walker] + *

    + *

    + *

    Input

    + *

    + * [Input description] + *

    + *

    + *

    Output

    + *

    + * [Output description] + *

    + *

    + *

    Examples

    + *
    + *    java
    + *      -jar GenomeAnalysisTK.jar
    + *      -T $WalkerName
    + *  
    + * + * @author Your Name + * @since Date created + */ +public class GCFHeaderBuilder { + Map alleles = new HashMap(); + Map strings = new HashMap(); + Map samples = new HashMap(); + + public GCFHeader createHeader() { + return new GCFHeader(alleles, strings, samples); + } + + public int encodeString(final String chr) { return encode(strings, chr); } + public int encodeAllele(final Allele allele) { return encode(alleles, allele); } + public int encodeSample(final String sampleName) { return encode(samples, sampleName); } + + private int encode(Map map, T key) { + Integer v = map.get(key); + if ( v == null ) { + v = map.size(); + map.put(key, v); + } + return v; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java new file mode 100644 index 000000000..18fae18c4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/gcf/GCFWriter.java @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.gcf; + +import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.codecs.vcf.IndexingVCFWriter; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.*; + +/** + * GCFWriter implementing the VCFWriter interface + * @author Your Name + * @since Date created + */ +public class GCFWriter extends IndexingVCFWriter { + final boolean skipGenotypes; + final FileOutputStream fileOutputStream; + final DataOutputStream dataOutputStream; + final GCFHeaderBuilder gcfHeaderBuilder; + int nbytes = 0; + VCFHeader header = null; + File location; + + // -------------------------------------------------------------------------------- + // + // Constructors + // + // -------------------------------------------------------------------------------- + + public GCFWriter(final File location, final SAMSequenceDictionary refDict, boolean enableOnTheFlyIndexing, boolean doNotWriteGenotypes) { + super(writerName(location, null), location, null, refDict, enableOnTheFlyIndexing); + this.location = location; + this.skipGenotypes = doNotWriteGenotypes; + + // write the output + try { + fileOutputStream = new FileOutputStream(location); + dataOutputStream = createDataOutputStream(fileOutputStream); + gcfHeaderBuilder = new GCFHeaderBuilder(); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotCreateOutputFile(location, e); + } + } + + // -------------------------------------------------------------------------------- + // + // VCFWriter interface functions + // + // -------------------------------------------------------------------------------- + + @Override + public void writeHeader(VCFHeader header) { + this.header = header; + try { + nbytes += GCFHeader.writeHeader(dataOutputStream); + } catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Couldn't write header", e); + } + } + + @Override + public void add(VariantContext vc) { + super.add(vc); + GCF gcf = new GCF(gcfHeaderBuilder, vc, skipGenotypes); + try { + nbytes += gcf.write(dataOutputStream); + } catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(getStreamName(), "Failed to add gcf record " + gcf + " to stream " + getStreamName(), e); + } + } + + @Override + public void close() { + // todo -- write out VCF header lines + GCFHeader gcfHeader = gcfHeaderBuilder.createHeader(); + try { + long headerPosition = nbytes; + nbytes += gcfHeader.writeFooter(dataOutputStream); + dataOutputStream.close(); + //System.out.println("Writing forward reference to " + headerPosition); + + RandomAccessFile raFile = new RandomAccessFile(location, "rw"); + raFile.seek(GCFHeader.HEADER_FORWARD_REFERENCE_OFFSET); + raFile.writeLong(headerPosition); + raFile.close(); + } catch ( IOException e ) { + throw new ReviewedStingException("Failed to close GCFWriter " + getStreamName(), e); + } + + super.close(); + } + + private static final DataOutputStream createDataOutputStream(final OutputStream stream) { + return new DataOutputStream(new BufferedOutputStream(stream, GCF.BUFFER_SIZE)); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java index a9d71ef98..25ef8ccd2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java @@ -29,6 +29,7 @@ import org.apache.log4j.Logger; import org.broadinstitute.sting.commandline.ArgumentDefinition; import org.broadinstitute.sting.commandline.ArgumentDefinitionGroup; import org.broadinstitute.sting.commandline.ArgumentDefinitions; +import org.broadinstitute.sting.commandline.ArgumentMatchSource; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.text.TextFormattingUtils; @@ -47,6 +48,7 @@ public class HelpFormatter { /** * Prints the help, given a collection of argument definitions. + * @param applicationDetails Application details * @param argumentDefinitions Argument definitions for which help should be printed. */ public void printHelp( ApplicationDetails applicationDetails, ArgumentDefinitions argumentDefinitions ) { @@ -233,7 +235,7 @@ public class HelpFormatter { private List prepareArgumentGroups( ArgumentDefinitions argumentDefinitions ) { // Sort the list of argument definitions according to how they should be shown. // Put the sorted results into a new cloned data structure. - Comparator definitionComparator = new Comparator() { + Comparator definitionComparator = new Comparator() { public int compare( ArgumentDefinition lhs, ArgumentDefinition rhs ) { if( lhs.required && rhs.required ) return 0; if( lhs.required ) return -1; @@ -242,15 +244,15 @@ public class HelpFormatter { } }; - List argumentGroups = new ArrayList(); + List argumentGroups = new ArrayList(); for( ArgumentDefinitionGroup argumentGroup: argumentDefinitions.getArgumentDefinitionGroups() ) { - List sortedDefinitions = new ArrayList( argumentGroup.argumentDefinitions ); + List sortedDefinitions = new ArrayList( argumentGroup.argumentDefinitions ); Collections.sort( sortedDefinitions, definitionComparator ); argumentGroups.add( new ArgumentDefinitionGroup(argumentGroup.groupName,sortedDefinitions) ); } // Sort the argument groups themselves with main arguments first, followed by plugins sorted in name order. - Comparator groupComparator = new Comparator() { + Comparator groupComparator = new Comparator() { public int compare( ArgumentDefinitionGroup lhs, ArgumentDefinitionGroup rhs ) { if( lhs.groupName == null && rhs.groupName == null ) return 0; if( lhs.groupName == null ) return -1; @@ -271,9 +273,9 @@ public class HelpFormatter { * Generate a standard header for the logger * * @param applicationDetails details of the application to run. - * @param args the command line arguments passed in + * @param parsedArgs the command line arguments passed in */ - public static void generateHeaderInformation(ApplicationDetails applicationDetails, String[] args) { + public static void generateHeaderInformation(ApplicationDetails applicationDetails, Map> parsedArgs) { DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); java.util.Date date = new java.util.Date(); @@ -283,11 +285,22 @@ public class HelpFormatter { logger.info(barrier); for (String headerLine : applicationDetails.applicationHeader) logger.info(headerLine); - String output = ""; - for (String str : args) { - output = output + str + " "; + logger.debug("Current directory: " + System.getProperty("user.dir")); + for (Map.Entry> entry: parsedArgs.entrySet()) { + ArgumentMatchSource matchSource = entry.getKey(); + final String sourceName; + switch (matchSource.getType()) { + case CommandLine: sourceName = "Program"; break; + case File: sourceName = matchSource.getFile().getPath(); break; + default: throw new RuntimeException("Unexpected argument match source type: " + matchSource.getType()); + } + + String output = sourceName + " Args:"; + for (String str : entry.getValue()) { + output = output + " " + str; + } + logger.info(output); } - logger.info("Program Args: " + output); logger.info("Date/Time: " + dateFormat.format(date)); logger.info(barrier); diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java deleted file mode 100644 index 2bc3fa284..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalFileMergingIterator.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.interval; - -import org.broadinstitute.sting.gatk.iterators.PushbackIterator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.text.XReadLines; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.Iterator; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Jun 11, 2010 - * Time: 2:56:29 PM - * To change this template use File | Settings | File Templates. - */ - -/** This iterator reads intervals from interval file (can be gatk-style - * interval list or a bed file) and merges them on the fly. Very much alike - * IntervalUtils.sortAndMergeIntervals() but the list is read sequentially - * from a file upon request instead of loading the whole list into memory. - * Intervals in the underlying file MUST be - * pre-sorted into the reference order (they can overlap though, as this - * iterator is a merging one). - */ -public class IntervalFileMergingIterator implements Iterator { - private PushbackIterator it ; - private IntervalMergingRule myRule; - private File myFile; - - public IntervalFileMergingIterator(GenomeLocParser genomeLocParser,File f, IntervalMergingRule rule) { - myFile = f; - - try { - XReadLines reader = new XReadLines(f); - - if (f.getName().toUpperCase().endsWith(".BED")) { - it = new PushbackIterator( new StringToGenomeLocIteratorAdapter( genomeLocParser,reader.iterator(), - StringToGenomeLocIteratorAdapter.FORMAT.BED ) ) ; - } else { - it = new PushbackIterator( new StringToGenomeLocIteratorAdapter( genomeLocParser,reader.iterator(), - StringToGenomeLocIteratorAdapter.FORMAT.GATK ) ) ; - } - } catch ( FileNotFoundException e ) { - throw new UserException.CouldNotReadInputFile(f, e); - } - myRule = rule; - } - - public boolean hasNext() { - return it.hasNext(); - } - - /** Returns next merged interval from the underlying interval file. In other words, keeps reading intervals - * for as long as they overlap and returns a single merged interval encompassing the set of overlapping - * intervals read from the file. Non-overlapping intervals are returned as is. This method will throw an - * exception if it runs into an interval that is out of order. - * @return - */ - public GenomeLoc next() { - - GenomeLoc current = it.next(); - - while ( it.hasNext() ) { - GenomeLoc next = it.next(); - - if ( next.isBefore(current)) { - throw new UserException.MalformedFile(myFile, "Interval "+next+" in the interval file is out of order."); - } - - if (current.overlapsP(next)) { - current = current.merge(next); - } else if (current.contiguousP(next) && myRule == IntervalMergingRule.ALL) { - current = current.merge(next); - } else { - it.pushback(next); - break; - } - } - - return current; - } - - public void remove() { - throw new UnsupportedOperationException("method 'remove' is not supported by this iterator"); - } - -} diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java index eae4f8db5..f31c0a5a0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalSetRule.java @@ -4,6 +4,8 @@ package org.broadinstitute.sting.utils.interval; * set operators for combining lists of intervals */ public enum IntervalSetRule { + /** Take the union of all intervals */ UNION, + /** Take the intersection of intervals (the subset that overlaps all intervals specified) */ INTERSECTION; } diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java index f551e1368..f0e164c87 100644 --- a/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/interval/IntervalUtils.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.utils.interval; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; import net.sf.picard.util.Interval; import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; @@ -8,8 +10,9 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.bed.BedParser; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.XReadLines; @@ -35,70 +38,68 @@ public class IntervalUtils { * * @param parser Genome loc parser. * @param argList A list of strings containing interval data. - * @param allowEmptyIntervalList If false instead of an empty interval list will return null. - * @return an unsorted, unmerged representation of the given intervals. Null is used to indicate that all intervals should be used. + * @return an unsorted, unmerged representation of the given intervals. Null is used to indicate that all intervals should be used. */ - public static List parseIntervalArguments(GenomeLocParser parser, List argList, boolean allowEmptyIntervalList) { + public static List parseIntervalArguments(GenomeLocParser parser, List argList) { List rawIntervals = new ArrayList(); // running list of raw GenomeLocs if (argList != null) { // now that we can be in this function if only the ROD-to-Intervals was provided, we need to // ensure that the arg list isn't null before looping. for (String argument : argList) { - - // separate argument on semicolon first - for (String fileOrInterval : argument.split(";")) { - // if any interval argument is '-L all', consider all loci by returning no intervals - if (fileOrInterval.trim().toLowerCase().equals("all")) { - if (argList.size() != 1) { - // throw error if '-L all' is not only interval - potentially conflicting commands - throw new UserException.CommandLineException(String.format("Conflicting arguments: Intervals given along with \"-L all\"")); - } - return null; - } - // if any argument is 'unmapped', "parse" it to a null entry. A null in this case means 'all the intervals with no alignment data'. - else if (isUnmapped(fileOrInterval)) - rawIntervals.add(GenomeLoc.UNMAPPED); - // if it's a file, add items to raw interval list - else if (isIntervalFile(fileOrInterval)) { - try { - rawIntervals.addAll(intervalFileToList(parser, fileOrInterval, allowEmptyIntervalList)); - } - catch ( UserException.MalformedGenomeLoc e ) { - throw e; - } - catch ( Exception e ) { - throw new UserException.MalformedFile(fileOrInterval, "Interval file could not be parsed in any supported format.", e); - } - } - - // otherwise treat as an interval -> parse and add to raw interval list - else { - rawIntervals.add(parser.parseGenomeLoc(fileOrInterval)); - } - } + rawIntervals.addAll(parseIntervalArguments(parser, argument)); } } return rawIntervals; } - /** + public static List parseIntervalArguments(GenomeLocParser parser, String arg) { + List rawIntervals = new ArrayList(); // running list of raw GenomeLocs + + // separate argument on semicolon first + for (String fileOrInterval : arg.split(";")) { + // if any argument is 'unmapped', "parse" it to a null entry. A null in this case means 'all the intervals with no alignment data'. + if (isUnmapped(fileOrInterval)) + rawIntervals.add(GenomeLoc.UNMAPPED); + // if it's a file, add items to raw interval list + else if (isIntervalFile(fileOrInterval)) { + try { + rawIntervals.addAll(intervalFileToList(parser, fileOrInterval)); + } + catch ( UserException.MalformedGenomeLoc e ) { + throw e; + } + catch ( Exception e ) { + throw new UserException.MalformedFile(fileOrInterval, "Interval file could not be parsed in any supported format.", e); + } + } + + // otherwise treat as an interval -> parse and add to raw interval list + else { + rawIntervals.add(parser.parseGenomeLoc(fileOrInterval)); + } + } + + return rawIntervals; + } + + /** * Read a file of genome locations to process. The file may be in BED, Picard, * or GATK interval format. * - * @param file_name interval file - * @param allowEmptyIntervalList if false an exception will be thrown for files that contain no intervals + * @param glParser GenomeLocParser + * @param file_name interval file * @return List List of Genome Locs that have been parsed from file */ - public static List intervalFileToList(final GenomeLocParser glParser, final String file_name, boolean allowEmptyIntervalList) { + public static List intervalFileToList(final GenomeLocParser glParser, final String file_name) { // try to open file File inputFile = new File(file_name); List ret = new ArrayList(); // case: BED file - if (file_name.toUpperCase().endsWith(".BED")) { - BedParser parser = new BedParser(glParser,inputFile); - ret.addAll(parser.getLocations()); + if ( file_name.toUpperCase().endsWith(".BED") ) { + // this is now supported in Tribble + throw new ReviewedStingException("BED files must be parsed through Tribble; parsing them as intervals through the GATK engine is no longer supported"); } else { /** @@ -145,12 +146,6 @@ public class IntervalUtils { } } - if ( ret.isEmpty() && ! allowEmptyIntervalList ) { - throw new UserException("The interval file " + inputFile.getAbsolutePath() + " contains no intervals " + - "that could be parsed, and the unsafe operation ALLOW_EMPTY_INTERVAL_LIST has " + - "not been enabled"); - } - return ret; } @@ -204,7 +199,7 @@ public class IntervalUtils { //if we have an empty list, throw an exception. If they specified intersection and there are no items, this is bad. if (retList.size() == 0) - throw new UserException.BadInput("The INTERSECTION of your -BTI and -L options produced no intervals."); + throw new UserException.BadInput("The INTERSECTION of your -L options produced no intervals."); // we don't need to add the rest of remaining locations, since we know they don't overlap. return what we have return retList; @@ -229,6 +224,44 @@ public class IntervalUtils { return GenomeLocSortedSet.createSetFromList(parser,intervals); } + /** + * computes whether the test interval list is equivalent to master. To be equivalent, test must + * contain GenomeLocs covering every base in master, exactly once. Note that this algorithm + * assumes that master genomelocs are all discontiguous (i.e., we don't have locs like 1-3 and 4-6 but + * rather just 1-6). In order to use this algorithm with contiguous genomelocs first merge them. The algorithm + * doesn't assume that test has discontinuous genomelocs. + * + * Returns a null string if there are no differences, otherwise returns a string describing the difference + * (useful for UnitTests). Assumes both lists are sorted + */ + public static final String equateIntervals(List masterArg, List testArg) { + LinkedList master = new LinkedList(masterArg); + LinkedList test = new LinkedList(testArg); + + while ( ! master.isEmpty() ) { // there's still unchecked bases in master + final GenomeLoc masterHead = master.pop(); + final GenomeLoc testHead = test.pop(); + + if ( testHead.overlapsP(masterHead) ) { + // remove the parts of test that overlap master, and push the remaining + // parts onto master for further comparison. + for ( final GenomeLoc masterPart : Utils.reverse(masterHead.subtract(testHead)) ) { + master.push(masterPart); + } + } else { + // testHead is incompatible with masterHead, so we must have extra bases in testHead + // that aren't in master + return "Incompatible locs detected masterHead=" + masterHead + ", testHead=" + testHead; + } + } + + if ( test.isEmpty() ) // everything is equal + return null; // no differences + else + return "Remaining elements found in test: first=" + test.peek(); + } + + /** * Check if string argument was intented as a file * Accepted file extensions: .bed .list, .picard, .interval_list, .intervals. @@ -334,24 +367,44 @@ public class IntervalUtils { } /** - * Splits an interval list into multiple files. - * @param fileHeader The sam file header. + * Splits an interval list into multiple sublists. * @param locs The genome locs to split. * @param splits The stop points for the genome locs returned by splitFixedIntervals. - * @param scatterParts The output interval lists to write to. + * @return A list of lists of genome locs, split according to splits */ - public static void scatterFixedIntervals(SAMFileHeader fileHeader, List locs, List splits, List scatterParts) { - if (splits.size() != scatterParts.size()) - throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); - int fileIndex = 0; + public static List> splitIntervalsToSubLists(List locs, List splits) { int locIndex = 1; int start = 0; + List> sublists = new ArrayList>(splits.size()); for (Integer stop: splits) { - IntervalList intervalList = new IntervalList(fileHeader); + List curList = new ArrayList(); for (int i = start; i < stop; i++) - intervalList.add(toInterval(locs.get(i), locIndex++)); - intervalList.write(scatterParts.get(fileIndex++)); + curList.add(locs.get(i)); start = stop; + sublists.add(curList); + } + + return sublists; + } + + + /** + * Splits an interval list into multiple files. + * @param fileHeader The sam file header. + * @param splits Pre-divided genome locs returned by splitFixedIntervals. + * @param scatterParts The output interval lists to write to. + */ + public static void scatterFixedIntervals(SAMFileHeader fileHeader, List> splits, List scatterParts) { + if (splits.size() != scatterParts.size()) + throw new UserException.BadArgumentValue("splits", String.format("Split points %d does not equal the number of scatter parts %d.", splits.size(), scatterParts.size())); + + int fileIndex = 0; + int locIndex = 1; + for (final List split : splits) { + IntervalList intervalList = new IntervalList(fileHeader); + for (final GenomeLoc loc : split) + intervalList.add(toInterval(loc, locIndex++)); + intervalList.write(scatterParts.get(fileIndex++)); } } @@ -361,17 +414,101 @@ public class IntervalUtils { * @param numParts Number of parts to split the locs into. * @return The stop points to split the genome locs. */ - public static List splitFixedIntervals(List locs, int numParts) { + public static List> splitFixedIntervals(List locs, int numParts) { if (locs.size() < numParts) throw new UserException.BadArgumentValue("scatterParts", String.format("Cannot scatter %d locs into %d parts.", locs.size(), numParts)); - long locsSize = 0; - for (GenomeLoc loc: locs) - locsSize += loc.size(); - List splitPoints = new ArrayList(); + final long locsSize = intervalSize(locs); + final List splitPoints = new ArrayList(); addFixedSplit(splitPoints, locs, locsSize, 0, locs.size(), numParts); Collections.sort(splitPoints); splitPoints.add(locs.size()); - return splitPoints; + return splitIntervalsToSubLists(locs, splitPoints); + } + + @Requires({"locs != null", "numParts > 0"}) + @Ensures("result != null") + public static List> splitLocusIntervals(List locs, int numParts) { + // the ideal size of each split + final long bp = IntervalUtils.intervalSize(locs); + final long idealSplitSize = Math.max((long)Math.floor(bp / (1.0*numParts)), 1); + + // algorithm: + // split = () + // set size = 0 + // pop the head H off locs. + // If size + size(H) < splitSize: + // add H to split, continue + // If size + size(H) == splitSize: + // done with split, put in splits, restart + // if size + size(H) > splitSize: + // cut H into two pieces, first of which has splitSize - size bp + // push both pieces onto locs, continue + // The last split is special -- when you have only one split left, it gets all of the remaining locs + // to deal with rounding issues + final List> splits = new ArrayList>(numParts); + + LinkedList locsLinkedList = new LinkedList(locs); + while ( ! locsLinkedList.isEmpty() ) { + if ( splits.size() + 1 == numParts ) { + // the last one gets all of the remaining parts + splits.add(new ArrayList(locsLinkedList)); + locsLinkedList.clear(); + } else { + final SplitLocusRecursive one = splitLocusIntervals1(locsLinkedList, idealSplitSize); + splits.add(one.split); + locsLinkedList = one.remaining; + } + } + + return splits; + } + + @Requires({"remaining != null", "!remaining.isEmpty()", "idealSplitSize > 0"}) + @Ensures({"result != null"}) + final static SplitLocusRecursive splitLocusIntervals1(LinkedList remaining, long idealSplitSize) { + final List split = new ArrayList(); + long size = 0; + + while ( ! remaining.isEmpty() ) { + GenomeLoc head = remaining.pop(); + final long newSize = size + head.size(); + + if ( newSize == idealSplitSize ) { + split.add(head); + break; // we are done + } else if ( newSize > idealSplitSize ) { + final long remainingBp = idealSplitSize - size; + final long cutPoint = head.getStart() + remainingBp; + GenomeLoc[] parts = head.split((int)cutPoint); + remaining.push(parts[1]); + remaining.push(parts[0]); + // when we go around, head.size' = idealSplitSize - size + // so newSize' = splitSize + head.size' = size + (idealSplitSize - size) = idealSplitSize + } else { + split.add(head); + size = newSize; + } + } + + return new SplitLocusRecursive(split, remaining); + } + + private final static class SplitLocusRecursive { + final List split; + final LinkedList remaining; + + @Requires({"split != null", "remaining != null"}) + private SplitLocusRecursive(final List split, final LinkedList remaining) { + this.split = split; + this.remaining = remaining; + } + } + + public static List flattenSplitIntervals(List> splits) { + final List locs = new ArrayList(); + for ( final List split : splits ) + locs.addAll(split); + return locs; } private static void addFixedSplit(List splitPoints, List locs, long locsSize, int startIndex, int stopIndex, int numParts) { @@ -441,4 +578,11 @@ public class IntervalUtils { return merged; } } + + public static final long intervalSize(final List locs) { + long size = 0; + for ( final GenomeLoc loc : locs ) + size += loc.size(); + return size; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIterator.java b/public/java/src/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIterator.java deleted file mode 100644 index 7e87ce8b5..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIterator.java +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.interval; - -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.util.Iterator; -import java.util.PriorityQueue; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Oct 28, 2010 - * Time: 12:06:23 PM - * To change this template use File | Settings | File Templates. - */ - -/** - * An adapter over a collection of underlying Iterator objects (a single underlying iterator is allowed). Each - * individual underlying iterator must serve its intervals in coordinate-sorted order or an exception will be thrown. - * Intervals from individual underlying streams (iterators) are 1) merged into a single ordered stream; 2) each group of - * overlapping intervals from that merged stream are merged into a single interval; each call to next() returns such - * merged interval guaranteed to have no overlaps with the previous or next interval. - * - */ -public class NwayIntervalMergingIterator implements Iterator, Iterable { - - private PriorityQueue queue = null; - private IntervalMergingRule myRule; - - public NwayIntervalMergingIterator(IntervalMergingRule rule) { - myRule = rule; - queue = new PriorityQueue(); - } - - public void add(Iterator it) { - Element e = new Element(it); - if ( ! e.isEmpty() ) queue.add(e); - } - - public Iterator iterator() { - return this; - } - - /** - * Returns true if the iteration has more elements. (In other - * words, returns true if next would return an element - * rather than throwing an exception.) - * - * @return true if the iterator has more elements. - */ - public boolean hasNext() { - return ! queue.isEmpty(); //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Returns the next element in the iteration. - * - * @return the next element in the iteration. - * @throws java.util.NoSuchElementException - * iteration has no more elements. - */ - public GenomeLoc next() { - Element e = queue.poll(); - GenomeLoc result = e.current; - - // advance element (i.e. its underlying iterator) and reinsert into the queue - e.advance(); - if ( ! e.isEmpty() ) queue.add(e); - - while ( ! queue.isEmpty () ) { - e = queue.peek(); - - if (result.overlapsP(e.current) || myRule == IntervalMergingRule.ALL && result.contiguousP(e.current)) { - // we need to merge: - result = result.merge(e.current); - - // remove current head of the queue that we just merged into the result: - e = queue.poll(); - // advance element we just merged into the result and reinsert it into the queue (if it has any data left): - e.advance(); - if ( ! e.isEmpty() ) queue.add(e); - - } else { - // next element does not overlap with current result; we are done: return the result and that - // next element will wait for next call to next() - break; - } - - } - return result; //To change body of implemented methods use File | Settings | File Templates. - } - - /** - * Removes from the underlying collection the last element returned by the - * iterator (optional operation). This method can be called only once per - * call to next. The behavior of an iterator is unspecified if - * the underlying collection is modified while the iteration is in - * progress in any way other than by calling this method. - * - * @throws UnsupportedOperationException if the remove - * operation is not supported by this Iterator. - * @throws IllegalStateException if the next method has not - * yet been called, or the remove method has already - * been called after the last call to the next - * method. - */ - public void remove() { - throw new UnsupportedOperationException("remove() method not supported by this iterator"); - } - - private class Element implements Comparable { - private Iterator it; - private GenomeLoc current = null; - - private void advance() { - if ( it.hasNext() ) { - GenomeLoc next = it.next(); - if ( next.isBefore(current) ) { - throw new UserException("Interval list provided by underlying iterator "+it.getClass().getName() +" is out of order"); - } - current = next; - } - else current = null; - } - - public boolean isEmpty() { return current == null; } - - public Element(Iterator it) { - this.it = it; - if ( this.it.hasNext() ) current = this.it.next(); - } - - /** - * Compares this object with the specified object for order. Returns a - * negative integer, zero, or a positive integer as this object is less - * than, equal to, or greater than the specified object. - *

    - *

    The implementor must ensure sgn(x.compareTo(y)) == - * -sgn(y.compareTo(x)) for all x and y. (This - * implies that x.compareTo(y) must throw an exception iff - * y.compareTo(x) throws an exception.) - *

    - *

    The implementor must also ensure that the relation is transitive: - * (x.compareTo(y)>0 && y.compareTo(z)>0) implies - * x.compareTo(z)>0. - *

    - *

    Finally, the implementor must ensure that x.compareTo(y)==0 - * implies that sgn(x.compareTo(z)) == sgn(y.compareTo(z)), for - * all z. - *

    - *

    It is strongly recommended, but not strictly required that - * (x.compareTo(y)==0) == (x.equals(y)). Generally speaking, any - * class that implements the Comparable interface and violates - * this condition should clearly indicate this fact. The recommended - * language is "Note: this class has a natural ordering that is - * inconsistent with equals." - *

    - *

    In the foregoing description, the notation - * sgn(expression) designates the mathematical - * signum function, which is defined to return one of -1, - * 0, or 1 according to whether the value of - * expression is negative, zero or positive. - * - * @param o the object to be compared. - * @return a negative integer, zero, or a positive integer as this object - * is less than, equal to, or greater than the specified object. - * @throws ClassCastException if the specified object's type prevents it - * from being compared to this object. - */ - public int compareTo(Element o) { - if ( current == null ) return 1; - if ( o.current == null ) return -1; - return current.compareTo(o.current); //To change body of implemented methods use File | Settings | File Templates. - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/interval/StringToGenomeLocIteratorAdapter.java b/public/java/src/org/broadinstitute/sting/utils/interval/StringToGenomeLocIteratorAdapter.java deleted file mode 100644 index 659260345..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/interval/StringToGenomeLocIteratorAdapter.java +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.interval; - -import org.broadinstitute.sting.gatk.iterators.PushbackIterator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.bed.BedParser; - -import java.util.Iterator; - -/** - * Created by IntelliJ IDEA. -* User: asivache -* Date: Jun 11, 2010 -* Time: 2:25:42 PM -* To change this template use File | Settings | File Templates. -*/ - -/** - * Wrap this adapter around Iterator to get Iterator. Each string coming from the underlying - * iterator is parsed and converted to GenomeLoc on the fly and the latter is returned on each call to next(). - * This adaptor silently skips empty lines received from the underlying string iterator. - * Two string formats are currently supported: BED and GATK. This iterator will throw an exception if it fails - * to parse a string. - */ -public class StringToGenomeLocIteratorAdapter implements Iterator { - private GenomeLocParser genomeLocParser; - - private PushbackIterator it = null; - - public enum FORMAT { BED, GATK }; - - FORMAT myFormat = FORMAT.GATK; - - public StringToGenomeLocIteratorAdapter(GenomeLocParser genomeLocParser,Iterator it, FORMAT format) { - this.genomeLocParser = genomeLocParser; - this.it = new PushbackIterator(it); - myFormat = format; - } - - public StringToGenomeLocIteratorAdapter(GenomeLocParser genomeLocParser,Iterator it ) { - this(genomeLocParser,it,FORMAT.GATK); - } - - public boolean hasNext() { - String s = null; - boolean success = false; - - // skip empty lines: - while ( it.hasNext() ) { - s = it.next(); - if ( s.length() != 0 && ! s.matches("^\\s+$")) { - success = true; - it.pushback(s); - break; - } - } - return success; - } - - public GenomeLoc next() { - - if ( myFormat == FORMAT.GATK ) return genomeLocParser.parseGenomeLoc(it.next()); - return BedParser.parseLocation( genomeLocParser,it.next() ); - } - - public void remove() { - throw new UnsupportedOperationException("method 'remove' is not supported by this iterator"); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/io/FileExtension.java b/public/java/src/org/broadinstitute/sting/utils/io/FileExtension.java new file mode 100644 index 000000000..cd69ee126 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/io/FileExtension.java @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.io; + +import java.io.File; + +public interface FileExtension { + /** + * Returns a clone of the FileExtension with a new path. + * @param path New path. + * @return New FileExtension + */ + public File withPath(String path); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/io/HardThresholdingOutputStream.java b/public/java/src/org/broadinstitute/sting/utils/io/HardThresholdingOutputStream.java new file mode 100755 index 000000000..26b5ae6fd --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/io/HardThresholdingOutputStream.java @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.utils.io; + +import org.apache.commons.io.output.ThresholdingOutputStream; + +import java.io.IOException; + +/** + * An output stream which stops at the threshold + * instead of potentially triggering early. + */ +public abstract class HardThresholdingOutputStream extends ThresholdingOutputStream { + protected HardThresholdingOutputStream(int threshold) { + super(threshold); + } + + @Override + public void write(byte[] b) throws IOException { + write(b, 0, b.length); + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + int remaining = this.getThreshold() - (int)this.getByteCount(); + if (!isThresholdExceeded() && len > remaining) { + super.write(b, off, remaining); + super.write(b, off + remaining, len - remaining); + } else { + super.write(b, off, len); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java new file mode 100644 index 000000000..94c2d4c0b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java @@ -0,0 +1,365 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.io; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.io.LineIterator; +import org.apache.commons.lang.StringUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.*; +import java.util.*; + +public class IOUtils { + private static Logger logger = Logger.getLogger(IOUtils.class); + + /** + * Checks if the temp directory has been setup and throws an exception if they user hasn't set it correctly. + * + * @param tempDir Temporary directory. + */ + public static void checkTempDir(File tempDir) { + String tempDirPath = tempDir.getAbsolutePath(); + // Keeps the user from leaving the temp directory as the default, and on Macs from having pluses + // in the path which can cause problems with the Google Reflections library. + // see also: http://benjchristensen.com/2009/09/22/mac-osx-10-6-java-java-io-tmpdir/ + if (tempDirPath.startsWith("/var/folders/") || (tempDirPath.equals("/tmp")) || (tempDirPath.equals("/tmp/"))) + throw new UserException.BadTmpDir("java.io.tmpdir must be explicitly set"); + if (!tempDir.exists() && !tempDir.mkdirs()) + throw new UserException.BadTmpDir("Could not create directory: " + tempDir.getAbsolutePath()); + } + + /** + * Creates a temp directory with the prefix and optional suffix. + * + * @param prefix Prefix for the directory name. + * @param suffix Optional suffix for the directory name. + * @return The created temporary directory. + */ + public static File tempDir(String prefix, String suffix) { + return tempDir(prefix, suffix, null); + } + + /** + * Creates a temp directory with the prefix and optional suffix. + * + * @param prefix Prefix for the directory name. + * @param suffix Optional suffix for the directory name. + * @param tempDirParent Parent directory for the temp directory. + * @return The created temporary directory. + */ + public static File tempDir(String prefix, String suffix, File tempDirParent) { + try { + if (tempDirParent == null) + tempDirParent = FileUtils.getTempDirectory(); + if (!tempDirParent.exists() && !tempDirParent.mkdirs()) + throw new UserException.BadTmpDir("Could not create temp directory: " + tempDirParent); + File temp = File.createTempFile(prefix + "-", suffix, tempDirParent); + if (!temp.delete()) + throw new UserException.BadTmpDir("Could not delete sub file: " + temp.getAbsolutePath()); + if (!temp.mkdir()) + throw new UserException.BadTmpDir("Could not create sub directory: " + temp.getAbsolutePath()); + return absolute(temp); + } catch (IOException e) { + throw new UserException.BadTmpDir(e.getMessage()); + } + } + + /** + * Writes content to a temp file and returns the path to the temporary file. + * + * @param content to write. + * @param prefix Prefix for the temp file. + * @param suffix Suffix for the temp file. + * @return the path to the temp file. + */ + public static File writeTempFile(String content, String prefix, String suffix) { + return writeTempFile(content, prefix, suffix, null); + } + + /** + * Writes content to a temp file and returns the path to the temporary file. + * + * @param content to write. + * @param prefix Prefix for the temp file. + * @param suffix Suffix for the temp file. + * @param directory Directory for the temp file. + * @return the path to the temp file. + */ + public static File writeTempFile(String content, String prefix, String suffix, File directory) { + try { + File tempFile = absolute(File.createTempFile(prefix, suffix, directory)); + FileUtils.writeStringToFile(tempFile, content); + return tempFile; + } catch (IOException e) { + throw new UserException.BadTmpDir(e.getMessage()); + } + } + + /** + * Waits for NFS to propagate a file creation, imposing a timeout. + * + * Based on Apache Commons IO FileUtils.waitFor() + * + * @param file The file to wait for. + * @param seconds The maximum time in seconds to wait. + * @return true if the file exists + */ + public static boolean waitFor(File file, int seconds) { + return waitFor(Collections.singletonList(file), seconds).isEmpty(); + } + + /** + * Waits for NFS to propagate a file creation, imposing a timeout. + * + * Based on Apache Commons IO FileUtils.waitFor() + * + * @param files The list of files to wait for. + * @param seconds The maximum time in seconds to wait. + * @return Files that still do not exists at the end of the timeout, or a empty list if all files exists. + */ + public static List waitFor(Collection files, int seconds) { + long timeout = 0; + long tick = 0; + List missingFiles = new ArrayList(); + for (File file : files) + if (!file.exists()) + missingFiles.add(file); + + while (!missingFiles.isEmpty() && timeout <= seconds) { + if (tick >= 10) { + tick = 0; + timeout++; + } + tick++; + try { + Thread.sleep(100); + } catch (InterruptedException ignore) { + } + List newMissingFiles = new ArrayList(); + for (File file : missingFiles) + if (!file.exists()) + newMissingFiles.add(file); + missingFiles = newMissingFiles; + } + return missingFiles; + } + + /** + * Returns the directory at the number of levels deep. + * For example 2 levels of /path/to/dir will return /path/to + * + * @param dir Directory path. + * @param level how many levels deep from the root. + * @return The path to the parent directory that is level-levels deep. + */ + public static File dirLevel(File dir, int level) { + List directories = new ArrayList(); + File parentDir = absolute(dir); + while (parentDir != null) { + directories.add(0, parentDir); + parentDir = parentDir.getParentFile(); + } + if (directories.size() <= level) + return directories.get(directories.size() - 1); + else + return directories.get(level); + } + + /** + * Returns the sub path rooted at the parent. + * + * @param parent The parent directory. + * @param path The sub path to append to the parent, if the path is not absolute. + * @return The absolute path to the file in the parent dir if the path was not absolute, otherwise the original path. + */ + public static File absolute(File parent, String path) { + return absolute(parent, new File(path)); + } + + /** + * Returns the sub path rooted at the parent. + * + * @param parent The parent directory. + * @param file The sub path to append to the parent, if the path is not absolute. + * @return The absolute path to the file in the parent dir if the path was not absolute, otherwise the original path. + */ + public static File absolute(File parent, File file) { + String newPath; + if (file.isAbsolute()) + newPath = absolutePath(file); + else + newPath = absolutePath(new File(parent, file.getPath())); + return replacePath(file, newPath); + } + + /** + * A mix of getCanonicalFile and getAbsoluteFile that returns the + * absolute path to the file without deferencing symbolic links. + * + * @param file the file. + * @return the absolute path to the file. + */ + public static File absolute(File file) { + return replacePath(file, absolutePath(file)); + } + + private static String absolutePath(File file) { + File fileAbs = file.getAbsoluteFile(); + LinkedList names = new LinkedList(); + while (fileAbs != null) { + String name = fileAbs.getName(); + fileAbs = fileAbs.getParentFile(); + + if (".".equals(name)) { + /* skip */ + + /* TODO: What do we do for ".."? + } else if (name == "..") { + + CentOS tcsh says use getCanonicalFile: + ~ $ mkdir -p test1/test2 + ~ $ ln -s test1/test2 test3 + ~ $ cd test3/.. + ~/test1 $ + + Mac bash says keep going with getAbsoluteFile: + ~ $ mkdir -p test1/test2 + ~ $ ln -s test1/test2 test3 + ~ $ cd test3/.. + ~ $ + + For now, leave it and let the shell figure it out. + */ + } else { + names.add(0, name); + } + } + + return ("/" + StringUtils.join(names, "/")); + } + + private static File replacePath(File file, String path) { + if (file instanceof FileExtension) + return ((FileExtension)file).withPath(path); + if (!File.class.equals(file.getClass())) + throw new StingException("Sub classes of java.io.File must also implement FileExtension"); + return new File(path); + } + + /** + * Returns the last lines of the file. + * NOTE: This is only safe to run on smaller files! + * + * @param file File to read. + * @param count Maximum number of lines to return. + * @return The last count lines from file. + * @throws IOException When unable to read the file. + */ + public static List tail(File file, int count) throws IOException { + LinkedList tailLines = new LinkedList(); + FileReader reader = new FileReader(file); + try { + LineIterator iterator = org.apache.commons.io.IOUtils.lineIterator(reader); + int lineCount = 0; + while (iterator.hasNext()) { + String line = iterator.nextLine(); + lineCount++; + if (lineCount > count) + tailLines.removeFirst(); + tailLines.offer(line); + } + } finally { + org.apache.commons.io.IOUtils.closeQuietly(reader); + } + return tailLines; + } + + /** + * Tries to delete a file. Emits a warning if the file was unable to be deleted. + * + * @param file File to delete. + * @return true if the file was deleted. + */ + public static boolean tryDelete(File file) { + boolean deleted = FileUtils.deleteQuietly(file); + if (deleted) + logger.debug("Deleted " + file); + else if (file.exists()) + logger.warn("Unable to delete " + file); + return deleted; + } + + /** + * Writes the an embedded resource to a temp file. + * File is not scheduled for deletion and must be cleaned up by the caller. + * @param resource Embedded resource. + * @return Path to the temp file with the contents of the resource. + */ + public static File writeTempResource(Resource resource) { + File temp; + try { + temp = File.createTempFile(FilenameUtils.getBaseName(resource.getPath()) + ".", "." + FilenameUtils.getExtension(resource.getPath())); + } catch (IOException e) { + throw new UserException.BadTmpDir(e.getMessage()); + } + writeResource(resource, temp); + return temp; + } + + /** + * Writes the an embedded resource to a file. + * File is not scheduled for deletion and must be cleaned up by the caller. + * @param resource Embedded resource. + * @param file File path to write. + */ + public static void writeResource(Resource resource, File file) { + String path = resource.getPath(); + Class clazz = resource.getRelativeClass(); + InputStream inputStream = null; + OutputStream outputStream = null; + try { + if (clazz == null) { + inputStream = ClassLoader.getSystemResourceAsStream(path); + if (inputStream == null) + throw new IllegalArgumentException("Resource not found: " + path); + } else { + inputStream = clazz.getResourceAsStream(path); + if (inputStream == null) + throw new IllegalArgumentException("Resource not found relative to " + clazz + ": " + path); + } + outputStream = FileUtils.openOutputStream(file); + org.apache.commons.io.IOUtils.copy(inputStream, outputStream); + } catch (IOException e) { + throw new StingException(String.format("Unable to copy resource '%s' to '%s'", path, file), e); + } finally { + org.apache.commons.io.IOUtils.closeQuietly(inputStream); + org.apache.commons.io.IOUtils.closeQuietly(outputStream); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/io/Resource.java b/public/java/src/org/broadinstitute/sting/utils/io/Resource.java new file mode 100644 index 000000000..895fb9731 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/io/Resource.java @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.io; + +import java.io.File; + +/** + * Stores a resource by path and a relative class. + */ +public class Resource { + private final String path; + private final Class relativeClass; + + /** + * Create a resource with a path and a relative class. + * @param path Relative or absolute path to the class. + * @param relativeClass Relative class to use as a class loader and for a relative package. + * + * If the relative class is null then the system classloader will be used and the path must be absolute. + */ + public Resource(String path, Class relativeClass) { + this.path = path; + this.relativeClass = relativeClass; + } + + public Class getRelativeClass() { + return relativeClass; + } + + public String getPath() { + return path; + } + + public String getFullPath() { + if (relativeClass == null) + return path; + if (new File(path).isAbsolute()) + return path; + return String.format("%s%s%s", + relativeClass.getPackage().getName().replace('.', File.separatorChar), + File.separator, + path); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 3821c9c8a..18051ce92 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -24,13 +24,13 @@ package org.broadinstitute.sting.utils.pileup; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; +import org.broadinstitute.sting.utils.fragments.FragmentUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; @@ -45,6 +45,7 @@ public abstract class AbstractReadBackedPileup pileupElementTracker; protected int size = 0; // cached value of the size of the pileup + protected int abstractSize = -1; // cached value of the abstract size of the pileup protected int nDeletions = 0; // cached value of the number of deletions protected int nMQ0Reads = 0; // cached value of the number of MQ0 reads @@ -58,12 +59,12 @@ public abstract class AbstractReadBackedPileup reads, List offsets ) { + public AbstractReadBackedPileup(GenomeLoc loc, List reads, List offsets ) { this.loc = loc; this.pileupElementTracker = readsOffsets2Pileup(reads,offsets); } - public AbstractReadBackedPileup(GenomeLoc loc, List reads, int offset ) { + public AbstractReadBackedPileup(GenomeLoc loc, List reads, int offset ) { this.loc = loc; this.pileupElementTracker = readsOffsets2Pileup(reads,offset); } @@ -114,10 +115,10 @@ public abstract class AbstractReadBackedPileup> pileupsBySample) { + protected AbstractReadBackedPileup(GenomeLoc loc, Map> pileupsBySample) { this.loc = loc; PerSamplePileupElementTracker tracker = new PerSamplePileupElementTracker(); - for(Map.Entry> pileupEntry: pileupsBySample.entrySet()) { + for(Map.Entry> pileupEntry: pileupsBySample.entrySet()) { tracker.addElements(pileupEntry.getKey(),pileupEntry.getValue().pileupElementTracker); addPileupToCumulativeStats(pileupEntry.getValue()); } @@ -145,8 +146,16 @@ public abstract class AbstractReadBackedPileup pileup) { - size += pileup.size(); + size += pileup.getNumberOfElements(); + abstractSize += pileup.depthOfCoverage(); nDeletions += pileup.getNumberOfDeletions(); nMQ0Reads += pileup.getNumberOfMappingQualityZeroReads(); } @@ -158,7 +167,7 @@ public abstract class AbstractReadBackedPileup readsOffsets2Pileup(List reads, List offsets ) { + private PileupElementTracker readsOffsets2Pileup(List reads, List offsets ) { if ( reads == null ) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); if ( offsets == null ) throw new ReviewedStingException("Illegal null offsets list in UnifiedReadBackedPileup"); if ( reads.size() != offsets.size() ) throw new ReviewedStingException("Reads and offset lists have different sizes!"); @@ -178,7 +187,7 @@ public abstract class AbstractReadBackedPileup readsOffsets2Pileup(List reads, int offset ) { + private PileupElementTracker readsOffsets2Pileup(List reads, int offset ) { if ( reads == null ) throw new ReviewedStingException("Illegal null read list in UnifiedReadBackedPileup"); if ( offset < 0 ) throw new ReviewedStingException("Illegal offset < 0 UnifiedReadBackedPileup"); @@ -191,7 +200,7 @@ public abstract class AbstractReadBackedPileup createNewPileup(GenomeLoc loc, PileupElementTracker pileupElementTracker); - protected abstract PE createNewPileupElement(SAMRecord read, int offset); + protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset); // -------------------------------------------------------- // @@ -213,7 +222,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupWithoutDeletions(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -251,7 +260,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getOverlappingFragmentFilteredPileup(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -305,7 +314,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupWithoutMappingQualityZeroReads(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -334,7 +343,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPositiveStrandPileup(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -363,7 +372,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getNegativeStrandPileup(); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -393,7 +402,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getFilteredPileup(filter); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -425,7 +434,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getBaseAndMappingFilteredPileup(minBaseQ,minMapQ); filteredTracker.addElements(sample,pileup.pileupElementTracker); @@ -492,7 +501,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForReadGroup(targetReadGroupId); if(pileup != null) @@ -503,7 +512,7 @@ public abstract class AbstractReadBackedPileup filteredTracker = new UnifiedPileupElementTracker(); for(PE p: pileupElementTracker) { - SAMRecord read = p.getRead(); + GATKSAMRecord read = p.getRead(); if(targetReadGroupId != null) { if(read.getReadGroup() != null && targetReadGroupId.equals(read.getReadGroup().getReadGroupId())) filteredTracker.add(p); @@ -523,7 +532,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); AbstractReadBackedPileup pileup = createNewPileup(loc,perSampleElements).getPileupForLane(laneID); if(pileup != null) @@ -534,7 +543,7 @@ public abstract class AbstractReadBackedPileup filteredTracker = new UnifiedPileupElementTracker(); for(PE p: pileupElementTracker) { - SAMRecord read = p.getRead(); + GATKSAMRecord read = p.getRead(); if(laneID != null) { if(read.getReadGroup() != null && (read.getReadGroup().getReadGroupId().startsWith(laneID + ".")) || // lane is the same, but sample identifier is different @@ -550,19 +559,15 @@ public abstract class AbstractReadBackedPileup getSampleNames() { + public Collection getSamples() { if(pileupElementTracker instanceof PerSamplePileupElementTracker) { PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; - Collection sampleNames = new HashSet(); - for (Sample sample : tracker.getSamples()) { - sampleNames.add(sample.getId()); - } - return sampleNames; + return new HashSet(tracker.getSamples()); } else { Collection sampleNames = new HashSet(); for(PileupElement p: this) { - SAMRecord read = p.getRead(); + GATKSAMRecord read = p.getRead(); String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; sampleNames.add(sampleName); } @@ -570,16 +575,6 @@ public abstract class AbstractReadBackedPileup getSamples() { - if(!(pileupElementTracker instanceof PerSamplePileupElementTracker)) { - throw new StingException("Must be an instance of PerSampleElementTracker"); - } - PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; - return tracker.getSamples(); - } - - /** * Returns a pileup randomly downsampled to the desiredCoverage. * @@ -588,7 +583,7 @@ public abstract class AbstractReadBackedPileup perSampleElements = tracker.getElements(sample); List filteredPileup = new ArrayList(); @@ -639,7 +634,7 @@ public abstract class AbstractReadBackedPileup sampleNames) { + public RBP getPileupForSamples(Collection sampleNames) { if(pileupElementTracker instanceof PerSamplePileupElementTracker) { PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker)pileupElementTracker; PileupElementTracker filteredElements = tracker.getElements(sampleNames); @@ -649,7 +644,7 @@ public abstract class AbstractReadBackedPileup hashSampleNames = new HashSet(sampleNames); // to speed up the "contains" access in the for loop UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); for(PE p: pileupElementTracker) { - SAMRecord read = p.getRead(); + GATKSAMRecord read = p.getRead(); if(sampleNames != null) { // still checking on sampleNames because hashSampleNames will never be null. And empty means something else. if(read.getReadGroup() != null && hashSampleNames.contains(read.getReadGroup().getSample())) filteredTracker.add(p); @@ -665,7 +660,7 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; PileupElementTracker filteredElements = tracker.getElements(sampleName); @@ -674,7 +669,7 @@ public abstract class AbstractReadBackedPileup filteredTracker = new UnifiedPileupElementTracker(); for(PE p: pileupElementTracker) { - SAMRecord read = p.getRead(); + GATKSAMRecord read = p.getRead(); if(sampleName != null) { if(read.getReadGroup() != null && sampleName.equals(read.getReadGroup().getSample())) filteredTracker.add(p); @@ -688,30 +683,6 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; - PileupElementTracker filteredElements = tracker.getElements(sample); - return filteredElements != null ? (RBP)createNewPileup(loc,filteredElements) : null; - } - else { - UnifiedPileupElementTracker filteredTracker = new UnifiedPileupElementTracker(); - for(PE p: pileupElementTracker) { - SAMRecord read = p.getRead(); - if(sample != null) { - if(read.getReadGroup() != null && sample.getId().equals(read.getReadGroup().getSample())) - filteredTracker.add(p); - } - else { - if(read.getReadGroup() == null || read.getReadGroup().getSample() == null) - filteredTracker.add(p); - } - } - return filteredTracker.size()>0 ? (RBP)createNewPileup(loc,filteredTracker) : null; - } - } - // -------------------------------------------------------- // // iterators @@ -765,13 +736,23 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker)pileupElementTracker; - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { int[] countsBySample = createNewPileup(loc,tracker.getElements(sample)).getBaseCounts(); for(int i = 0; i < counts.length; i++) counts[i] += countsBySample[i]; @@ -843,8 +824,8 @@ public abstract class AbstractReadBackedPileup getReads() { - List reads = new ArrayList(size()); + public List getReads() { + List reads = new ArrayList(getNumberOfElements()); for ( PileupElement pile : this ) { reads.add(pile.getRead()); } return reads; } @@ -855,7 +836,7 @@ public abstract class AbstractReadBackedPileup getOffsets() { - List offsets = new ArrayList(size()); + List offsets = new ArrayList(getNumberOfElements()); for ( PileupElement pile : this ) { offsets.add(pile.getOffset()); } return offsets; } @@ -866,7 +847,7 @@ public abstract class AbstractReadBackedPileup toFragments() { + return FragmentUtils.create(this); + } } + diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java index 26e66014c..1e5e4d4e5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java @@ -1,131 +1,132 @@ -package org.broadinstitute.sting.utils.pileup; - -import net.sf.samtools.SAMRecord; - -import java.util.Arrays; - -/** - * In the "standard" locus traversal mode, - * the traversal is performed striclty over the reference bases. Thus, only pileups of bases (and hence local events - * such as point mutations) are "seen" at every invocation of the walker's map() function at every (genomic) locus. Deletions - * are seen on the base-by-base basis (i.e. the pileup does keep the information about the current reference base being deleted - * in some reads), but the information about the extended event (deletion length, string of all deleted bases) is not kept. - * The insertions that may be present in some reads are not seen at all in such strict reference traversal mode. - * - * By convention, any extended event (indel) is mapped onto the reference at the last base prior to the event (i.e. - * last base before the insertion or deletion). If the special "extended" traversal mode is turned on and there is - * an indel in at least one read that maps onto the reference position Z, the walker's map function will be called twice: - * first call will be performed in a "standard" mode, with a pileup of bases over the position Z, and then the additional - * call will be made at the same position with a pileup of event/noevent calls, where events are extended and contain - * full information about insertions/deletions. Then the next, "standard", call to map() will be performed at the next - * (covered) reference position. Note that if the extended event at Z was a deletion, the "standard" base pileup at - * Z+1 and following bases may still contain deleted bases. However the fully extended event call will be performed - * only once, at the position where the indel maps (starts). - * - * This class wraps an "extended" event (indel) so that in can be added to a pileup of events at a given location. - * - * Created by IntelliJ IDEA. - * User: asivache - * Date: Dec 21, 2009 - * Time: 2:57:55 PM - * To change this template use File | Settings | File Templates. - */ -public class ExtendedEventPileupElement extends PileupElement { - public enum Type { - NOEVENT, DELETION, INSERTION - } - - private Type type = null; - private int eventLength = -1; - private String eventBases = null; // if it is a deletion, we do not have information about the actual deleted bases - // in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases - private SAMRecord read; - private int offset; // position in the read immediately BEFORE the event - // This is broken! offset is always zero because these member variables are shadowed by base class - - /** Constructor for extended pileup element (indel). - * - * @param read the read, in which the indel is observed - * @param offset position in the read immediately before the indel (can be -1 if read starts with an insertion) - * @param length length of the indel (number of inserted or deleted bases); length <=0 indicates that the read has no indel (NOEVENT) - * @param eventBases inserted bases. null indicates that the event is a deletion; ignored if length<=0 (noevent) - */ - public ExtendedEventPileupElement( SAMRecord read, int offset, int length, byte[] eventBases ) { - super(read, offset); - this.eventLength = length; - if ( length <= 0 ) type = Type.NOEVENT; - else { - if ( eventBases != null ) { - this.eventBases = new String(eventBases).toUpperCase(); - type = Type.INSERTION; - } else { - type = Type.DELETION; - } - } - } - - /** Constructor for deletion or noevent calls - does not take event bases as an argument (as those should - * be null or are ignored in these cases anyway) - * @param read - * @param offset - * @param length - */ - public ExtendedEventPileupElement( SAMRecord read, int offset, int length ) { - this(read,offset, length, null); - } - - public boolean isDeletion() { - return type == Type.DELETION; - } - - public boolean isInsertion() { - return type == Type.INSERTION; - } - - public boolean isIndel() { - return isDeletion() || isInsertion(); - } - - public Type getType() { return type; } - - // The offset can be negative with insertions at the start of the read, but a valid base does exist at this position with - // a valid base quality. The following code attempts to compensate for that.' - - @Override - public byte getBase() { - return getBase(offset >= 0 ? offset : offset+eventLength); - } - - @Override - public int getBaseIndex() { - return getBaseIndex(offset >= 0 ? offset : offset+eventLength); - } - - @Override - public byte getQual() { - return getQual(offset >= 0 ? offset : offset+eventLength); - } - - /** Returns length of the event (number of inserted or deleted bases */ - public int getEventLength() { return eventLength; } - - /** Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. - * */ - public String getEventBases() { return eventBases; } - - @Override - public String toString() { - char c = '.'; - String fillStr = null ; - if ( isDeletion() ) { - c = '-'; - char [] filler = new char[eventLength]; - Arrays.fill(filler, 'D'); - fillStr = new String(filler); - } - else if ( isInsertion() ) c = '+'; - return String.format("%s @ %d = %c%s MQ%d", getRead().getReadName(), getOffset(), c, isIndel()? - (isInsertion() ? eventBases : fillStr ): "", getMappingQual()); - } - -} +package org.broadinstitute.sting.utils.pileup; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Arrays; + +/** + * In the "standard" locus traversal mode, + * the traversal is performed striclty over the reference bases. Thus, only pileups of bases (and hence local events + * such as point mutations) are "seen" at every invocation of the walker's map() function at every (genomic) locus. Deletions + * are seen on the base-by-base basis (i.e. the pileup does keep the information about the current reference base being deleted + * in some reads), but the information about the extended event (deletion length, string of all deleted bases) is not kept. + * The insertions that may be present in some reads are not seen at all in such strict reference traversal mode. + * + * By convention, any extended event (indel) is mapped onto the reference at the last base prior to the event (i.e. + * last base before the insertion or deletion). If the special "extended" traversal mode is turned on and there is + * an indel in at least one read that maps onto the reference position Z, the walker's map function will be called twice: + * first call will be performed in a "standard" mode, with a pileup of bases over the position Z, and then the additional + * call will be made at the same position with a pileup of event/noevent calls, where events are extended and contain + * full information about insertions/deletions. Then the next, "standard", call to map() will be performed at the next + * (covered) reference position. Note that if the extended event at Z was a deletion, the "standard" base pileup at + * Z+1 and following bases may still contain deleted bases. However the fully extended event call will be performed + * only once, at the position where the indel maps (starts). + * + * This class wraps an "extended" event (indel) so that in can be added to a pileup of events at a given location. + * + * Created by IntelliJ IDEA. + * User: asivache + * Date: Dec 21, 2009 + * Time: 2:57:55 PM + * To change this template use File | Settings | File Templates. + */ +public class ExtendedEventPileupElement extends PileupElement { + public enum Type { + NOEVENT, DELETION, INSERTION + } + + private Type type = null; + private int eventLength = -1; + private String eventBases = null; // if it is a deletion, we do not have information about the actual deleted bases + // in the read itself, so we fill the string with D's; for insertions we keep actual inserted bases + private SAMRecord read; + private int offset; // position in the read immediately BEFORE the event + // This is broken! offset is always zero because these member variables are shadowed by base class + + /** Constructor for extended pileup element (indel). + * + * @param read the read, in which the indel is observed + * @param offset position in the read immediately before the indel (can be -1 if read starts with an insertion) + * @param length length of the indel (number of inserted or deleted bases); length <=0 indicates that the read has no indel (NOEVENT) + * @param eventBases inserted bases. null indicates that the event is a deletion; ignored if length<=0 (noevent) + */ + public ExtendedEventPileupElement( GATKSAMRecord read, int offset, int length, byte[] eventBases ) { + super(read, offset); + this.eventLength = length; + if ( length <= 0 ) type = Type.NOEVENT; + else { + if ( eventBases != null ) { + this.eventBases = new String(eventBases).toUpperCase(); + type = Type.INSERTION; + } else { + type = Type.DELETION; + } + } + } + + /** Constructor for deletion or noevent calls - does not take event bases as an argument (as those should + * be null or are ignored in these cases anyway) + * @param read + * @param offset + * @param length + */ + public ExtendedEventPileupElement( GATKSAMRecord read, int offset, int length ) { + this(read,offset, length, null); + } + + public boolean isDeletion() { + return type == Type.DELETION; + } + + public boolean isInsertion() { + return type == Type.INSERTION; + } + + public boolean isIndel() { + return isDeletion() || isInsertion(); + } + + public Type getType() { return type; } + + // The offset can be negative with insertions at the start of the read, but a valid base does exist at this position with + // a valid base quality. The following code attempts to compensate for that.' + + @Override + public byte getBase() { + return getBase(offset >= 0 ? offset : offset+eventLength); + } + + @Override + public int getBaseIndex() { + return getBaseIndex(offset >= 0 ? offset : offset+eventLength); + } + + @Override + public byte getQual() { + return getQual(offset >= 0 ? offset : offset+eventLength); + } + + /** Returns length of the event (number of inserted or deleted bases */ + public int getEventLength() { return eventLength; } + + /** Returns actual sequence of inserted bases, or a null if the event is a deletion or if there is no event in the associated read. + * */ + public String getEventBases() { return eventBases; } + + @Override + public String toString() { + char c = '.'; + String fillStr = null ; + if ( isDeletion() ) { + c = '-'; + char [] filler = new char[eventLength]; + Arrays.fill(filler, 'D'); + fillStr = new String(filler); + } + else if ( isInsertion() ) c = '+'; + return String.format("%s @ %d = %c%s MQ%d", getRead().getReadName(), getOffset(), c, isIndel()? + (isInsertion() ? eventBases : fillStr ): "", getMappingQual()); + } + +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java deleted file mode 100644 index f7d237401..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/FragmentPileup.java +++ /dev/null @@ -1,95 +0,0 @@ -package org.broadinstitute.sting.utils.pileup; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; - -/** - * An easy to access fragment-based pileup, which contains two separate pileups. The first - * is a regular collection of PileupElements containing all of the reads in the original RBP - * that uniquely info about a fragment. The second are TwoReadPileupElements that, as the - * name suggests, contain two reads that are sequenced from the same underlying fragment. - * - * Based on the original code by E. Banks - * - * TODO -- technically we could generalize this code to support a pseudo-duplicate marking - * TODO -- algorithm that could collect all duplicates into a single super pileup element - * - * User: depristo - * Date: 3/26/11 - * Time: 10:09 PM - */ -public class FragmentPileup { - final Collection oneReadPile; - final Collection twoReadPile = new ArrayList(); - - /** - * Create a new Fragment-based pileup from the standard read-based pileup - * @param pileup - */ - public FragmentPileup(ReadBackedPileup pileup) { - Map nameMap = new HashMap(); - - // build an initial map, grabbing all of the multi-read fragments - for ( PileupElement p : pileup ) { - String readName = p.getRead().getReadName(); - - PileupElement pe1 = nameMap.get(readName); - if ( pe1 != null ) { - // assumes we have at most 2 reads per fragment - twoReadPile.add(new TwoReadPileupElement(pe1, p)); - nameMap.remove(readName); - } else { - nameMap.put(readName, p); - } - } - - // now set the one Read pile to the values in the nameMap with only a single read - oneReadPile = nameMap.values(); - } - - /** - * Gets the pileup elements containing two reads, in no particular order - * - * @return - */ - public Collection getTwoReadPileup() { - return twoReadPile; - } - - /** - * Gets the pileup elements containing one read, in no particular order - * - * @return - */ - public Collection getOneReadPileup() { - return oneReadPile; - } - - /** - * Useful helper class to represent a full read pair at a position - * - * User: ebanks, depristo - * Date: Jan 10, 2011 - */ - public static class TwoReadPileupElement { - final protected PileupElement PE1, PE2; - - /** - * Creates a fragment element that contains both ends of a paired end read - * @param PE1 - * @param PE2 - */ - public TwoReadPileupElement(PileupElement PE1, PileupElement PE2) { - this.PE1 = PE1; - this.PE2 = PE2; - } - - /** Returns the first pileup element -- never null */ - public PileupElement getFirst() { return PE1; } - - /** Returns the second read in this fragment element. May be null */ - public PileupElement getSecond() { return PE2; } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java b/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java index 7005cf869..c00ed24f2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/MergingPileupElementIterator.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.picard.util.PeekableIterator; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import java.util.Comparator; import java.util.Iterator; @@ -42,7 +41,7 @@ class MergingPileupElementIterator implements Iterator public MergingPileupElementIterator(PerSamplePileupElementTracker tracker) { perSampleIterators = new PriorityQueue>(Math.max(1,tracker.getSamples().size()),new PileupElementIteratorComparator()); - for(Sample sample: tracker.getSamples()) { + for(final String sample: tracker.getSamples()) { PileupElementTracker trackerPerSample = tracker.getElements(sample); if(trackerPerSample.size() != 0) perSampleIterators.add(new PeekableIterator(trackerPerSample.iterator())); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 12899e898..daf6606ef 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -2,9 +2,8 @@ package org.broadinstitute.sting.utils.pileup; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.utils.BaseUtils; -import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** * Created by IntelliJ IDEA. @@ -12,7 +11,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; * Date: Apr 14, 2009 * Time: 8:54:05 AM */ -public class PileupElement { +public class PileupElement implements Comparable { public static final byte DELETION_BASE = BaseUtils.D; public static final byte DELETION_QUAL = (byte) 16; public static final byte A_FOLLOWED_BY_INSERTION_BASE = (byte) 87; @@ -20,14 +19,14 @@ public class PileupElement { public static final byte T_FOLLOWED_BY_INSERTION_BASE = (byte) 89; public static final byte G_FOLLOWED_BY_INSERTION_BASE = (byte) 90; - protected final SAMRecord read; + protected final GATKSAMRecord read; protected final int offset; @Requires({ "read != null", "offset >= -1", "offset <= read.getReadLength()"}) - public PileupElement( SAMRecord read, int offset ) { + public PileupElement( GATKSAMRecord read, int offset ) { this.read = read; this.offset = offset; } @@ -37,7 +36,7 @@ public class PileupElement { } @Ensures("result != null") - public SAMRecord getRead() { return read; } + public GATKSAMRecord getRead() { return read; } @Ensures("result == offset") public int getOffset() { return offset; } @@ -75,26 +74,32 @@ public class PileupElement { return isDeletion() ? DELETION_QUAL : read.getBaseQualities()[offset]; } + @Override + public int compareTo(final PileupElement pileupElement) { + if ( offset < pileupElement.offset ) + return -1; + else if ( offset > pileupElement.offset ) + return 1; + else if ( read.getAlignmentStart() < pileupElement.read.getAlignmentStart() ) + return -1; + else if ( read.getAlignmentStart() > pileupElement.read.getAlignmentStart() ) + return 1; + else + return 0; + } + // -------------------------------------------------------------------------- // // Reduced read accessors // // -------------------------------------------------------------------------- - private Integer getReducedReadQualityTagValue() { - return getRead().getIntegerAttribute(ReadUtils.REDUCED_READ_QUALITY_TAG); - } - public boolean isReducedRead() { - return getReducedReadQualityTagValue() != null; + return ((GATKSAMRecord)read).isReducedRead(); } - public int getReducedCount() { - return (int)getQual(); - } - - public byte getReducedQual() { - return (byte)(int)getReducedReadQualityTagValue(); + public int getRepresentativeCount() { + return isReducedRead() ? ((GATKSAMRecord)read).getReducedCount(offset) : 1; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java index 29e431695..09b907e00 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElementTracker.java @@ -24,8 +24,6 @@ package org.broadinstitute.sting.utils.pileup; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; - import java.util.*; /** @@ -60,52 +58,35 @@ class UnifiedPileupElementTracker extends PileupElemen } class PerSamplePileupElementTracker extends PileupElementTracker { - private final Map> pileup; - private final Map sampleNames = new HashMap(); + private final Map> pileup; private int size = 0; public PerSamplePileupElementTracker() { - pileup = new HashMap>(); - } - - public PerSamplePileupElementTracker(Map> pileupsBySample) { - pileup = new HashMap>(); - for(Map.Entry> entry: pileupsBySample.entrySet()) { - Sample sample = entry.getKey(); - AbstractReadBackedPileup pileupBySample = entry.getValue(); - pileup.put(sample,pileupBySample.pileupElementTracker); - sampleNames.put(sample.getId(), sample); - } + pileup = new HashMap>(); } /** * Gets a list of all the samples stored in this pileup. * @return List of samples in this pileup. */ - public Collection getSamples() { + public Collection getSamples() { return pileup.keySet(); } - public PileupElementTracker getElements(final Sample sample) { + public PileupElementTracker getElements(final String sample) { return pileup.get(sample); } - public PileupElementTracker getElements(final String sampleName) { - return pileup.get(sampleNames.get(sampleName)); - } - public PileupElementTracker getElements(final Collection selectSampleNames) { PerSamplePileupElementTracker result = new PerSamplePileupElementTracker(); - for (String sample : selectSampleNames) { - Sample sampleObject = sampleNames.get(sample); - result.addElements(sampleObject, pileup.get(sampleObject)); + for (final String sample : selectSampleNames) { + result.addElements(sample, pileup.get(sample)); } return result; } - public void addElements(final Sample sample, PileupElementTracker elements) { + public void addElements(final String sample, PileupElementTracker elements) { pileup.put(sample,elements); - sampleNames.put(sample.getId(), sample); size += elements.size(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java index 8d43a368a..3d872f9fb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileup.java @@ -25,9 +25,9 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Collection; import java.util.List; @@ -121,20 +121,7 @@ public interface ReadBackedExtendedEventPileup extends ReadBackedPileup { * Gets a list of all the samples stored in this pileup. * @return List of samples in this pileup. */ - public Collection getSampleNames(); - - /** - * Gets a list of all the samples stored in this pileup. - * @return List of samples in this pileup. - */ - public Collection getSamples(); - - /** - * Gets the particular subset of this pileup with the given sample name. - * @param sample Name of the sample to use. - * @return A subset of this pileup containing only reads with the given sample. - */ - public ReadBackedExtendedEventPileup getPileupForSample(Sample sample); + public Collection getSamples(); public Iterable toExtendedIterable(); @@ -169,7 +156,7 @@ public interface ReadBackedExtendedEventPileup extends ReadBackedPileup { /** * @return the number of elements in this pileup */ - public int size(); + public int getNumberOfElements(); /** * @return the location of this pileup @@ -180,7 +167,7 @@ public interface ReadBackedExtendedEventPileup extends ReadBackedPileup { * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time * @return */ - public List getReads(); + public List getReads(); /** * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java index 31d29430a..43ad06352 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java @@ -23,11 +23,10 @@ */ package org.broadinstitute.sting.utils.pileup; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; @@ -56,7 +55,7 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< } // this is the good new one - public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { + public ReadBackedExtendedEventPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { super(loc,pileupElementsBySample); } @@ -96,7 +95,7 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< } @Override - protected ExtendedEventPileupElement createNewPileupElement(SAMRecord read, int offset) { + protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset) { throw new UnsupportedOperationException("Not enough information provided to create a new pileup element"); } @@ -134,7 +133,7 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< */ @Override public byte[] getEvents() { - byte[] v = new byte[size()]; + byte[] v = new byte[getNumberOfElements()]; int i = 0; for ( ExtendedEventPileupElement e : this.toExtendedIterable() ) { switch ( e.getType() ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index 36b8a8c65..02767df7c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -24,10 +24,10 @@ package org.broadinstitute.sting.utils.pileup; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.HasGenomeLocation; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Collection; import java.util.List; @@ -137,18 +137,11 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public ReadBackedPileup getPileupForLane(String laneID); - - /** - * Gets a collection of all the samples stored in this pileup. - * @return Collection of samples in this pileup. - */ - public Collection getSamples(); - /** * Gets a collection of *names* of all the samples stored in this pileup. * @return Collection of names */ - public Collection getSampleNames(); + public Collection getSamples(); /** @@ -156,7 +149,7 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca * @param sampleNames Name of the sample to use. * @return A subset of this pileup containing only reads with the given sample. */ - public ReadBackedPileup getPileupForSampleNames(Collection sampleNames); + public ReadBackedPileup getPileupForSamples(Collection sampleNames); /** @@ -164,14 +157,7 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca * @param sampleName Name of the sample to use. * @return A subset of this pileup containing only reads with the given sample. */ - public ReadBackedPileup getPileupForSampleName(String sampleName); - - /** - * Gets the particular subset of this pileup with the given sample. - * @param sample Sample to use. - * @return A subset of this pileup containing only reads with the given sample. - */ - public ReadBackedPileup getPileupForSample(Sample sample); + public ReadBackedPileup getPileupForSample(String sampleName); /** * Simple useful routine to count the number of deletion bases in this pileup @@ -183,9 +169,14 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca public int getNumberOfMappingQualityZeroReads(); /** - * @return the number of elements in this pileup + * @return the number of physical elements in this pileup (a reduced read is counted just once) */ - public int size(); + public int getNumberOfElements(); + + /** + * @return the number of abstract elements in this pileup (reduced reads are expanded to count all reads that they represent) + */ + public int depthOfCoverage(); /** * @return true if there are 0 elements in the pileup, false otherwise @@ -211,7 +202,7 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca * Returns a list of the reads in this pileup. Note this call costs O(n) and allocates fresh lists each time * @return */ - public List getReads(); + public List getReads(); /** * Returns a list of the offsets in this pileup. Note this call costs O(n) and allocates fresh lists each time @@ -237,4 +228,9 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public byte[] getMappingQuals(); + /** + * Converts this pileup into a FragmentCollection (see FragmentUtils for documentation) + * @return + */ + public FragmentCollection toFragments(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java index e5b054961..b7445be8d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -23,9 +23,8 @@ */ package org.broadinstitute.sting.utils.pileup; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.List; import java.util.Map; @@ -36,11 +35,11 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup reads, List offsets ) { + public ReadBackedPileupImpl(GenomeLoc loc, List reads, List offsets ) { super(loc,reads,offsets); } - public ReadBackedPileupImpl(GenomeLoc loc, List reads, int offset ) { + public ReadBackedPileupImpl(GenomeLoc loc, List reads, int offset ) { super(loc,reads,offset); } @@ -48,7 +47,7 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup pileupElementsBySample) { + public ReadBackedPileupImpl(GenomeLoc loc, Map pileupElementsBySample) { super(loc,pileupElementsBySample); } @@ -71,7 +70,7 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup outputStreams = new EnumMap(StreamLocation.class); + + /** + * The byte stream to capture content or null if no output string content was requested. + */ + private final ByteArrayOutputStream bufferStream; + + /** + * True if the buffer is truncated. + */ + private boolean bufferTruncated = false; + + /** + * @param settings Settings that define what to capture. + * @param processStream Stream to capture output. + * @param standardStream Stream to write debug output. + */ + public CapturedStreamOutput(OutputStreamSettings settings, InputStream processStream, PrintStream standardStream) { + this.processStream = processStream; + int bufferSize = settings.getBufferSize(); + this.bufferStream = (bufferSize < 0) ? new ByteArrayOutputStream() : new ByteArrayOutputStream(bufferSize); + + for (StreamLocation location : settings.getStreamLocations()) { + OutputStream outputStream; + switch (location) { + case Buffer: + if (bufferSize < 0) { + outputStream = this.bufferStream; + } else { + outputStream = new HardThresholdingOutputStream(bufferSize) { + @Override + protected OutputStream getStream() throws IOException { + return bufferTruncated ? NullOutputStream.NULL_OUTPUT_STREAM : bufferStream; + } + + @Override + protected void thresholdReached() throws IOException { + bufferTruncated = true; + } + }; + } + break; + case File: + try { + outputStream = new FileOutputStream(settings.getOutputFile(), settings.isAppendFile()); + } catch (IOException e) { + throw new UserException.BadInput(e.getMessage()); + } + break; + case Standard: + outputStream = standardStream; + break; + default: + throw new ReviewedStingException("Unexpected stream location: " + location); + } + this.outputStreams.put(location, outputStream); + } + } + + @Override + public byte[] getBufferBytes() { + return bufferStream.toByteArray(); + } + + @Override + public boolean isBufferTruncated() { + return bufferTruncated; + } + + /** + * Drain the input stream to keep the process from backing up until it's empty. + * File streams will be closed automatically when this method returns. + * + * @throws java.io.IOException When unable to read or write. + */ + public void readAndClose() throws IOException { + try { + byte[] buf = new byte[4096]; + int readCount; + while ((readCount = processStream.read(buf)) >= 0) + for (OutputStream outputStream : this.outputStreams.values()) { + outputStream.write(buf, 0, readCount); + } + } finally { + for (StreamLocation location : this.outputStreams.keySet()) { + OutputStream outputStream = this.outputStreams.get(location); + outputStream.flush(); + if (location != StreamLocation.Standard) + IOUtils.closeQuietly(outputStream); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/InputStreamSettings.java b/public/java/src/org/broadinstitute/sting/utils/runtime/InputStreamSettings.java new file mode 100755 index 000000000..dfa380a68 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/runtime/InputStreamSettings.java @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.runtime; + +import java.io.File; +import java.util.Collections; +import java.util.EnumSet; +import java.util.Set; + +/** + * Settings that define text to write to the process stdin. + */ +public class InputStreamSettings { + private final EnumSet streamLocations = EnumSet.noneOf(StreamLocation.class); + private byte[] inputBuffer; + private File inputFile; + + public InputStreamSettings() { + } + + /** + * @param inputBuffer String to write to stdin. + */ + public InputStreamSettings(String inputBuffer) { + setInputBuffer(inputBuffer); + } + + /** + * @param inputFile File to write to stdin. + */ + public InputStreamSettings(File inputFile) { + setInputFile(inputFile); + } + + /** + * @param inputBuffer String to write to stdin. + * @param inputFile File to write to stdin. + */ + public InputStreamSettings(byte[] inputBuffer, File inputFile) { + setInputBuffer(inputBuffer); + setInputFile(inputFile); + } + + public Set getStreamLocations() { + return Collections.unmodifiableSet(streamLocations); + } + + public byte[] getInputBuffer() { + return inputBuffer; + } + + public void setInputBuffer(String inputBuffer) { + if (inputBuffer == null) + throw new IllegalArgumentException("inputBuffer cannot be null"); + this.streamLocations.add(StreamLocation.Buffer); + this.inputBuffer = inputBuffer.getBytes(); + } + + public void setInputBuffer(byte[] inputBuffer) { + if (inputBuffer == null) + throw new IllegalArgumentException("inputBuffer cannot be null"); + this.streamLocations.add(StreamLocation.Buffer); + this.inputBuffer = inputBuffer; + } + + public void clearInputBuffer() { + this.streamLocations.remove(StreamLocation.Buffer); + this.inputBuffer = null; + } + + public File getInputFile() { + return inputFile; + } + + public void setInputFile(File inputFile) { + if (inputFile == null) + throw new IllegalArgumentException("inputFile cannot be null"); + this.streamLocations.add(StreamLocation.File); + this.inputFile = inputFile; + } + + public void clearInputFile() { + this.streamLocations.remove(StreamLocation.File); + this.inputFile = null; + } + + public void setInputStandard(boolean inputStandard) { + if (inputStandard) + this.streamLocations.add(StreamLocation.Standard); + else + this.streamLocations.remove(StreamLocation.Standard); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/OutputStreamSettings.java b/public/java/src/org/broadinstitute/sting/utils/runtime/OutputStreamSettings.java new file mode 100755 index 000000000..468ece178 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/runtime/OutputStreamSettings.java @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.runtime; + +import java.io.File; +import java.util.Collections; +import java.util.EnumSet; +import java.util.Set; + +/** + * Settings that define text to capture from a process stream. + */ +public class OutputStreamSettings { + private final EnumSet streamLocations = EnumSet.noneOf(StreamLocation.class); + private int bufferSize; + private File outputFile; + private boolean appendFile; + + public OutputStreamSettings() { + } + + /** + * @param bufferSize The number of bytes to capture, or -1 for unlimited. + */ + public OutputStreamSettings(int bufferSize) { + setBufferSize(bufferSize); + } + + /** + * @param outputFile The file to write output to. + */ + public OutputStreamSettings(File outputFile) { + setOutputFile(outputFile); + } + + /** + * @param outputFile The file to write output to. + * @param append true if the output file should be appended to. + */ + public OutputStreamSettings(File outputFile, boolean append) { + setOutputFile(outputFile, append); + } + + public OutputStreamSettings(int bufferSize, File outputFile, boolean appendFile) { + setBufferSize(bufferSize); + setOutputFile(outputFile, appendFile); + } + + public Set getStreamLocations() { + return Collections.unmodifiableSet(streamLocations); + } + + public int getBufferSize() { + return bufferSize; + } + + public void setBufferSize(int bufferSize) { + this.streamLocations.add(StreamLocation.Buffer); + this.bufferSize = bufferSize; + } + + public void clearBufferSize() { + this.streamLocations.remove(StreamLocation.Buffer); + this.bufferSize = 0; + } + + public File getOutputFile() { + return outputFile; + } + + public boolean isAppendFile() { + return appendFile; + } + + /** + * Overwrites the outputFile with the process output. + * + * @param outputFile File to overwrite. + */ + public void setOutputFile(File outputFile) { + setOutputFile(outputFile, false); + } + + public void setOutputFile(File outputFile, boolean append) { + if (outputFile == null) + throw new IllegalArgumentException("outputFile cannot be null"); + streamLocations.add(StreamLocation.File); + this.outputFile = outputFile; + this.appendFile = append; + } + + public void clearOutputFile() { + streamLocations.remove(StreamLocation.File); + this.outputFile = null; + this.appendFile = false; + } + + public void printStandard(boolean print) { + if (print) + this.streamLocations.add(StreamLocation.Standard); + else + this.streamLocations.remove(StreamLocation.Standard); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessController.java b/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessController.java new file mode 100755 index 000000000..6a3f9c753 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessController.java @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.runtime; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.*; + +/** + * Facade to Runtime.exec() and java.lang.Process. Handles + * running a process to completion and returns stdout and stderr + * as strings. Creates separate threads for reading stdout and stderr, + * then reuses those threads for each process most efficient use is + * to create one of these and use it repeatedly. Instances are not + * thread-safe, however. + * + * TODO: java.io sometimes zombies the backround threads locking up on read(). + * Supposedly NIO has better ways of interrupting a blocked stream but will + * require a little bit of refactoring. + * + * @author Michael Koehrsen + * @author Khalid Shakir + */ +public class ProcessController { + private static Logger logger = Logger.getLogger(ProcessController.class); + + private static enum ProcessStream {Stdout, Stderr} + + // Tracks running processes. + private static final Set running = Collections.synchronizedSet(new HashSet()); + + // Tracks this running process. + private Process process; + + // Threads that capture stdout and stderr + private final OutputCapture stdoutCapture; + private final OutputCapture stderrCapture; + + // When a caller destroyes a controller a new thread local version will be created + private boolean destroyed = false; + + // Communication channels with output capture threads + + // Holds the stdout and stderr sent to the background capture threads + private final Map toCapture = + new EnumMap(ProcessStream.class); + + // Holds the results of the capture from the background capture threads. + // May be the content via toCapture or an StreamOutput.EMPTY if the capture was interrupted. + private final Map fromCapture = + new EnumMap(ProcessStream.class); + + // Useful for debugging if background threads have shut down correctly + private static int nextControllerId = 0; + private final int controllerId; + + public ProcessController() { + // Start the background threads for this controller. + synchronized (running) { + controllerId = nextControllerId++; + } + stdoutCapture = new OutputCapture(ProcessStream.Stdout, controllerId); + stderrCapture = new OutputCapture(ProcessStream.Stderr, controllerId); + stdoutCapture.start(); + stderrCapture.start(); + } + + /** + * Returns a thread local ProcessController. + * Should NOT be closed when finished so it can be reused by the thread. + * + * @return a thread local ProcessController. + */ + public static ProcessController getThreadLocal() { + // If the local controller was destroyed get a fresh instance. + if (threadProcessController.get().destroyed) + threadProcessController.remove(); + return threadProcessController.get(); + } + + /** + * Thread local process controller container. + */ + private static final ThreadLocal threadProcessController = + new ThreadLocal() { + @Override + protected ProcessController initialValue() { + return new ProcessController(); + } + }; + + /** + * Similar to Runtime.exec() but drains the output and error streams. + * + * @param command Command to run. + * @return The result code. + */ + public static int exec(String[] command) { + ProcessController controller = ProcessController.getThreadLocal(); + return controller.exec(new ProcessSettings(command)).getExitValue(); + } + + /** + * Executes a command line program with the settings and waits for it to return, + * processing the output on a background thread. + * + * @param settings Settings to be run. + * @return The output of the command. + */ + public ProcessOutput exec(ProcessSettings settings) { + if (destroyed) + throw new IllegalStateException("This controller was destroyed"); + + ProcessBuilder builder = new ProcessBuilder(settings.getCommand()); + builder.directory(settings.getDirectory()); + + Map settingsEnvironment = settings.getEnvironment(); + if (settingsEnvironment != null) { + Map builderEnvironment = builder.environment(); + builderEnvironment.clear(); + builderEnvironment.putAll(settingsEnvironment); + } + + builder.redirectErrorStream(settings.isRedirectErrorStream()); + + StreamOutput stdout = null; + StreamOutput stderr = null; + + // Start the process running. + + try { + synchronized (toCapture) { + process = builder.start(); + } + running.add(this); + } catch (IOException e) { + throw new ReviewedStingException("Unable to start command: " + StringUtils.join(builder.command(), " ")); + } + + int exitCode; + + try { + // Notify the background threads to start capturing. + synchronized (toCapture) { + toCapture.put(ProcessStream.Stdout, + new CapturedStreamOutput(settings.getStdoutSettings(), process.getInputStream(), System.out)); + toCapture.put(ProcessStream.Stderr, + new CapturedStreamOutput(settings.getStderrSettings(), process.getErrorStream(), System.err)); + toCapture.notifyAll(); + } + + // Write stdin content + InputStreamSettings stdinSettings = settings.getStdinSettings(); + Set streamLocations = stdinSettings.getStreamLocations(); + if (!streamLocations.isEmpty()) { + try { + OutputStream stdinStream = process.getOutputStream(); + for (StreamLocation location : streamLocations) { + InputStream inputStream; + switch (location) { + case Buffer: + inputStream = new ByteArrayInputStream(stdinSettings.getInputBuffer()); + break; + case File: + try { + inputStream = FileUtils.openInputStream(stdinSettings.getInputFile()); + } catch (IOException e) { + throw new UserException.BadInput(e.getMessage()); + } + break; + case Standard: + inputStream = System.in; + break; + default: + throw new ReviewedStingException("Unexpected stream location: " + location); + } + try { + IOUtils.copy(inputStream, stdinStream); + } finally { + if (location != StreamLocation.Standard) + IOUtils.closeQuietly(inputStream); + } + } + stdinStream.flush(); + } catch (IOException e) { + throw new ReviewedStingException("Error writing to stdin on command: " + StringUtils.join(builder.command(), " "), e); + } + } + + // Wait for the process to complete. + try { + process.getOutputStream().close(); + process.waitFor(); + } catch (IOException e) { + throw new ReviewedStingException("Unable to close stdin on command: " + StringUtils.join(builder.command(), " "), e); + } catch (InterruptedException e) { + throw new ReviewedStingException("Process interrupted", e); + } finally { + while (!destroyed && stdout == null || stderr == null) { + synchronized (fromCapture) { + if (fromCapture.containsKey(ProcessStream.Stdout)) + stdout = fromCapture.remove(ProcessStream.Stdout); + if (fromCapture.containsKey(ProcessStream.Stderr)) + stderr = fromCapture.remove(ProcessStream.Stderr); + try { + if (stdout == null || stderr == null) + fromCapture.wait(); + } catch (InterruptedException e) { + // Log the error, ignore the interrupt and wait patiently + // for the OutputCaptures to (via finally) return their + // stdout and stderr. + logger.error(e); + } + } + } + + if (destroyed) { + if (stdout == null) + stdout = StreamOutput.EMPTY; + if (stderr == null) + stderr = StreamOutput.EMPTY; + } + } + } finally { + synchronized (toCapture) { + exitCode = process.exitValue(); + process = null; + } + running.remove(this); + } + + return new ProcessOutput(exitCode, stdout, stderr); + } + + /** + * @return The set of still running processes. + */ + public static Set getRunning() { + synchronized (running) { + return new HashSet(running); + } + } + + /** + * Stops the process from running and tries to ensure process is cleaned up properly. + * NOTE: sub-processes started by process may be zombied with their parents set to pid 1. + * NOTE: capture threads may block on read. + * TODO: Try to use NIO to interrupt streams. + */ + public void tryDestroy() { + destroyed = true; + synchronized (toCapture) { + if (process != null) { + process.destroy(); + IOUtils.closeQuietly(process.getInputStream()); + IOUtils.closeQuietly(process.getErrorStream()); + } + stdoutCapture.interrupt(); + stderrCapture.interrupt(); + toCapture.notifyAll(); + } + } + + @Override + protected void finalize() throws Throwable { + try { + tryDestroy(); + } catch (Exception e) { + logger.error(e); + } + super.finalize(); + } + + private class OutputCapture extends Thread { + private final int controllerId; + private final ProcessStream key; + + /** + * Reads in the output of a stream on a background thread to keep the output pipe from backing up and freezing the called process. + * + * @param key The stdout or stderr key for this output capture. + * @param controllerId Unique id of the controller. + */ + public OutputCapture(ProcessStream key, int controllerId) { + super(String.format("OutputCapture-%d-%s-%s-%d", controllerId, key.name().toLowerCase(), + Thread.currentThread().getName(), Thread.currentThread().getId())); + this.controllerId = controllerId; + this.key = key; + setDaemon(true); + } + + /** + * Runs the capture. + */ + @Override + public void run() { + while (!destroyed) { + StreamOutput processStream = StreamOutput.EMPTY; + try { + // Wait for a new input stream to be passed from this process controller. + CapturedStreamOutput capturedProcessStream = null; + while (!destroyed && capturedProcessStream == null) { + synchronized (toCapture) { + if (toCapture.containsKey(key)) { + capturedProcessStream = toCapture.remove(key); + } else { + toCapture.wait(); + } + } + } + + if (!destroyed) { + // Read in the input stream + processStream = capturedProcessStream; + capturedProcessStream.readAndClose(); + } + } catch (InterruptedException e) { + logger.info("OutputCapture interrupted, exiting"); + break; + } catch (IOException e) { + logger.error("Error reading process output", e); + } finally { + // Send the string back to the process controller. + synchronized (fromCapture) { + fromCapture.put(key, processStream); + fromCapture.notify(); + } + } + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessOutput.java b/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessOutput.java new file mode 100755 index 000000000..211008950 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessOutput.java @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.runtime; + +public class ProcessOutput { + private final int exitValue; + private final StreamOutput stdout; + private final StreamOutput stderr; + + /** + * The output of a process. + * + * @param exitValue The exit value. + * @param stdout The capture of stdout as defined by the stdout OutputStreamSettings. + * @param stderr The capture of stderr as defined by the stderr OutputStreamSettings. + */ + public ProcessOutput(int exitValue, StreamOutput stdout, StreamOutput stderr) { + this.exitValue = exitValue; + this.stdout = stdout; + this.stderr = stderr; + } + + public int getExitValue() { + return exitValue; + } + + public StreamOutput getStdout() { + return stdout; + } + + public StreamOutput getStderr() { + return stderr; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessSettings.java b/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessSettings.java new file mode 100755 index 000000000..b9f67f3a4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/runtime/ProcessSettings.java @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.runtime; + +import com.sun.corba.se.spi.orbutil.fsm.Input; + +import java.io.File; +import java.util.Map; + +public class ProcessSettings { + private String[] command; + private Map environment; + private File directory; + private boolean redirectErrorStream; + private InputStreamSettings stdinSettings; + private OutputStreamSettings stdoutSettings; + private OutputStreamSettings stderrSettings; + + /** + * @param command Command line to run. + */ + public ProcessSettings(String[] command) { + this(command, false, null, null, null, null, null); + } + + /** + * @param command Command line to run. + * @param redirectErrorStream true if stderr should be sent to stdout. + * @param environment Environment settings to override System.getEnv, or null to use System.getEnv. + * @param directory The directory to run the command in, or null to run in the current directory. + * @param stdinSettings Settings for writing to the process stdin. + * @param stdoutSettings Settings for capturing the process stdout. + * @param stderrSettings Setting for capturing the process stderr. + */ + public ProcessSettings(String[] command, boolean redirectErrorStream, File directory, Map environment, + InputStreamSettings stdinSettings, OutputStreamSettings stdoutSettings, OutputStreamSettings stderrSettings) { + this.command = checkCommand(command); + this.redirectErrorStream = redirectErrorStream; + this.directory = directory; + this.environment = environment; + this.stdinSettings = checkSettings(stdinSettings); + this.stdoutSettings = checkSettings(stdoutSettings); + this.stderrSettings = checkSettings(stderrSettings); + } + + public String[] getCommand() { + return command; + } + + public void setCommand(String[] command) { + this.command = checkCommand(command); + } + + public boolean isRedirectErrorStream() { + return redirectErrorStream; + } + + public void setRedirectErrorStream(boolean redirectErrorStream) { + this.redirectErrorStream = redirectErrorStream; + } + + public File getDirectory() { + return directory; + } + + public void setDirectory(File directory) { + this.directory = directory; + } + + public Map getEnvironment() { + return environment; + } + + public void setEnvironment(Map environment) { + this.environment = environment; + } + + public InputStreamSettings getStdinSettings() { + return stdinSettings; + } + + public void setStdinSettings(InputStreamSettings stdinSettings) { + this.stdinSettings = checkSettings(stdinSettings); + } + + public OutputStreamSettings getStdoutSettings() { + return stdoutSettings; + } + + public void setStdoutSettings(OutputStreamSettings stdoutSettings) { + this.stdoutSettings = checkSettings(stdoutSettings); + } + + public OutputStreamSettings getStderrSettings() { + return stderrSettings; + } + + public void setStderrSettings(OutputStreamSettings stderrSettings) { + this.stderrSettings = checkSettings(stderrSettings); + } + + protected String[] checkCommand(String[] command) { + if (command == null) + throw new IllegalArgumentException("Command is not allowed to be null"); + for (String s: command) + if (s == null) + throw new IllegalArgumentException("Command is not allowed to contain nulls"); + return command; + } + + protected InputStreamSettings checkSettings(InputStreamSettings settings) { + return settings == null ? new InputStreamSettings() : settings; + } + + protected OutputStreamSettings checkSettings(OutputStreamSettings settings) { + return settings == null ? new OutputStreamSettings() : settings; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/RuntimeUtils.java b/public/java/src/org/broadinstitute/sting/utils/runtime/RuntimeUtils.java new file mode 100644 index 000000000..b5b2cbee5 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/runtime/RuntimeUtils.java @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.runtime; + +import org.apache.commons.lang.StringUtils; + +import java.io.File; + +public class RuntimeUtils { + public static final String[] PATHS; + + static { + String path = System.getenv("PATH"); + if (path == null) + path = System.getenv("path"); + if (path == null) { + PATHS = new String[0]; + } else { + PATHS = StringUtils.split(path, File.pathSeparatorChar); + } + } + + /** + * Returns the path to an executable or null if it doesn't exist. + * @param executable Relative path + * @return The absolute file path. + */ + public static File which(String executable) { + for (String path: PATHS) { + File file = new File(path, executable); + if (file.exists()) + return file.getAbsoluteFile(); + } + return null; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/StreamLocation.java b/public/java/src/org/broadinstitute/sting/utils/runtime/StreamLocation.java new file mode 100755 index 000000000..df72180f1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/runtime/StreamLocation.java @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.runtime; + +/** + * Where to read/write a stream + */ +public enum StreamLocation { + Buffer, File, Standard +} diff --git a/public/java/src/org/broadinstitute/sting/utils/runtime/StreamOutput.java b/public/java/src/org/broadinstitute/sting/utils/runtime/StreamOutput.java new file mode 100755 index 000000000..5dc94815f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/runtime/StreamOutput.java @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.runtime; + +/** + * The content of stdout or stderr. + */ +public abstract class StreamOutput { + /** + * Empty stream output when no output is captured due to an error. + */ + public static final StreamOutput EMPTY = new StreamOutput() { + @Override + public byte[] getBufferBytes() { + return new byte[0]; + } + + @Override + public boolean isBufferTruncated() { + return false; + } + }; + + /** + * Returns the content as a string. + * + * @return The content as a string. + */ + public String getBufferString() { + return new String(getBufferBytes()); + } + + /** + * Returns the content as a string. + * + * @return The content as a string. + */ + public abstract byte[] getBufferBytes(); + + /** + * Returns true if the buffer was truncated. + * + * @return true if the buffer was truncated. + */ + public abstract boolean isBufferTruncated(); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java new file mode 100644 index 000000000..02512c8dc --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentStartWithNoTiesComparator.java @@ -0,0 +1,46 @@ +package org.broadinstitute.sting.utils.sam; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMRecord; + +import java.util.Comparator; + +public class AlignmentStartWithNoTiesComparator implements Comparator { + @Requires("c1 >= 0 && c2 >= 0") + @Ensures("result == 0 || result == 1 || result == -1") + private int compareContigs(int c1, int c2) { + if (c1 == c2) + return 0; + else if (c1 > c2) + return 1; + return -1; + } + + @Requires("r1 != null && r2 != null") + @Ensures("result == 0 || result == 1 || result == -1") + public int compare(SAMRecord r1, SAMRecord r2) { + int result; + + if (r1 == r2) + result = 0; + + else if (r1.getReadUnmappedFlag()) + result = 1; + else if (r2.getReadUnmappedFlag()) + result = -1; + else { + final int cmpContig = compareContigs(r1.getReferenceIndex(), r2.getReferenceIndex()); + + if (cmpContig != 0) + result = cmpContig; + + else { + if (r1.getAlignmentStart() < r2.getAlignmentStart()) result = -1; + else result = 1; + } + } + + return result; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java index 5f7db458a..475f7de21 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java @@ -104,9 +104,9 @@ public class ArtificialReadsTraversal extends TraversalEngine createPair(SAMFileHeader header, String name, int readLen, int leftStart, int rightStart, boolean leftIsFirst, boolean leftIsNegative) { + GATKSAMRecord left = ArtificialSAMUtils.createArtificialRead(header, name, 0, leftStart, readLen); + GATKSAMRecord right = ArtificialSAMUtils.createArtificialRead(header, name, 0, rightStart, readLen); + + left.setReadPairedFlag(true); + right.setReadPairedFlag(true); + + left.setProperPairFlag(true); + right.setProperPairFlag(true); + + left.setFirstOfPairFlag(leftIsFirst); + right.setFirstOfPairFlag(! leftIsFirst); + + left.setReadNegativeStrandFlag(leftIsNegative); + left.setMateNegativeStrandFlag(!leftIsNegative); + right.setReadNegativeStrandFlag(!leftIsNegative); + right.setMateNegativeStrandFlag(leftIsNegative); + + left.setMateAlignmentStart(right.getAlignmentStart()); + right.setMateAlignmentStart(left.getAlignmentStart()); + + left.setMateReferenceIndex(0); + right.setMateReferenceIndex(0); + + int isize = rightStart + readLen - leftStart; + left.setInferredInsertSize(isize); + right.setInferredInsertSize(-isize); + + return Arrays.asList(left, right); + } + /** * create an iterator containing the specified read piles * @@ -255,4 +292,52 @@ public class ArtificialSAMUtils { return new ArtificialSAMQueryIterator(startingChr, endingChr, readCount, unmappedReadCount, header); } + + private final static int ranIntInclusive(Random ran, int start, int stop) { + final int range = stop - start; + return ran.nextInt(range) + start; + } + + /** + * Creates a read backed pileup containing up to pileupSize reads at refID 0 from header at loc with + * reads created that have readLen bases. Pairs are sampled from a gaussian distribution with mean insert + * size of insertSize and variation of insertSize / 10. The first read will be in the pileup, and the second + * may be, depending on where this sampled insertSize puts it. + * @param header + * @param loc + * @param readLen + * @param insertSize + * @param pileupSize + * @return + */ + public static ReadBackedPileup createReadBackedPileup(final SAMFileHeader header, final GenomeLoc loc, final int readLen, final int insertSize, final int pileupSize) { + final Random ran = new Random(); + final boolean leftIsFirst = true; + final boolean leftIsNegative = false; + final int insertSizeVariation = insertSize / 10; + final int pos = loc.getStart(); + + final List pileupElements = new ArrayList(); + for ( int i = 0; i < pileupSize / 2; i++ ) { + final String readName = "read" + i; + final int leftStart = ranIntInclusive(ran, 1, pos); + final int fragmentSize = (int)(ran.nextGaussian() * insertSizeVariation + insertSize); + final int rightStart = leftStart + fragmentSize - readLen; + + if ( rightStart <= 0 ) continue; + + List pair = createPair(header, readName, readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); + final GATKSAMRecord left = pair.get(0); + final GATKSAMRecord right = pair.get(1); + + pileupElements.add(new PileupElement(left, pos - leftStart)); + + if ( pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd() ) { + pileupElements.add(new PileupElement(right, pos - rightStart)); + } + } + + Collections.sort(pileupElements); + return new ReadBackedPileupImpl(loc, pileupElements); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java index c7ffcab0c..ff7d12f09 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.utils.NGSPlatform; /** * @author ebanks @@ -15,16 +16,28 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { // the SAMReadGroupRecord data we're caching private String mSample = null; private String mPlatform = null; + private NGSPlatform mNGSPlatform = null; // because some values can be null, we don't want to duplicate effort private boolean retrievedSample = false; private boolean retrievedPlatform = false; + private boolean retrievedNGSPlatform = false; + public GATKSAMReadGroupRecord(final String id) { + super(id); + } public GATKSAMReadGroupRecord(SAMReadGroupRecord record) { super(record.getReadGroupId(), record); } + public GATKSAMReadGroupRecord(SAMReadGroupRecord record, NGSPlatform pl) { + super(record.getReadGroupId(), record); + setPlatform(pl.getDefaultPlatform()); + mNGSPlatform = pl; + retrievedPlatform = retrievedNGSPlatform = true; + } + /////////////////////////////////////////////////////////////////////////////// // *** The following methods are overloaded to cache the appropriate data ***// /////////////////////////////////////////////////////////////////////////////// @@ -55,5 +68,15 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { super.setPlatform(s); mPlatform = s; retrievedPlatform = true; + retrievedNGSPlatform = false; // recalculate the NGSPlatform + } + + public NGSPlatform getNGSPlatform() { + if ( ! retrievedNGSPlatform ) { + mNGSPlatform = NGSPlatform.fromReadGroupPL(getPlatform()); + retrievedNGSPlatform = true; + } + + return mNGSPlatform; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index c55a462f1..ede75817a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -1,49 +1,57 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.NGSPlatform; -import java.lang.reflect.Method; import java.util.HashMap; -import java.util.List; import java.util.Map; /** - * @author ebanks + * @author ebanks, depristo * GATKSAMRecord * - * this class extends the samtools SAMRecord class and caches important + * this class extends the samtools BAMRecord class (and SAMRecord) and caches important * (and oft-accessed) data that's not already cached by the SAMRecord class * * IMPORTANT NOTE: Because ReadGroups are not set through the SAMRecord, * if they are ever modified externally then one must also invoke the * setReadGroup() method here to ensure that the cache is kept up-to-date. * - * 13 Oct 2010 - mhanna - this class is fundamentally flawed: it uses a decorator - * pattern to wrap a heavyweight object, which can lead - * to heinous side effects if the wrapping is not carefully - * done. Hopefully SAMRecord will become an interface and - * this will eventually be fixed. */ -public class GATKSAMRecord extends SAMRecord { - - // the underlying SAMRecord which we are wrapping - private final SAMRecord mRecord; - +public class GATKSAMRecord extends BAMRecord { + public static final String REDUCED_READ_QUALITY_TAG = "RR"; // the SAMRecord data we're caching private String mReadString = null; - private SAMReadGroupRecord mReadGroup = null; - private boolean mNegativeStrandFlag; - private boolean mUnmappedFlag; - private Boolean mSecondOfPairFlag = null; + private GATKSAMReadGroupRecord mReadGroup = null; + private byte[] reducedReadCounts = null; // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; - - /** A private cache for the reduced read quality. Null indicates the value hasn't be fetched yet or isn't available */ - private boolean lookedUpReducedReadQuality = false; - private Integer reducedReadQuality; + private boolean retrievedReduceReadCounts = false; // These temporary attributes were added here to make life easier for // certain algorithms by providing a way to label or attach arbitrary data to @@ -51,105 +59,112 @@ public class GATKSAMRecord extends SAMRecord { // These attributes exist in memory only, and are never written to disk. private Map temporaryAttributes; - public GATKSAMRecord(SAMRecord record, boolean useOriginalBaseQualities, byte defaultBaseQualities) { - super(null); // it doesn't matter - this isn't used - if ( record == null ) - throw new IllegalArgumentException("The SAMRecord argument cannot be null"); - mRecord = record; + /** + * HACK TO CREATE GATKSAMRECORD WITH ONLY A HEADER FOR TESTING PURPOSES ONLY + * @param header + */ + public GATKSAMRecord(final SAMFileHeader header) { + this(new SAMRecord(header)); + } - mNegativeStrandFlag = mRecord.getReadNegativeStrandFlag(); - mUnmappedFlag = mRecord.getReadUnmappedFlag(); + /** + * HACK TO CREATE GATKSAMRECORD BASED ONLY A SAMRECORD FOR TESTING PURPOSES ONLY + * @param read + */ + public GATKSAMRecord(final SAMRecord read) { + super(read.getHeader(), read.getMateReferenceIndex(), + read.getAlignmentStart(), + read.getReadName() != null ? (short)read.getReadNameLength() : 0, + (short)read.getMappingQuality(), + 0, + read.getCigarLength(), + read.getFlags(), + read.getReadLength(), + read.getMateReferenceIndex(), + read.getMateAlignmentStart(), + read.getInferredInsertSize(), + new byte[]{}); + super.clearAttributes(); + } - // because attribute methods are declared to be final (and we can't overload them), - // we need to actually set all of the attributes here - List attributes = record.getAttributes(); - for ( SAMTagAndValue attribute : attributes ) - setAttribute(attribute.tag, attribute.value); - - // if we are using default quals, check if we need them, and add if necessary. - // 1. we need if reads are lacking or have incomplete quality scores - // 2. we add if defaultBaseQualities has a positive value - if (defaultBaseQualities >= 0) { - byte reads [] = record.getReadBases(); - byte quals [] = record.getBaseQualities(); - if (quals == null || quals.length < reads.length) { - byte new_quals [] = new byte [reads.length]; - for (int i=0; i getAttributes() { return mRecord.getAttributes(); } - - public SAMFileHeader getHeader() { return mRecord.getHeader(); } - - public void setHeader(SAMFileHeader samFileHeader) { mRecord.setHeader(samFileHeader); } - - public byte[] getVariableBinaryRepresentation() { return mRecord.getVariableBinaryRepresentation(); } - - public int getAttributesBinarySize() { return mRecord.getAttributesBinarySize(); } - - public String format() { return mRecord.format(); } - - public List getAlignmentBlocks() { return mRecord.getAlignmentBlocks(); } - - public List validateCigar(long l) { return mRecord.validateCigar(l); } - @Override public boolean equals(Object o) { if (this == o) return true; - // note -- this forbids a GATKSAMRecord being equal to its underlying SAMRecord if (!(o instanceof GATKSAMRecord)) return false; // note that we do not consider the GATKSAMRecord internal state at all - return mRecord.equals(((GATKSAMRecord)o).mRecord); - } - - public int hashCode() { return mRecord.hashCode(); } - - public List isValid() { return mRecord.isValid(); } - - public Object clone() throws CloneNotSupportedException { return mRecord.clone(); } - - public String toString() { return mRecord.toString(); } - - public SAMFileSource getFileSource() { return mRecord.getFileSource(); } - - /** - * Sets a marker providing the source reader for this file and the position in the file from which the read originated. - * @param fileSource source of the given file. - */ - @Override - protected void setFileSource(final SAMFileSource fileSource) { - try { - Method method = SAMRecord.class.getDeclaredMethod("setFileSource",SAMFileSource.class); - method.setAccessible(true); - method.invoke(mRecord,fileSource); - } - catch(Exception ex) { - throw new ReviewedStingException("Unable to invoke setFileSource method",ex); - } + return super.equals(o); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java new file mode 100644 index 000000000..d96c874ea --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSamRecordFactory.java @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMRecordFactory; +import net.sf.samtools.BAMRecord; +import org.broadinstitute.sting.utils.exceptions.UserException; + +/** + * Factory interface implementation used to create GATKSamRecords + * from SAMFileReaders with SAM-JDK + * + * @author Mark DePristo + */ +public class GATKSamRecordFactory implements SAMRecordFactory { + + /** Create a new SAMRecord to be filled in */ + public SAMRecord createSAMRecord(SAMFileHeader header) { + throw new UserException.BadInput("The GATK now longer supports input SAM files"); + } + + /** Create a new BAM Record. */ + public BAMRecord createBAMRecord(final SAMFileHeader header, + final int referenceSequenceIndex, + final int alignmentStart, + final short readNameLength, + final short mappingQuality, + final int indexingBin, + final int cigarLen, + final int flags, + final int readLen, + final int mateReferenceSequenceIndex, + final int mateAlignmentStart, + final int insertSize, + final byte[] variableLengthBlock) { + return new GATKSAMRecord(header, + referenceSequenceIndex, + alignmentStart, + readNameLength, + mappingQuality, + indexingBin, + cigarLen, + flags, + readLen, + mateReferenceSequenceIndex, + mateAlignmentStart, + insertSize, + variableLengthBlock); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java index 07bfc52c7..fa07523f3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/NWaySAMFileWriter.java @@ -25,18 +25,16 @@ package org.broadinstitute.sting.utils.sam; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileWriter; -import net.sf.samtools.SAMFileWriterFactory; -import net.sf.samtools.SAMRecord; +import net.sf.samtools.*; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.io.File; -import java.util.HashMap; -import java.util.Map; +import java.util.*; /** * Created by IntelliJ IDEA. @@ -50,21 +48,35 @@ public class NWaySAMFileWriter implements SAMFileWriter { private Map writerMap = null; private boolean presorted ; GenomeAnalysisEngine toolkit; + boolean KEEP_ALL_PG_RECORDS = false; - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5) { + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { this.presorted = presorted; this.toolkit = toolkit; + this.KEEP_ALL_PG_RECORDS = keep_records; writerMap = new HashMap(); - setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5); + setupByReader(toolkit,in2out,order, presorted, indexOnTheFly, generateMD5, pRecord); } - public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly , boolean generateMD5) { + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly , boolean generateMD5, SAMProgramRecord pRecord, boolean keep_records) { this.presorted = presorted; this.toolkit = toolkit; + this.KEEP_ALL_PG_RECORDS = keep_records; writerMap = new HashMap(); - setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5); + setupByReader(toolkit,ext,order, presorted, indexOnTheFly, generateMD5, pRecord); } + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5) { + this(toolkit, in2out, order, presorted, indexOnTheFly, generateMD5, null,false); + } + + public NWaySAMFileWriter(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly , boolean generateMD5) { + this(toolkit, ext, order, presorted, indexOnTheFly, generateMD5, null,false); + } /** * Instantiates multiple underlying SAM writes, one per input SAM reader registered with GATK engine (those will be retrieved @@ -73,7 +85,8 @@ public class NWaySAMFileWriter implements SAMFileWriter { * @param toolkit * @param in2out */ - public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5) { + public void setupByReader(GenomeAnalysisEngine toolkit, Map in2out, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { if ( in2out==null ) throw new StingException("input-output bam filename map for n-way-out writing is NULL"); for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { @@ -85,9 +98,10 @@ public class NWaySAMFileWriter implements SAMFileWriter { outName = in2out.get(fName); if ( writerMap.containsKey( rid ) ) - throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); + throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered; "+ + "map file likely contains multiple entries for this input file"); - addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5); + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); } } @@ -100,7 +114,8 @@ public class NWaySAMFileWriter implements SAMFileWriter { * @param toolkit * @param ext */ - public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5) { + public void setupByReader(GenomeAnalysisEngine toolkit, String ext, SAMFileHeader.SortOrder order, + boolean presorted, boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord pRecord) { for ( SAMReaderID rid : toolkit.getReadsDataSource().getReaderIDs() ) { String fName = toolkit.getReadsDataSource().getSAMFile(rid).getName(); @@ -117,16 +132,29 @@ public class NWaySAMFileWriter implements SAMFileWriter { if ( writerMap.containsKey( rid ) ) throw new StingException("nWayOut mode: Reader id for input sam file "+fName+" is already registered"); - - addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5); + addWriter(rid,outName, order, presorted, indexOnTheFly, generateMD5, pRecord); } } - private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, boolean indexOnTheFly, boolean generateMD5) { + private void addWriter(SAMReaderID id , String outName, SAMFileHeader.SortOrder order, boolean presorted, + boolean indexOnTheFly, boolean generateMD5, SAMProgramRecord programRecord) { File f = new File(outName); SAMFileHeader header = toolkit.getSAMFileHeader(id).clone(); header.setSortOrder(order); + + if ( programRecord != null ) { + // --->> add program record + List oldRecords = header.getProgramRecords(); + List newRecords = new ArrayList(oldRecords.size()+1); + for ( SAMProgramRecord record : oldRecords ) { + if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) + newRecords.add(record); + } + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + // <-- add program record ends here + } SAMFileWriterFactory factory = new SAMFileWriterFactory(); factory.setCreateIndex(indexOnTheFly); factory.setCreateMd5File(generateMD5); @@ -134,7 +162,10 @@ public class NWaySAMFileWriter implements SAMFileWriter { writerMap.put(id,sw); } - + public Collection getWriters() { + return writerMap.values(); + } + public void addAlignment(SAMRecord samRecord) { final SAMReaderID id = toolkit.getReaderIDForRead(samRecord); String rg = samRecord.getStringAttribute("RG"); @@ -146,7 +177,7 @@ public class NWaySAMFileWriter implements SAMFileWriter { } public SAMFileHeader getFileHeader() { - return toolkit.getSAMFileHeader(); + return toolkit.getSAMFileHeader(); } public void close() { diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java old mode 100644 new mode 100755 index 62bbb0307..e125b8c80 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -43,10 +43,19 @@ import java.util.*; * @version 0.1 */ public class ReadUtils { - public static final String REDUCED_READ_QUALITY_TAG = "RQ"; - private ReadUtils() { } + // ---------------------------------------------------------------------------------------------------- + // + // Reduced read utilities + // + // ---------------------------------------------------------------------------------------------------- + + // ---------------------------------------------------------------------------------------------------- + // + // General utilities + // + // ---------------------------------------------------------------------------------------------------- public static SAMFileHeader copySAMFileHeader(SAMFileHeader toCopy) { SAMFileHeader copy = new SAMFileHeader(); @@ -118,37 +127,46 @@ public class ReadUtils { /** * This enum represents all the different ways in which a read can overlap an interval. * - * NO_OVERLAP: + * NO_OVERLAP_CONTIG: + * read and interval are in different contigs. + * + * NO_OVERLAP_LEFT: + * the read does not overlap the interval. + * + * |----------------| (interval) + * <----------------> (read) + * + * NO_OVERLAP_RIGHT: * the read does not overlap the interval. * * |----------------| (interval) * <----------------> (read) * - * LEFT_OVERLAP: + * OVERLAP_LEFT: * the read starts before the beginning of the interval but ends inside of it * * |----------------| (interval) * <----------------> (read) * - * RIGHT_OVERLAP: + * OVERLAP_RIGHT: * the read starts inside the interval but ends outside of it * * |----------------| (interval) * <----------------> (read) * - * FULL_OVERLAP: + * OVERLAP_LEFT_AND_RIGHT: * the read starts before the interval and ends after the interval * * |-----------| (interval) * <-------------------> (read) * - * CONTAINED: + * OVERLAP_CONTAINED: * the read starts and ends inside the interval * * |----------------| (interval) * <--------> (read) */ - public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} + public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, NO_OVERLAP_HARDCLIPPED_LEFT, NO_OVERLAP_HARDCLIPPED_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED} /** * God, there's a huge information asymmetry in SAM format: @@ -170,15 +188,15 @@ public class ReadUtils { * This makes the following code a little nasty, since we can only detect if a base is in the adaptor, but not * if it overlaps the read. * - * @param rec + * @param read * @param basePos * @param adaptorLength * @return */ - public static OverlapType readPairBaseOverlapType(final SAMRecord rec, long basePos, final int adaptorLength) { + public static OverlapType readPairBaseOverlapType(final SAMRecord read, long basePos, final int adaptorLength) { OverlapType state = OverlapType.NOT_OVERLAPPING; - Pair adaptorBoundaries = getAdaptorBoundaries(rec, adaptorLength); + Pair adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength); if ( adaptorBoundaries != null ) { // we're not an unmapped pair -- cannot filter out @@ -187,28 +205,28 @@ public class ReadUtils { if ( inAdapator ) { state = OverlapType.IN_ADAPTOR; //System.out.printf("baseOverlapState: %50s negStrand=%b base=%d start=%d stop=%d, adaptorStart=%d adaptorEnd=%d isize=%d => %s%n", - // rec.getReadName(), rec.getReadNegativeStrandFlag(), basePos, rec.getAlignmentStart(), rec.getAlignmentEnd(), adaptorBoundaries.first, adaptorBoundaries.second, rec.getInferredInsertSize(), state); + // read.getReadName(), read.getReadNegativeStrandFlag(), basePos, read.getAlignmentStart(), read.getAlignmentEnd(), adaptorBoundaries.first, adaptorBoundaries.second, read.getInferredInsertSize(), state); } } return state; } - private static Pair getAdaptorBoundaries(SAMRecord rec, int adaptorLength) { - int isize = rec.getInferredInsertSize(); + private static Pair getAdaptorBoundaries(SAMRecord read, int adaptorLength) { + int isize = read.getInferredInsertSize(); if ( isize == 0 ) return null; // don't worry about unmapped pairs int adaptorStart, adaptorEnd; - if ( rec.getReadNegativeStrandFlag() ) { + if ( read.getReadNegativeStrandFlag() ) { // we are on the negative strand, so our mate is on the positive strand - int mateStart = rec.getMateAlignmentStart(); + int mateStart = read.getMateAlignmentStart(); adaptorStart = mateStart - adaptorLength - 1; adaptorEnd = mateStart - 1; } else { // we are on the positive strand, so our mate is on the negative strand - int mateEnd = rec.getAlignmentStart() + isize - 1; + int mateEnd = read.getAlignmentStart() + isize - 1; adaptorStart = mateEnd + 1; adaptorEnd = mateEnd + adaptorLength; } @@ -218,47 +236,47 @@ public class ReadUtils { /** * - * @param rec original SAM record + * @param read original SAM record * @param adaptorLength length of adaptor sequence * @return a new read with adaptor sequence hard-clipped out or null if read is fully clipped */ - public static GATKSAMRecord hardClipAdaptorSequence(final SAMRecord rec, int adaptorLength) { + public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read, int adaptorLength) { - Pair adaptorBoundaries = getAdaptorBoundaries(rec, adaptorLength); - GATKSAMRecord result = (GATKSAMRecord)rec; + Pair adaptorBoundaries = getAdaptorBoundaries(read, adaptorLength); + GATKSAMRecord result = (GATKSAMRecord)read; if ( adaptorBoundaries != null ) { - if ( rec.getReadNegativeStrandFlag() && adaptorBoundaries.second >= rec.getAlignmentStart() && adaptorBoundaries.first < rec.getAlignmentEnd() ) - result = hardClipStartOfRead(rec, adaptorBoundaries.second); - else if ( !rec.getReadNegativeStrandFlag() && adaptorBoundaries.first <= rec.getAlignmentEnd() ) - result = hardClipEndOfRead(rec, adaptorBoundaries.first); + if ( read.getReadNegativeStrandFlag() && adaptorBoundaries.second >= read.getAlignmentStart() && adaptorBoundaries.first < read.getAlignmentEnd() ) + result = hardClipStartOfRead(read, adaptorBoundaries.second); + else if ( !read.getReadNegativeStrandFlag() && adaptorBoundaries.first <= read.getAlignmentEnd() ) + result = hardClipEndOfRead(read, adaptorBoundaries.first); } return result; } // return true if the read needs to be completely clipped - private static GATKSAMRecord hardClipStartOfRead(SAMRecord oldRec, int stopPosition) { + private static GATKSAMRecord hardClipStartOfRead(GATKSAMRecord oldRec, int stopPosition) { if ( stopPosition >= oldRec.getAlignmentEnd() ) { // BAM representation issue -- we can't clip away all bases in a read, just leave it alone and let the filter deal with it - //System.out.printf("Entire read needs to be clipped: %50s %n", rec.getReadName()); + //System.out.printf("Entire read needs to be clipped: %50s %n", read.getReadName()); return null; } - GATKSAMRecord rec; + GATKSAMRecord read; try { - rec = (GATKSAMRecord)oldRec.clone(); + read = (GATKSAMRecord)oldRec.clone(); } catch (Exception e) { return null; } //System.out.printf("Clipping start of read: %50s start=%d adaptorEnd=%d isize=%d %n", - // rec.getReadName(), rec.getAlignmentStart(), stopPosition, rec.getInferredInsertSize()); + // read.getReadName(), read.getAlignmentStart(), stopPosition, read.getInferredInsertSize()); - Cigar oldCigar = rec.getCigar(); + Cigar oldCigar = read.getCigar(); LinkedList newCigarElements = new LinkedList(); - int currentPos = rec.getAlignmentStart(); + int currentPos = read.getAlignmentStart(); int basesToClip = 0; int basesAlreadyClipped = 0; @@ -297,48 +315,48 @@ public class ReadUtils { } // copy over the unclipped bases - final byte[] bases = rec.getReadBases(); - final byte[] quals = rec.getBaseQualities(); + final byte[] bases = read.getReadBases(); + final byte[] quals = read.getBaseQualities(); int newLength = bases.length - basesToClip; byte[] newBases = new byte[newLength]; byte[] newQuals = new byte[newLength]; System.arraycopy(bases, basesToClip, newBases, 0, newLength); System.arraycopy(quals, basesToClip, newQuals, 0, newLength); - rec.setReadBases(newBases); - rec.setBaseQualities(newQuals); + read.setReadBases(newBases); + read.setBaseQualities(newQuals); // now add a CIGAR element for the clipped bases newCigarElements.addFirst(new CigarElement(basesToClip + basesAlreadyClipped, CigarOperator.H)); Cigar newCigar = new Cigar(newCigarElements); - rec.setCigar(newCigar); + read.setCigar(newCigar); // adjust the start accordingly - rec.setAlignmentStart(stopPosition + 1); + read.setAlignmentStart(stopPosition + 1); - return rec; + return read; } - private static GATKSAMRecord hardClipEndOfRead(SAMRecord oldRec, int startPosition) { + private static GATKSAMRecord hardClipEndOfRead(GATKSAMRecord oldRec, int startPosition) { if ( startPosition <= oldRec.getAlignmentStart() ) { // BAM representation issue -- we can't clip away all bases in a read, just leave it alone and let the filter deal with it - //System.out.printf("Entire read needs to be clipped: %50s %n", rec.getReadName()); + //System.out.printf("Entire read needs to be clipped: %50s %n", read.getReadName()); return null; } - GATKSAMRecord rec; + GATKSAMRecord read; try { - rec = (GATKSAMRecord)oldRec.clone(); + read = (GATKSAMRecord)oldRec.clone(); } catch (Exception e) { return null; } //System.out.printf("Clipping end of read: %50s adaptorStart=%d end=%d isize=%d %n", - // rec.getReadName(), startPosition, rec.getAlignmentEnd(), rec.getInferredInsertSize()); + // read.getReadName(), startPosition, read.getAlignmentEnd(), read.getInferredInsertSize()); - Cigar oldCigar = rec.getCigar(); + Cigar oldCigar = read.getCigar(); LinkedList newCigarElements = new LinkedList(); - int currentPos = rec.getAlignmentStart(); + int currentPos = read.getAlignmentStart(); int basesToKeep = 0; int basesAlreadyClipped = 0; @@ -384,41 +402,41 @@ public class ReadUtils { } // copy over the unclipped bases - final byte[] bases = rec.getReadBases(); - final byte[] quals = rec.getBaseQualities(); + final byte[] bases = read.getReadBases(); + final byte[] quals = read.getBaseQualities(); byte[] newBases = new byte[basesToKeep]; byte[] newQuals = new byte[basesToKeep]; System.arraycopy(bases, 0, newBases, 0, basesToKeep); System.arraycopy(quals, 0, newQuals, 0, basesToKeep); - rec.setReadBases(newBases); - rec.setBaseQualities(newQuals); + read.setReadBases(newBases); + read.setBaseQualities(newQuals); // now add a CIGAR element for the clipped bases newCigarElements.add(new CigarElement((bases.length - basesToKeep) + basesAlreadyClipped, CigarOperator.H)); Cigar newCigar = new Cigar(newCigarElements); - rec.setCigar(newCigar); + read.setCigar(newCigar); // adjust the stop accordingly - // rec.setAlignmentEnd(startPosition - 1); + // read.setAlignmentEnd(startPosition - 1); - return rec; + return read; } /** * Hard clips away (i.e.g, removes from the read) bases that were previously soft clipped. * - * @param rec + * @param read * @return */ - @Requires("rec != null") + @Requires("read != null") @Ensures("result != null") - public static SAMRecord hardClipSoftClippedBases(SAMRecord rec) { - List cigarElts = rec.getCigar().getCigarElements(); + public static GATKSAMRecord hardClipSoftClippedBases(GATKSAMRecord read) { + List cigarElts = read.getCigar().getCigarElements(); if ( cigarElts.size() == 1 ) // can't be soft clipped, just return - return rec; + return read; - int keepStart = 0, keepEnd = rec.getReadLength() - 1; + int keepStart = 0, keepEnd = read.getReadLength() - 1; List newCigarElements = new LinkedList(); for ( int i = 0; i < cigarElts.size(); i++ ) { @@ -429,7 +447,7 @@ public class ReadUtils { if ( i == 0 ) keepStart = l; else - keepEnd = rec.getReadLength() - l - 1; + keepEnd = read.getReadLength() - l - 1; newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP)); break; @@ -459,54 +477,54 @@ public class ReadUtils { } mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator)); - return hardClipBases(rec, keepStart, keepEnd, mergedCigarElements); + return hardClipBases(read, keepStart, keepEnd, mergedCigarElements); } /** - * Hard clips out the bases in rec, keeping the bases from keepStart to keepEnd, inclusive. Note these + * Hard clips out the bases in read, keeping the bases from keepStart to keepEnd, inclusive. Note these * are offsets, so they are 0 based * - * @param rec + * @param read * @param keepStart * @param keepEnd * @param newCigarElements * @return */ @Requires({ - "rec != null", + "read != null", "keepStart >= 0", - "keepEnd < rec.getReadLength()", - "rec.getReadUnmappedFlag() || newCigarElements != null"}) + "keepEnd < read.getReadLength()", + "read.getReadUnmappedFlag() || newCigarElements != null"}) @Ensures("result != null") - public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd, List newCigarElements) { + public static GATKSAMRecord hardClipBases(GATKSAMRecord read, int keepStart, int keepEnd, List newCigarElements) { int newLength = keepEnd - keepStart + 1; - if ( newLength != rec.getReadLength() ) { + if ( newLength != read.getReadLength() ) { try { - rec = SimplifyingSAMFileWriter.simplifyRead((SAMRecord)rec.clone()); + read = (GATKSAMRecord)read.clone(); // copy over the unclipped bases - final byte[] bases = rec.getReadBases(); - final byte[] quals = rec.getBaseQualities(); + final byte[] bases = read.getReadBases(); + final byte[] quals = read.getBaseQualities(); byte[] newBases = new byte[newLength]; byte[] newQuals = new byte[newLength]; System.arraycopy(bases, keepStart, newBases, 0, newLength); System.arraycopy(quals, keepStart, newQuals, 0, newLength); - rec.setReadBases(newBases); - rec.setBaseQualities(newQuals); + read.setReadBases(newBases); + read.setBaseQualities(newQuals); // now add a CIGAR element for the clipped bases, if the read isn't unmapped - if ( ! rec.getReadUnmappedFlag() ) { + if ( ! read.getReadUnmappedFlag() ) { Cigar newCigar = new Cigar(newCigarElements); - rec.setCigar(newCigar); + read.setCigar(newCigar); } } catch ( CloneNotSupportedException e ) { throw new ReviewedStingException("WTF, where did clone go?", e); } } - return rec; + return read; } - public static SAMRecord replaceSoftClipsWithMatches(SAMRecord read) { + public static GATKSAMRecord replaceSoftClipsWithMatches(GATKSAMRecord read) { List newCigarElements = new ArrayList(); for ( CigarElement ce : read.getCigar().getCigarElements() ) { @@ -543,15 +561,15 @@ public class ReadUtils { /** * - * @param rec original SAM record + * @param read original SAM record * @return a new read with adaptor sequence hard-clipped out or null if read is fully clipped */ - public static GATKSAMRecord hardClipAdaptorSequence(final SAMRecord rec) { - return hardClipAdaptorSequence(rec, DEFAULT_ADAPTOR_SIZE); + public static GATKSAMRecord hardClipAdaptorSequence(final GATKSAMRecord read) { + return hardClipAdaptorSequence(read, DEFAULT_ADAPTOR_SIZE); } - public static OverlapType readPairBaseOverlapType(final SAMRecord rec, long basePos) { - return readPairBaseOverlapType(rec, basePos, DEFAULT_ADAPTOR_SIZE); + public static OverlapType readPairBaseOverlapType(final SAMRecord read, long basePos) { + return readPairBaseOverlapType(read, basePos, DEFAULT_ADAPTOR_SIZE); } public static boolean is454Read(SAMRecord read) { @@ -583,10 +601,10 @@ public class ReadUtils { readFlagNames.put(0x400, "Duplicate"); } - public static String readFlagsAsString(SAMRecord rec) { + public static String readFlagsAsString(GATKSAMRecord read) { String flags = ""; for (int flag : readFlagNames.keySet()) { - if ((rec.getFlags() & flag) != 0) { + if ((read.getFlags() & flag) != 0) { flags += readFlagNames.get(flag) + " "; } } @@ -600,7 +618,7 @@ public class ReadUtils { * @param reads * @return */ - public final static List coordinateSortReads(List reads) { + public final static List coordinateSortReads(List reads) { final SAMRecordComparator comparer = new SAMRecordCoordinateComparator(); Collections.sort(reads, comparer); return reads; @@ -629,37 +647,45 @@ public class ReadUtils { * @param interval the interval * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) */ - public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(SAMRecord read, GenomeLoc interval) { + public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(GATKSAMRecord read, GenomeLoc interval) { - int start = getRefCoordSoftUnclippedStart(read); - int stop = getRefCoordSoftUnclippedEnd(read); + int sStart = getRefCoordSoftUnclippedStart(read); + int sStop = getRefCoordSoftUnclippedEnd(read); + int uStart = read.getUnclippedStart(); + int uStop = read.getUnclippedEnd(); if ( !read.getReferenceName().equals(interval.getContig()) ) return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG; - else if ( stop < interval.getStart() ) + else if ( uStop < interval.getStart() ) return ReadAndIntervalOverlap.NO_OVERLAP_LEFT; - else if ( start > interval.getStop() ) + else if ( uStart > interval.getStop() ) return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT; - else if ( (start >= interval.getStart()) && - (stop <= interval.getStop()) ) + else if ( sStop < interval.getStart() ) + return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_LEFT; + + else if ( sStart > interval.getStop() ) + return ReadAndIntervalOverlap.NO_OVERLAP_HARDCLIPPED_RIGHT; + + else if ( (sStart >= interval.getStart()) && + (sStop <= interval.getStop()) ) return ReadAndIntervalOverlap.OVERLAP_CONTAINED; - else if ( (start < interval.getStart()) && - (stop > interval.getStop()) ) + else if ( (sStart < interval.getStart()) && + (sStop > interval.getStop()) ) return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT; - else if ( (start < interval.getStart()) ) + else if ( (sStart < interval.getStart()) ) return ReadAndIntervalOverlap.OVERLAP_LEFT; else return ReadAndIntervalOverlap.OVERLAP_RIGHT; } - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"}) - public static int getRefCoordSoftUnclippedStart(SAMRecord read) { + @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) + public static int getRefCoordSoftUnclippedStart(GATKSAMRecord read) { int start = read.getUnclippedStart(); for (CigarElement cigarElement : read.getCigar().getCigarElements()) { if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) @@ -670,9 +696,13 @@ public class ReadUtils { return start; } - @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd()"}) - public static int getRefCoordSoftUnclippedEnd(SAMRecord read) { + @Ensures({"result >= read.getUnclippedStart()", "result <= read.getUnclippedEnd() || readIsEntirelyInsertion(read)"}) + public static int getRefCoordSoftUnclippedEnd(GATKSAMRecord read) { int stop = read.getUnclippedStart(); + + if (readIsEntirelyInsertion(read)) + return stop; + int shift = 0; CigarOperator lastOperator = null; for (CigarElement cigarElement : read.getCigar().getCigarElements()) { @@ -686,85 +716,198 @@ public class ReadUtils { return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } + private static boolean readIsEntirelyInsertion(GATKSAMRecord read) { + for (CigarElement cigarElement : read.getCigar().getCigarElements()) { + if (cigarElement.getOperator() != CigarOperator.INSERTION) + return false; + } + return true; + } + + public enum ClippingTail { + LEFT_TAIL, + RIGHT_TAIL + } + /** - * Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region before - * the alignment start of the read. + * Pre-processes the results of getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int) in case it falls in + * a deletion following the typical clipping needs. If clipping the left tail (beginning of the read) returns + * the base prior to the deletion. If clipping the right tail (end of the read) returns the base after the + * deletion. * * @param read * @param refCoord - * @return the corresponding read coordinate or -1 if it failed to find it (it has been hard clipped before) + * @param tail + * @return the read coordinate corresponding to the requested reference coordinate for clipping. */ - @Requires({"refCoord >= read.getUnclippedStart()", "refCoord < read.getAlignmentStart()"}) - private static int getReadCoordinateForReferenceCoordinateBeforeAlignmentStart(SAMRecord read, int refCoord) { - if (getRefCoordSoftUnclippedStart(read) <= refCoord) - return refCoord - getRefCoordSoftUnclippedStart(read) + 1; - return -1; - } - - - /** - * Looks for a read coordinate that corresponds to the reference coordinate in the soft clipped region after - * the alignment end of the read. - * - * @param read - * @param refCoord - * @return the corresponding read coordinate or -1 if it failed to find it (it has been hard clipped before) - */ - @Requires({"refCoord <= read.getUnclippedEnd()", "refCoord > read.getAlignmentEnd()"}) - private static int getReadCoordinateForReferenceCoordinateBeforeAlignmentEnd(SAMRecord read, int refCoord) { - if (getRefCoordSoftUnclippedEnd(read) >= refCoord) - return refCoord - getRefCoordSoftUnclippedStart(read) + 1; - return -1; - } - - @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"}) @Ensures({"result >= 0", "result < read.getReadLength()"}) - public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) { + public static int getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord, ClippingTail tail) { + Pair result = getReadCoordinateForReferenceCoordinate(read, refCoord); + int readCoord = result.getFirst(); + + if (result.getSecond() && tail == ClippingTail.RIGHT_TAIL) + readCoord++; + + return readCoord; + } + + /** + * Returns the read coordinate corresponding to the requested reference coordinate. + * + * WARNING: if the requested reference coordinate happens to fall inside a deletion in the read, this function + * will return the last read base before the deletion. This function returns a + * Pair(int readCoord, boolean fallsInsideDeletion) so you can choose which readCoordinate to use when faced with + * a deletion. + * + * SUGGESTION: Use getReadCoordinateForReferenceCoordinate(GATKSAMRecord, int, ClippingTail) instead to get a + * pre-processed result according to normal clipping needs. Or you can use this function and tailor the + * behavior to your needs. + * + * @param read + * @param refCoord + * @return the read coordinate corresponding to the requested reference coordinate. (see warning!) + */ + @Requires({"refCoord >= getRefCoordSoftUnclippedStart(read)", "refCoord <= getRefCoordSoftUnclippedEnd(read)"}) + @Ensures({"result.getFirst() >= 0", "result.getFirst() < read.getReadLength()"}) + public static Pair getReadCoordinateForReferenceCoordinate(GATKSAMRecord read, int refCoord) { int readBases = 0; int refBases = 0; + boolean fallsInsideDeletion = false; - if (refCoord < read.getAlignmentStart()) { - readBases = getReadCoordinateForReferenceCoordinateBeforeAlignmentStart(read, refCoord); - if (readBases < 0) - throw new ReviewedStingException("Requested a coordinate in a hard clipped area of the read. No equivalent read coordinate."); - } - else if (refCoord > read.getAlignmentEnd()) { - readBases = getReadCoordinateForReferenceCoordinateBeforeAlignmentEnd(read, refCoord); - if (readBases < 0) - throw new ReviewedStingException("Requested a coordinate in a hard clipped area of the read. No equivalent read coordinate."); - } - else { - int goal = refCoord - read.getAlignmentStart(); // The goal is to move this many reference bases - boolean goalReached = refBases == goal; + int goal = refCoord - getRefCoordSoftUnclippedStart(read); // The goal is to move this many reference bases + boolean goalReached = refBases == goal; - Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); - while (!goalReached && cigarElementIterator.hasNext()) { - CigarElement cigarElement = cigarElementIterator.next(); - int shift = 0; + Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); + while (!goalReached && cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + int shift = 0; - if (cigarElement.getOperator().consumesReferenceBases()) { - if (refBases + cigarElement.getLength() < goal) { - shift = cigarElement.getLength(); + if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP) { + if (refBases + cigarElement.getLength() < goal) + shift = cigarElement.getLength(); + else + shift = goal - refBases; + + refBases += shift; + } + goalReached = refBases == goal; + + if (!goalReached && cigarElement.getOperator().consumesReadBases()) + readBases += cigarElement.getLength(); + + if (goalReached) { + // Is this base's reference position within this cigar element? Or did we use it all? + boolean endsWithinCigar = shift < cigarElement.getLength(); + + // If it isn't, we need to check the next one. There should *ALWAYS* be a next one + // since we checked if the goal coordinate is within the read length, so this is just a sanity check. + if (!endsWithinCigar && !cigarElementIterator.hasNext()) + throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + + CigarElement nextCigarElement; + + // if we end inside the current cigar element, we just have to check if it is a deletion + if (endsWithinCigar) + fallsInsideDeletion = cigarElement.getOperator() == CigarOperator.DELETION; + + // if we end outside the current cigar element, we need to check if the next element is an insertion or deletion. + else { + nextCigarElement = cigarElementIterator.next(); + + // if it's an insertion, we need to clip the whole insertion before looking at the next element + if (nextCigarElement.getOperator() == CigarOperator.INSERTION) { + readBases += nextCigarElement.getLength(); + if (!cigarElementIterator.hasNext()) + throw new ReviewedStingException("Reference coordinate corresponds to a non-existent base in the read. This should never happen -- call Mauricio"); + + nextCigarElement = cigarElementIterator.next(); } - else { - shift = goal - refBases; - } - refBases += shift; + + // if it's a deletion, we will pass the information on to be handled downstream. + fallsInsideDeletion = nextCigarElement.getOperator() == CigarOperator.DELETION; } - goalReached = refBases == goal; - if (cigarElement.getOperator().consumesReadBases()) { - readBases += goalReached ? shift : cigarElement.getLength(); + // If we reached our goal outside a deletion, add the shift + if (!fallsInsideDeletion && cigarElement.getOperator().consumesReadBases()) + readBases += shift; + + // If we reached our goal inside a deletion, but the deletion is the next cigar element then we need + // to add the shift of the current cigar element but go back to it's last element to return the last + // base before the deletion (see warning in function contracts) + else if (fallsInsideDeletion && !endsWithinCigar) + readBases += shift - 1; + + // If we reached our goal inside a deletion then we must backtrack to the last base before the deletion + else if (fallsInsideDeletion && endsWithinCigar) + readBases--; } } if (!goalReached) throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); - } + - return readBases; + return new Pair(readBases, fallsInsideDeletion); } + public static GATKSAMRecord unclipSoftClippedBases(GATKSAMRecord read) { + int newReadStart = read.getAlignmentStart(); + int newReadEnd = read.getAlignmentEnd(); + List newCigarElements = new ArrayList(read.getCigar().getCigarElements().size()); + int heldOver = -1; + boolean sSeen = false; + for ( CigarElement e : read.getCigar().getCigarElements() ) { + if ( e.getOperator().equals(CigarOperator.S) ) { + newCigarElements.add(new CigarElement(e.getLength(),CigarOperator.M)); + if ( sSeen ) { + newReadEnd += e.getLength(); + sSeen = true; + } else { + newReadStart -= e.getLength(); + } + } else { + newCigarElements.add(e); + } + } + // merge duplicate operators together + int idx = 0; + List finalCigarElements = new ArrayList(read.getCigar().getCigarElements().size()); + while ( idx < newCigarElements.size() -1 ) { + if ( newCigarElements.get(idx).getOperator().equals(newCigarElements.get(idx+1).getOperator()) ) { + int combSize = newCigarElements.get(idx).getLength(); + int offset = 0; + while ( idx + offset < newCigarElements.size()-1 && newCigarElements.get(idx+offset).getOperator().equals(newCigarElements.get(idx+1+offset).getOperator()) ) { + combSize += newCigarElements.get(idx+offset+1).getLength(); + offset++; + } + finalCigarElements.add(new CigarElement(combSize,newCigarElements.get(idx).getOperator())); + idx = idx + offset -1; + } else { + finalCigarElements.add(newCigarElements.get(idx)); + } + idx++; + } + + read.setCigar(new Cigar(finalCigarElements)); + read.setAlignmentStart(newReadStart); + + return read; + } + + /** + * Compares two SAMRecords only the basis on alignment start. Note that + * comparisons are performed ONLY on the basis of alignment start; any + * two SAM records with the same alignment start will be considered equal. + * + * Unmapped alignments will all be considered equal. + */ + + @Requires({"read1 != null", "read2 != null"}) + @Ensures("result == 0 || result == 1 || result == -1") + public static int compareSAMRecords(GATKSAMRecord read1, GATKSAMRecord read2) { + AlignmentStartComparator comp = new AlignmentStartComparator(); + return comp.compare(read1, read2); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java index 9d4b23a8b..c146bf4d4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/ListFileUtils.java @@ -80,7 +80,7 @@ public class ListFileUtils { unpackedReads.add(new SAMReaderID(inputFileName,inputFileNameTags)); } else { - throw new UserException.CommandLineException(String.format("The GATK reads argument (-I) supports only BAM files with the .bam extension and lists of BAM files " + + throw new UserException.CommandLineException(String.format("The GATK reads argument (-I, --input_file) supports only BAM files with the .bam extension and lists of BAM files " + "with the .list extension, but the file %s has neither extension. Please ensure that your BAM file or list " + "of BAM files is in the correct format, update the extension, and try again.",inputFileName)); } diff --git a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java b/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java index 52b6f3b01..49e9ddf52 100644 --- a/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java +++ b/public/java/src/org/broadinstitute/sting/utils/text/XReadLines.java @@ -99,9 +99,9 @@ public class XReadLines implements Iterator, Iterable { * * @param reader */ - public XReadLines(final BufferedReader reader, final boolean trimWhitespace) { + public XReadLines(final Reader reader, final boolean trimWhitespace) { try { - this.in = reader; + this.in = new BufferedReader(reader); nextline = readNextLine(); this.trimWhitespace = trimWhitespace; } catch(IOException e) { @@ -109,7 +109,7 @@ public class XReadLines implements Iterator, Iterable { } } - public XReadLines(final BufferedReader reader) throws FileNotFoundException { + public XReadLines(final Reader reader) { this(reader, true); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index fdf3d97db..e2e44e2b9 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -26,14 +26,31 @@ public class Genotype { protected boolean filtersWereAppliedToContext; public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased) { + this(sampleName, alleles, negLog10PError, filters, attributes, isPhased, null); + } + + public Genotype(String sampleName, List alleles, double negLog10PError, Set filters, Map attributes, boolean isPhased, double[] log10Likelihoods) { if ( alleles != null ) this.alleles = Collections.unmodifiableList(alleles); commonInfo = new InferredGeneticContext(sampleName, negLog10PError, filters, attributes); + if ( log10Likelihoods != null ) + commonInfo.putAttribute(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY, GenotypeLikelihoods.fromLog10Likelihoods(log10Likelihoods)); filtersWereAppliedToContext = filters != null; this.isPhased = isPhased; validate(); } + /** + * Creates a new Genotype for sampleName with genotype according to alleles. + * @param sampleName + * @param alleles + * @param negLog10PError the confidence in these alleles + * @param log10Likelihoods a log10 likelihoods for each of the genotype combinations possible for alleles, in the standard VCF ordering, or null if not known + */ + public Genotype(String sampleName, List alleles, double negLog10PError, double[] log10Likelihoods) { + this(sampleName, alleles, negLog10PError, null, null, false, log10Likelihoods); + } + public Genotype(String sampleName, List alleles, double negLog10PError) { this(sampleName, alleles, negLog10PError, null, null, false); } @@ -57,13 +74,6 @@ public class Genotype { return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attributes, g.isPhased()); } - public static Genotype removePLs(Genotype g) { - Map attrs = new HashMap(g.getAttributes()); - attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); - attrs.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY); - return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attrs, g.isPhased()); - } - public static Genotype modifyAlleles(Genotype g, List alleles) { return new Genotype(g.getSampleName(), alleles, g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, g.getAttributes(), g.isPhased()); } @@ -98,14 +108,19 @@ public class Genotype { /** * @return the ploidy of this genotype */ - public int getPloidy() { return alleles.size(); } + public int getPloidy() { + if ( alleles == null ) + throw new ReviewedStingException("Requesting ploidy for an UNAVAILABLE genotype"); + return alleles.size(); + } public enum Type { NO_CALL, HOM_REF, HET, HOM_VAR, - UNAVAILABLE + UNAVAILABLE, + MIXED // no-call and call in the same genotype } public Type getType() { @@ -119,36 +134,68 @@ public class Genotype { if ( alleles == null ) return Type.UNAVAILABLE; - Allele firstAllele = alleles.get(0); + boolean sawNoCall = false, sawMultipleAlleles = false; + Allele observedAllele = null; - if ( firstAllele.isNoCall() ) { - return Type.NO_CALL; + for ( Allele allele : alleles ) { + if ( allele.isNoCall() ) + sawNoCall = true; + else if ( observedAllele == null ) + observedAllele = allele; + else if ( !allele.equals(observedAllele) ) + sawMultipleAlleles = true; } - for (Allele a : alleles) { - if ( ! firstAllele.equals(a) ) - return Type.HET; + if ( sawNoCall ) { + if ( observedAllele == null ) + return Type.NO_CALL; + return Type.MIXED; } - return firstAllele.isReference() ? Type.HOM_REF : Type.HOM_VAR; + + if ( observedAllele == null ) + throw new ReviewedStingException("BUG: there are no alleles present in this genotype but the alleles list is not null"); + + return sawMultipleAlleles ? Type.HET : observedAllele.isReference() ? Type.HOM_REF : Type.HOM_VAR; } /** - * @return true if all observed alleles are the same (regardless of whether they are ref or alt) + * @return true if all observed alleles are the same (regardless of whether they are ref or alt); if any alleles are no-calls, this method will return false. */ public boolean isHom() { return isHomRef() || isHomVar(); } + + /** + * @return true if all observed alleles are ref; if any alleles are no-calls, this method will return false. + */ public boolean isHomRef() { return getType() == Type.HOM_REF; } + + /** + * @return true if all observed alleles are alt; if any alleles are no-calls, this method will return false. + */ public boolean isHomVar() { return getType() == Type.HOM_VAR; } /** - * @return true if we're het (observed alleles differ) + * @return true if we're het (observed alleles differ); if the ploidy is less than 2 or if any alleles are no-calls, this method will return false. */ public boolean isHet() { return getType() == Type.HET; } /** - * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF) + * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF); if any alleles are not no-calls (even if some are), this method will return false. */ public boolean isNoCall() { return getType() == Type.NO_CALL; } + + /** + * @return true if this genotype is comprised of any alleles that are not no-calls (even if some are). + */ public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; } + + /** + * @return true if this genotype is comprised of both calls and no-calls. + */ + public boolean isMixed() { return getType() == Type.MIXED; } + + /** + * @return true if the type of this genotype is set. + */ public boolean isAvailable() { return getType() != Type.UNAVAILABLE; } // @@ -187,14 +234,16 @@ public class Genotype { if ( alleles == null ) return; if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0"); - int nNoCalls = 0; + // int nNoCalls = 0; for ( Allele allele : alleles ) { if ( allele == null ) throw new IllegalArgumentException("BUG: allele cannot be null in Genotype"); - nNoCalls += allele.isNoCall() ? 1 : 0; + // nNoCalls += allele.isNoCall() ? 1 : 0; } - if ( nNoCalls > 0 && nNoCalls != alleles.size() ) - throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this); + + // Technically, the spec does allow for the below case so this is not an illegal state + //if ( nNoCalls > 0 && nNoCalls != alleles.size() ) + // throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this); } public String getGenotypeString() { @@ -258,7 +307,8 @@ public class Genotype { * @param the value type * @return a sting, enclosed in {}, with comma seperated key value pairs in order of the keys */ - public static , V> String sortedString(Map c) { + private static , V> String sortedString(Map c) { + // NOTE -- THIS IS COPIED FROM GATK UTILS TO ALLOW US TO KEEP A SEPARATION BETWEEN THE GATK AND VCF CODECS List t = new ArrayList(c.keySet()); Collections.sort(t); @@ -293,17 +343,8 @@ public class Genotype { return commonInfo.getAttribute(key, defaultValue); } - public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); } public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); } - public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); } public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } - public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); } public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } - public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); } public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } - - public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); } - public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); } - public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); } - public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java index 3d162adb0..bf16cd1cf 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/InferredGeneticContext.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.utils.variantcontext; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; + import java.util.*; @@ -204,27 +206,40 @@ public final class InferredGeneticContext { return defaultValue; } -// public AttributedObject getAttributes(Collection keys) { -// AttributedObject selected = new AttributedObject(); -// -// for ( Object key : keys ) -// selected.putAttribute(key, this.getAttribute(key)); -// -// return selected; -// } + public String getAttributeAsString(String key, String defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof String ) return (String)x; + return String.valueOf(x); // throws an exception if this isn't a string + } - public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null" - public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); } - public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); } - public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); } + public int getAttributeAsInt(String key, int defaultValue) { + Object x = getAttribute(key); + if ( x == null || x == VCFConstants.MISSING_VALUE_v4 ) return defaultValue; + if ( x instanceof Integer ) return (Integer)x; + return Integer.valueOf((String)x); // throws an exception if this isn't a string + } - public String getAttributeAsString(String key, String defaultValue) { return (String)getAttribute(key, defaultValue); } - public int getAttributeAsInt(String key, int defaultValue) { return (Integer)getAttribute(key, defaultValue); } - public double getAttributeAsDouble(String key, double defaultValue) { return (Double)getAttribute(key, defaultValue); } - public boolean getAttributeAsBoolean(String key, boolean defaultValue){ return (Boolean)getAttribute(key, defaultValue); } + public double getAttributeAsDouble(String key, double defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof Double ) return (Double)x; + return Double.valueOf((String)x); // throws an exception if this isn't a string + } - public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} } - public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} } - public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); } - public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} } + public boolean getAttributeAsBoolean(String key, boolean defaultValue) { + Object x = getAttribute(key); + if ( x == null ) return defaultValue; + if ( x instanceof Boolean ) return (Boolean)x; + return Boolean.valueOf((String)x); // throws an exception if this isn't a string + } + +// public String getAttributeAsString(String key) { return (String.valueOf(getAttribute(key))); } // **NOTE**: will turn a null Object into the String "null" +// public int getAttributeAsInt(String key) { Object x = getAttribute(key); return x instanceof Integer ? (Integer)x : Integer.valueOf((String)x); } +// public double getAttributeAsDouble(String key) { Object x = getAttribute(key); return x instanceof Double ? (Double)x : Double.valueOf((String)x); } +// public boolean getAttributeAsBoolean(String key) { Object x = getAttribute(key); return x instanceof Boolean ? (Boolean)x : Boolean.valueOf((String)x); } +// public Integer getAttributeAsIntegerNoException(String key) { try {return getAttributeAsInt(key);} catch (Exception e) {return null;} } +// public Double getAttributeAsDoubleNoException(String key) { try {return getAttributeAsDouble(key);} catch (Exception e) {return null;} } +// public String getAttributeAsStringNoException(String key) { if (getAttribute(key) == null) return null; return getAttributeAsString(key); } +// public Boolean getAttributeAsBooleanNoException(String key) { try {return getAttributeAsBoolean(key);} catch (Exception e) {return null;} } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java index 0cd684cb6..14419a2a0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java @@ -40,19 +40,7 @@ public class MutableGenotype extends Genotype { */ public void setAlleles(List alleles) { this.alleles = new ArrayList(alleles); - - // todo -- add validation checking here - - if ( alleles == null ) throw new IllegalArgumentException("BUG: alleles cannot be null in setAlleles"); - if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0 in setAlleles"); - - int nNoCalls = 0; - for ( Allele allele : alleles ) { nNoCalls += allele.isNoCall() ? 1 : 0; } - if ( nNoCalls > 0 && nNoCalls != alleles.size() ) - throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this); - - for ( Allele allele : alleles ) - if ( allele == null ) throw new IllegalArgumentException("BUG: Cannot add a null allele to a genotype"); + validate(); } public void setPhase(boolean isPhased) { diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index cfd59b504..f52a7087b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -132,7 +132,7 @@ import java.util.*; * vc.hasGenotypes() * vc.isMonomorphic() * vc.isPolymorphic() - * vc.getSampleNames().size() + * vc.getSamples().size() * * vc.getGenotypes() * vc.getGenotypes().get("g1") @@ -181,7 +181,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati protected Type type = null; /** A set of the alleles segregating in this context */ - protected LinkedHashSet alleles = null; + final protected List alleles; /** A mapping from sampleName -> genotype objects for all genotypes associated with this context */ protected Map genotypes = null; @@ -355,7 +355,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); } // we need to make this a LinkedHashSet in case the user prefers a given ordering of alleles - this.alleles = alleleCollectionToSet(new LinkedHashSet(), alleles); + this.alleles = makeAlleles(alleles); if ( genotypes == null ) { genotypes = NO_GENOTYPES; } @@ -445,7 +445,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @param alleles the set of allele segregating alleles at this site. Must include those in genotypes, but may be more * @return vc subcontext */ - public VariantContext subContextFromGenotypes(Collection genotypes, Set alleles) { + public VariantContext subContextFromGenotypes(Collection genotypes, Collection alleles) { return new VariantContext(getSource(), contig, start, stop, alleles, genotypes != null ? genotypeCollectionToMap(new TreeMap(), genotypes) : null, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes(), getReferenceBaseForIndel()); } @@ -666,21 +666,11 @@ public class VariantContext implements Feature { // to enable tribble intergrati return commonInfo.getAttribute(key, defaultValue); } - public String getAttributeAsString(String key) { return commonInfo.getAttributeAsString(key); } public String getAttributeAsString(String key, String defaultValue) { return commonInfo.getAttributeAsString(key, defaultValue); } - public int getAttributeAsInt(String key) { return commonInfo.getAttributeAsInt(key); } public int getAttributeAsInt(String key, int defaultValue) { return commonInfo.getAttributeAsInt(key, defaultValue); } - public double getAttributeAsDouble(String key) { return commonInfo.getAttributeAsDouble(key); } public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } - public boolean getAttributeAsBoolean(String key) { return commonInfo.getAttributeAsBoolean(key); } public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } - public Integer getAttributeAsIntegerNoException(String key) { return commonInfo.getAttributeAsIntegerNoException(key); } - public Double getAttributeAsDoubleNoException(String key) { return commonInfo.getAttributeAsDoubleNoException(key); } - public String getAttributeAsStringNoException(String key) { return commonInfo.getAttributeAsStringNoException(key); } - public Boolean getAttributeAsBooleanNoException(String key) { return commonInfo.getAttributeAsBooleanNoException(key); } - - // --------------------------------------------------------------------------------------------------------- // // Working with alleles @@ -697,17 +687,6 @@ public class VariantContext implements Feature { // to enable tribble intergrati return ref; } - /** Private helper routine that grabs the reference allele but doesn't throw an error if there's no such allele */ - -// private Allele getReferenceWithoutError() { -// for ( Allele allele : getAlleles() ) { -// if ( allele.isReference() ) { -// return allele; -// } -// } -// -// return null; -// } /** * @return true if the context is strictly bi-allelic @@ -764,7 +743,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * * @return the set of alleles */ - public Set getAlleles() { return alleles; } + public List getAlleles() { return alleles; } /** * Gets the alternate alleles. This method should return all the alleles present at the location, @@ -773,14 +752,8 @@ public class VariantContext implements Feature { // to enable tribble intergrati * * @return the set of alternate alleles */ - public Set getAlternateAlleles() { - LinkedHashSet altAlleles = new LinkedHashSet(); - for ( Allele allele : alleles ) { - if ( allele.isNonReference() ) - altAlleles.add(allele); - } - - return Collections.unmodifiableSet(altAlleles); + public List getAlternateAlleles() { + return alleles.subList(1, alleles.size()); } /** @@ -807,14 +780,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @throws IllegalArgumentException if i is invalid */ public Allele getAlternateAllele(int i) { - int n = 0; - - for ( Allele allele : alleles ) { - if ( allele.isNonReference() && n++ == i ) - return allele; - } - - throw new IllegalArgumentException("Requested " + i + " alternative allele but there are only " + n + " alternative alleles " + this); + return alleles.get(i+1); } /** @@ -823,8 +789,8 @@ public class VariantContext implements Feature { // to enable tribble intergrati * regardless of ordering. Otherwise returns false. */ public boolean hasSameAlternateAllelesAs ( VariantContext other ) { - Set thisAlternateAlleles = getAlternateAlleles(); - Set otherAlternateAlleles = other.getAlternateAlleles(); + List thisAlternateAlleles = getAlternateAlleles(); + List otherAlternateAlleles = other.getAlternateAlleles(); if ( thisAlternateAlleles.size() != otherAlternateAlleles.size() ) { return false; @@ -1032,7 +998,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati else if ( g.isHomVar() ) genotypeCounts[Genotype.Type.HOM_VAR.ordinal()]++; else - throw new IllegalStateException("Genotype of unknown type: " + g); + genotypeCounts[Genotype.Type.MIXED.ordinal()]++; } } } @@ -1076,6 +1042,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati return genotypeCounts[Genotype.Type.HOM_VAR.ordinal()]; } + /** + * Genotype-specific functions -- how many mixed calls are there in the genotypes? + * + * @return number of mixed calls + */ + public int getMixedCount() { + return genotypeCounts[Genotype.Type.MIXED.ordinal()]; + } + // --------------------------------------------------------------------------------------------------------- // // validation: extra-strict validation routines for paranoid users @@ -1131,7 +1106,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati if ( !hasGenotypes() ) return; - Set reportedAlleles = getAlleles(); + List reportedAlleles = getAlleles(); Set observedAlleles = new HashSet(); observedAlleles.add(getReference()); for ( Genotype g : getGenotypes().values() ) { @@ -1381,17 +1356,34 @@ public class VariantContext implements Feature { // to enable tribble intergrati } // protected basic manipulation routines - private static LinkedHashSet alleleCollectionToSet(LinkedHashSet dest, Collection alleles) { - for ( Allele a : alleles ) { - for ( Allele b : dest ) { + private static List makeAlleles(Collection alleles) { + final List alleleList = new ArrayList(alleles.size()); + + boolean sawRef = false; + for ( final Allele a : alleles ) { + for ( final Allele b : alleleList ) { if ( a.equals(b, true) ) throw new IllegalArgumentException("Duplicate allele added to VariantContext: " + a); } - dest.add(a); + // deal with the case where the first allele isn't the reference + if ( a.isReference() ) { + if ( sawRef ) + throw new IllegalArgumentException("Alleles for a VariantContext must contain at most one reference allele: " + alleles); + alleleList.add(0, a); + sawRef = true; + } + else + alleleList.add(a); } - return dest; + if ( alleleList.isEmpty() ) + throw new IllegalArgumentException("Cannot create a VariantContext with an empty allele list"); + + if ( alleleList.get(0).isNonReference() ) + throw new IllegalArgumentException("Alleles for a VariantContext must contain at least one reference allele: " + alleles); + + return alleleList; } public static Map genotypeCollectionToMap(Map dest, Collection genotypes) { @@ -1505,7 +1497,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati // Do not change the filter state if filters were not applied to this context Set inputVCFilters = inputVC.filtersWereAppliedToContext ? inputVC.getFilters() : null; - return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVCFilters, inputVC.getAttributes()); + return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVCFilters, inputVC.getAttributes(),refByte); } else return inputVC; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index e0e27b4f7..43f91041f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -29,6 +29,7 @@ import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.samtools.util.StringUtil; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; +import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.gatk.walkers.phasing.ReadBackedPhasingWalker; import org.broadinstitute.sting.utils.BaseUtils; @@ -44,6 +45,12 @@ import java.io.Serializable; import java.util.*; public class VariantContextUtils { + private static Logger logger = Logger.getLogger(VariantContextUtils.class); + public final static String MERGE_INTERSECTION = "Intersection"; + public final static String MERGE_FILTER_IN_ALL = "FilteredInAll"; + public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; + public final static String MERGE_FILTER_PREFIX = "filterIn"; + final public static JexlEngine engine = new JexlEngine(); static { engine.setSilent(false); // will throw errors now for selects that don't evaluate properly @@ -154,6 +161,13 @@ public class VariantContextUtils { return "%." + precision + "f"; } + public static Genotype removePLs(Genotype g) { + Map attrs = new HashMap(g.getAttributes()); + attrs.remove(VCFConstants.PHRED_GENOTYPE_LIKELIHOODS_KEY); + attrs.remove(VCFConstants.GENOTYPE_LIKELIHOODS_KEY); + return new Genotype(g.getSampleName(), g.getAlleles(), g.getNegLog10PError(), g.filtersWereApplied() ? g.getFilters() : null, attrs, g.isPhased()); + } + /** * A simple but common wrapper for matching VariantContext objects using JEXL expressions */ @@ -499,7 +513,7 @@ public class VariantContextUtils { final String name = first.getSource(); final Allele refAllele = determineReferenceAllele(VCs); - final Set alleles = new TreeSet(); + final Set alleles = new LinkedHashSet(); final Set filters = new TreeSet(); final Map attributes = new TreeMap(); final Set inconsistentAttributes = new HashSet(); @@ -548,12 +562,10 @@ public class VariantContextUtils { // special case DP (add it up) and ID (just preserve it) // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) - depth += Integer.valueOf(vc.getAttributeAsString(VCFConstants.DEPTH_KEY)); - + depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); if ( vc.hasID() && ! vc.getID().equals(VCFConstants.EMPTY_ID_FIELD) ) rsIDs.add(vc.getID()); - if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { - String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY); + String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator if (rawAlleleCounts.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)) { List alleleCountArray = Arrays.asList(rawAlleleCounts.substring(1, rawAlleleCounts.length() - 1).split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR)); @@ -594,10 +606,17 @@ public class VariantContextUtils { } } - // if we have more alternate alleles in the merged VC than in one or more of the original VCs, we need to strip out the GL/PLs (because they are no longer accurate) + // if we have more alternate alleles in the merged VC than in one or more of the + // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF for ( VariantContext vc : VCs ) { - if ( vc.alleles.size() != alleles.size() ) { + if (vc.alleles.size() == 1) + continue; + if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { + logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", + genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genotypes = stripPLs(genotypes); + // this will remove stale AC,AF attributed from vc + calculateChromosomeCounts(vc, attributes, true); break; } } @@ -611,19 +630,20 @@ public class VariantContextUtils { if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() ) filters.clear(); + if ( annotateOrigin ) { // we care about where the call came from String setValue; if ( nFiltered == 0 && variantSources.size() == priorityListOfVCs.size() ) // nothing was unfiltered - setValue = "Intersection"; + setValue = MERGE_INTERSECTION; else if ( nFiltered == VCs.size() ) // everything was filtered out - setValue = "FilteredInAll"; + setValue = MERGE_FILTER_IN_ALL; else if ( variantSources.isEmpty() ) // everyone was reference - setValue = "ReferenceInAll"; + setValue = MERGE_REF_IN_ALL; else { LinkedHashSet s = new LinkedHashSet(); for ( VariantContext vc : VCs ) if ( vc.isVariant() ) - s.add( vc.isFiltered() ? "filterIn" + vc.getSource() : vc.getSource() ); + s.add( vc.isFiltered() ? MERGE_FILTER_PREFIX + vc.getSource() : vc.getSource() ); setValue = Utils.join("-", s); } @@ -648,6 +668,36 @@ public class VariantContextUtils { return merged; } + private static final boolean hasPLIncompatibleAlleles(final Collection alleleSet1, final Collection alleleSet2) { + final Iterator it1 = alleleSet1.iterator(); + final Iterator it2 = alleleSet2.iterator(); + + while ( it1.hasNext() && it2.hasNext() ) { + final Allele a1 = it1.next(); + final Allele a2 = it2.next(); + if ( ! a1.equals(a2) ) + return true; + } + + // by this point, at least one of the iterators is empty. All of the elements + // we've compared are equal up until this point. But it's possible that the + // sets aren't the same size, which is indicated by the test below. If they + // are of the same size, though, the sets are compatible + return it1.hasNext() || it2.hasNext(); + } + + public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { + // if all alleles of vc1 are a contained in alleles of vc2, return true + if (!vc1.getReference().equals(vc2.getReference())) + return false; + + for (Allele a :vc1.getAlternateAlleles()) { + if (!vc2.getAlternateAlleles().contains(a)) + return false; + } + + return true; + } public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) { // see if we need to trim common reference base from all alleles boolean trimVC; @@ -724,7 +774,7 @@ public class VariantContextUtils { Map newGs = new HashMap(genotypes.size()); for ( Map.Entry g : genotypes.entrySet() ) { - newGs.put(g.getKey(), g.getValue().hasLikelihoods() ? Genotype.removePLs(g.getValue()) : g.getValue()); + newGs.put(g.getKey(), g.getValue().hasLikelihoods() ? removePLs(g.getValue()) : g.getValue()); } return newGs; @@ -733,9 +783,46 @@ public class VariantContextUtils { public static Map> separateVariantContextsByType(Collection VCs) { HashMap> mappedVCs = new HashMap>(); for ( VariantContext vc : VCs ) { - if ( !mappedVCs.containsKey(vc.getType()) ) - mappedVCs.put(vc.getType(), new ArrayList()); - mappedVCs.get(vc.getType()).add(vc); + + // look at previous variant contexts of different type. If: + // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list + // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) + // c) neither: do nothing, just add vc to its own list + boolean addtoOwnList = true; + for (VariantContext.Type type : VariantContext.Type.values()) { + if (type.equals(vc.getType())) + continue; + + if (!mappedVCs.containsKey(type)) + continue; + + List vcList = mappedVCs.get(type); + for (int k=0; k < vcList.size(); k++) { + VariantContext otherVC = vcList.get(k); + if (allelesAreSubset(otherVC,vc)) { + // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list + vcList.remove(k); + // avoid having empty lists + if (vcList.size() == 0) + mappedVCs.remove(vcList); + if ( !mappedVCs.containsKey(vc.getType()) ) + mappedVCs.put(vc.getType(), new ArrayList()); + mappedVCs.get(vc.getType()).add(otherVC); + break; + } + else if (allelesAreSubset(vc,otherVC)) { + // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own + mappedVCs.get(type).add(vc); + addtoOwnList = false; + break; + } + } + } + if (addtoOwnList) { + if ( !mappedVCs.containsKey(vc.getType()) ) + mappedVCs.put(vc.getType(), new ArrayList()); + mappedVCs.get(vc.getType()).add(vc); + } } return mappedVCs; @@ -1132,9 +1219,7 @@ public class VariantContextUtils { for (String orAttrib : MERGE_OR_ATTRIBS) { boolean attribVal = false; for (VariantContext vc : vcList) { - Boolean val = vc.getAttributeAsBooleanNoException(orAttrib); - if (val != null) - attribVal = (attribVal || val); + attribVal = vc.getAttributeAsBoolean(orAttrib, false); if (attribVal) // already true, so no reason to continue: break; } @@ -1144,7 +1229,7 @@ public class VariantContextUtils { // Merge ID fields: String iDVal = null; for (VariantContext vc : vcList) { - String val = vc.getAttributeAsStringNoException(VariantContext.ID_KEY); + String val = vc.getAttributeAsString(VariantContext.ID_KEY, null); if (val != null && !val.equals(VCFConstants.EMPTY_ID_FIELD)) { if (iDVal == null) iDVal = val; @@ -1224,8 +1309,10 @@ public class VariantContextUtils { public PhaseAndQuality(Genotype gt) { this.isPhased = gt.isPhased(); - if (this.isPhased) - this.PQ = gt.getAttributeAsDoubleNoException(ReadBackedPhasingWalker.PQ_KEY); + if (this.isPhased) { + this.PQ = gt.getAttributeAsDouble(ReadBackedPhasingWalker.PQ_KEY, -1); + if ( this.PQ == -1 ) this.PQ = null; + } } } diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 63faf1ab9..f99a105ae 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -50,6 +50,7 @@ public abstract class BaseTest { public static final String hg18Reference = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"; public static final String hg19Reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; public static final String b36KGReference = "/humgen/1kg/reference/human_b36_both.fasta"; + //public static final String b37KGReference = "/Users/depristo/Desktop/broadLocal/localData/human_g1k_v37.fasta"; public static final String b37KGReference = "/humgen/1kg/reference/human_g1k_v37.fasta"; public static final String GATKDataLocation = "/humgen/gsa-hpprojects/GATK/data/"; public static final String validationDataLocation = GATKDataLocation + "Validation_Data/"; @@ -80,7 +81,8 @@ public abstract class BaseTest { public static final String networkTempDir = "/broad/shptmp/"; public static final File networkTempDirFile = new File(networkTempDir); - public static final String testDir = "public/testdata/"; + public static final File testDirFile = new File("public/testdata/"); + public static final String testDir = testDirFile.getAbsolutePath() + "/"; /** before the class starts up */ static { @@ -98,10 +100,10 @@ public abstract class BaseTest { logger.setLevel(Level.WARN); // find our file sources - if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { - logger.fatal("We can't locate the reference directories. Aborting!"); - throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories"); - } +// if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { +// logger.fatal("We can't locate the reference directories. Aborting!"); +// throw new RuntimeException("BaseTest setup failed: unable to locate the reference directories"); +// } } /** @@ -132,15 +134,21 @@ public abstract class BaseTest { */ public static class TestDataProvider { private static final Map> tests = new HashMap>(); + private final String name; /** * Create a new TestDataProvider instance bound to the class variable C * @param c */ - public TestDataProvider(Class c) { + public TestDataProvider(Class c, String name) { if ( ! tests.containsKey(c) ) tests.put(c, new ArrayList()); tests.get(c).add(this); + this.name = name; + } + + public TestDataProvider(Class c) { + this(c, ""); } /** @@ -153,6 +161,11 @@ public abstract class BaseTest { for ( Object x : tests.get(c) ) params2.add(new Object[]{x}); return params2.toArray(new Object[][]{}); } + + @Override + public String toString() { + return "TestDataProvider("+name+")"; + } } /** diff --git a/public/java/test/org/broadinstitute/sting/MD5DB.java b/public/java/test/org/broadinstitute/sting/MD5DB.java index 2fd8f8b6d..c9f53c581 100644 --- a/public/java/test/org/broadinstitute/sting/MD5DB.java +++ b/public/java/test/org/broadinstitute/sting/MD5DB.java @@ -129,7 +129,7 @@ public class MD5DB { System.out.printf("##### Skipping update, cannot write file %s%n", dbFile); } } else { - System.out.printf("##### MD5 file is up to date: %s%n", dbFile.getPath()); + //System.out.printf("##### MD5 file is up to date: %s%n", dbFile.getPath()); } } @@ -170,6 +170,18 @@ public class MD5DB { return bytes; } + public static class MD5Match { + final String md5; + final String failMessage; + boolean failed; + + public MD5Match(final String md5, final String failMessage, final boolean failed) { + this.md5 = md5; + this.failMessage = failMessage; + this.failed = failed; + } + } + /** * Tests a file MD5 against an expected value, returning the MD5. NOTE: This function WILL throw an exception if the MD5s are different. * @param name Name of the test. @@ -178,18 +190,21 @@ public class MD5DB { * @param parameterize If true or if expectedMD5 is an empty string, will print out the calculated MD5 instead of error text. * @return The calculated MD5. */ - public static String assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5, final boolean parameterize) { - String filemd5sum = testFileMD5(name, resultsFile, expectedMD5, parameterize); + public static MD5Match assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5, final boolean parameterize) { + final String filemd5sum = testFileMD5(name, resultsFile, expectedMD5, parameterize); + String failMessage = null; + boolean failed = false; if (parameterize || expectedMD5.equals("")) { // Don't assert } else if ( filemd5sum.equals(expectedMD5) ) { - System.out.println(String.format(" => %s PASSED", name)); + System.out.println(String.format(" => %s PASSED (expected=%s)", name, expectedMD5)); } else { - Assert.fail(String.format("%s has mismatching MD5s: expected=%s observed=%s", name, expectedMD5, filemd5sum)); + failed = true; + failMessage = String.format("%s has mismatching MD5s: expected=%s observed=%s", name, expectedMD5, filemd5sum); } - return filemd5sum; + return new MD5Match(filemd5sum, failMessage, failed); } @@ -218,8 +233,8 @@ public class MD5DB { System.out.println(String.format("PARAMETERIZATION[%s]: file %s has md5 = %s, stated expectation is %s, equal? = %b", name, resultsFile, filemd5sum, expectedMD5, filemd5sum.equals(expectedMD5))); } else { - System.out.println(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5)); - System.out.flush(); + //System.out.println(String.format("Checking MD5 for %s [calculated=%s, expected=%s]", resultsFile, filemd5sum, expectedMD5)); + //System.out.flush(); if ( ! expectedMD5.equals(filemd5sum) ) { // we are going to fail for real in assertEquals (so we are counted by the testing framework). diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 386c17659..ca7653b58 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -52,7 +52,7 @@ public class WalkerTest extends BaseTest { GenomeAnalysisEngine.resetRandomGenerator(); } - public String assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5) { + public MD5DB.MD5Match assertMatchingMD5(final String name, final File resultsFile, final String expectedMD5) { return MD5DB.assertMatchingMD5(name, resultsFile, expectedMD5, parameterize()); } @@ -75,7 +75,7 @@ public class WalkerTest extends BaseTest { Index indexFromOutputFile = IndexFactory.createIndex(resultFile, new VCFCodec()); Index dynamicIndex = IndexFactory.loadIndex(indexFile.getAbsolutePath()); - if ( ! indexFromOutputFile.equalsIgnoreTimestamp(dynamicIndex) ) { + if ( ! indexFromOutputFile.equalsIgnoreProperties(dynamicIndex) ) { Assert.fail(String.format("Index on disk from indexing on the fly not equal to the index created after the run completed. FileIndex %s vs. on-the-fly %s%n", indexFromOutputFile.getProperties(), dynamicIndex.getProperties())); @@ -84,10 +84,23 @@ public class WalkerTest extends BaseTest { public List assertMatchingMD5s(final String name, List resultFiles, List expectedMD5s) { List md5s = new ArrayList(); + List fails = new ArrayList(); + for (int i = 0; i < resultFiles.size(); i++) { - String md5 = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i)); - maybeValidateSupplementaryFile(name, resultFiles.get(i)); - md5s.add(i, md5); + MD5DB.MD5Match result = assertMatchingMD5(name, resultFiles.get(i), expectedMD5s.get(i)); + if ( ! result.failed ) { + maybeValidateSupplementaryFile(name, resultFiles.get(i)); + md5s.add(result.md5); + } else { + fails.add(result); + } + } + + if ( ! fails.isEmpty() ) { + for ( final MD5DB.MD5Match fail : fails ) { + logger.warn("Fail: " + fail.failMessage); + } + Assert.fail("Test failed: " + name); } return md5s; diff --git a/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java new file mode 100644 index 000000000..99d6b88f3 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.commandline; + +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; + +public class ArgumentMatchSiteUnitTest { + @Test + public void testCommandLine() { + ArgumentMatchSite site = new ArgumentMatchSite(ArgumentMatchSource.COMMAND_LINE, 1); + Assert.assertEquals(site.getSource(), ArgumentMatchSource.COMMAND_LINE); + Assert.assertEquals(site.getIndex(), 1); + } + + @Test + public void testFile() { + ArgumentMatchSource source = new ArgumentMatchSource(new File("test")); + ArgumentMatchSite site = new ArgumentMatchSite(source, 1); + Assert.assertEquals(site.getSource(), source); + Assert.assertEquals(site.getIndex(), 1); + } + + @Test + public void testEquals() { + ArgumentMatchSource cmdLine = ArgumentMatchSource.COMMAND_LINE; + ArgumentMatchSite site1 = new ArgumentMatchSite(cmdLine, 1); + ArgumentMatchSite site2 = new ArgumentMatchSite(cmdLine, 2); + + Assert.assertFalse(site1.equals(null)); + + Assert.assertTrue(site1.equals(site1)); + Assert.assertFalse(site1.equals(site2)); + + Assert.assertFalse(site2.equals(site1)); + Assert.assertTrue(site2.equals(site2)); + } + + @Test + public void testCompareTo() { + ArgumentMatchSource cmdLine = ArgumentMatchSource.COMMAND_LINE; + ArgumentMatchSite site1 = new ArgumentMatchSite(cmdLine, 1); + ArgumentMatchSite site2 = new ArgumentMatchSite(cmdLine, 2); + + Assert.assertTrue(site1.compareTo(site1) == 0); + Assert.assertTrue(site1.compareTo(site2) < 0); + Assert.assertTrue(site2.compareTo(site1) > 0); + Assert.assertTrue(site2.compareTo(site2) == 0); + } + + @Test(expectedExceptions = NullPointerException.class) + public void testCompareToNull() { + new ArgumentMatchSite(ArgumentMatchSource.COMMAND_LINE, 0).compareTo(null); + } +} diff --git a/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java new file mode 100644 index 000000000..4bc7eb822 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.commandline; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; + +public class ArgumentMatchSourceUnitTest extends BaseTest { + @Test + public void testCommandLine() { + ArgumentMatchSource source = ArgumentMatchSource.COMMAND_LINE; + Assert.assertEquals(source.getType(), ArgumentMatchSourceType.CommandLine); + Assert.assertNull(source.getFile()); + } + + @Test + public void testFile() { + File f = new File("test"); + ArgumentMatchSource source = new ArgumentMatchSource(f); + Assert.assertEquals(source.getType(), ArgumentMatchSourceType.File); + Assert.assertEquals(source.getFile(), f); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testNullFile() { + new ArgumentMatchSource(null); + } + + @Test + public void testEquals() { + ArgumentMatchSource cmdLine = ArgumentMatchSource.COMMAND_LINE; + ArgumentMatchSource fileA = new ArgumentMatchSource(new File("a")); + ArgumentMatchSource fileB = new ArgumentMatchSource(new File("b")); + + Assert.assertFalse(cmdLine.equals(null)); + + Assert.assertTrue(cmdLine.equals(cmdLine)); + Assert.assertFalse(cmdLine.equals(fileA)); + Assert.assertFalse(cmdLine.equals(fileB)); + + Assert.assertFalse(fileA.equals(cmdLine)); + Assert.assertTrue(fileA.equals(fileA)); + Assert.assertFalse(fileA.equals(fileB)); + + Assert.assertFalse(fileB.equals(cmdLine)); + Assert.assertFalse(fileB.equals(fileA)); + Assert.assertTrue(fileB.equals(fileB)); + } + + @Test + public void testCompareTo() { + ArgumentMatchSource cmdLine = ArgumentMatchSource.COMMAND_LINE; + ArgumentMatchSource fileA = new ArgumentMatchSource(new File("a")); + ArgumentMatchSource fileB = new ArgumentMatchSource(new File("b")); + + Assert.assertTrue(cmdLine.compareTo(cmdLine) == 0); + Assert.assertTrue(cmdLine.compareTo(fileA) < 0); + Assert.assertTrue(cmdLine.compareTo(fileB) < 0); + + Assert.assertTrue(fileA.compareTo(cmdLine) > 0); + Assert.assertTrue(fileA.compareTo(fileA) == 0); + Assert.assertTrue(fileA.compareTo(fileB) < 0); + + Assert.assertTrue(fileB.compareTo(cmdLine) > 0); + Assert.assertTrue(fileB.compareTo(fileA) > 0); + Assert.assertTrue(fileB.compareTo(fileB) == 0); + } + + @Test(expectedExceptions = NullPointerException.class) + public void testCompareToNull() { + ArgumentMatchSource.COMMAND_LINE.compareTo(null); + } +} diff --git a/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java index f04731214..87f0e6ff0 100755 --- a/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/commandline/ParsingEngineUnitTest.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.commandline; +import org.apache.commons.io.FileUtils; import org.broad.tribble.Feature; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -34,6 +35,8 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; +import java.io.File; +import java.io.IOException; import java.util.List; import java.util.EnumSet; /** @@ -493,6 +496,7 @@ public class ParsingEngineUnitTest extends BaseTest { Assert.assertNotNull(definition, "Invalid default argument name assigned"); } + @SuppressWarnings("unused") private class CamelCaseArgProvider { @Argument(doc="my arg") Integer myArg; @@ -507,6 +511,7 @@ public class ParsingEngineUnitTest extends BaseTest { parsingEngine.validate(); } + @SuppressWarnings("unused") private class BooleanArgProvider { @Argument(doc="my bool") boolean myBool; @@ -561,6 +566,7 @@ public class ParsingEngineUnitTest extends BaseTest { parsingEngine.validate(); } + @SuppressWarnings("unused") private class MutuallyExclusiveArgProvider { @Argument(doc="foo",exclusiveOf="bar") Integer foo; @@ -618,6 +624,7 @@ public class ParsingEngineUnitTest extends BaseTest { parsingEngine.addArgumentSource( MultipleArgumentCollectionProvider.class ); } + @SuppressWarnings("unused") private class MultipleArgumentCollectionProvider { @ArgumentCollection RequiredArgProvider rap1 = new RequiredArgProvider(); @@ -937,4 +944,23 @@ public class ParsingEngineUnitTest extends BaseTest { VariantContextRodBindingArgProvider argProvider = new VariantContextRodBindingArgProvider(); parsingEngine.loadArgumentsIntoObject( argProvider ); } + + @Test + public void argumentListTest() throws IOException { + File argsFile = BaseTest.createTempFile("args.", ".list"); + try { + FileUtils.write(argsFile, "-I na12878.bam"); + final String[] commandLine = new String[] {"-args", argsFile.getPath()}; + parsingEngine.addArgumentSource(InputFileArgProvider.class); + parsingEngine.parse(commandLine); + parsingEngine.validate(); + + InputFileArgProvider argProvider = new InputFileArgProvider(); + parsingEngine.loadArgumentsIntoObject(argProvider); + + Assert.assertEquals(argProvider.inputFile, "na12878.bam", "Argument is not correctly initialized"); + } finally { + FileUtils.deleteQuietly(argsFile); + } + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index 1e4625bf0..3ce62b697 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -28,24 +28,25 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.util.Interval; import net.sf.picard.util.IntervalList; import net.sf.samtools.SAMFileHeader; +import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.ArgumentException; +import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.walkers.PrintReadsWalker; -import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; import java.io.PrintWriter; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.List; @@ -72,7 +73,7 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); @@ -81,7 +82,7 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { testEngine.checkForDuplicateSamFiles(); } - @Test(expectedExceptions=ArgumentException.class) + @Test public void testEmptyIntervalSetHandling() throws Exception { GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); @@ -108,31 +109,7 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { }; } - @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") - public void testInvalidRODIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, - String contig, int intervalStart, int intervalEnd ) throws Exception { - - List intervalArgs = new ArrayList(); - List rodIntervals = Arrays.asList(genomeLocParser.createGenomeLoc(contig, intervalStart, intervalEnd, true)); - - testEngine.loadIntervals(intervalArgs, rodIntervals); - } - - @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") - public void testInvalidBedIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, - String contig, int intervalStart, int intervalEnd ) throws Exception { - // We need to adjust intervalStart, since BED intervals are 0-based. We don't need to adjust intervalEnd, - // since the ending point is an open interval. - File bedFile = createTempFile("testInvalidBedIntervalHandling", ".bed", - String.format("%s %d %d", contig, intervalStart -1, intervalEnd)); - - List intervalArgs = Arrays.asList(bedFile.getAbsolutePath()); - List rodIntervals = new ArrayList(); - - testEngine.loadIntervals(intervalArgs, rodIntervals); - } - - @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") + @Test(dataProvider="invalidIntervalTestData") public void testInvalidPicardIntervalHandling(GenomeAnalysisEngine testEngine, GenomeLocParser genomeLocParser, String contig, int intervalStart, int intervalEnd ) throws Exception { @@ -144,10 +121,10 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { File picardIntervalFile = createTempFile("testInvalidPicardIntervalHandling", ".intervals"); picardIntervals.write(picardIntervalFile); - List intervalArgs = Arrays.asList(picardIntervalFile.getAbsolutePath()); - List rodIntervals = new ArrayList(); + List> intervalArgs = new ArrayList>(1); + intervalArgs.add(new IntervalBinding(picardIntervalFile.getAbsolutePath())); - testEngine.loadIntervals(intervalArgs, rodIntervals); + testEngine.loadIntervals(intervalArgs, IntervalSetRule.UNION); } @Test(expectedExceptions=UserException.class, dataProvider="invalidIntervalTestData") @@ -157,10 +134,10 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { File gatkIntervalFile = createTempFile("testInvalidGATKFileIntervalHandling", ".intervals", String.format("%s:%d-%d", contig, intervalStart, intervalEnd)); - List intervalArgs = Arrays.asList(gatkIntervalFile.getAbsolutePath()); - List rodIntervals = new ArrayList(); + List> intervalArgs = new ArrayList>(1); + intervalArgs.add(new IntervalBinding(gatkIntervalFile.getAbsolutePath())); - testEngine.loadIntervals(intervalArgs, rodIntervals); + testEngine.loadIntervals(intervalArgs, IntervalSetRule.UNION); } private File createTempFile( String tempFilePrefix, String tempFileExtension, String... lines ) throws Exception { diff --git a/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java deleted file mode 100755 index 3a242cb13..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollectionUnitTest.java +++ /dev/null @@ -1,113 +0,0 @@ -package org.broadinstitute.sting.gatk.arguments; - -import org.broadinstitute.sting.BaseTest; -import org.testng.annotations.AfterMethod; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; -import static org.testng.Assert.fail; - -import java.io.File; -import java.util.*; - -import net.sf.samtools.SAMFileReader; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/** - * @author aaron - * @version 1.0 - * @date May 7, 2009 - *

    - * Class GATKArgumentCollection - *

    - * Test out the argument collection class - */ -public class GATKArgumentCollectionUnitTest extends BaseTest { - - // our collection of arguments - private GATKArgumentCollection collect; - - // where to write our xml file - private String xmlFileLoc = "testfile.xml"; - - /** setup our test */ - @BeforeMethod - public void setup() { - collect = new GATKArgumentCollection(); - } - - /** destroy the temp file */ - @AfterMethod - public void takedown() { - File f = new File(xmlFileLoc); - if (f.exists()) { - f.delete(); - } - } - - private void setupCollection() { - // parameters and their defaults - Map wArgs = new HashMap(); - wArgs.put("wArgType1", "Arg1"); - wArgs.put("wArgType2", "Arg2"); - wArgs.put("wArgType3", "Arg3"); - collect.walkerArgs = wArgs; - - List input = new ArrayList(); - input.add("test.file"); - collect.samFiles = input; - collect.strictnessLevel = SAMFileReader.ValidationStringency.STRICT; - collect.referenceFile = new File("referenceFile".toLowerCase()); - collect.unsafe = ValidationExclusion.TYPE.ALL; - collect.downsampleFraction = null; - collect.downsampleCoverage = null; - collect.intervals = new ArrayList(); - collect.intervals.add("intervals".toLowerCase()); - collect.excludeIntervals = new ArrayList(); - collect.numberOfThreads = 1; - } - - - /** test the output of an XML file in the arg collection */ - @Test - public void testOutput() { - setupCollection(); - - GATKArgumentCollection.marshal(collect, xmlFileLoc); - GATKArgumentCollection collection = GATKArgumentCollection.unmarshal(xmlFileLoc); - if (!collect.equals(collection)) { - fail("Collections not equal"); - } - } - - - /** test the output of an XML file in the arg collection */ - @Test - public void testInput() { - setupCollection(); - GATKArgumentCollection.marshal(collect, xmlFileLoc); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java index 9807cede4..ecb865f0c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/AllLocusViewUnitTest.java @@ -1,11 +1,10 @@ package org.broadinstitute.sting.gatk.datasources.providers; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import net.sf.samtools.SAMRecord; import java.util.List; /** @@ -38,7 +37,7 @@ public class AllLocusViewUnitTest extends LocusViewTemplate { * @param reads */ @Override - protected void testReadsInContext( LocusView view, List range, List reads ) { + protected void testReadsInContext( LocusView view, List range, List reads ) { AllLocusView allLocusView = (AllLocusView)view; // TODO: Should skip over loci not in the given range. @@ -52,7 +51,7 @@ public class AllLocusViewUnitTest extends LocusViewTemplate { Assert.assertEquals(locusContext.getLocation(), site, "Locus context location is incorrect"); int expectedReadsAtSite = 0; - for( SAMRecord read: reads ) { + for( GATKSAMRecord read: reads ) { if(genomeLocParser.createGenomeLoc(read).containsP(locusContext.getLocation())) { Assert.assertTrue(locusContext.getReads().contains(read),"Target locus context does not contain reads"); expectedReadsAtSite++; diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java index 75716eae6..3a0caef51 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/CoveredLocusViewUnitTest.java @@ -1,11 +1,11 @@ package org.broadinstitute.sting.gatk.datasources.providers; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import net.sf.samtools.SAMRecord; import java.util.List; /** @@ -41,7 +41,7 @@ public class CoveredLocusViewUnitTest extends LocusViewTemplate { * @param reads */ @Override - protected void testReadsInContext( LocusView view, List range, List reads ) { + protected void testReadsInContext( LocusView view, List range, List reads ) { CoveredLocusView coveredLocusView = (CoveredLocusView)view; // TODO: Should skip over loci not in the given range. @@ -53,7 +53,7 @@ public class CoveredLocusViewUnitTest extends LocusViewTemplate { GenomeLoc site = genomeLocParser.createGenomeLoc("chr1",i); int expectedReadsAtSite = 0; - for( SAMRecord read: reads ) { + for( GATKSAMRecord read: reads ) { if( genomeLocParser.createGenomeLoc(read).containsP(site) ) expectedReadsAtSite++; } @@ -67,7 +67,7 @@ public class CoveredLocusViewUnitTest extends LocusViewTemplate { Assert.assertEquals(locusContext.getLocation(), site, "Target locus context location is incorrect"); Assert.assertEquals(locusContext.getReads().size(), expectedReadsAtSite, "Found wrong number of reads at site"); - for( SAMRecord read: reads ) { + for( GATKSAMRecord read: reads ) { if(genomeLocParser.createGenomeLoc(read).containsP(locusContext.getLocation())) Assert.assertTrue(locusContext.getReads().contains(read),"Target locus context does not contain reads"); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java index e5cf80826..8d7dd82ac 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java @@ -8,13 +8,12 @@ import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.executive.WindowMaker; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; import org.broadinstitute.sting.gatk.datasources.reads.LocusShard; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -51,23 +50,23 @@ public abstract class LocusViewTemplate extends BaseTest { GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); - testReadsInContext(view, shard.getGenomeLocs(), Collections.emptyList()); + testReadsInContext(view, shard.getGenomeLocs(), Collections.emptyList()); } @Test public void singleReadTest() { - SAMRecord read = buildSAMRecord("chr1", 1, 5); + GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 5); SAMRecordIterator iterator = new SAMRecordIterator(read); GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(shardBounds)); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); @@ -78,11 +77,11 @@ public abstract class LocusViewTemplate extends BaseTest { @Test public void readCoveringFirstPartTest() { - SAMRecord read = buildSAMRecord("chr1", 1, 5); + GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 5); SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -92,11 +91,11 @@ public abstract class LocusViewTemplate extends BaseTest { @Test public void readCoveringLastPartTest() { - SAMRecord read = buildSAMRecord("chr1", 6, 10); + GATKSAMRecord read = buildSAMRecord("read1","chr1", 6, 10); SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -106,11 +105,11 @@ public abstract class LocusViewTemplate extends BaseTest { @Test public void readCoveringMiddleTest() { - SAMRecord read = buildSAMRecord("chr1", 3, 7); + GATKSAMRecord read = buildSAMRecord("read1","chr1", 3, 7); SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -120,11 +119,11 @@ public abstract class LocusViewTemplate extends BaseTest { @Test public void readAndLocusOverlapAtLastBase() { - SAMRecord read = buildSAMRecord("chr1", 1, 5); + GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 5); SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 5, 5))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -134,11 +133,11 @@ public abstract class LocusViewTemplate extends BaseTest { @Test public void readOverlappingStartTest() { - SAMRecord read = buildSAMRecord("chr1", 1, 10); + GATKSAMRecord read = buildSAMRecord("read1","chr1", 1, 10); SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 6, 15))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -148,11 +147,11 @@ public abstract class LocusViewTemplate extends BaseTest { @Test public void readOverlappingEndTest() { - SAMRecord read = buildSAMRecord("chr1", 6, 15); + GATKSAMRecord read = buildSAMRecord("read1","chr1", 6, 15); SAMRecordIterator iterator = new SAMRecordIterator(read); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); @@ -162,104 +161,104 @@ public abstract class LocusViewTemplate extends BaseTest { @Test public void readsSpanningTest() { - SAMRecord read1 = buildSAMRecord("chr1", 1, 5); - SAMRecord read2 = buildSAMRecord("chr1", 6, 10); + GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 1, 5); + GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 6, 10); SAMRecordIterator iterator = new SAMRecordIterator(read1, read2); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); - List expectedReads = new ArrayList(); + List expectedReads = new ArrayList(); Collections.addAll(expectedReads, read1, read2); testReadsInContext(view, shard.getGenomeLocs(), expectedReads); } @Test public void duplicateReadsTest() { - SAMRecord read1 = buildSAMRecord("chr1", 1, 5); - SAMRecord read2 = buildSAMRecord("chr1", 1, 5); - SAMRecord read3 = buildSAMRecord("chr1", 6, 10); - SAMRecord read4 = buildSAMRecord("chr1", 6, 10); + GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 1, 5); + GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 1, 5); + GATKSAMRecord read3 = buildSAMRecord("read3","chr1", 6, 10); + GATKSAMRecord read4 = buildSAMRecord("read4","chr1", 6, 10); SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); - List expectedReads = new ArrayList(); + List expectedReads = new ArrayList(); Collections.addAll(expectedReads, read1, read2, read3, read4); testReadsInContext(view, shard.getGenomeLocs(), expectedReads); } @Test public void cascadingReadsWithinBoundsTest() { - SAMRecord read1 = buildSAMRecord("chr1", 2, 6); - SAMRecord read2 = buildSAMRecord("chr1", 3, 7); - SAMRecord read3 = buildSAMRecord("chr1", 4, 8); - SAMRecord read4 = buildSAMRecord("chr1", 5, 9); + GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 2, 6); + GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 3, 7); + GATKSAMRecord read3 = buildSAMRecord("read3","chr1", 4, 8); + GATKSAMRecord read4 = buildSAMRecord("read4","chr1", 5, 9); SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); - List expectedReads = new ArrayList(); + List expectedReads = new ArrayList(); Collections.addAll(expectedReads, read1, read2, read3, read4); testReadsInContext(view, shard.getGenomeLocs(), expectedReads); } @Test public void cascadingReadsAtBoundsTest() { - SAMRecord read1 = buildSAMRecord("chr1", 1, 5); - SAMRecord read2 = buildSAMRecord("chr1", 2, 6); - SAMRecord read3 = buildSAMRecord("chr1", 3, 7); - SAMRecord read4 = buildSAMRecord("chr1", 4, 8); - SAMRecord read5 = buildSAMRecord("chr1", 5, 9); - SAMRecord read6 = buildSAMRecord("chr1", 6, 10); + GATKSAMRecord read1 = buildSAMRecord("read1","chr1", 1, 5); + GATKSAMRecord read2 = buildSAMRecord("read2","chr1", 2, 6); + GATKSAMRecord read3 = buildSAMRecord("read3","chr1", 3, 7); + GATKSAMRecord read4 = buildSAMRecord("read4","chr1", 4, 8); + GATKSAMRecord read5 = buildSAMRecord("read5","chr1", 5, 9); + GATKSAMRecord read6 = buildSAMRecord("read6","chr1", 6, 10); SAMRecordIterator iterator = new SAMRecordIterator(read1, read2, read3, read4, read5, read6); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 1, 10))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); - List expectedReads = new ArrayList(); + List expectedReads = new ArrayList(); Collections.addAll(expectedReads, read1, read2, read3, read4, read5, read6); testReadsInContext(view, shard.getGenomeLocs(), expectedReads); } @Test public void cascadingReadsOverlappingBoundsTest() { - SAMRecord read01 = buildSAMRecord("chr1", 1, 5); - SAMRecord read02 = buildSAMRecord("chr1", 2, 6); - SAMRecord read03 = buildSAMRecord("chr1", 3, 7); - SAMRecord read04 = buildSAMRecord("chr1", 4, 8); - SAMRecord read05 = buildSAMRecord("chr1", 5, 9); - SAMRecord read06 = buildSAMRecord("chr1", 6, 10); - SAMRecord read07 = buildSAMRecord("chr1", 7, 11); - SAMRecord read08 = buildSAMRecord("chr1", 8, 12); - SAMRecord read09 = buildSAMRecord("chr1", 9, 13); - SAMRecord read10 = buildSAMRecord("chr1", 10, 14); - SAMRecord read11 = buildSAMRecord("chr1", 11, 15); - SAMRecord read12 = buildSAMRecord("chr1", 12, 16); + GATKSAMRecord read01 = buildSAMRecord("read1","chr1", 1, 5); + GATKSAMRecord read02 = buildSAMRecord("read2","chr1", 2, 6); + GATKSAMRecord read03 = buildSAMRecord("read3","chr1", 3, 7); + GATKSAMRecord read04 = buildSAMRecord("read4","chr1", 4, 8); + GATKSAMRecord read05 = buildSAMRecord("read5","chr1", 5, 9); + GATKSAMRecord read06 = buildSAMRecord("read6","chr1", 6, 10); + GATKSAMRecord read07 = buildSAMRecord("read7","chr1", 7, 11); + GATKSAMRecord read08 = buildSAMRecord("read8","chr1", 8, 12); + GATKSAMRecord read09 = buildSAMRecord("read9","chr1", 9, 13); + GATKSAMRecord read10 = buildSAMRecord("read10","chr1", 10, 14); + GATKSAMRecord read11 = buildSAMRecord("read11","chr1", 11, 15); + GATKSAMRecord read12 = buildSAMRecord("read12","chr1", 12, 16); SAMRecordIterator iterator = new SAMRecordIterator(read01, read02, read03, read04, read05, read06, read07, read08, read09, read10, read11, read12); Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc("chr1", 6, 15))); - WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(), new SampleDataSource()); + WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, window.getSourceInfo(), genomeLocParser, window.getLocus(), window, null, null); LocusView view = createView(dataProvider); - List expectedReads = new ArrayList(); + List expectedReads = new ArrayList(); Collections.addAll(expectedReads, read01, read02, read03, read04, read05, read06, read07, read08, read09, read10, read11, read12); testReadsInContext(view, shard.getGenomeLocs(), expectedReads); @@ -279,7 +278,7 @@ public abstract class LocusViewTemplate extends BaseTest { * @param bounds * @param reads */ - protected abstract void testReadsInContext(LocusView view, List bounds, List reads); + protected abstract void testReadsInContext(LocusView view, List bounds, List reads); /** * Fake a reference sequence file. Essentially, seek a header with a bunch of dummy data. @@ -323,12 +322,13 @@ public abstract class LocusViewTemplate extends BaseTest { * * @return New SAM Record */ - protected SAMRecord buildSAMRecord(String contig, int alignmentStart, int alignmentEnd) { + protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { SAMFileHeader header = new SAMFileHeader(); header.setSequenceDictionary(sequenceSourceFile.getSequenceDictionary()); - SAMRecord record = new SAMRecord(header); + GATKSAMRecord record = new GATKSAMRecord(header); + record.setReadName(readName); record.setReferenceIndex(sequenceSourceFile.getSequenceDictionary().getSequenceIndex(contig)); record.setAlignmentStart(alignmentStart); Cigar cigar = new Cigar(); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 2ecd75754..5ee373e4f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.datasources.reads; import com.google.caliper.Param; import net.sf.picard.filter.FilteringIterator; -import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Tags; @@ -34,15 +33,12 @@ import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.baq.BAQ; -import java.io.File; import java.util.Collections; import java.util.Iterator; @@ -88,12 +84,9 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { (byte)0); GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); - SampleDataSource sampleDataSource = new SampleDataSource(); - sampleDataSource.addSamplesFromSAMHeader(reader.getFileHeader()); - // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? Iterator readIterator = new FilteringIterator(reader.iterator(),new UnmappedReadFilter()); - LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser,sampleDataSource); + LocusIteratorByState locusIteratorByState = new LocusIteratorByState(readIterator,readProperties,genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); while(locusIteratorByState.hasNext()) { locusIteratorByState.next().getLocation(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java index 31458f835..564d1e2a3 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java @@ -25,13 +25,10 @@ package org.broadinstitute.sting.gatk.datasources.reads; import com.google.caliper.Param; -import net.sf.picard.filter.SamRecordFilter; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; @@ -41,9 +38,9 @@ import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.walkers.qc.CountLociWalker; import org.broadinstitute.sting.gatk.walkers.qc.CountReadsWalker; import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; -import java.lang.reflect.Field; import java.util.Collections; /** @@ -126,7 +123,7 @@ class CountBasesInReadPerformanceWalker extends ReadWalker { private long Gs; private long Ts; - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { for(byte base: read.getReadBases()) { switch(base) { case 'A': As++; break; diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java deleted file mode 100644 index 59405c065..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleDataSourceUnitTest.java +++ /dev/null @@ -1,241 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -import net.sf.samtools.SAMFileHeader; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.exceptions.StingException; - -import org.testng.annotations.Test; - -import java.io.File; -import java.util.*; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Sep 9, 2010 - * Time: 8:21:00 AM - */ -public class SampleDataSourceUnitTest extends BaseTest { - - // this empty header used to instantiate sampledatasource objects - private static SAMFileHeader header = new SAMFileHeader(); - - // all the test sample files are located here - private String sampleFilesDir = validationDataLocation + "samples/"; - - // make sure samples are created from the SAM file correctly - @Test() - public void loadSAMSamplesTest() { - SampleDataSource s = new SampleDataSource(header, null); - } - - // tests that a basic sample with relationships loads correctly - // Note that this is the only test for family relationships - we may want to expand this - @Test() - public void basicLoadSampleFileTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - Assert.assertTrue(s.sampleCount() == 5); - Sample sampleA = s.getSampleById("sampleA"); - Sample sampleB = s.getSampleById("sampleB"); - Assert.assertTrue(sampleB.getMother() == sampleA); - Assert.assertTrue(s.getChildren(sampleA).contains(sampleB)); - Set family = s.getFamily("family1"); - Assert.assertTrue(family.size() == 2); - Assert.assertTrue(family.contains(sampleA)); - Assert.assertTrue(family.contains(sampleB)); - } - - // but that file should fail if it has an extra character in it... - @Test(expectedExceptions=StingException.class) - public void loadInvalidSampleExtraCharText() { - File sampleFile = new File(sampleFilesDir + "invalidSyntaxExtraChar.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - } - - // ...or a typo... - @Test(expectedExceptions=StingException.class) - public void loadInvalidSampleTypoText() { - File sampleFile = new File(sampleFilesDir + "invalidSyntaxTypo.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - - } - - // ...or an extra unrecognized array - @Test(expectedExceptions=StingException.class) - public void loadInvalidSampleExtraArrayText() { - File sampleFile = new File(sampleFilesDir + "invalidSyntaxExtraArray.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - } - - // make sure aliases work - @Test(expectedExceptions=StingException.class) - public void sampleAliasText() { - File sampleFile = new File(sampleFilesDir + "basicSampleFileWithAlias.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - // this file has two samples, but one has an alias. let's make sure that checks out... - Assert.assertTrue(s.sampleCount() == 3); - Assert.assertTrue(s.getSampleById("sampleA") == s.getSampleById("sampleC")); - } - - // error is thrown if property is included that's not in properties array - @Test(expectedExceptions=StingException.class) - public void unallowedPropertySampleTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFileUnallowedProperty.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - } - - // same as above, with relationship - @Test(expectedExceptions=StingException.class) - public void unallowedRelationshipSampleTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFileUnallowedRelationship.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - } - - // two sample files - @Test() - public void twoSampleFilesTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); - File secondFile = new File(sampleFilesDir + "basicSampleFileExt.yaml"); - ArrayList files = new ArrayList(); - files.add(sampleFile); - files.add(secondFile); - SampleDataSource s = new SampleDataSource(header, files); - Assert.assertTrue(s.getSampleById("sampleA").getProperty("propC").equals("valC")); - Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA")); - } - - // two sample files, with contradictory properties - @Test(expectedExceptions=StingException.class) - public void twoContradictorySampleFilesTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); - File secondFile = new File(sampleFilesDir + "basicSampleFileInvalidExt.yaml"); - ArrayList files = new ArrayList(); - files.add(sampleFile); - files.add(secondFile); - SampleDataSource s = new SampleDataSource(header, files); - } - - // three sample files - @Test() - public void threeSamplesTest() { - File sampleFile = new File(sampleFilesDir + "basicSampleFile.yaml"); - ArrayList files = new ArrayList(); - files.add(sampleFile); - files.add(new File(sampleFilesDir + "basicSampleFileExt.yaml")); - files.add(new File(sampleFilesDir + "basicSampleFileExt2.yaml")); - SampleDataSource s = new SampleDataSource(header, files); - Assert.assertTrue(s.sampleCount() == 6); - Assert.assertTrue(s.getSampleById("sampleE").getProperty("propC").equals("valC")); - Assert.assertTrue(s.getSampleById("sampleA").getProperty("propA").equals("valA")); - } - - /** - * testing getSamplesWithProperty - * in this file there are 5 samples - 2 with population "CEU", 1 with population "ABC", 1 with no population, - * and then the default null sample - */ - @Test() - public void getSamplesWithPropertyTest() { - File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - Assert.assertTrue(s.sampleCount() == 5); - Set ceuSamples = s.getSamplesWithProperty("population", "CEU"); - Assert.assertTrue(ceuSamples.size() == 2); - - Iterator i = ceuSamples.iterator(); - ArrayList sampleNames = new ArrayList(); - sampleNames.add(i.next().getId()); - sampleNames.add(i.next().getId()); - Assert.assertTrue(sampleNames.contains("sampleA")); - Assert.assertTrue(sampleNames.contains("sampleB")); - } - - // make sure we can import data types other than Strings - @Test() - public void sampleTestPropertyType() { - File sampleFile = new File(sampleFilesDir + "sampleFileOtherTypes.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - Sample sample = s.getSampleById("sampleA"); - Assert.assertTrue(sample.getProperty("a").getClass() == Integer.class); - Assert.assertTrue(sample.getProperty("b").getClass() == String.class); - Assert.assertTrue(sample.getProperty("c").getClass() == Double.class); - Assert.assertTrue(sample.getProperty("b").getClass() == String.class); - } - - /** - * check that getSamplesFromVariantContext works - * create a variant context with two sample names, and make sure the right samples are there - */ - @Test() - public void variantContextTest() { - SampleDataSource s = new SampleDataSource(header, null); - List alleleCollection = new ArrayList(); - Allele a1 = Allele.create("A", true); - alleleCollection.add(a1); - - Set genotypeCollection = new HashSet(); - genotypeCollection.add(new Genotype("NA123", alleleCollection)); - genotypeCollection.add(new Genotype("NA456", alleleCollection)); - - VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection); - - // make sure the set that's returned is the right size - HashSet set = (HashSet) s.getSamplesByVariantContext(v); - Assert.assertTrue(set.size() == 2); - - // make sure both samples are included - Iterator i = set.iterator(); - ArrayList sampleNames = new ArrayList(); - sampleNames.add(i.next().getId()); - sampleNames.add(i.next().getId()); - Assert.assertTrue(sampleNames.contains("NA123")); - Assert.assertTrue(sampleNames.contains("NA456")); - } - - /** - * checking subContextFromSampleProperty - */ - - /** - * check that subContextFromSampleProperty works - * create a variant context with four sample names, make sure that it filters correctly to 2 - */ - @Test() - public void subContextFromSamplePropertyTest() { - - File sampleFile = new File(sampleFilesDir + "sampleFileWithProperties.yaml"); - SampleDataSource s = new SampleDataSource(header, makeFileList(sampleFile)); - Assert.assertTrue(s.sampleCount() == 5); - - List alleleCollection = new ArrayList(); - Allele a1 = Allele.create("A", true); - alleleCollection.add(a1); - - Set genotypeCollection = new HashSet(); - genotypeCollection.add(new Genotype("NA123", alleleCollection)); - genotypeCollection.add(new Genotype("sampleA", alleleCollection)); - genotypeCollection.add(new Genotype("sampleB", alleleCollection)); - genotypeCollection.add(new Genotype("sampleC", alleleCollection)); - - VariantContext v = new VariantContext("contextName", "chr1", 1, 1, alleleCollection, genotypeCollection); - VariantContext subContext = s.subContextFromSampleProperty(v, "population", "CEU"); - - Assert.assertTrue(subContext.getSampleNames().contains("sampleA")); - Assert.assertTrue(subContext.getSampleNames().contains("sampleA")); - Assert.assertTrue(subContext.getSampleNames().size() == 2); - - } - - - // we create lots of single item lists... - private ArrayList makeFileList(File file) { - ArrayList a = new ArrayList(); - a.add(file); - return a; - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleUnitTest.java deleted file mode 100644 index 67e84cdd8..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/sample/SampleUnitTest.java +++ /dev/null @@ -1,64 +0,0 @@ -package org.broadinstitute.sting.gatk.datasources.sample; - -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; - -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -/** - * Created by IntelliJ IDEA. - * User: brett - * Date: Sep 9, 2010 - * Time: 8:21:00 AM - */ -public class SampleUnitTest extends BaseTest { - - static Sample sampleA; - static Sample sampleA1; - static Sample sampleB; - static Sample sampleC; - - @BeforeClass - public void init() { - sampleA = new Sample("sampleA"); - sampleA.setProperty("uniqueProperty", "uniqueValue"); - sampleA1 = new Sample("sampleA"); - sampleA1.setProperty("uniqueProperty", "uniqueValue"); - sampleB = new Sample("sampleB"); - sampleC = new Sample("sampleC"); - sampleC.setProperty("population", "pop1"); - sampleC.setProperty("gender", Sample.Gender.MALE); - } - - /** - * Testing equality - */ - @Test() - public void equalsTest() { - Assert.assertTrue(sampleA.equals(sampleA1)); - Assert.assertFalse(sampleA == sampleA1); - Assert.assertFalse(sampleA.equals(sampleB)); - } - - /** - * And hash - */ - @Test() - public void basicHashTest() { - Assert.assertFalse(sampleA.hashCode() == sampleB.hashCode()); - Assert.assertTrue(sampleA.hashCode() == sampleA1.hashCode()); - } - - /** - * Now test the special getter methods - */ - @Test() - public void specialGettersTest() { - Assert.assertTrue(sampleC.getId().equals("sampleC")); - Assert.assertTrue(sampleC.getPopulation().equals("pop1")); - Assert.assertTrue(sampleC.isMale()); - Assert.assertFalse(sampleA.isMale()); // sample A doesn't have a gender, so this should be false - } - -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index 32d3675b7..c9727d904 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -1,6 +1,5 @@ package org.broadinstitute.sting.gatk.iterators; -import net.sf.picard.filter.SamRecordFilter; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; @@ -8,11 +7,11 @@ import net.sf.samtools.util.CloseableIterator; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.datasources.sample.SampleDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -29,11 +28,8 @@ import java.util.*; * testing of the LocusIteratorByState */ public class LocusIteratorByStateUnitTest extends BaseTest { - - private final int MAX_READS = 10; private static SAMFileHeader header; private LocusIteratorByState li; - private GenomeLocParser genomeLocParser; @BeforeClass @@ -42,6 +38,10 @@ public class LocusIteratorByStateUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } + private final LocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { + return new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); + } + @Test public void testIndelBaseQualityFiltering() { final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; @@ -68,7 +68,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(before,during,after); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser, new SampleDataSource()); + li = makeLTBS(reads,readAttributes); boolean foundExtendedEventPileup = false; while (li.hasNext()) { @@ -78,7 +78,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { ReadBackedExtendedEventPileup pileup = context.getExtendedEventPileup().getBaseFilteredPileup(10); Assert.assertEquals(pileup.getLocation().getStart(), 5, "Extended event pileup at wrong location"); - Assert.assertEquals(pileup.size(), 3, "Pileup size is incorrect"); + Assert.assertEquals(pileup.getNumberOfElements(), 3, "Pileup size is incorrect"); foundExtendedEventPileup = true; } @@ -120,7 +120,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(before,during,after); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser, new SampleDataSource()); + li = makeLTBS(reads,readAttributes); boolean foundExtendedEventPileup = false; while (li.hasNext()) { @@ -154,7 +154,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(indelOnlyRead); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser,new SampleDataSource()); + li = makeLTBS(reads, readAttributes); // Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read // and considers it to be an indel-containing read. @@ -167,7 +167,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { // Turn on extended events, and make sure the event is found. JVMUtils.setFieldValue(JVMUtils.findField(ReadProperties.class,"generateExtendedEvents"),readAttributes,true); - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser,new SampleDataSource()); + li = makeLTBS(reads, readAttributes); Assert.assertTrue(li.hasNext(),"LocusIteratorByState with extended events should contain exactly one pileup"); alignmentContext = li.next(); @@ -203,7 +203,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(leadingRead,indelOnlyRead,fullMatchAfterIndel); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),createTestReadProperties(),genomeLocParser,new SampleDataSource()); + li = makeLTBS(reads, createTestReadProperties()); int currentLocus = firstLocus; int numAlignmentContextsFound = 0; @@ -212,12 +212,12 @@ public class LocusIteratorByStateUnitTest extends BaseTest { Assert.assertEquals(alignmentContext.getLocation().getStart(),currentLocus,"Current locus returned by alignment context is incorrect"); if(currentLocus == firstLocus) { - List readsAtLocus = alignmentContext.getBasePileup().getReads(); + List readsAtLocus = alignmentContext.getBasePileup().getReads(); Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus); Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + currentLocus); } else if(currentLocus == secondLocus) { - List readsAtLocus = alignmentContext.getBasePileup().getReads(); + List readsAtLocus = alignmentContext.getBasePileup().getReads(); Assert.assertEquals(readsAtLocus.size(),2,"Wrong number of reads at locus " + currentLocus); Assert.assertSame(readsAtLocus.get(0),indelOnlyRead,"indelOnlyRead absent from pileup at locus " + currentLocus); Assert.assertSame(readsAtLocus.get(1),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at locus " + currentLocus); @@ -260,12 +260,12 @@ public class LocusIteratorByStateUnitTest extends BaseTest { List reads = Arrays.asList(leadingRead,indelOnlyRead,fullMatchAfterIndel); // create the iterator by state with the fake reads and fake records - li = new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()),readAttributes,genomeLocParser,new SampleDataSource()); + li = makeLTBS(reads,readAttributes); Assert.assertTrue(li.hasNext(),"Missing first locus at " + firstLocus); AlignmentContext alignmentContext = li.next(); Assert.assertEquals(alignmentContext.getLocation().getStart(),firstLocus,"Incorrect locus at this position; should be " + firstLocus); - List readsAtLocus = alignmentContext.getBasePileup().getReads(); + List readsAtLocus = alignmentContext.getBasePileup().getReads(); Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + firstLocus); Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + firstLocus); diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java index bae8e99ed..e8799e2ab 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManagerUnitTest.java @@ -56,6 +56,7 @@ public class FeatureManagerUnitTest extends BaseTest { private static final File VCF3_FILE = new File(validationDataLocation + "vcfexample3.vcf"); private static final File VCF4_FILE = new File(testDir + "HiSeq.10000.vcf"); private static final File VCF4_FILE_GZ = new File(testDir + "HiSeq.10000.vcf.gz"); + private static final File VCF4_FILE_BGZIP = new File(testDir + "HiSeq.10000.bgzip.vcf.gz"); private FeatureManager manager; private GenomeLocParser genomeLocParser; @@ -109,6 +110,7 @@ public class FeatureManagerUnitTest extends BaseTest { new FMTest(VariantContext.class, VCF3Codec.class, "VCF3", VCF3_FILE); new FMTest(VariantContext.class, VCFCodec.class, "VCF", VCF4_FILE); new FMTest(VariantContext.class, VCFCodec.class, "VCF", VCF4_FILE_GZ); + new FMTest(VariantContext.class, VCFCodec.class, "VCF", VCF4_FILE_BGZIP); new FMTest(TableFeature.class, BedTableCodec.class, "bedtable", null); return FMTest.getTests(FMTest.class); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java index ae218e898..724c343e4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/tracks/RMDTrackBuilderUnitTest.java @@ -29,7 +29,6 @@ import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.index.Index; -import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.utils.codecs.vcf.VCF3Codec; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -45,7 +44,6 @@ import org.testng.annotations.Test; import java.io.*; import java.nio.channels.FileChannel; -import java.util.Map; /** @@ -164,7 +162,7 @@ public class RMDTrackBuilderUnitTest extends BaseTest { try { Index idx = builder.loadIndex(vcfFile, new VCFCodec()); // catch any exception; this call should pass correctly - SAMSequenceDictionary dict = RMDTrackBuilder.getSequenceDictionaryFromProperties(idx); + SAMSequenceDictionary dict = IndexDictionaryUtils.getSequenceDictionaryFromProperties(idx); } catch (IOException e) { e.printStackTrace(); Assert.fail("IO exception unexpected" + e.getMessage()); diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java new file mode 100644 index 000000000..1601845cd --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/PedReaderUnitTest.java @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.samples; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.StringReader; +import java.util.*; + +/** + * UnitTest for PedReader + * + * @author Mark DePristo + * @since 2011 + */ +public class PedReaderUnitTest extends BaseTest { + private static Logger logger = Logger.getLogger(PedReaderUnitTest.class); + + private class PedReaderTest extends TestDataProvider { + public String fileContents; + public List expectedSamples; + EnumSet missing; + + private PedReaderTest(final String name, final List expectedSamples, final String fileContents) { + super(PedReaderTest.class, name); + this.fileContents = fileContents; + this.expectedSamples = expectedSamples; + } + } + +// Family ID +// Individual ID +// Paternal ID +// Maternal ID +// Sex (1=male; 2=female; other=unknown) +// Phenotype +// +// -9 missing +// 0 missing +// 1 unaffected +// 2 affected + + @DataProvider(name = "readerTest") + public Object[][] createPEDFiles() { + new PedReaderTest("singleRecordMale", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED)), + "fam1 kid 0 0 1 1"); + + new PedReaderTest("singleRecordFemale", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.FEMALE, Affection.UNAFFECTED)), + "fam1 kid 0 0 2 1"); + + new PedReaderTest("singleRecordMissingGender", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.UNKNOWN, Affection.UNKNOWN)), + "fam1 kid 0 0 0 0"); + + // Affection + new PedReaderTest("singleRecordAffected", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.AFFECTED)), + "fam1 kid 0 0 1 2"); + + new PedReaderTest("singleRecordUnaffected", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED)), + "fam1 kid 0 0 1 1"); + + new PedReaderTest("singleRecordMissingAffection-9", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.UNKNOWN)), + "fam1 kid 0 0 1 -9"); + + new PedReaderTest("singleRecordMissingAffection0", + Arrays.asList(new Sample("kid", "fam1", null, null, Gender.MALE, Affection.UNKNOWN)), + "fam1 kid 0 0 1 0"); + + new PedReaderTest("multipleUnrelated", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.AFFECTED)), + String.format("%s%n%s", + "fam1 s1 0 0 1 1", + "fam2 s2 0 0 2 2")); + + new PedReaderTest("multipleUnrelatedExtraLine", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.AFFECTED)), + String.format("%s%n%s%n %n", // note extra newlines and whitespace + "fam1 s1 0 0 1 1", + "fam2 s2 0 0 2 2")); + + new PedReaderTest("explicitTrio", + Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED)), + String.format("%s%n%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1", + "fam1 mom 0 0 2 2")); + + new PedReaderTest("implicitTrio", + Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), + "fam1 kid dad mom 1 2"); + + new PedReaderTest("partialTrio", + Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), + String.format("%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1")); + + new PedReaderTest("bigPedigree", + Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", "granddad1", "grandma1", Gender.MALE, Affection.UNAFFECTED), + new Sample("granddad1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("grandma1", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN), + new Sample("mom", "fam1", "granddad2", "grandma2", Gender.FEMALE, Affection.AFFECTED), + new Sample("granddad2", "fam1", null, null, Gender.MALE, Affection.UNKNOWN), + new Sample("grandma2", "fam1", null, null, Gender.FEMALE, Affection.UNKNOWN)), + String.format("%s%n%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad granddad1 grandma1 1 1", + "fam1 mom granddad2 grandma2 2 2")); + + // Quantitative trait + new PedReaderTest("OtherPhenotype", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.OTHER, "1"), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.OTHER, "10.0")), + String.format("%s%n%s", + "fam1 s1 0 0 1 1", + "fam2 s2 0 0 2 10.0")); + + new PedReaderTest("OtherPhenotypeWithMissing", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.UNKNOWN, Sample.UNSET_QT), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.OTHER, "10.0")), + String.format("%s%n%s", + "fam1 s1 0 0 1 -9", + "fam2 s2 0 0 2 10.0")); + + new PedReaderTest("OtherPhenotypeOnlyInts", + Arrays.asList( + new Sample("s1", "fam1", null, null, Gender.MALE, Affection.OTHER, "1"), + new Sample("s2", "fam2", null, null, Gender.FEMALE, Affection.OTHER, "10")), + String.format("%s%n%s", + "fam1 s1 0 0 1 1", + "fam2 s2 0 0 2 10")); + + return PedReaderTest.getTests(PedReaderTest.class); + } + + private static final void runTest(PedReaderTest test, String myFileContents, EnumSet missing) { + logger.warn("Test " + test); + PedReader reader = new PedReader(); + SampleDB sampleDB = new SampleDB(); + List readSamples = reader.parse(myFileContents, missing, sampleDB); + Assert.assertEquals(new HashSet(test.expectedSamples), new HashSet(readSamples)); + } + + @Test(enabled = true, dataProvider = "readerTest") + public void testPedReader(PedReaderTest test) { + runTest(test, test.fileContents, EnumSet.noneOf(PedReader.MissingPedField.class)); + } + + @Test(enabled = true, dataProvider = "readerTest") + public void testPedReaderWithComments(PedReaderTest test) { + runTest(test, String.format("#comment%n%s", test.fileContents), EnumSet.noneOf(PedReader.MissingPedField.class)); + } + + @Test(enabled = true, dataProvider = "readerTest") + public void testPedReaderWithSemicolons(PedReaderTest test) { + runTest(test, + test.fileContents.replace(String.format("%n"), ";"), + EnumSet.noneOf(PedReader.MissingPedField.class)); + } + + // ----------------------------------------------------------------- + // missing format field tests + // ----------------------------------------------------------------- + + private class PedReaderTestMissing extends TestDataProvider { + public EnumSet missingDesc; + public EnumSet missingFields; + public final String fileContents; + public Sample expected; + + + private PedReaderTestMissing(final String name, final String fileContents, + EnumSet missingDesc, + EnumSet missingFields, + final Sample expected) { + super(PedReaderTestMissing.class, name); + this.fileContents = fileContents; + this.missingDesc = missingDesc; + this.missingFields = missingFields; + this.expected = expected; + } + } + + @DataProvider(name = "readerTestMissing") + public Object[][] createPEDFilesWithMissing() { + new PedReaderTestMissing("missingFam", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_FAMILY_ID), + EnumSet.of(PedReader.Field.FAMILY_ID), + new Sample("kid", null, "dad", "mom", Gender.MALE, Affection.AFFECTED)); + + new PedReaderTestMissing("missingParents", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_PARENTS), + EnumSet.of(PedReader.Field.PATERNAL_ID, PedReader.Field.MATERNAL_ID), + new Sample("kid", "fam1", null, null, Gender.MALE, Affection.AFFECTED)); + + new PedReaderTestMissing("missingSex", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_SEX), + EnumSet.of(PedReader.Field.GENDER), + new Sample("kid", "fam1", "dad", "mom", Gender.UNKNOWN, Affection.AFFECTED)); + + new PedReaderTestMissing("missingPhenotype", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE), + EnumSet.of(PedReader.Field.PHENOTYPE), + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.UNKNOWN)); + + new PedReaderTestMissing("missingEverythingButGender", + "fam1 kid dad mom 1 2", + EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE, PedReader.MissingPedField.NO_PARENTS, PedReader.MissingPedField.NO_FAMILY_ID), + EnumSet.of(PedReader.Field.FAMILY_ID, PedReader.Field.PATERNAL_ID, PedReader.Field.MATERNAL_ID, PedReader.Field.PHENOTYPE), + new Sample("kid", null, null, null, Gender.MALE, Affection.UNKNOWN)); + + + return PedReaderTestMissing.getTests(PedReaderTestMissing.class); + } + + @Test(enabled = true, dataProvider = "readerTestMissing") + public void testPedReaderWithMissing(PedReaderTestMissing test) { + final String contents = sliceContents(test.missingFields, test.fileContents); + logger.warn("Test " + test); + PedReader reader = new PedReader(); + SampleDB sampleDB = new SampleDB(); + reader.parse(new StringReader(contents), test.missingDesc, sampleDB); + final Sample missingSample = sampleDB.getSample("kid"); + Assert.assertEquals(test.expected, missingSample, "Missing field value not expected value for " + test); + } + + private final static String sliceContents(EnumSet missingFieldsSet, String full) { + List parts = new ArrayList(Arrays.asList(full.split("\\s+"))); + final List missingFields = new ArrayList(missingFieldsSet); + Collections.reverse(missingFields); + for ( PedReader.Field field : missingFields ) + parts.remove(field.ordinal()); + return Utils.join("\t", parts); + } + + // ----------------------------------------------------------------- + // parsing tags + // ----------------------------------------------------------------- + + private class PedReaderTestTagParsing extends TestDataProvider { + public EnumSet expected; + public final List tags; + + private PedReaderTestTagParsing(final List tags, EnumSet missingDesc) { + super(PedReaderTestTagParsing.class); + this.tags = tags; + this.expected = missingDesc; + } + } + + @DataProvider(name = "readerTestTagParsing") + public Object[][] createReaderTestTagParsing() { + new PedReaderTestTagParsing( + Collections.emptyList(), + EnumSet.noneOf(PedReader.MissingPedField.class)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_FAMILY_ID"), + EnumSet.of(PedReader.MissingPedField.NO_FAMILY_ID)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_PARENTS"), + EnumSet.of(PedReader.MissingPedField.NO_PARENTS)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_PHENOTYPE"), + EnumSet.of(PedReader.MissingPedField.NO_PHENOTYPE)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_SEX"), + EnumSet.of(PedReader.MissingPedField.NO_SEX)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_SEX", "NO_PHENOTYPE"), + EnumSet.of(PedReader.MissingPedField.NO_SEX, PedReader.MissingPedField.NO_PHENOTYPE)); + + new PedReaderTestTagParsing( + Arrays.asList("NO_SEX", "NO_PHENOTYPE", "NO_PARENTS"), + EnumSet.of(PedReader.MissingPedField.NO_SEX, PedReader.MissingPedField.NO_PHENOTYPE, PedReader.MissingPedField.NO_PARENTS)); + + return PedReaderTestTagParsing.getTests(PedReaderTestTagParsing.class); + } + + @Test(enabled = true, dataProvider = "readerTestTagParsing") + public void testPedReaderTagParsing(PedReaderTestTagParsing test) { + EnumSet parsed = PedReader.parseMissingFieldTags("test", test.tags); + Assert.assertEquals(test.expected, parsed, "Failed to properly parse tags " + test.tags); + } + + @Test(enabled = true, expectedExceptions = UserException.class) + public void testPedReaderTagParsing1() { + EnumSet parsed = PedReader.parseMissingFieldTags("test", Arrays.asList("XXX")); + } + + @Test(enabled = true, expectedExceptions = UserException.class) + public void testPedReaderTagParsing2() { + EnumSet parsed = PedReader.parseMissingFieldTags("test", Arrays.asList("NO_SEX", "XXX")); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java new file mode 100644 index 000000000..d498ee61a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleDBUnitTest.java @@ -0,0 +1,157 @@ +package org.broadinstitute.sting.gatk.samples; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.*; + +/** + * Created by IntelliJ IDEA. + * User: brett + * Date: Sep 9, 2010 + * Time: 8:21:00 AM + */ +public class SampleDBUnitTest extends BaseTest { + private static SampleDBBuilder builder; + // all the test sample files are located here + private File testPED = new File(testDir + "ceutrio.ped"); + + private static final Set testPEDSamples = new HashSet(Arrays.asList( + new Sample("kid", "fam1", "dad", "mom", Gender.MALE, Affection.AFFECTED), + new Sample("dad", "fam1", null, null, Gender.MALE, Affection.UNAFFECTED), + new Sample("mom", "fam1", null, null, Gender.FEMALE, Affection.AFFECTED))); + + private static final Set testSAMSamples = new HashSet(Arrays.asList( + new Sample("kid", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN), + new Sample("mom", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN), + new Sample("dad", null, null, null, Gender.UNKNOWN, Affection.UNKNOWN))); + + private static final String testPEDString = + String.format("%s%n%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1", + "fam1 mom 0 0 2 2"); + + private static final String testPEDMultipleFamilies = + String.format("%s%n%s%n%s%n%s%n%s", + "fam1 kid dad mom 1 2", + "fam1 dad 0 0 1 1", + "fam1 mom 0 0 2 2", + "fam3 s1 d1 m1 2 2", + "fam2 s2 d2 m2 2 2"); + + private static final String testPEDStringInconsistentGender = + "fam1 kid 0 0 2 2"; + + private static final Set testPEDSamplesAsSet = + new HashSet(testPEDSamples); + + + @BeforeMethod + public void before() { + builder = new SampleDBBuilder(PedigreeValidationType.STRICT); + } + + @Test() + public void loadPEDFile() { + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamplesAsSet, db.getSamples()); + } + + @Test() + public void loadPEDString() { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDString)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamplesAsSet, db.getSamples()); + } + + private static final void addSAMHeader() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 10); + ArtificialSAMUtils.createEnumeratedReadGroups(header, Arrays.asList("1", "2", "3"), + Arrays.asList("kid", "mom", "dad")); + builder.addSamplesFromSAMHeader(header); + } + + @Test() + public void loadSAMHeader() { + addSAMHeader(); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testSAMSamples, db.getSamples()); + } + + @Test() + public void loadSAMHeaderPlusPED() { + addSAMHeader(); + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamples, db.getSamples()); + } + + @Test() + public void loadDuplicateData() { + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testPEDSamples, db.getSamples()); + } + + @Test(expectedExceptions = UserException.class) + public void loadNonExistentFile() { + builder.addSamplesFromPedigreeFiles(Arrays.asList(new File("non-existence-file.txt"))); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(testSAMSamples, db.getSamples()); + } + + @Test(expectedExceptions = UserException.class) + public void loadInconsistentData() { + builder = new SampleDBBuilder(PedigreeValidationType.STRICT); + builder.addSamplesFromPedigreeFiles(Arrays.asList(testPED)); + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDStringInconsistentGender)); + builder.getFinalSampleDB(); + } + + @Test(expectedExceptions = UserException.class) + public void sampleInSAMHeaderNotInSamplesDB() { + addSAMHeader(); + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDStringInconsistentGender)); + builder.getFinalSampleDB(); + } + + @Test() + public void getFamilyIDs() { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(db.getFamilyIDs(), new TreeSet(Arrays.asList("fam1", "fam2", "fam3"))); + } + + @Test() + public void getFamily() { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); + SampleDB db = builder.getFinalSampleDB(); + Assert.assertEquals(db.getFamily("fam1"), testPEDSamplesAsSet); + } + + @Test() + public void loadFamilyIDs() { + builder.addSamplesFromPedigreeStrings(Arrays.asList(testPEDMultipleFamilies)); + SampleDB db = builder.getFinalSampleDB(); + Map> families = db.getFamilies(); + Assert.assertEquals(families.size(), 3); + Assert.assertEquals(families.keySet(), new TreeSet(Arrays.asList("fam1", "fam2", "fam3"))); + + for ( final String famID : families.keySet() ) { + final Set fam = families.get(famID); + Assert.assertEquals(fam.size(), 3); + for ( final Sample sample : fam ) { + Assert.assertEquals(sample.getFamilyID(), famID); + } + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java new file mode 100644 index 000000000..3af40adbe --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/samples/SampleUnitTest.java @@ -0,0 +1,64 @@ +package org.broadinstitute.sting.gatk.samples; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +/** + * + */ +public class SampleUnitTest extends BaseTest { + SampleDB db; + static Sample fam1A, fam1B, fam1C; + static Sample s1, s2; + static Sample trait1, trait2, trait3, trait4, trait5; + + @BeforeClass + public void init() { + db = new SampleDB(); + + fam1A = new Sample("1A", db, "fam1", "1B", "1C", Gender.UNKNOWN); + fam1B = new Sample("1B", db, "fam1", null, null, Gender.MALE); + fam1C = new Sample("1C", db, "fam1", null, null, Gender.FEMALE); + + s1 = new Sample("s1", db); + s2 = new Sample("s2", db); + + trait1 = new Sample("t1", db, Affection.AFFECTED, Sample.UNSET_QT); + trait2 = new Sample("t2", db, Affection.UNAFFECTED, Sample.UNSET_QT); + trait3 = new Sample("t3", db, Affection.UNKNOWN, Sample.UNSET_QT); + trait4 = new Sample("t4", db, Affection.OTHER, "1.0"); + trait5 = new Sample("t4", db, Affection.OTHER, "CEU"); + } + + /** + * Now basic getters + */ + @Test() + public void normalGettersTest() { + Assert.assertEquals("1A", fam1A.getID()); + Assert.assertEquals("fam1", fam1A.getFamilyID()); + Assert.assertEquals("1B", fam1A.getPaternalID()); + Assert.assertEquals("1C", fam1A.getMaternalID()); + Assert.assertEquals(null, fam1B.getPaternalID()); + Assert.assertEquals(null, fam1B.getMaternalID()); + + Assert.assertEquals(Affection.AFFECTED, trait1.getAffection()); + Assert.assertEquals(Sample.UNSET_QT, trait1.getOtherPhenotype()); + Assert.assertEquals(Affection.UNAFFECTED, trait2.getAffection()); + Assert.assertEquals(Sample.UNSET_QT, trait2.getOtherPhenotype()); + Assert.assertEquals(Affection.UNKNOWN, trait3.getAffection()); + Assert.assertEquals(Sample.UNSET_QT, trait3.getOtherPhenotype()); + Assert.assertEquals(Affection.OTHER, trait4.getAffection()); + Assert.assertEquals("1.0", trait4.getOtherPhenotype()); + Assert.assertEquals("CEU", trait5.getOtherPhenotype()); + } + + @Test() + public void testGenders() { + Assert.assertTrue(fam1A.getGender() == Gender.UNKNOWN); + Assert.assertTrue(fam1B.getGender() == Gender.MALE); + Assert.assertTrue(fam1C.getGender() == Gender.FEMALE); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java index 702ba9f4f..c7eb4d88b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/BAQIntegrationTest.java @@ -18,7 +18,7 @@ public class BAQIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testPrintReadsNoBAQ() { - WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq OFF", 1, Arrays.asList("902197bf77ed5a828d50e08771685928")); + WalkerTestSpec spec = new WalkerTestSpec( baseCommand +" -baq OFF", 1, Arrays.asList("d97340a2bba2c6320d1ebeb86024a27c")); executeTest(String.format("testPrintReadsNoBAQ"), spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java index 1565c419b..216026a52 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java @@ -36,7 +36,7 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-R " + hg18Reference + " -T ClipReads " + - "-I " + validationDataLocation + "clippingReadsTest.bam " + + "-I " + validationDataLocation + "clippingReadsTest.withRG.bam " + "-os %s " + "-o %s " + args, 2, // just one output file @@ -46,23 +46,22 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest { } final static String Q10ClipOutput = "b29c5bc1cb9006ed9306d826a11d444f"; - @Test public void testQClip0() { testClipper("clipQSum0", "-QT 0", "117a4760b54308f81789c39b1c9de578", "4deca83d80dfa3f093e0dc27d27d1352"); } - @Test public void testQClip2() { testClipper("clipQSum2", "-QT 2", Q10ClipOutput, "1e123233ebd2f35ac3f41b1b7d2c8199"); } - @Test public void testQClip10() { testClipper("clipQSum10", "-QT 10", "b29c5bc1cb9006ed9306d826a11d444f", "1e123233ebd2f35ac3f41b1b7d2c8199"); } - @Test public void testQClip20() { testClipper("clipQSum20", "-QT 20", "6c3434dce66ae5c9eeea502f10fb9bee", "b950538d2d8fac1bcee11850c452bd6a"); } - @Test public void testQClip30() { testClipper("clipQSum30", "-QT 20", "6c3434dce66ae5c9eeea502f10fb9bee", "b950538d2d8fac1bcee11850c452bd6a"); } + @Test public void testQClip0() { testClipper("clipQSum0", "-QT 0", "117a4760b54308f81789c39b1c9de578", "33e781084379aae538954e30919e8fd3"); } + @Test public void testQClip2() { testClipper("clipQSum2", "-QT 2", Q10ClipOutput, "57c05b6241db7110148a91fde2d431d0"); } + @Test public void testQClip10() { testClipper("clipQSum10", "-QT 10", "b29c5bc1cb9006ed9306d826a11d444f", "57c05b6241db7110148a91fde2d431d0"); } + @Test public void testQClip20() { testClipper("clipQSum20", "-QT 20", "6c3434dce66ae5c9eeea502f10fb9bee", "67263a39d5127f2660a5b638ff32056a"); } - @Test public void testClipRange1() { testClipper("clipRange1", "-CT 1-5", "b5acd753226e25b1e088838c1aab9117", "9f70540b795f227668dcf78edcb35c09"); } - @Test public void testClipRange2() { testClipper("clipRange2", "-CT 1-5,11-15", "be4fcad5b666a5540028b774169cbad7", "a22347a741640fc6df92700e0e8d6f61"); } + @Test public void testClipRange1() { testClipper("clipRange1", "-CT 1-5", "b5acd753226e25b1e088838c1aab9117", "764846d0592f346a33525af674fd7a10"); } + @Test public void testClipRange2() { testClipper("clipRange2", "-CT 1-5,11-15", "be4fcad5b666a5540028b774169cbad7", "3061cf742f9e5526a61130128ae761a3"); } - @Test public void testClipSeq() { testClipper("clipSeqX", "-X CCCCC", "db199bd06561c9f2122f6ffb07941fbc", "f49e9e61a44115e2be59330259966f53"); } - @Test public void testClipSeqFile() { testClipper("clipSeqXF", "-XF " + validationDataLocation + "seqsToClip.fasta", "d011a3152b31822475afbe0281491f8d", "5c977f261442ab6122d5198fa4086e67"); } + @Test public void testClipSeq() { testClipper("clipSeqX", "-X CCCCC", "db199bd06561c9f2122f6ffb07941fbc", "b89459f373e40f0b835c1faff2208839"); } + @Test public void testClipSeqFile() { testClipper("clipSeqXF", "-XF " + validationDataLocation + "seqsToClip.fasta", "d011a3152b31822475afbe0281491f8d", "24e19116ef16a37a6d095ed5c22c2466"); } - @Test public void testClipMulti() { testClipper("clipSeqMulti", "-QT 10 -CT 1-5 -XF " + validationDataLocation + "seqsToClip.fasta -X CCCCC", "a23187bd9bfb06557f799706d98441de", "38d5f33d198aeee7eebec9feb7b11199"); } + @Test public void testClipMulti() { testClipper("clipSeqMulti", "-QT 10 -CT 1-5 -XF " + validationDataLocation + "seqsToClip.fasta -X CCCCC", "a23187bd9bfb06557f799706d98441de", "ad8d30300cb43d5e300fcc4d2450da8e"); } - @Test public void testClipNs() { testClipper("testClipNs", "-QT 10 -CR WRITE_NS", Q10ClipOutput, "1e123233ebd2f35ac3f41b1b7d2c8199"); } - @Test public void testClipQ0s() { testClipper("testClipQs", "-QT 10 -CR WRITE_Q0S", Q10ClipOutput, "d44cab2e3b70f5492a0f5b59f0b80043"); } - @Test public void testClipSoft() { testClipper("testClipSoft", "-QT 10 -CR SOFTCLIP_BASES", Q10ClipOutput, "b86374a7e6f59e3dd35781e9e8006702"); } + @Test public void testClipNs() { testClipper("testClipNs", "-QT 10 -CR WRITE_NS", Q10ClipOutput, "57c05b6241db7110148a91fde2d431d0"); } + @Test public void testClipQ0s() { testClipper("testClipQs", "-QT 10 -CR WRITE_Q0S", Q10ClipOutput, "2a1a3153e0942ab355fd8a6e082b30e0"); } + @Test public void testClipSoft() { testClipper("testClipSoft", "-QT 10 -CR SOFTCLIP_BASES", Q10ClipOutput, "50d43d63d8e39f67a87a6359963c6f52"); } @Test public void testUseOriginalQuals() { diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsWalkerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsWalkerUnitTest.java index 5990f1a06..8cd10048a 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsWalkerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsWalkerUnitTest.java @@ -10,6 +10,8 @@ import net.sf.samtools.SAMFileHeader; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; + +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; @@ -94,8 +96,8 @@ public class PrintReadsWalkerUnitTest extends BaseTest { walker.out = writer; SAMFileHeader head = ArtificialSAMUtils.createArtificialSamHeader(3,1,1000); - SAMRecord rec = ArtificialSAMUtils.createArtificialRead(head, "FakeRead", 1, 1, 50); - SAMRecord ret = walker.map(bases, rec,null); + GATKSAMRecord rec = ArtificialSAMUtils.createArtificialRead(head, "FakeRead", 1, 1, 50); + SAMRecord ret = walker.map(bases, rec, null); assertTrue(ret == rec); assertTrue(ret.getReadName().equals(rec.getReadName())); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 04bff8d41..8e887c32a 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -31,7 +31,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, Arrays.asList("8e7de435105499cd71ffc099e268a83e")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -39,7 +39,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, Arrays.asList("64b6804cb1e27826e3a47089349be581")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -63,7 +63,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, Arrays.asList("fd1ffb669800c2e07df1e2719aa38e49")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -71,15 +71,23 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, + baseTestString() + " -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, Arrays.asList("09f8e840770a9411ff77508e0ed0837f")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } + @Test + public void testExcludeAnnotations() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant:VCF3 " + validationDataLocation + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, + Arrays.asList("b49fe03aa4b675db80a9db38a3552c95")); + executeTest("test exclude annotations", spec); + } + @Test public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G \"Standard\" --variant:VCF " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, + baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, Arrays.asList("78d2c19f8107d865970dbaf3e12edd92")); executeTest("test overwriting header", spec); } @@ -87,7 +95,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testNoReads() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -BTI variant", 1, + baseTestString() + " -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, Arrays.asList("16e3a1403fc376320d7c69492cad9345")); executeTest("not passing it any reads", spec); } @@ -95,7 +103,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testDBTagWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --dbsnp " + b36dbSNP129 + " -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -BTI variant", 1, + baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, Arrays.asList("3da8ca2b6bdaf6e92d94a8c77a71313d")); executeTest("getting DB tag with dbSNP", spec); } @@ -103,7 +111,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --comp:H3 " + validationDataLocation + "fakeHM3.vcf -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -BTI variant", 1, + baseTestString() + " --comp:H3 " + validationDataLocation + "fakeHM3.vcf -G Standard --variant " + validationDataLocation + "vcfexample3empty.vcf -L " + validationDataLocation + "vcfexample3empty.vcf", 1, Arrays.asList("1bc01c5b3bd0b7aef75230310c3ce688")); executeTest("getting DB tag with HM3", spec); } @@ -111,8 +119,8 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { @Test public void testUsingExpression() { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G \"Standard\" --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.AF -BTI variant", 1, - Arrays.asList("e9c0d832dc6b4ed06c955060f830c140")); + baseTestString() + " --resource:foo " + validationDataLocation + "targetAnnotations.vcf -G Standard --variant:VCF3 " + validationDataLocation + "vcfexample3empty.vcf -E foo.AF -L " + validationDataLocation + "vcfexample3empty.vcf", 1, + Arrays.asList("ae30a1ac7bfbc3d22a327f8b689cad31")); executeTest("using expression", spec); } @@ -121,7 +129,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { final String MD5 = "13269d5a2e16f06fd755cc0fb9271acf"; for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) { WalkerTestSpec spec = new WalkerTestSpec( - baseTestString() + " -A HomopolymerRun --variant:VCF " + validationDataLocation + "/" + file + " -BTI variant -NO_HEADER", 1, + baseTestString() + " -A HomopolymerRun --variant:vcf " + validationDataLocation + file + " -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf -NO_HEADER", 1, Arrays.asList(MD5)); executeTest("Testing lookup vcf tabix vs. vcf tribble", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java index 59ac1a41e..646fb5e77 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -57,11 +57,11 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { // now add the expected files that get generated spec.addAuxFile("423571e4c05e7934322172654ac6dbb7", baseOutputFile); spec.addAuxFile("9df5e7e07efeb34926c94a724714c219", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_counts")); - spec.addAuxFile("b9a7748e5aec4dc06daed893c901c00d", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); + spec.addAuxFile("229b9b5bc2141c86dbc69c8acc9eba6a", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_cumulative_coverage_proportions")); spec.addAuxFile("9cd395f47b329b9dd00ad024fcac9929", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_statistics")); - spec.addAuxFile("aec669d64d9dd652dd088a5341835ea5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); - spec.addAuxFile("f6dbd74d32a48abe71ce08d300bce983", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); - spec.addAuxFile("e3a3467ed259ee3680f8d01980f525b7", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); + spec.addAuxFile("471c34ad2e4f7228efd20702d5941ba9", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); + spec.addAuxFile("9667c77284c2c08e647b162d0e9652d4", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_statistics")); + spec.addAuxFile("5a96c75f96d6fa6ee617451d731dae37", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_summary")); spec.addAuxFile("b82846df660f0aac8429aec57c2a62d6", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_counts")); spec.addAuxFile("d32a8c425fadcc4c048bd8b48d0f61e5", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_cumulative_coverage_proportions")); spec.addAuxFile("7b9d0e93bf5b5313995be7010ef1f528", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_interval_statistics")); @@ -69,11 +69,11 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { spec.addAuxFile("e70952f241eebb9b5448f2e7cb288131", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_statistics")); spec.addAuxFile("054ed1e184f46d6a170dc9bf6524270c", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".read_group_summary")); spec.addAuxFile("d53431022f7387fe9ac47814ab1fcd88", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_counts")); - spec.addAuxFile("650ee3714da7fbad7832c9d4ad49eb51", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); + spec.addAuxFile("a395dafde101971d2b9e5ddb6cd4b7d0", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_cumulative_coverage_proportions")); spec.addAuxFile("df0ba76e0e6082c0d29fcfd68efc6b77", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_statistics")); - spec.addAuxFile("7dcac2e8962c778081486332a4576dc3", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); - spec.addAuxFile("a50011571334f17e950ad3ed1149e350", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); - spec.addAuxFile("6f3260504295695d765af639539585c9", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); + spec.addAuxFile("e013cb5b11b0321a81c8dbd7c1863787", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_interval_summary")); + spec.addAuxFile("661160f571def8c323345b5859cfb9da", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_statistics")); + spec.addAuxFile("c95a7a6840334cadd0e520939615c77b", createTempFileFromBase(baseOutputFile.getAbsolutePath()+".sample_summary")); execute("testBaseOutputNoFiltering",spec); } @@ -90,7 +90,7 @@ public class DepthOfCoverageIntegrationTest extends WalkerTest { spec.setOutputFileLocation(baseOutputFile); spec.addAuxFile("6ccd7d8970ba98cb95fe41636a070c1c",baseOutputFile); - spec.addAuxFile("0ee40f3e5091536c14e077b77557083a",createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); + spec.addAuxFile("7d87783b3d98b928cac16d383ceca807",createTempFileFromBase(baseOutputFile.getAbsolutePath()+".library_interval_summary")); execute("testNoCoverageDueToFiltering",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java index 1f11b5886..c8a25c97b 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffObjectsIntegrationTest.java @@ -50,8 +50,8 @@ public class DiffObjectsIntegrationTest extends WalkerTest { @DataProvider(name = "data") public Object[][] createData() { - new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "dc1ca75c6ecf32641967d61e167acfff"); - new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "df0fcb568a3a49fc74830103b2e26f6c"); + new TestParams(testDir + "diffTestMaster.vcf", testDir + "diffTestTest.vcf", "ed377322c615abc7dceb97025076078d"); + new TestParams(testDir + "exampleBAM.bam", testDir + "exampleBAM.simple.bam", "02e46f5d2ebb3d49570850595b3f792e"); return TestParams.getTests(TestParams.class); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java index dee7bbd88..46b0df5b4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diffengine/DiffableReaderUnitTest.java @@ -70,7 +70,7 @@ public class DiffableReaderUnitTest extends BaseTest { private static void testLeaf(DiffNode rec, String field, Object expected) { DiffElement value = rec.getElement(field); Assert.assertNotNull(value, "Expected to see leaf named " + field + " in rec " + rec); - Assert.assertEquals(value.getValue().getValue(), expected, "Expected to leaf named " + field + " to have value " + expected + " in rec " + rec); + Assert.assertEquals(value.getValue().getValue(), expected, "Expected to see leaf named " + field + " to have value " + expected + " in rec " + rec + " but got instead " + value.getValue().getValue()); } @Test(enabled = true, dependsOnMethods = "testPluggableDiffableReaders") @@ -95,7 +95,7 @@ public class DiffableReaderUnitTest extends BaseTest { testLeaf(rec1, "POS", 2646); testLeaf(rec1, "ID", "rs62635284"); testLeaf(rec1, "REF", Allele.create("G", true)); - testLeaf(rec1, "ALT", new HashSet(Arrays.asList(Allele.create("A")))); + testLeaf(rec1, "ALT", Arrays.asList(Allele.create("A"))); testLeaf(rec1, "QUAL", 0.15); testLeaf(rec1, "FILTER", Collections.emptySet()); testLeaf(rec1, "AC", "2"); diff --git a/public/java/test/org/broadinstitute/sting/utils/genotype/DiploidGenotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeUnitTest.java similarity index 95% rename from public/java/test/org/broadinstitute/sting/utils/genotype/DiploidGenotypeUnitTest.java rename to public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeUnitTest.java index e4f8b12e3..4e72b37a4 100644 --- a/public/java/test/org/broadinstitute/sting/utils/genotype/DiploidGenotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidGenotypeUnitTest.java @@ -1,5 +1,6 @@ -package org.broadinstitute.sting.utils.genotype; +package org.broadinstitute.sting.gatk.walkers.genotyper; +import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidGenotype; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java index 9882ce869..425b969e2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsUnitTest.java @@ -1,7 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.testng.Assert; -import org.broadinstitute.sting.utils.genotype.DiploidGenotype; import org.broadinstitute.sting.BaseTest; import org.testng.annotations.Test; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 7ef75ec53..b80f214b1 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -18,8 +18,8 @@ import java.util.Map; public class UnifiedGenotyperIntegrationTest extends WalkerTest { private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm BOTH --dbsnp " + b36dbSNP129; - private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm INDEL --dbsnp " + b36dbSNP129; - private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -NO_HEADER -glm INDEL --dbsnp " + b37dbSNP132; + private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -NO_HEADER -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; // -------------------------------------------------------------------------------------------------------------- // @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("e6639ea2dc81635c706e6c35921406d7")); + Arrays.asList("b27939251539439a382538e507e03507")); executeTest("test MultiSample Pilot1", spec); } @@ -43,7 +43,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("ec43daadfb15b00b41aeb0017a45df0b")); + Arrays.asList("6458f3b8fe4954e2ffc2af972aaab19e")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -51,7 +51,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("d1cbd1fb9f3f7323941a95bc2def7e5a")); + Arrays.asList("6762b72ae60155ad71738d7c76b80e4b")); executeTest("test SingleSample Pilot2", spec); } @@ -61,7 +61,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "2732b169cdccb21eb3ea00429619de79"; + private final static String COMPRESSED_OUTPUT_MD5 = "bc71dba7bbdb23e7d5cc60461fdd897b"; @Test public void testCompressedOutput() { @@ -82,7 +82,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "cbac3960bbcb9d6192c57549208c182c"; + String md5 = "b9504e446b9313559c3ed97add7e8dc1"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -113,9 +113,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testCallingParameters() { HashMap e = new HashMap(); - e.put( "--min_base_quality_score 26", "531966aee1cd5dced61c96c4fedb59a9" ); - e.put( "--min_mapping_quality_score 26", "c71ca370947739cb7d87b59452be7a07" ); - e.put( "--computeSLOD", "1a5648f26c18ced27df4be031b44e72d" ); + e.put( "--min_base_quality_score 26", "bb3f294eab3e2cf52c70e63b23aac5ee" ); + e.put( "--computeSLOD", "eb34979efaadba1e34bd82bcacf5c722" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -161,8 +160,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "aed69402ddffe7f2ed5ca98563bfba02" ); - e.put( 1.0 / 1850, "fa94a059f08c1821b721335d93ed2ea5" ); + e.put( 0.01, "f84da90c310367bd51f2ab6e346fa3d8" ); + e.put( 1.0 / 1850, "5791e7fef40d4412b6d8f84e0a809c6c" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -186,7 +185,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("1c080e6596d4c830bb5d147b04e2a82c")); + Arrays.asList("9cc9538ac83770e12bd0830d285bfbd0")); executeTest(String.format("test multiple technologies"), spec); } @@ -205,7 +204,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("9129ad748ca3be2d3b321d2d7e83ae5b")); + Arrays.asList("eaf8043edb46dfbe9f97ae03baa797ed")); executeTest(String.format("test calling with BAQ"), spec); } @@ -224,7 +223,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("0bece77ce6bc447438ef9b2921b2dc41")); + Arrays.asList("eeba568272f9b42d5450da75c7cc6d2d")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -241,7 +240,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { 1, Arrays.asList("5fe98ee853586dc9db58f0bc97daea63")); - executeTest(String.format("test indel caller in SLX witn low min allele count"), spec); + executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @Test @@ -252,38 +251,47 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("790b1a1d6ab79eee8c24812bb8ca6fae")); + Arrays.asList("19ff9bd3139480bdf79dcbf117cf2b24")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @Test - public void testWithIndelAllelesPassedIn() { + public void testWithIndelAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("408d3aba4d094c067fc00a43992c2292")); + Arrays.asList("118918f2e9e56a3cfc5ccb2856d529c8")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); + } + @Test + public void testWithIndelAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("94977d6e42e764280e9deaf4e3ac8c80")); + Arrays.asList("a20799237accd52c1b8c2ac096309c8f")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); + } + + @Test + public void testWithIndelAllelesPassedIn3() { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("e66b7321e2ac91742ad3ef91040daafd")); + Arrays.asList("18ef8181157b4ac3eb8492f538467f92")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); + } + @Test + public void testWithIndelAllelesPassedIn4() { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("4be308fd9e8167ebee677f62a7a753b7")); + Arrays.asList("ad884e511a751b05e64db5314314365a")); executeTest("test MultiSample 1000G Phase1 indels with complicated records emitting all sites", spec4); - } @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java index 1873ccbe2..26e23e016 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java @@ -8,22 +8,43 @@ import java.util.Arrays; public class RealignerTargetCreatorIntegrationTest extends WalkerTest { @Test - public void testIntervals() { + public void testIntervals1() { + String md5 = "3f0b63a393104d0c4158c7d1538153b8"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( "-T RealignerTargetCreator -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam --mismatchFraction 0.15 -L 1:10,000,000-10,050,000 -o %s", 1, - Arrays.asList("e7accfa58415d6da80383953b1a3a986")); - executeTest("test standard", spec1); + Arrays.asList(md5)); + executeTest("test standard nt=1", spec1); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - "-T RealignerTargetCreator --known " + b36dbSNP129 + " -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", + "-nt 4 -T RealignerTargetCreator -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam --mismatchFraction 0.15 -L 1:10,000,000-10,050,000 -o %s", 1, - Arrays.asList("0367d39a122c8ac0899fb868a82ef728")); - executeTest("test dbsnp", spec2); + Arrays.asList(md5)); + executeTest("test standard nt=4", spec2); + } + @Test + public void testIntervals2() { + String md5 = "e0f745b79b679c225314a2abef4919ff"; + + WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( + "-T RealignerTargetCreator --known " + b36dbSNP129 + " -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,200,000 -o %s", + 1, + Arrays.asList(md5)); + executeTest("test with dbsnp nt=1", spec1); + + WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( + "-nt 4 -T RealignerTargetCreator --known " + b36dbSNP129 + " -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,200,000 -o %s", + 1, + Arrays.asList(md5)); + executeTest("test with dbsnp nt=4", spec2); + } + + @Test + public void testKnownsOnly() { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( - "-T RealignerTargetCreator -R " + b36KGReference + " --known " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -BTI known -o %s", + "-T RealignerTargetCreator -R " + b36KGReference + " --known " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -L " + validationDataLocation + "NA12878.chr1_10mb_11mb.slx.indels.vcf4 -o %s", 1, Arrays.asList("5206cee6c01b299417bf2feeb8b3dc96")); executeTest("test rods only", spec3); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java index 0a0d8c5b2..155d3c4db 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmpliconsIntegrationTest.java @@ -20,7 +20,7 @@ public class ValidationAmpliconsIntegrationTest extends WalkerTest { String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf"; String intervalTable = validationDataLocation + "amplicon_interval_table1.table"; String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons --ValidateAlleles:VCF "+siteVCF+" -o %s"; - testArgs += " --ProbeIntervals:table "+intervalTable+" -BTI ProbeIntervals --MaskAlleles:VCF "+maskVCF; + testArgs += " --ProbeIntervals:table "+intervalTable+" -L:table "+intervalTable+" --MaskAlleles:VCF "+maskVCF; testArgs += " --virtualPrimerSize 30"; WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, Arrays.asList("27f9450afa132888a8994167f0035fd7")); @@ -33,7 +33,7 @@ public class ValidationAmpliconsIntegrationTest extends WalkerTest { String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf"; String intervalTable = validationDataLocation + "amplicon_interval_table1.table"; String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons --ValidateAlleles:VCF "+siteVCF+" -o %s"; - testArgs += " --ProbeIntervals:table "+intervalTable+" -BTI ProbeIntervals --MaskAlleles:VCF "+maskVCF; + testArgs += " --ProbeIntervals:table "+intervalTable+" -L:table "+intervalTable+" --MaskAlleles:VCF "+maskVCF; testArgs += " --virtualPrimerSize 30 --doNotUseBWA"; WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, Arrays.asList("f2611ff1d9cd5bedaad003251fed8bc1")); @@ -46,7 +46,7 @@ public class ValidationAmpliconsIntegrationTest extends WalkerTest { String maskVCF = validationDataLocation + "amplicon_mask_sites.vcf"; String intervalTable = validationDataLocation + "amplicon_interval_table1.table"; String testArgs = "-R " + b37KGReference + " -T ValidationAmplicons --ValidateAlleles:VCF "+siteVCF+" -o %s"; - testArgs += " --ProbeIntervals:table "+intervalTable+" -BTI ProbeIntervals --MaskAlleles:VCF "+maskVCF; + testArgs += " --ProbeIntervals:table "+intervalTable+" -L:table "+intervalTable+" --MaskAlleles:VCF "+maskVCF; testArgs += " --virtualPrimerSize 30 --filterMonomorphic"; WalkerTestSpec spec = new WalkerTestSpec(testArgs, 1, Arrays.asList("77b3f30e38fedad812125bdf6cf3255f")); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index 9fe253ecb..cd2493dde 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -26,7 +26,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-EV TiTvVariantEvaluator", "-noST", "-ST FunctionalClass", - "-BTI eval", + "-L " + validationDataLocation + "snpEff.AFR.unfiltered.VariantAnnotator.output.vcf", "-o %s" ), 1, @@ -46,7 +46,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-noEV", "-EV TiTvVariantEvaluator", "-ST Sample", - "-BTI eval", + "-L " + variantEvalTestDataRoot + "/CEU.trio.callsForVE.vcf", "-o %s" ), 1, @@ -66,7 +66,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-noEV", "-EV CountVariants", "-noST", - "-BTI eval", + "-L " + fundamentalTestVCF, "-o %s" ), 1, @@ -87,7 +87,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-EV CountVariants", "-noST", "-ST Novelty", - "-BTI eval", + "-L " + fundamentalTestVCF, "-o %s" ), 1, @@ -109,7 +109,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-noST", "-ST Novelty", "-ST Filter", - "-BTI eval", + "-L " + fundamentalTestVCF, "-o %s" ), 1, @@ -130,7 +130,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-EV CountVariants", "-noST", "-ST CpG", - "-BTI eval", + "-L " + fundamentalTestVCF, "-o %s" ), 1, @@ -151,7 +151,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-EV CountVariants", "-noST", "-ST FunctionalClass", - "-BTI eval", + "-L " + fundamentalTestVCF, "-o %s" ), 1, @@ -172,7 +172,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-EV CountVariants", "-noST", "-ST Degeneracy", - "-BTI eval", + "-L " + fundamentalTestVCF, "-o %s" ), 1, @@ -193,7 +193,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-EV CountVariants", "-noST", "-ST Sample", - "-BTI eval", + "-L " + fundamentalTestVCF, "-o %s" ), 1, @@ -216,7 +216,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-ST JexlExpression", "-select 'DP < 20'", "-selectName DepthSelect", - "-BTI eval", + "-L " + fundamentalTestVCF, "-o %s" ), 1, @@ -241,7 +241,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-selectName DepthLt20", "-select 'DP > 20'", "-selectName DepthGt20", - "-BTI eval", + "-L " + fundamentalTestVCF, "-o %s" ), 1, @@ -260,7 +260,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-noEV", "-EV CountVariants", "-noST", - "-BTI eval", + "-L " + fundamentalTestVCF, "-o %s" ), 1, @@ -371,7 +371,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-EV CompOverlap", "-sn HG00625", "-noST", - "-BTI eval", + "-L " + fundamentalTestSNPsVCF, "-o %s" ), 1, @@ -388,7 +388,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-noEV", "-EV CompOverlap", "-noST", - "-BTI eval", + "-L " + fundamentalTestSNPsOneSampleVCF, "-o %s" ), 1, @@ -410,7 +410,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { "-EV CountVariants", "-noST", "-ST AlleleCount", - "-BTI eval", + "-L " + fundamentalTestSNPsVCF, "-o %s" ), 1, diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index b65de9d36..5a4d6e6a1 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -78,26 +78,26 @@ public class CombineVariantsIntegrationTest extends WalkerTest { executeTest("combine PLs 1:" + new File(file1).getName() + " 2:" + new File(file2).getName(), spec); } - @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "c608b9fc1e36dba6cebb4f259883f9f0"); } - @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "20caad94411d6ab48153b214de916df8", " -setKey foo"); } - @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "004f3065cb1bc2ce2f9afd695caf0b48", " -setKey null"); } + @Test public void test1SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "ea0a660cd04101ce7b534aba0310721d"); } + @Test public void test2SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "cb0350e7a9d2483993482b69f5432b64", " -setKey foo"); } + @Test public void test3SNP() { test1InOut("pilot2.snps.vcf4.genotypes.vcf", "0571c48cc59cf244779caae52d562e79", " -setKey null"); } @Test public void testOfficialCEUPilotCalls() { test1InOut("CEU.trio.2010_03.genotypes.vcf.gz", "c9c901ff9ef2a982624b203a8086dff0"); } // official project VCF files in tabix format - @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "7593be578d4274d672fc22fced38012b"); } + @Test public void test1Indel1() { test1InOut("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "75901304abc1daa41b1906f881aa7bbc"); } @Test public void test1Indel2() { test1InOut("CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "1cd467863c4e948fadd970681552d57e"); } - @Test public void combineWithPLs() { combinePLs("combine.3.vcf", "combine.4.vcf", "0f873fed02aa99db5b140bcd6282c10a"); } + @Test public void combineWithPLs() { combinePLs("combine.3.vcf", "combine.4.vcf", "d08e933b6c81246e998d3ece50ddfdcc"); } - @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "1d5a021387a8a86554db45a29f66140f"); } // official project VCF files in tabix format - @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "96941ee177b0614a9879af0ac3218963"); } // official project VCF files in tabix format - @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "a8a6e7589f22e0b6c5d222066b9a2093"); } + @Test public void combineTrioCalls() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", "", "01967686e0e02dbccd2590b70f2d049b"); } // official project VCF files in tabix format + @Test public void combineTrioCallsMin() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "YRI.trio.2010_03.genotypes.vcf.gz", " -minimalVCF", "8c113199c4a93a4a408104b735d59044"); } // official project VCF files in tabix format + @Test public void combine2Indels() { combine2("CEU.dindel.vcf4.trio.2010_06.indel.genotypes.vcf", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "30e96a0cb614cd5bc056e1f7ec6d10bd"); } @Test public void combineSNPsAndIndels() { combine2("CEU.trio.2010_03.genotypes.vcf.gz", "CEU.dindel.vcf4.low_coverage.2010_06.indel.genotypes.vcf", "", "e144b6283765494bfe8189ac59965083"); } - @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "89f55abea8f59e39d1effb908440548c"); } + @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "78a49597f1abf1c738e67d50c8fbed2b"); } - @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "c6adeda751cb2a08690dd9202356629f"); } - @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "3a08fd5ee18993dfc8882156ccf5d2e9"); } + @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "4c63bfa5f73793aaca42e130ec49f238"); } + @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "86e326acbd8d2af8a6040eb146d92fc6"); } @Test public void threeWayWithRefs() { WalkerTestSpec spec = new WalkerTestSpec( @@ -110,7 +110,7 @@ public class CombineVariantsIntegrationTest extends WalkerTest { " -priority NA19240_BGI,NA19240_ILLUMINA,NA19240_WUGSC,denovoInfo" + " -genotypeMergeOptions UNIQUIFY -L 1"), 1, - Arrays.asList("212d9d3df10bb29e2c7fb226da422dc0")); + Arrays.asList("b14f8cbb5d03a2e613b12da4da9efd9a")); executeTest("threeWayWithRefs", spec); } @@ -127,17 +127,17 @@ public class CombineVariantsIntegrationTest extends WalkerTest { executeTest("combineComplexSites 1:" + new File(file1).getName() + " 2:" + new File(file2).getName() + " args = " + args, spec); } - @Test public void complexTestFull() { combineComplexSites("", "b5a53ee92bdaacd2bb3327e9004ae058"); } - @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "df96cb3beb2dbb5e02f80abec7d3571e"); } - @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "f704caeaaaed6711943014b847fe381a"); } - @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "f704caeaaaed6711943014b847fe381a"); } + @Test public void complexTestFull() { combineComplexSites("", "2842337e9943366f7a4d5f148f701b8c"); } + @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "39724318e6265d0318a3fe4609612785"); } + @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "fe9bb02ab8b3d0dd2ad6373ebdb6d915"); } + @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "fe9bb02ab8b3d0dd2ad6373ebdb6d915"); } @Test public void combineDBSNPDuplicateSites() { WalkerTestSpec spec = new WalkerTestSpec( "-T CombineVariants -NO_HEADER -L 1:902000-903000 -o %s -R " + b37KGReference + " -V:v1 " + b37dbSNP132, 1, - Arrays.asList("")); + Arrays.asList("5969446769cb8377daa2db29304ae6b5")); executeTest("combineDBSNPDuplicateSites:", spec); } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 20409d4ca..6e994be3a 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -16,7 +16,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant:VCF3 " + testfile), + baseTestString(" -sn A -se '[CDH]' -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), 1, Arrays.asList("d18516c1963802e92cb9e425c0b75fd6") ); @@ -30,7 +30,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s -NO_HEADER -xl_sn A -xl_sf " + samplesFile + " --variant:VCF3 " + testfile, + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s -NO_HEADER -xl_sn A -xl_sf " + samplesFile + " --variant " + testfile, 1, Arrays.asList("730f021fd6ecf1d195dabbee2e233bfd") ); @@ -43,7 +43,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testfile = validationDataLocation + "test.dup.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - baseTestString(" -sn A -sn B -sn C --variant:VCF3 " + testfile), + baseTestString(" -sn A -sn B -sn C --variant " + testfile), 1, Arrays.asList("b74038779fe6485dbb8734ae48178356") ); @@ -56,7 +56,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { String testFile = validationDataLocation + "NA12878.hg19.example1.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant:VCF " + b37hapmapGenotypes + " -disc:VCF " + testFile + " -o %s -NO_HEADER", + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s -NO_HEADER", 1, Arrays.asList("78e6842325f1f1bc9ab30d5e7737ee6e") ); @@ -64,12 +64,25 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testDiscordance--" + testFile, spec); } + @Test + public void testDiscordanceNoSampleSpecified() { + String testFile = validationDataLocation + "NA12878.hg19.example1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + hg19Reference + " -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s -NO_HEADER", + 1, + Arrays.asList("5d7d899c0c4954ec59104aebfe4addd5") + ); + + executeTest("testDiscordanceNoSampleSpecified--" + testFile, spec); + } + @Test public void testConcordance() { String testFile = validationDataLocation + "NA12878.hg19.example1.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc:VCF " + b37hapmapGenotypes + " --variant " + testFile + " -o %s -NO_HEADER", + "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 -conc " + b37hapmapGenotypes + " --variant " + testFile + " -o %s -NO_HEADER", 1, Arrays.asList("d2ba3ea30a810f6f0fbfb1b643292b6a") ); @@ -90,16 +103,16 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testVariantTypeSelection--" + testFile, spec); } - @Test(enabled=false) - public void testRemovePLs() { + @Test + public void testUsingDbsnpName() { String testFile = validationDataLocation + "combine.3.vcf"; WalkerTestSpec spec = new WalkerTestSpec( - "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant " + testFile + " -o %s -NO_HEADER", + "-T SelectVariants -R " + b36KGReference + " -sn NA12892 --variant:dbsnp " + testFile + " -o %s -NO_HEADER", 1, - Arrays.asList("") + Arrays.asList("167a1265df820978a74c267df44d5c43") ); - executeTest("testWithPLs--" + testFile, spec); + executeTest("testUsingDbsnpName--" + testFile, spec); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java index 836a4473f..f597694bb 100644 --- a/public/java/test/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/R/RScriptExecutorUnitTest.java @@ -26,60 +26,84 @@ package org.broadinstitute.sting.utils.R; import org.apache.commons.io.FileUtils; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.io.IOUtils; +import org.testng.Assert; import org.testng.annotations.Test; import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; /** * Basic unit test for RScriptExecutor in reduced reads */ public class RScriptExecutorUnitTest extends BaseTest { - final static String testrscript = "print(\"hello, world\")\n"; - final static String publicRScript = "plot_Tranches.R"; - // -------------------------------------------------------------------------------- - // - // Difference testing routines - // - // -------------------------------------------------------------------------------- - - private void testOne(String script, String pathToRscript, String anotherSearchPath, boolean exceptOnError) { - RScriptExecutor.RScriptArgumentCollection collection = - new RScriptExecutor.RScriptArgumentCollection(); - if ( pathToRscript != null ) - collection.PATH_TO_RSCRIPT = pathToRscript; - if ( anotherSearchPath != null ) { - List x = new ArrayList(collection.PATH_TO_RESOURCES); - x.add(anotherSearchPath); - collection.PATH_TO_RESOURCES = x; - } - RScriptExecutor executor = new RScriptExecutor(collection, exceptOnError); - executor.callRScripts(script); - } + private static final String HELLO_WORLD_SCRIPT = "print('hello, world')"; + private static final String GSALIB_LOADED_SCRIPT = "if (!'package:gsalib' %in% search()) stop('gsalib not loaded')"; @Test - public void testPublic() { testOne(publicRScript, null, null, true); } - - @Test(expectedExceptions = UserException.class) - public void testNonExistantScriptException() { testOne("does_not_exist.R", null, null, true); } - - @Test() - public void testNonExistantScriptNoException() { testOne("does_not_exist.R", null, null, false); } - - @Test(expectedExceptions = UserException.class) - public void testNonExistantRScriptException() { testOne(publicRScript, "badRScriptValue", null, true); } - - @Test() - public void testNonExistantRScriptNoException() { testOne(publicRScript, "badRScriptValue", null, false); } - - @Test() - public void testScriptInNewPath() throws IOException { - File t = createTempFile("myTestScript", ".R"); - FileUtils.writeStringToFile(t, testrscript); - testOne(t.getName(), null, t.getParent(), true); + public void testRscriptExists() { + Assert.assertTrue(RScriptExecutor.RSCRIPT_EXISTS, "Rscript not found in environment ${PATH}"); } -} \ No newline at end of file + + @Test(dependsOnMethods = "testRscriptExists") + public void testExistingScript() { + File script = writeScript(HELLO_WORLD_SCRIPT); + try { + RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(script); + executor.setExceptOnError(true); + Assert.assertTrue(executor.exec(), "Exec failed"); + } finally { + FileUtils.deleteQuietly(script); + } + } + + @Test(dependsOnMethods = "testRscriptExists", expectedExceptions = RScriptExecutorException.class) + public void testNonExistantScriptException() { + RScriptExecutor executor = new RScriptExecutor(); + executor.setExceptOnError(true); + executor.addScript(new File("does_not_exists.R")); + executor.exec(); + } + + @Test(dependsOnMethods = "testRscriptExists") + public void testNonExistantScriptNoException() { + logger.warn("Testing that warning is printed an no exception thrown for missing script."); + RScriptExecutor executor = new RScriptExecutor(); + executor.setExceptOnError(false); + executor.addScript(new File("does_not_exists.R")); + Assert.assertFalse(executor.exec(), "Exec should have returned false when the job failed"); + } + + @Test(dependsOnMethods = "testRscriptExists") + public void testLibrary() { + File script = writeScript(GSALIB_LOADED_SCRIPT); + try { + RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(script); + executor.addLibrary(RScriptLibrary.GSALIB); + executor.setExceptOnError(true); + Assert.assertTrue(executor.exec(), "Exec failed"); + } finally { + FileUtils.deleteQuietly(script); + } + } + + @Test(dependsOnMethods = "testRscriptExists", expectedExceptions = RScriptExecutorException.class) + public void testLibraryMissing() { + File script = writeScript(GSALIB_LOADED_SCRIPT); + try { + RScriptExecutor executor = new RScriptExecutor(); + executor.addScript(script); + // GSALIB is not added nor imported in the script + executor.setExceptOnError(true); + executor.exec(); + } finally { + FileUtils.deleteQuietly(script); + } + } + + private File writeScript(String content) { + return IOUtils.writeTempFile(content, "myTestScript", ".R"); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/R/RScriptLibraryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/R/RScriptLibraryUnitTest.java new file mode 100644 index 000000000..19fd5b316 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/R/RScriptLibraryUnitTest.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.R; + +import org.apache.commons.io.FileUtils; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; + +public class RScriptLibraryUnitTest { + @Test + public void testProperties() { + Assert.assertEquals(RScriptLibrary.GSALIB.getLibraryName(), "gsalib"); + Assert.assertEquals(RScriptLibrary.GSALIB.getResourcePath(), "gsalib.tar.gz"); + } + + @Test + public void testWriteTemp() { + File file = RScriptLibrary.GSALIB.writeTemp(); + Assert.assertTrue(file.exists(), "R library was not written to temp file: " + file); + FileUtils.deleteQuietly(file); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java index 7cb7fec98..bc39d714e 100755 --- a/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java @@ -3,18 +3,20 @@ package org.broadinstitute.sting.utils; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; public class ReadUtilsUnitTest extends BaseTest { - SAMRecord read; + GATKSAMRecord read, reducedRead; final static String BASES = "ACTG"; final static String QUALS = "!+5?"; + final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40}; @BeforeTest public void init() { @@ -23,9 +25,14 @@ public class ReadUtilsUnitTest extends BaseTest { read.setReadUnmappedFlag(true); read.setReadBases(new String(BASES).getBytes()); read.setBaseQualityString(new String(QUALS)); + + reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); + reducedRead.setReadBases(BASES.getBytes()); + reducedRead.setBaseQualityString(QUALS); + reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_QUALITY_TAG, REDUCED_READ_COUNTS); } - private void testReadBasesAndQuals(SAMRecord read, int expectedStart, int expectedStop) { + private void testReadBasesAndQuals(GATKSAMRecord read, int expectedStart, int expectedStop) { SAMRecord clipped = ReadUtils.hardClipBases(read, expectedStart, expectedStop - 1, null); String expectedBases = BASES.substring(expectedStart, expectedStop); String expectedQuals = QUALS.substring(expectedStart, expectedStop); @@ -38,4 +45,27 @@ public class ReadUtilsUnitTest extends BaseTest { @Test public void testClip2Front() { testReadBasesAndQuals(read, 2, 4); } @Test public void testClip1Back() { testReadBasesAndQuals(read, 0, 3); } @Test public void testClip2Back() { testReadBasesAndQuals(read, 0, 2); } + + @Test + public void testReducedReads() { + Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read"); + Assert.assertEquals(read.getReducedReadCounts(), null, "No reduced read tag in normal read"); + + Assert.assertTrue(reducedRead.isReducedRead(), "isReducedRead is true for reduced read"); + for ( int i = 0; i < reducedRead.getReadLength(); i++) { + Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); + } + } + + @Test + public void testReducedReadPileupElement() { + PileupElement readp = new PileupElement(read,0); + PileupElement reducedreadp = new PileupElement(reducedRead,0); + + Assert.assertFalse(readp.isReducedRead()); + + Assert.assertTrue(reducedreadp.isReducedRead()); + Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]); + Assert.assertEquals(reducedreadp.getQual(), readp.getQual()); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java index 76dd5d341..0f19e2f90 100644 --- a/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.utils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; import org.testng.annotations.Test; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; @@ -28,7 +29,7 @@ public class ReservoirDownsamplerUnitTest { @Test public void testOneElementWithPoolSizeOne() { - List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); ReservoirDownsampler downsampler = new ReservoirDownsampler(1); downsampler.addAll(reads); @@ -40,7 +41,7 @@ public class ReservoirDownsamplerUnitTest { @Test public void testOneElementWithPoolSizeGreaterThanOne() { - List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); + List reads = Collections.singletonList(ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,76)); ReservoirDownsampler downsampler = new ReservoirDownsampler(5); downsampler.addAll(reads); diff --git a/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java deleted file mode 100644 index 56bf66f53..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java +++ /dev/null @@ -1,68 +0,0 @@ -package org.broadinstitute.sting.utils.bed; - -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.List; - -import net.sf.picard.reference.IndexedFastaSequenceFile; - - -public class BedParserUnitTest extends BaseTest { - - private static IndexedFastaSequenceFile seq; - private GenomeLocParser genomeLocParser; - private File bedFile = new File("public/testdata/sampleBedFile.bed"); - - @BeforeClass - public void beforeTests() { - File referenceFile = new File(b36KGReference); - try { - seq = new CachingIndexedFastaSequenceFile(referenceFile); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(referenceFile,ex); - } - - genomeLocParser = new GenomeLocParser(seq); - } - - @Test - public void testLoadBedFile() { - BedParser parser = new BedParser(genomeLocParser,bedFile); - List location = parser.getLocations(); - Assert.assertEquals(location.size(), 4); - } - - @Test - public void testBedParsing() { - BedParser parser = new BedParser(genomeLocParser,bedFile); - List location = parser.getLocations(); - Assert.assertEquals(location.size(), 4); - Assert.assertTrue(location.get(0).getContig().equals("20")); - Assert.assertTrue(location.get(1).getContig().equals("20")); - Assert.assertTrue(location.get(2).getContig().equals("22")); - Assert.assertTrue(location.get(3).getContig().equals("22")); - - // now check the the start positions - Assert.assertEquals(location.get(0).getStart(), 1); - Assert.assertEquals(location.get(1).getStart(), 1002); - Assert.assertEquals(location.get(2).getStart(), 1001); - Assert.assertEquals(location.get(3).getStart(), 2001); - - // now check the the stop positions - Assert.assertEquals(location.get(0).getStop(), 999); - Assert.assertEquals(location.get(1).getStop(), 2000); - Assert.assertEquals(location.get(2).getStop(), 5000); - Assert.assertEquals(location.get(3).getStop(), 6000); - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java new file mode 100644 index 000000000..f625af23c --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.clipreads; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +/** + * Created by IntelliJ IDEA. + * User: roger + * Date: 9/28/11 + * Time: 9:54 PM + * To change this template use File | Settings | File Templates. + */ +public class ReadClipperUnitTest extends BaseTest { + + // TODO: Add error messages on failed tests + + GATKSAMRecord read, expected; + ReadClipper readClipper; + final static String BASES = "ACTG"; + final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63 + + @BeforeClass + public void init() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); + read.setReadUnmappedFlag(true); + read.setReadBases(new String(BASES).getBytes()); + read.setBaseQualityString(new String(QUALS)); + + readClipper = new ReadClipper(read); + } + + @Test ( enabled = false ) + public void testHardClipBothEndsByReferenceCoordinates() { + logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(0,0), new GATKSAMRecord(read.getHeader())); + //clip 1 base + expected = readClipper.hardClipBothEndsByReferenceCoordinates(0,3); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,3)); + Assert.assertEquals(expected.getCigarString(), "1H2M1H"); + + } + + @Test ( enabled = false ) + public void testHardClipByReadCoordinates() { + logger.warn("Executing testHardClipByReadCoordinates"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipByReadCoordinates(0,3), new GATKSAMRecord(read.getHeader())); + + //clip 1 base at start + expected = readClipper.hardClipByReadCoordinates(0,0); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); + Assert.assertEquals(expected.getCigarString(), "1H3M"); + + //clip 1 base at end + expected = readClipper.hardClipByReadCoordinates(3,3); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); + Assert.assertEquals(expected.getCigarString(), "3M1H"); + + //clip 2 bases at start + expected = readClipper.hardClipByReadCoordinates(0,1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); + Assert.assertEquals(expected.getCigarString(), "2H2M"); + + //clip 2 bases at end + expected = readClipper.hardClipByReadCoordinates(2,3); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); + Assert.assertEquals(expected.getCigarString(), "2M2H"); + + } + + @Test ( enabled = false ) + public void testHardClipByReferenceCoordinates() { + logger.warn("Executing testHardClipByReferenceCoordinates"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipByReferenceCoordinates(1,4), new GATKSAMRecord(read.getHeader())); + + //clip 1 base at start + expected = readClipper.hardClipByReferenceCoordinates(-1,1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); + Assert.assertEquals(expected.getCigarString(), "1H3M"); + + //clip 1 base at end + expected = readClipper.hardClipByReferenceCoordinates(3,-1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); + Assert.assertEquals(expected.getCigarString(), "3M1H"); + + //clip 2 bases at start + expected = readClipper.hardClipByReferenceCoordinates(-1,2); + Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); + Assert.assertEquals(expected.getCigarString(), "2H2M"); + + //clip 2 bases at end + expected = readClipper.hardClipByReferenceCoordinates(2,-1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); + Assert.assertEquals(expected.getCigarString(), "2M2H"); + + } + + @Test ( enabled = false ) + public void testHardClipByReferenceCoordinatesLeftTail() { + logger.warn("Executing testHardClipByReferenceCoordinatesLeftTail"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesLeftTail(4), new GATKSAMRecord(read.getHeader())); + + //clip 1 base at start + expected = readClipper.hardClipByReferenceCoordinatesLeftTail(1); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); + Assert.assertEquals(expected.getCigarString(), "1H3M"); + + //clip 2 bases at start + expected = readClipper.hardClipByReferenceCoordinatesLeftTail(2); + Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); + Assert.assertEquals(expected.getCigarString(), "2H2M"); + + } + + @Test ( enabled = false ) + public void testHardClipByReferenceCoordinatesRightTail() { + logger.warn("Executing testHardClipByReferenceCoordinatesRightTail"); + + //Clip whole read + Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesRightTail(1), new GATKSAMRecord(read.getHeader())); + + //clip 1 base at end + expected = readClipper.hardClipByReferenceCoordinatesRightTail(3); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); + Assert.assertEquals(expected.getCigarString(), "3M1H"); + + //clip 2 bases at end + expected = readClipper.hardClipByReferenceCoordinatesRightTail(2); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); + Assert.assertEquals(expected.getCigarString(), "2M2H"); + + } + + @Test ( enabled = false ) + public void testHardClipLowQualEnds() { + logger.warn("Executing testHardClipByReferenceCoordinates"); + + + //Clip whole read + Assert.assertEquals(readClipper.hardClipLowQualEnds((byte)64), new GATKSAMRecord(read.getHeader())); + + //clip 1 base at start + expected = readClipper.hardClipLowQualEnds((byte)34); + Assert.assertEquals(expected.getReadBases(), BASES.substring(1,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(1,4)); + Assert.assertEquals(expected.getCigarString(), "1H3M"); + + //clip 2 bases at start + expected = readClipper.hardClipLowQualEnds((byte)44); + Assert.assertEquals(expected.getReadBases(), BASES.substring(2,4).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(2,4)); + Assert.assertEquals(expected.getCigarString(), "2H2M"); + + // Reverse Quals sequence + readClipper.getRead().setBaseQualityString("?5+!"); // 63,53,43,33 + + //clip 1 base at end + expected = readClipper.hardClipLowQualEnds((byte)34); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,3).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,3)); + Assert.assertEquals(expected.getCigarString(), "3M1H"); + + //clip 2 bases at end + expected = readClipper.hardClipLowQualEnds((byte)44); + Assert.assertEquals(expected.getReadBases(), BASES.substring(0,2).getBytes()); + Assert.assertEquals(expected.getBaseQualityString(), QUALS.substring(0,2)); + Assert.assertEquals(expected.getCigarString(), "2M2H"); + + // revert Qual sequence + readClipper.getRead().setBaseQualityString(QUALS); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java index d08cda949..55bd4783b 100755 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java @@ -1,53 +1,43 @@ package org.broadinstitute.sting.utils.codecs.vcf; +import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.Tribble; import org.broad.tribble.index.*; import org.broad.tribble.iterators.CloseableTribbleIterator; import org.broad.tribble.source.BasicFeatureSource; +import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.testng.Assert; +import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; import java.io.File; +import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; /** * tests out the various functions in the index factory class */ -public class IndexFactoryUnitTest { +public class IndexFactoryUnitTest extends BaseTest { File inputFile = new File("public/testdata/HiSeq.10000.vcf"); File outputFile = new File("public/testdata/onTheFlyOutputTest.vcf"); File outputFileIndex = Tribble.indexFile(outputFile); - /** - * test out scoring the indexes - */ - @Test - public void testScoreIndexes() { - /*// make a list of indexes to score - Map creators = new HashMap(); - // add a linear index with the default bin size - LinearIndexCreator linearNormal = new LinearIndexCreator(); - linearNormal.initialize(inputFile, linearNormal.defaultBinSize()); - creators.add(LInearIndexlinearNormal); + private SAMSequenceDictionary dict; - // create a tree index with a small index size - IntervalIndexCreator treeSmallBin = new IntervalIndexCreator(); - treeSmallBin.initialize(inputFile, Math.max(200,treeSmallBin.defaultBinSize()/10)); - creators.add(treeSmallBin); - - List indexes = new ArrayList(); - for (IndexCreator creator : creators) - indexes.add(creator.finalizeIndex(0)); - - ArrayList scores = IndexFactory.scoreIndexes(0.5,indexes,100, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); - System.err.println("scores are : "); - for (Double score : scores) { - System.err.println(score); -*/ + @BeforeTest + public void setup() { + try { + dict = new CachingIndexedFastaSequenceFile(new File(b37KGReference)).getSequenceDictionary(); + } + catch(FileNotFoundException ex) { + throw new UserException.CouldNotReadInputFile(b37KGReference,ex); + } } // @@ -65,7 +55,7 @@ public class IndexFactoryUnitTest { BasicFeatureSource source = new BasicFeatureSource(inputFile.getAbsolutePath(), indexFromInputFile, new VCFCodec()); int counter = 0; - VCFWriter writer = new StandardVCFWriter(outputFile); + VCFWriter writer = new StandardVCFWriter(outputFile, dict); writer.writeHeader((VCFHeader)source.getHeader()); CloseableTribbleIterator it = source.iterator(); while (it.hasNext() && (counter++ < maxRecords || maxRecords == -1) ) { diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java index 2ef116708..c8a0c0ed6 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/VCFIntegrationTest.java @@ -17,7 +17,7 @@ public class VCFIntegrationTest extends WalkerTest { String baseCommand = "-R " + b37KGReference + " -NO_HEADER -o %s "; - String test1 = baseCommand + "-T VariantAnnotator --variant " + testVCF + " -BTI variant"; + String test1 = baseCommand + "-T VariantAnnotator --variant " + testVCF + " -L " + testVCF; WalkerTestSpec spec1 = new WalkerTestSpec(test1, 1, Arrays.asList(md5ofInputVCF)); List result = executeTest("Test Variant Annotator with no changes", spec1).getFirst(); diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsBenchmark.java b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsBenchmark.java new file mode 100644 index 000000000..2771a7e45 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsBenchmark.java @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.fragments; + +import com.google.caliper.Param; +import com.google.caliper.SimpleBenchmark; +import com.google.caliper.runner.CaliperMain; +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; + +import java.util.*; + +/** + * Caliper microbenchmark of fragment pileup + */ +public class FragmentUtilsBenchmark extends SimpleBenchmark { + List pileups; + + @Param({"0", "4", "30", "150", "1000"}) + int pileupSize; // set automatically by framework + + @Param({"200", "400"}) + int insertSize; // set automatically by framework + + @Override protected void setUp() { + final int nPileupsToGenerate = 100; + pileups = new ArrayList(nPileupsToGenerate); + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + GenomeLocParser genomeLocParser; + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + GenomeLoc loc = genomeLocParser.createGenomeLoc("chr1", 50); + final int readLen = 100; + + for ( int pileupN = 0; pileupN < nPileupsToGenerate; pileupN++ ) { + ReadBackedPileup rbp = ArtificialSAMUtils.createReadBackedPileup(header, loc, readLen, insertSize, pileupSize); + pileups.add(rbp); + } + } + +// public void timeOriginal(int rep) { +// run(rep, FragmentUtils.FragmentMatchingAlgorithm.ORIGINAL); +// } + + public void timeSkipNonOverlapping(int rep) { + int nFrags = 0; + for ( int i = 0; i < rep; i++ ) { + for ( ReadBackedPileup rbp : pileups ) + nFrags += FragmentUtils.create(rbp).getOverlappingPairs().size(); + } + } + + public static void main(String[] args) { + CaliperMain.main(FragmentUtilsBenchmark.class, args); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java new file mode 100644 index 000000000..cbe580809 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/fragments/FragmentUtilsUnitTest.java @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.fragments; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Test routines for read-backed pileup. + */ +public class FragmentUtilsUnitTest extends BaseTest { + private static SAMFileHeader header; + + private class FragmentUtilsTest extends TestDataProvider { + List statesForPileup = new ArrayList(); + List statesForReads = new ArrayList(); + + private FragmentUtilsTest(String name, int readLen, int leftStart, int rightStart, + boolean leftIsFirst, boolean leftIsNegative) { + super(FragmentUtilsTest.class, String.format("%s-leftIsFirst:%b-leftIsNegative:%b", name, leftIsFirst, leftIsNegative)); + + List pair = ArtificialSAMUtils.createPair(header, "readpair", readLen, leftStart, rightStart, leftIsFirst, leftIsNegative); + GATKSAMRecord left = pair.get(0); + GATKSAMRecord right = pair.get(1); + + for ( int pos = leftStart; pos < rightStart + readLen; pos++) { + boolean posCoveredByLeft = pos >= left.getAlignmentStart() && pos <= left.getAlignmentEnd(); + boolean posCoveredByRight = pos >= right.getAlignmentStart() && pos <= right.getAlignmentEnd(); + + if ( posCoveredByLeft || posCoveredByRight ) { + List reads = new ArrayList(); + List offsets = new ArrayList(); + + if ( posCoveredByLeft ) { + reads.add(left); + offsets.add(pos - left.getAlignmentStart()); + } + + if ( posCoveredByRight ) { + reads.add(right); + offsets.add(pos - right.getAlignmentStart()); + } + + boolean shouldBeFragment = posCoveredByLeft && posCoveredByRight; + ReadBackedPileup pileup = new ReadBackedPileupImpl(null, reads, offsets); + TestState testState = new TestState(shouldBeFragment ? 0 : 1, shouldBeFragment ? 1 : 0, pileup, null); + statesForPileup.add(testState); + } + + TestState testState = left.getAlignmentEnd() >= right.getAlignmentStart() ? new TestState(0, 1, null, pair) : new TestState(2, 0, null, pair); + statesForReads.add(testState); + } + } + } + + private class TestState { + int expectedSingletons, expectedPairs; + ReadBackedPileup pileup; + List rawReads; + + private TestState(final int expectedSingletons, final int expectedPairs, final ReadBackedPileup pileup, final List rawReads) { + this.expectedSingletons = expectedSingletons; + this.expectedPairs = expectedPairs; + this.pileup = pileup; + this.rawReads = rawReads; + } + } + + @DataProvider(name = "fragmentUtilsTest") + public Object[][] createTests() { + for ( boolean leftIsFirst : Arrays.asList(true, false) ) { + for ( boolean leftIsNegative : Arrays.asList(true, false) ) { + // Overlapping pair + // ----> [first] + // <--- [second] + new FragmentUtilsTest("overlapping-pair", 10, 1, 5, leftIsFirst, leftIsNegative); + + // Non-overlapping pair + // ----> + // <---- + new FragmentUtilsTest("nonoverlapping-pair", 10, 1, 15, leftIsFirst, leftIsNegative); + } + } + + return FragmentUtilsTest.getTests(FragmentUtilsTest.class); + } + + @Test(enabled = true, dataProvider = "fragmentUtilsTest") + public void testAsPileup(FragmentUtilsTest test) { + for ( TestState testState : test.statesForPileup ) { + ReadBackedPileup rbp = testState.pileup; + FragmentCollection fp = FragmentUtils.create(rbp); + Assert.assertEquals(fp.getOverlappingPairs().size(), testState.expectedPairs); + Assert.assertEquals(fp.getSingletonReads().size(), testState.expectedSingletons); + } + } + + @Test(enabled = true, dataProvider = "fragmentUtilsTest") + public void testAsListOfReadsFromPileup(FragmentUtilsTest test) { + for ( TestState testState : test.statesForPileup ) { + FragmentCollection fp = FragmentUtils.create(testState.pileup.getReads()); + Assert.assertEquals(fp.getOverlappingPairs().size(), testState.expectedPairs); + Assert.assertEquals(fp.getSingletonReads().size(), testState.expectedSingletons); + } + } + + @Test(enabled = true, dataProvider = "fragmentUtilsTest") + public void testAsListOfReads(FragmentUtilsTest test) { + for ( TestState testState : test.statesForReads ) { + FragmentCollection fp = FragmentUtils.create(testState.rawReads); + Assert.assertEquals(fp.getOverlappingPairs().size(), testState.expectedPairs); + Assert.assertEquals(fp.getSingletonReads().size(), testState.expectedSingletons); + } + } + + @Test(enabled = true, expectedExceptions = IllegalArgumentException.class) + public void testOutOfOrder() { + final List pair = ArtificialSAMUtils.createPair(header, "readpair", 100, 1, 50, true, true); + final GATKSAMRecord left = pair.get(0); + final GATKSAMRecord right = pair.get(1); + final List reads = Arrays.asList(right, left); // OUT OF ORDER! + final List offsets = Arrays.asList(0, 50); + final ReadBackedPileup pileup = new ReadBackedPileupImpl(null, reads, offsets); + FragmentUtils.create(pileup); // should throw exception + } + + @BeforeTest + public void setup() { + header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java index e3a926fb9..35c6a4993 100644 --- a/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/genotype/vcf/VCFWriterUnitTest.java @@ -38,12 +38,13 @@ public class VCFWriterUnitTest extends BaseTest { private Set additionalColumns = new HashSet(); private File fakeVCFFile = new File("FAKEVCFFILEFORTESTING.vcf"); private GenomeLocParser genomeLocParser; + private IndexedFastaSequenceFile seq; @BeforeClass public void beforeTests() { File referenceFile = new File(hg18Reference); try { - IndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(referenceFile); + seq = new CachingIndexedFastaSequenceFile(referenceFile); genomeLocParser = new GenomeLocParser(seq); } catch(FileNotFoundException ex) { @@ -55,7 +56,7 @@ public class VCFWriterUnitTest extends BaseTest { @Test public void testBasicWriteAndRead() { VCFHeader header = createFakeHeader(metaData,additionalColumns); - VCFWriter writer = new StandardVCFWriter(fakeVCFFile); + VCFWriter writer = new StandardVCFWriter(fakeVCFFile, seq.getSequenceDictionary()); writer.writeHeader(header); writer.add(createVC(header)); writer.add(createVC(header)); @@ -104,7 +105,6 @@ public class VCFWriterUnitTest extends BaseTest { public static VCFHeader createFakeHeader(Set metaData, Set additionalColumns) { metaData.add(new VCFHeaderLine(VCFHeaderVersion.VCF4_0.getFormatString(), VCFHeaderVersion.VCF4_0.getVersionString())); metaData.add(new VCFHeaderLine("two", "2")); - additionalColumns.add("FORMAT"); additionalColumns.add("extra1"); additionalColumns.add("extra2"); return new VCFHeader(metaData, additionalColumns); @@ -158,6 +158,6 @@ public class VCFWriterUnitTest extends BaseTest { Assert.assertTrue(additionalColumns.contains(key)); index++; } - Assert.assertEquals(index+1, additionalColumns.size() /* for the header field we don't see */); + Assert.assertEquals(index, additionalColumns.size()); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalFileMergingIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalFileMergingIteratorUnitTest.java deleted file mode 100644 index 752695052..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalFileMergingIteratorUnitTest.java +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.interval; - -import org.testng.Assert; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLoc; -import net.sf.picard.reference.ReferenceSequenceFileFactory; - -import java.io.File; -import java.util.Iterator; -import java.util.List; -import java.util.ArrayList; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Jun 14, 2010 - * Time: 10:15:52 AM - * To change this template use File | Settings | File Templates. - */ -public class IntervalFileMergingIteratorUnitTest extends BaseTest { - - private static File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta"); - private static String intervalFileNameGATK = validationDataLocation+"test.gatk.intervals"; - private static String intervalFileNameBED = validationDataLocation+"test.bed"; - private static List results1 = null; - private static List results2 = null; - - private GenomeLocParser genomeLocParser; - - @BeforeClass - public void init() { - genomeLocParser = new GenomeLocParser(ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile)); - - results1 = new ArrayList(); - results2 = new ArrayList(); - - results1.add(genomeLocParser.createGenomeLoc("chr1",1554)); - results1.add(genomeLocParser.createGenomeLoc("chr1",2538,2568)); - results1.add(genomeLocParser.createGenomeLoc("chr1",18932,19000)); - results1.add(genomeLocParser.createGenomeLoc("chr1",19001,25000)); - results1.add(genomeLocParser.createGenomeLoc("chr5",7415,7600)); - - results2.add(genomeLocParser.createGenomeLoc("chr1",1554)); - results2.add(genomeLocParser.createGenomeLoc("chr1",2538,2568)); - results2.add(genomeLocParser.createGenomeLoc("chr1",18932,25000)); - results2.add(genomeLocParser.createGenomeLoc("chr5",7415,7600)); - - } - - @Test - public void testGATKIntervalFileIterator_Overlap() { - logger.warn("Executing testGATKIntervalFileIterator_Overlap"); - - Iterator it = new IntervalFileMergingIterator(genomeLocParser,new File(intervalFileNameGATK),IntervalMergingRule.OVERLAPPING_ONLY); - Iterator check_it = results1.iterator(); - while(it.hasNext()) { - GenomeLoc l = it.next(); - GenomeLoc l_expected = check_it.next(); - //System.out.println("int: "+l+" expected: "+l_expected) ; - Assert.assertEquals(l_expected, l, "Unexpected location returned by the iterator: "+l); - } - } - - @Test - public void testGATKIntervalFileIterator_OverlapWithException() { - logger.warn("Executing testGATKIntervalFileIterator_OverlapWithException"); - - Iterator it = new IntervalFileMergingIterator(genomeLocParser,new File(intervalFileNameGATK),IntervalMergingRule.OVERLAPPING_ONLY); - Iterator check_it = results1.iterator(); - try { - while(it.hasNext()) { - GenomeLoc l = it.next(); - GenomeLoc l_expected = check_it.next(); -// System.out.println("int: "+l+" expected: "+l_expected) ; - } - } catch ( ReviewedStingException e) { - Assert.assertEquals("Interval chr5:7414 in the interval file is out of order.", e.getMessage()); - } - } - - @Test - public void testGATKIntervalFileIterator_All() { - logger.warn("Executing testGATKIntervalFileIterator_All"); - - Iterator it = new IntervalFileMergingIterator(genomeLocParser,new File(intervalFileNameGATK),IntervalMergingRule.ALL); - Iterator check_it = results2.iterator(); - while(it.hasNext()) { - GenomeLoc l = it.next(); - GenomeLoc l_expected = check_it.next(); -// System.out.println("int: "+l+" expected: "+l_expected) ; - Assert.assertEquals(l_expected, l, "Unexpected location returned by the iterator: "+l); - } - } - - @Test - public void testBEDIntervalFileIterator_Overlap() { - logger.warn("Executing testBEDIntervalFileIterator_Overlap"); - - Iterator it = new IntervalFileMergingIterator(genomeLocParser,new File(intervalFileNameBED),IntervalMergingRule.OVERLAPPING_ONLY); - Iterator check_it = results1.iterator(); - while(it.hasNext()) { - GenomeLoc l = it.next(); - GenomeLoc l_expected = check_it.next(); -// System.out.println("int: "+l+" expected: "+l_expected) ; - Assert.assertEquals(l_expected, l, "Unexpected location returned by the iterator: "+l); - } - } - -} diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java index 379d79c84..75bdc3142 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalIntegrationTest.java @@ -48,19 +48,20 @@ public class IntervalIntegrationTest extends WalkerTest { executeTest("testAllIntervalsImplicit",spec); } - @Test(enabled = true) - public void testAllExplicitIntervalParsing() { - String md5 = "7821db9e14d4f8e07029ff1959cd5a99"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T CountLoci" + - " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + - " -R " + hg18Reference + - " -L all" + - " -o %s", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testAllIntervalsExplicit",spec); - } +// '-L all' is no longer supported +// @Test(enabled = true) +// public void testAllExplicitIntervalParsing() { +// String md5 = "7821db9e14d4f8e07029ff1959cd5a99"; +// WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( +// "-T CountLoci" + +// " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + +// " -R " + hg18Reference + +// " -L all" + +// " -o %s", +// 1, // just one output file +// Arrays.asList(md5)); +// executeTest("testAllIntervalsExplicit",spec); +// } @Test public void testUnmappedReadInclusion() { @@ -82,7 +83,7 @@ public class IntervalIntegrationTest extends WalkerTest { executeTest("testUnmappedReadInclusion",spec); } - @Test(enabled = true) + @Test(enabled = false) public void testUnmappedReadExclusion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T PrintReads" + @@ -102,5 +103,122 @@ public class IntervalIntegrationTest extends WalkerTest { executeTest("testUnmappedReadExclusion",spec); } + @Test(enabled = true) + public void testIntervalParsingFromFile() { + String md5 = "48a24b70a0b376535542b996af517398"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CountLoci" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIntervalParsingFromFile", spec); + } + + @Test(enabled = true) + public void testIntervalMergingFromFiles() { + String md5 = "9ae0ea9e3c9c6e1b9b6252c8395efdc1"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CountLoci" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -L " + validationDataLocation + "intervalTest.2.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIntervalMergingFromFiles", spec); + } + + @Test(enabled = true) + public void testIntervalExclusionsFromFiles() { + String md5 = "26ab0db90d72e28ad0ba1e22ee510510"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CountLoci" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -XL " + validationDataLocation + "intervalTest.2.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIntervalExclusionsFromFiles", spec); + } + + @Test(enabled = true) + public void testMixedIntervalMerging() { + String md5 = "7c5aba41f53293b712fd86d08ed5b36e"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CountLoci" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -L chr1:1677524-1677528", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testMixedIntervalMerging", spec); + } + + @Test(enabled = true) + public void testComplexVCF() { + String md5 = "166d77ac1b46a1ec38aa35ab7e628ab5"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CountLoci" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.3.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testComplexVCF", spec); + } + + @Test(enabled = true) + public void testMergingWithComplexVCF() { + String md5 = "6d7fce9fee471194aa8b5b6e47267f03"; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CountLoci" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -XL " + validationDataLocation + "intervalTest.3.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testMergingWithComplexVCF", spec); + } + + @Test(enabled = true) + public void testEmptyVCF() { + String md5 = ""; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CountLoci" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.empty.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testEmptyVCFError", spec); + } + + @Test(enabled = true) + public void testIncludeExcludeIsTheSame() { + String md5 = ""; + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T CountLoci" + + " -I " + validationDataLocation + "OV-0930.normal.chunk.bam" + + " -R " + hg18Reference + + " -o %s" + + " -L " + validationDataLocation + "intervalTest.1.vcf" + + " -XL " + validationDataLocation + "intervalTest.1.vcf", + 1, // just one output file + Arrays.asList(md5)); + executeTest("testIncludeExcludeIsTheSame", spec); + } + } diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java index bb892eec8..9c3b905c2 100644 --- a/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/interval/IntervalUtilsUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.interval; import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.util.IntervalUtil; import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; @@ -30,6 +31,20 @@ public class IntervalUtilsUnitTest extends BaseTest { private SAMFileHeader hg19Header; private GenomeLocParser hg19GenomeLocParser; private List hg19ReferenceLocs; + private List hg19exomeIntervals; + + private List getLocs(String... intervals) { + return getLocs(Arrays.asList(intervals)); + } + + private List getLocs(List intervals) { + if (intervals.size() == 0) + return hg18ReferenceLocs; + List locs = new ArrayList(); + for (String interval: intervals) + locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); + return locs; + } @BeforeClass public void init() { @@ -54,12 +69,198 @@ public class IntervalUtilsUnitTest extends BaseTest { ReferenceSequenceFile seq = new CachingIndexedFastaSequenceFile(hg19Ref); hg19GenomeLocParser = new GenomeLocParser(seq); hg19ReferenceLocs = Collections.unmodifiableList(GenomeLocSortedSet.createSetFromSequenceDictionary(referenceDataSource.getReference().getSequenceDictionary()).toList()) ; + + hg19exomeIntervals = Collections.unmodifiableList(IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(hg19Intervals))); } catch(FileNotFoundException ex) { throw new UserException.CouldNotReadInputFile(hg19Ref,ex); } } + // ------------------------------------------------------------------------------------- + // + // tests to ensure the quality of the interval cuts of the interval cutting functions + // + // ------------------------------------------------------------------------------------- + + private class IntervalSlicingTest extends TestDataProvider { + public int parts; + public double maxAllowableVariance; + + private IntervalSlicingTest(final int parts, final double maxAllowableVariance) { + super(IntervalSlicingTest.class); + this.parts = parts; + this.maxAllowableVariance = maxAllowableVariance; + } + + public String toString() { + return String.format("IntervalSlicingTest parts=%d maxVar=%.2f", parts, maxAllowableVariance); + } + } + + @DataProvider(name = "intervalslicingdata") + public Object[][] createTrees() { + new IntervalSlicingTest(1, 0); + new IntervalSlicingTest(2, 1); + new IntervalSlicingTest(5, 1); + new IntervalSlicingTest(10, 1); + new IntervalSlicingTest(67, 1); + new IntervalSlicingTest(100, 1); + new IntervalSlicingTest(500, 1); + new IntervalSlicingTest(1000, 1); + return IntervalSlicingTest.getTests(IntervalSlicingTest.class); + } + + @Test(enabled = true, dataProvider = "intervalslicingdata") + public void testFixedScatterIntervalsAlgorithm(IntervalSlicingTest test) { + List> splits = IntervalUtils.splitFixedIntervals(hg19exomeIntervals, test.parts); + + long totalSize = IntervalUtils.intervalSize(hg19exomeIntervals); + long idealSplitSize = totalSize / test.parts; + + long sumOfSplitSizes = 0; + int counter = 0; + for ( final List split : splits ) { + long splitSize = IntervalUtils.intervalSize(split); + double sigma = (splitSize - idealSplitSize) / (1.0 * idealSplitSize); + //logger.warn(String.format("Split %d size %d ideal %d sigma %.2f", counter, splitSize, idealSplitSize, sigma)); + counter++; + sumOfSplitSizes += splitSize; + Assert.assertTrue(Math.abs(sigma) <= test.maxAllowableVariance, String.format("Interval %d (size %d ideal %d) has a variance %.2f outside of the tolerated range %.2f", counter, splitSize, idealSplitSize, sigma, test.maxAllowableVariance)); + } + + Assert.assertEquals(totalSize, sumOfSplitSizes, "Split intervals don't contain the exact number of bases in the origianl intervals"); + } + + // ------------------------------------------------------------------------------------- + // + // splitLocusIntervals tests + // + // ------------------------------------------------------------------------------------- + + /** large scale tests for many intervals */ + private class SplitLocusIntervalsTest extends TestDataProvider { + final List originalIntervals; + final public int parts; + + private SplitLocusIntervalsTest(final String name, List originalIntervals, final int parts) { + super(SplitLocusIntervalsTest.class, name); + this.parts = parts; + this.originalIntervals = originalIntervals; + } + + public String toString() { + return String.format("%s parts=%d", super.toString(), parts); + } + } + + @DataProvider(name = "IntervalRepartitionTest") + public Object[][] createIntervalRepartitionTest() { + for ( int parts : Arrays.asList(1, 2, 3, 10, 13, 100, 151, 1000, 10000) ) { + //for ( int parts : Arrays.asList(10) ) { + new SplitLocusIntervalsTest("hg19RefLocs", hg19ReferenceLocs, parts); + new SplitLocusIntervalsTest("hg19ExomeLocs", hg19exomeIntervals, parts); + } + + return SplitLocusIntervalsTest.getTests(SplitLocusIntervalsTest.class); + } + + @Test(enabled = true, dataProvider = "IntervalRepartitionTest") + public void testIntervalRepartition(SplitLocusIntervalsTest test) { + List> splitByLocus = IntervalUtils.splitLocusIntervals(test.originalIntervals, test.parts); + Assert.assertEquals(splitByLocus.size(), test.parts, "SplitLocusIntervals failed to generate correct number of intervals"); + List flat = IntervalUtils.flattenSplitIntervals(splitByLocus); + + // test overall size + final long originalSize = IntervalUtils.intervalSize(test.originalIntervals); + final long flatSize = IntervalUtils.intervalSize(flat); + Assert.assertEquals(flatSize, originalSize, "SplitLocusIntervals locs cover an incorrect number of bases"); + + // test size of each split + final long ideal = (long)Math.floor(originalSize / (1.0 * test.parts)); + final long maxSize = ideal + (originalSize % test.parts) * test.parts; // no more than N * rounding error in size + for ( final List split : splitByLocus ) { + final long splitSize = IntervalUtils.intervalSize(split); + Assert.assertTrue(splitSize >= ideal && splitSize <= maxSize, + String.format("SplitLocusIntervals interval (start=%s) has size %d outside of bounds ideal=%d, max=%d", + split.get(0), splitSize, ideal, maxSize)); + } + + // test that every base in original is covered once by a base in split by locus intervals + String diff = IntervalUtils.equateIntervals(test.originalIntervals, flat); + Assert.assertNull(diff, diff); + } + + /** small scale tests where the expected cuts are enumerated upfront for testing */ + private class SplitLocusIntervalsSmallTest extends TestDataProvider { + final List original; + final public int parts; + final public int expectedParts; + final List expected; + + private SplitLocusIntervalsSmallTest(final String name, List originalIntervals, final int parts, List expected) { + this(name, originalIntervals, parts, expected, parts); + } + + private SplitLocusIntervalsSmallTest(final String name, List originalIntervals, final int parts, List expected, int expectedParts) { + super(SplitLocusIntervalsSmallTest.class, name); + this.parts = parts; + this.expectedParts = expectedParts; + this.original = originalIntervals; + this.expected = expected; + } + + public String toString() { + return String.format("%s parts=%d", super.toString(), parts); + } + } + + @DataProvider(name = "SplitLocusIntervalsSmallTest") + public Object[][] createSplitLocusIntervalsSmallTest() { + GenomeLoc bp01_10 = hg19GenomeLocParser.createGenomeLoc("1", 1, 10); + + GenomeLoc bp1_5 = hg19GenomeLocParser.createGenomeLoc("1", 1, 5); + GenomeLoc bp6_10 = hg19GenomeLocParser.createGenomeLoc("1", 6, 10); + new SplitLocusIntervalsSmallTest("cut into two", Arrays.asList(bp01_10), 2, Arrays.asList(bp1_5, bp6_10)); + + GenomeLoc bp20_30 = hg19GenomeLocParser.createGenomeLoc("1", 20, 30); + new SplitLocusIntervalsSmallTest("two in two", Arrays.asList(bp01_10, bp20_30), 2, Arrays.asList(bp01_10, bp20_30)); + + GenomeLoc bp1_7 = hg19GenomeLocParser.createGenomeLoc("1", 1, 7); + GenomeLoc bp8_10 = hg19GenomeLocParser.createGenomeLoc("1", 8, 10); + GenomeLoc bp20_23 = hg19GenomeLocParser.createGenomeLoc("1", 20, 23); + GenomeLoc bp24_30 = hg19GenomeLocParser.createGenomeLoc("1", 24, 30); + new SplitLocusIntervalsSmallTest("two in three", Arrays.asList(bp01_10, bp20_30), 3, + Arrays.asList(bp1_7, bp8_10, bp20_23, bp24_30)); + + GenomeLoc bp1_2 = hg19GenomeLocParser.createGenomeLoc("1", 1, 2); + GenomeLoc bp1_1 = hg19GenomeLocParser.createGenomeLoc("1", 1, 1); + GenomeLoc bp2_2 = hg19GenomeLocParser.createGenomeLoc("1", 2, 2); + new SplitLocusIntervalsSmallTest("too many pieces", Arrays.asList(bp1_2), 5, Arrays.asList(bp1_1, bp2_2), 2); + + new SplitLocusIntervalsSmallTest("emptyList", Collections.emptyList(), 5, Collections.emptyList(), 0); + + return SplitLocusIntervalsSmallTest.getTests(SplitLocusIntervalsSmallTest.class); + } + + @Test(enabled = true, dataProvider = "SplitLocusIntervalsSmallTest") + public void splitLocusIntervalsSmallTest(SplitLocusIntervalsSmallTest test) { + List> splitByLocus = IntervalUtils.splitLocusIntervals(test.original, test.parts); + Assert.assertEquals(splitByLocus.size(), test.expectedParts, "SplitLocusIntervals failed to generate correct number of intervals"); + List flat = IntervalUtils.flattenSplitIntervals(splitByLocus); + + // test sizes + final long originalSize = IntervalUtils.intervalSize(test.original); + final long splitSize = IntervalUtils.intervalSize(flat); + Assert.assertEquals(splitSize, originalSize, "SplitLocusIntervals locs cover an incorrect number of bases"); + + Assert.assertEquals(flat, test.expected, "SplitLocusIntervals locs not expected intervals"); + } + + // + // Misc. tests + // + @Test(expectedExceptions=UserException.class) public void testMergeListsBySetOperatorNoOverlap() { // a couple of lists we'll use for the testing @@ -119,6 +320,22 @@ public class IntervalUtilsUnitTest extends BaseTest { Assert.assertEquals(ret.size(), 20); } + @Test + public void testOverlappingIntervalsFromSameSourceWithIntersection() { + // a couple of lists we'll use for the testing + List source1 = new ArrayList(); + List source2 = new ArrayList(); + + source1.add(hg18GenomeLocParser.createGenomeLoc("chr1", 10, 20)); + source1.add(hg18GenomeLocParser.createGenomeLoc("chr1", 15, 25)); + + source2.add(hg18GenomeLocParser.createGenomeLoc("chr1", 16, 18)); + source2.add(hg18GenomeLocParser.createGenomeLoc("chr1", 22, 24)); + + List ret = IntervalUtils.mergeListsBySetOperator(source1, source2, IntervalSetRule.INTERSECTION); + Assert.assertEquals(ret.size(), 2); + } + @Test public void testGetContigLengths() { Map lengths = IntervalUtils.getContigSizes(new File(BaseTest.hg18Reference)); @@ -129,19 +346,6 @@ public class IntervalUtilsUnitTest extends BaseTest { Assert.assertEquals((long)lengths.get("chrX"), 154913754); } - private List getLocs(String... intervals) { - return getLocs(Arrays.asList(intervals)); - } - - private List getLocs(List intervals) { - if (intervals.size() == 0) - return hg18ReferenceLocs; - List locs = new ArrayList(); - for (String interval: intervals) - locs.add(hg18GenomeLocParser.parseGenomeLoc(interval)); - return locs; - } - @Test public void testParseIntervalArguments() { Assert.assertEquals(getLocs().size(), 45); @@ -174,12 +378,12 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("basic.", 3, ".intervals"); List locs = getLocs("chr1", "chr2", "chr3"); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); @@ -200,12 +404,12 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("less.", 3, ".intervals"); List locs = getLocs("chr1", "chr2", "chr3", "chr4"); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); @@ -228,8 +432,8 @@ public class IntervalUtilsUnitTest extends BaseTest { public void testScatterFixedIntervalsMoreFiles() { List files = testFiles("more.", 3, ".intervals"); List locs = getLocs("chr1", "chr2"); - List splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(locs, locs.size()); // locs.size() instead of files.size() + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); } @Test public void testScatterFixedIntervalsStart() { @@ -242,12 +446,12 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); @@ -270,12 +474,12 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); @@ -298,12 +502,12 @@ public class IntervalUtilsUnitTest extends BaseTest { List files = testFiles("split.", 3, ".intervals"); List locs = getLocs(intervals); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); Assert.assertEquals(locs1.size(), 2); Assert.assertEquals(locs2.size(), 1); @@ -318,8 +522,8 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterFixedIntervalsFile() { List files = testFiles("sg.", 20, ".intervals"); - List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list"), false); - List splits = IntervalUtils.splitFixedIntervals(locs, files.size()); + List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(BaseTest.GATKDataLocation + "whole_exome_agilent_designed_120.targets.hg18.chr20.interval_list")); + List> splits = IntervalUtils.splitFixedIntervals(locs, files.size()); int[] counts = { 125, 138, 287, 291, 312, 105, 155, 324, @@ -332,21 +536,18 @@ public class IntervalUtilsUnitTest extends BaseTest { }; //String splitCounts = ""; - for (int lastIndex = 0, i = 0; i < splits.size(); i++) { - int splitIndex = splits.get(i); - int splitCount = (splitIndex - lastIndex); - //splitCounts += ", " + splitCount; - lastIndex = splitIndex; + for (int i = 0; i < splits.size(); i++) { + int splitCount = splits.get(i).size(); Assert.assertEquals(splitCount, counts[i], "Num intervals in split " + i); } //System.out.println(splitCounts.substring(2)); - IntervalUtils.scatterFixedIntervals(hg18Header, locs, splits, files); + IntervalUtils.scatterFixedIntervals(hg18Header, splits, files); int locIndex = 0; for (int i = 0; i < files.size(); i++) { String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file), false); + List parsedLocs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(file)); Assert.assertEquals(parsedLocs.size(), counts[i], "Intervals in " + file); for (GenomeLoc parsedLoc: parsedLocs) Assert.assertEquals(parsedLoc, locs.get(locIndex), String.format("Genome loc %d from file %d", locIndex++, i)); @@ -357,12 +558,12 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test public void testScatterFixedIntervalsMax() { List files = testFiles("sg.", 85, ".intervals"); - List splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); - IntervalUtils.scatterFixedIntervals(hg19Header, hg19ReferenceLocs, splits, files); + List> splits = IntervalUtils.splitFixedIntervals(hg19ReferenceLocs, files.size()); + IntervalUtils.scatterFixedIntervals(hg19Header, splits, files); for (int i = 0; i < files.size(); i++) { String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); + List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file)); Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); } @@ -379,9 +580,9 @@ public class IntervalUtilsUnitTest extends BaseTest { IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); @@ -402,9 +603,9 @@ public class IntervalUtilsUnitTest extends BaseTest { IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3"), files); - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); @@ -426,9 +627,9 @@ public class IntervalUtilsUnitTest extends BaseTest { IntervalUtils.scatterContigIntervals(hg18Header, getLocs("chr1", "chr2", "chr3", "chr4"), files); - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); @@ -458,9 +659,9 @@ public class IntervalUtilsUnitTest extends BaseTest { IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); Assert.assertEquals(locs1.size(), 2); Assert.assertEquals(locs2.size(), 1); @@ -484,9 +685,9 @@ public class IntervalUtilsUnitTest extends BaseTest { IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 2); @@ -510,9 +711,9 @@ public class IntervalUtilsUnitTest extends BaseTest { IntervalUtils.scatterContigIntervals(hg18Header, getLocs(intervals), files); - List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString()), false); - List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString()), false); - List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString()), false); + List locs1 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(0).toString())); + List locs2 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(1).toString())); + List locs3 = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Arrays.asList(files.get(2).toString())); Assert.assertEquals(locs1.size(), 1); Assert.assertEquals(locs2.size(), 1); @@ -531,7 +732,7 @@ public class IntervalUtilsUnitTest extends BaseTest { for (int i = 0; i < files.size(); i++) { String file = files.get(i).toString(); - List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file), false); + List parsedLocs = IntervalUtils.parseIntervalArguments(hg19GenomeLocParser, Arrays.asList(file)); Assert.assertEquals(parsedLocs.size(), 1, "parsedLocs[" + i + "].size()"); Assert.assertEquals(parsedLocs.get(0), hg19ReferenceLocs.get(i), "parsedLocs[" + i + "].get()"); } @@ -555,7 +756,7 @@ public class IntervalUtilsUnitTest extends BaseTest { @Test(dataProvider="unmergedIntervals") public void testUnmergedIntervals(String unmergedIntervals) { - List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals), false); + List locs = IntervalUtils.parseIntervalArguments(hg18GenomeLocParser, Collections.singletonList(validationDataLocation + unmergedIntervals)); Assert.assertEquals(locs.size(), 2); List merged = IntervalUtils.mergeIntervalLocations(locs, IntervalMergingRule.ALL); diff --git a/public/java/test/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIteratorUnitTest.java deleted file mode 100644 index 0b4e52a3d..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/interval/NwayIntervalMergingIteratorUnitTest.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils.interval; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.GenomeLoc; -import net.sf.picard.reference.ReferenceSequenceFileFactory; -import org.testng.Assert; -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.util.ArrayList; -import java.util.List; -import java.util.Iterator; -import java.io.File; - -/** - * Created by IntelliJ IDEA. - * User: asivache - * Date: Oct 28, 2010 - * Time: 2:46:03 PM - * To change this template use File | Settings | File Templates. - */ -public class NwayIntervalMergingIteratorUnitTest extends BaseTest { - - private static File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta"); - private GenomeLocParser genomeLocParser; - - private static List stream1 = null; - private static List stream2 = null; - private static List expected = null; - - @BeforeClass - public static void init() { - GenomeLocParser genomeLocParser = new GenomeLocParser(ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile)); - - stream1 = new ArrayList(); - stream2 = new ArrayList(); - expected = new ArrayList(); - - stream1.add(genomeLocParser.createGenomeLoc("chr1",1554,1560)); // 1 - stream1.add(genomeLocParser.createGenomeLoc("chr1",2538,2568)); // 3 - stream1.add(genomeLocParser.createGenomeLoc("chr1",2600,2610)); // 4 - stream1.add(genomeLocParser.createGenomeLoc("chr1",2609,2625)); // 4 - stream1.add(genomeLocParser.createGenomeLoc("chr1",18932,19000)); // 6 - stream1.add(genomeLocParser.createGenomeLoc("chr1",19001,25000)); //6 - - stream2.add(genomeLocParser.createGenomeLoc("chr1",1565,1570)); //2 - stream2.add(genomeLocParser.createGenomeLoc("chr1",2598,2604)); // 4 - stream2.add(genomeLocParser.createGenomeLoc("chr1",7415,7600)); // 5 - stream2.add(genomeLocParser.createGenomeLoc("chr1",18932,25000)); // 6 - stream2.add(genomeLocParser.createGenomeLoc("chr1",30000,35000)); // 7 - - expected.add(genomeLocParser.createGenomeLoc("chr1",1554,1560)); // 1 - expected.add(genomeLocParser.createGenomeLoc("chr1",1565,1570)); //2 - expected.add(genomeLocParser.createGenomeLoc("chr1",2538,2568)); // 3 - expected.add(genomeLocParser.createGenomeLoc("chr1",2598,2625)); // 4 - expected.add(genomeLocParser.createGenomeLoc("chr1",7415,7600)); // 5 - expected.add(genomeLocParser.createGenomeLoc("chr1",18932,25000)); // 6 - expected.add(genomeLocParser.createGenomeLoc("chr1",30000,35000)); // 7 - - - } - - @Test - public void testNwayIntervalMergingIterator() { - logger.warn("testNwayIntervalMergingIterator"); - - Iterator it1 = stream1.iterator(); - Iterator it2 = stream2.iterator(); - - Iterator e_it = expected.iterator(); - - - - NwayIntervalMergingIterator it = new NwayIntervalMergingIterator(IntervalMergingRule.OVERLAPPING_ONLY); - it.add(it1); - it.add(it2); - - while(it.hasNext()) { - GenomeLoc l = it.next(); - GenomeLoc l_expected = e_it.next(); - //System.out.println("int: "+l+" expected: "+l_expected) ; - Assert.assertEquals(l,l_expected,"Unexpected location returned by the iterator: "+l); - } - } - - -} diff --git a/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java new file mode 100644 index 000000000..4caf7f485 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java @@ -0,0 +1,197 @@ +package org.broadinstitute.sting.utils.io; + +import org.apache.commons.io.FileUtils; +import org.broadinstitute.sting.BaseTest; +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class IOUtilsUnitTest extends BaseTest { + @Test + public void testGoodTempDir() { + IOUtils.checkTempDir(new File("/tmp/queue")); + } + + @Test(expectedExceptions=UserException.BadTmpDir.class) + public void testBadTempDir() { + IOUtils.checkTempDir(new File("/tmp")); + } + + @Test + public void testAbsoluteSubDir() { + File subDir = IOUtils.absolute(new File("."), new File("/path/to/file")); + Assert.assertEquals(subDir, new File("/path/to/file")); + + subDir = IOUtils.absolute(new File("/different/path"), new File("/path/to/file")); + Assert.assertEquals(subDir, new File("/path/to/file")); + + subDir = IOUtils.absolute(new File("/different/path"), new File(".")); + Assert.assertEquals(subDir, new File("/different/path")); + } + + @Test + public void testRelativeSubDir() throws IOException { + File subDir = IOUtils.absolute(new File("."), new File("path/to/file")); + Assert.assertEquals(subDir.getCanonicalFile(), new File("path/to/file").getCanonicalFile()); + + subDir = IOUtils.absolute(new File("/different/path"), new File("path/to/file")); + Assert.assertEquals(subDir, new File("/different/path/path/to/file")); + } + + @Test + public void testDottedSubDir() throws IOException { + File subDir = IOUtils.absolute(new File("."), new File("path/../to/file")); + Assert.assertEquals(subDir.getCanonicalFile(), new File("path/../to/./file").getCanonicalFile()); + + subDir = IOUtils.absolute(new File("."), new File("/path/../to/file")); + Assert.assertEquals(subDir, new File("/path/../to/file")); + + subDir = IOUtils.absolute(new File("/different/../path"), new File("path/to/file")); + Assert.assertEquals(subDir, new File("/different/../path/path/to/file")); + + subDir = IOUtils.absolute(new File("/different/./path"), new File("/path/../to/file")); + Assert.assertEquals(subDir, new File("/path/../to/file")); + } + + @Test + public void testTempDir() { + File tempDir = IOUtils.tempDir("Q-Unit-Test", "", new File("queueTempDirToDelete")); + Assert.assertTrue(tempDir.exists()); + Assert.assertFalse(tempDir.isFile()); + Assert.assertTrue(tempDir.isDirectory()); + boolean deleted = IOUtils.tryDelete(tempDir); + Assert.assertTrue(deleted); + Assert.assertFalse(tempDir.exists()); + } + + @Test + public void testDirLevel() { + File dir = IOUtils.dirLevel(new File("/path/to/directory"), 1); + Assert.assertEquals(dir, new File("/path")); + + dir = IOUtils.dirLevel(new File("/path/to/directory"), 2); + Assert.assertEquals(dir, new File("/path/to")); + + dir = IOUtils.dirLevel(new File("/path/to/directory"), 3); + Assert.assertEquals(dir, new File("/path/to/directory")); + + dir = IOUtils.dirLevel(new File("/path/to/directory"), 4); + Assert.assertEquals(dir, new File("/path/to/directory")); + } + + @Test + public void testAbsolute() { + File dir = IOUtils.absolute(new File("/path/./to/./directory/.")); + Assert.assertEquals(dir, new File("/path/to/directory")); + + dir = IOUtils.absolute(new File("/")); + Assert.assertEquals(dir, new File("/")); + + dir = IOUtils.absolute(new File("/.")); + Assert.assertEquals(dir, new File("/")); + + dir = IOUtils.absolute(new File("/././.")); + Assert.assertEquals(dir, new File("/")); + + dir = IOUtils.absolute(new File("/./directory/.")); + Assert.assertEquals(dir, new File("/directory")); + + dir = IOUtils.absolute(new File("/./directory/./")); + Assert.assertEquals(dir, new File("/directory")); + + dir = IOUtils.absolute(new File("/./directory./")); + Assert.assertEquals(dir, new File("/directory.")); + + dir = IOUtils.absolute(new File("/./.directory/")); + Assert.assertEquals(dir, new File("/.directory")); + } + + @Test + public void testTail() throws IOException { + List lines = Arrays.asList( + "chr18_random 4262 3154410390 50 51", + "chr19_random 301858 3154414752 50 51", + "chr21_random 1679693 3154722662 50 51", + "chr22_random 257318 3156435963 50 51", + "chrX_random 1719168 3156698441 50 51"); + List tail = IOUtils.tail(new File(BaseTest.hg18Reference + ".fai"), 5); + Assert.assertEquals(tail.size(), 5); + for (int i = 0; i < 5; i++) + Assert.assertEquals(tail.get(i), lines.get(i)); + } + + @Test + public void testWriteSystemFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + IOUtils.writeResource(new Resource("StingText.properties", null), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testWriteSystemTempFile() throws IOException { + File temp = IOUtils.writeTempResource(new Resource("StingText.properties", null)); + try { + Assert.assertTrue(temp.getName().startsWith("StingText"), "File does not start with 'StingText.': " + temp); + Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testMissingSystemFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + IOUtils.writeResource(new Resource("MissingStingText.properties", null), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testWriteRelativeFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + IOUtils.writeResource(new Resource("/StingText.properties", IOUtils.class), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testWriteRelativeTempFile() throws IOException { + File temp = IOUtils.writeTempResource(new Resource("/StingText.properties", IOUtils.class)); + try { + Assert.assertTrue(temp.getName().startsWith("StingText"), "File does not start with 'StingText.': " + temp); + Assert.assertTrue(temp.getName().endsWith(".properties"), "File does not end with '.properties': " + temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testMissingRelativeFile() throws IOException { + File temp = createTempFile("temp.", ".properties"); + try { + // Looking for /org/broadinstitute/sting/utils/file/StingText.properties + IOUtils.writeResource(new Resource("StingText.properties", IOUtils.class), temp); + } finally { + FileUtils.deleteQuietly(temp); + } + } + + @Test + public void testResourceProperties() { + Resource resource = new Resource("foo", Resource.class); + Assert.assertEquals(resource.getPath(), "foo"); + Assert.assertEquals(resource.getRelativeClass(), Resource.class); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java index fb479ab47..6e955289c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/pileup/ReadBackedPileupUnitTest.java @@ -26,9 +26,8 @@ package org.broadinstitute.sting.utils.pileup; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; -import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.annotations.Test; @@ -51,27 +50,25 @@ public class ReadBackedPileupUnitTest { header.addReadGroup(readGroupOne); header.addReadGroup(readGroupTwo); - SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,10); + GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,10); read1.setAttribute("RG",readGroupOne.getId()); - SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,10); + GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,10); read2.setAttribute("RG",readGroupTwo.getId()); - SAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,10); + GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,10); read3.setAttribute("RG",readGroupOne.getId()); - SAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,10); + GATKSAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,10); read4.setAttribute("RG",readGroupTwo.getId()); - SAMRecord read5 = ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,10); + GATKSAMRecord read5 = ArtificialSAMUtils.createArtificialRead(header,"read5",0,1,10); read5.setAttribute("RG",readGroupTwo.getId()); - SAMRecord read6 = ArtificialSAMUtils.createArtificialRead(header,"read6",0,1,10); + GATKSAMRecord read6 = ArtificialSAMUtils.createArtificialRead(header,"read6",0,1,10); read6.setAttribute("RG",readGroupOne.getId()); - SAMRecord read7 = ArtificialSAMUtils.createArtificialRead(header,"read7",0,1,10); + GATKSAMRecord read7 = ArtificialSAMUtils.createArtificialRead(header,"read7",0,1,10); read7.setAttribute("RG",readGroupOne.getId()); - ReadBackedPileup pileup = new ReadBackedPileupImpl(null, - Arrays.asList(read1,read2,read3,read4,read5,read6,read7), - Arrays.asList(1,1,1,1,1,1,1)); + ReadBackedPileup pileup = new ReadBackedPileupImpl(null, Arrays.asList(read1,read2,read3,read4,read5,read6,read7), Arrays.asList(1,1,1,1,1,1,1)); ReadBackedPileup rg1Pileup = pileup.getPileupForReadGroup("rg1"); - List rg1Reads = rg1Pileup.getReads(); + List rg1Reads = rg1Pileup.getReads(); Assert.assertEquals(rg1Reads.size(), 4, "Wrong number of reads in read group rg1"); Assert.assertEquals(rg1Reads.get(0), read1, "Read " + read1.getReadName() + " should be in rg1 but isn't"); Assert.assertEquals(rg1Reads.get(1), read3, "Read " + read3.getReadName() + " should be in rg1 but isn't"); @@ -79,7 +76,7 @@ public class ReadBackedPileupUnitTest { Assert.assertEquals(rg1Reads.get(3), read7, "Read " + read7.getReadName() + " should be in rg1 but isn't"); ReadBackedPileup rg2Pileup = pileup.getPileupForReadGroup("rg2"); - List rg2Reads = rg2Pileup.getReads(); + List rg2Reads = rg2Pileup.getReads(); Assert.assertEquals(rg2Reads.size(), 3, "Wrong number of reads in read group rg2"); Assert.assertEquals(rg2Reads.get(0), read2, "Read " + read2.getReadName() + " should be in rg2 but isn't"); Assert.assertEquals(rg2Reads.get(1), read4, "Read " + read4.getReadName() + " should be in rg2 but isn't"); @@ -93,17 +90,17 @@ public class ReadBackedPileupUnitTest { public void testSplitByNullReadGroups() { SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000); - SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,10); - SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,10); - SAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,10); + GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,10); + GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,10); + GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,10); ReadBackedPileup pileup = new ReadBackedPileupImpl(null, Arrays.asList(read1,read2,read3), Arrays.asList(1,1,1)); ReadBackedPileup nullRgPileup = pileup.getPileupForReadGroup(null); - List nullRgReads = nullRgPileup.getReads(); - Assert.assertEquals(nullRgPileup.size(), 3, "Wrong number of reads in null read group"); + List nullRgReads = nullRgPileup.getReads(); + Assert.assertEquals(nullRgPileup.getNumberOfElements(), 3, "Wrong number of reads in null read group"); Assert.assertEquals(nullRgReads.get(0), read1, "Read " + read1.getReadName() + " should be in null rg but isn't"); Assert.assertEquals(nullRgReads.get(1), read2, "Read " + read2.getReadName() + " should be in null rg but isn't"); Assert.assertEquals(nullRgReads.get(2), read3, "Read " + read3.getReadName() + " should be in null rg but isn't"); @@ -126,13 +123,13 @@ public class ReadBackedPileupUnitTest { header.addReadGroup(readGroupOne); header.addReadGroup(readGroupTwo); - SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,10); + GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,10); read1.setAttribute("RG",readGroupOne.getId()); - SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,10); + GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,10); read2.setAttribute("RG",readGroupTwo.getId()); - SAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,10); + GATKSAMRecord read3 = ArtificialSAMUtils.createArtificialRead(header,"read3",0,1,10); read3.setAttribute("RG",readGroupOne.getId()); - SAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,10); + GATKSAMRecord read4 = ArtificialSAMUtils.createArtificialRead(header,"read4",0,1,10); read4.setAttribute("RG",readGroupTwo.getId()); ReadBackedPileupImpl sample1Pileup = new ReadBackedPileupImpl(null, @@ -141,21 +138,21 @@ public class ReadBackedPileupUnitTest { ReadBackedPileupImpl sample2Pileup = new ReadBackedPileupImpl(null, Arrays.asList(read2,read4), Arrays.asList(1,1)); - Map sampleToPileupMap = new HashMap(); - sampleToPileupMap.put(new Sample(readGroupOne.getSample()),sample1Pileup); - sampleToPileupMap.put(new Sample(readGroupTwo.getSample()),sample2Pileup); + Map sampleToPileupMap = new HashMap(); + sampleToPileupMap.put(readGroupOne.getSample(),sample1Pileup); + sampleToPileupMap.put(readGroupTwo.getSample(),sample2Pileup); ReadBackedPileup compositePileup = new ReadBackedPileupImpl(null,sampleToPileupMap); ReadBackedPileup rg1Pileup = compositePileup.getPileupForReadGroup("rg1"); - List rg1Reads = rg1Pileup.getReads(); + List rg1Reads = rg1Pileup.getReads(); Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg1"); Assert.assertEquals(rg1Reads.get(0), read1, "Read " + read1.getReadName() + " should be in rg1 but isn't"); Assert.assertEquals(rg1Reads.get(1), read3, "Read " + read3.getReadName() + " should be in rg1 but isn't"); ReadBackedPileup rg2Pileup = compositePileup.getPileupForReadGroup("rg2"); - List rg2Reads = rg2Pileup.getReads(); + List rg2Reads = rg2Pileup.getReads(); Assert.assertEquals(rg1Reads.size(), 2, "Wrong number of reads in read group rg2"); Assert.assertEquals(rg2Reads.get(0), read2, "Read " + read2.getReadName() + " should be in rg2 but isn't"); @@ -164,41 +161,37 @@ public class ReadBackedPileupUnitTest { @Test public void testGetPileupForSample() { - Sample sample1 = new Sample("sample1"); - Sample sample2 = new Sample("sample2"); + String sample1 = "sample1"; + String sample2 = "sample2"; SAMReadGroupRecord readGroupOne = new SAMReadGroupRecord("rg1"); - readGroupOne.setSample(sample1.getId()); + readGroupOne.setSample(sample1); SAMReadGroupRecord readGroupTwo = new SAMReadGroupRecord("rg2"); - readGroupTwo.setSample(sample2.getId()); + readGroupTwo.setSample(sample2); SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000); header.addReadGroup(readGroupOne); header.addReadGroup(readGroupTwo); - SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,10); + GATKSAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,1,10); read1.setAttribute("RG",readGroupOne.getId()); - SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,10); + GATKSAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,1,10); read2.setAttribute("RG",readGroupTwo.getId()); - Map sampleToPileupMap = new HashMap(); + Map sampleToPileupMap = new HashMap(); sampleToPileupMap.put(sample1,new ReadBackedPileupImpl(null,Collections.singletonList(read1),0)); sampleToPileupMap.put(sample2,new ReadBackedPileupImpl(null,Collections.singletonList(read2),0)); ReadBackedPileup pileup = new ReadBackedPileupImpl(null,sampleToPileupMap); - ReadBackedPileup sample1Pileup = pileup.getPileupForSample(sample1); - Assert.assertEquals(sample1Pileup.size(),1,"Sample 1 pileup has wrong number of elements"); - Assert.assertEquals(sample1Pileup.getReads().get(0),read1,"Sample 1 pileup has incorrect read"); - - ReadBackedPileup sample2Pileup = pileup.getPileupForSampleName(sample2.getId()); - Assert.assertEquals(sample2Pileup.size(),1,"Sample 2 pileup has wrong number of elements"); + ReadBackedPileup sample2Pileup = pileup.getPileupForSample(sample2); + Assert.assertEquals(sample2Pileup.getNumberOfElements(),1,"Sample 2 pileup has wrong number of elements"); Assert.assertEquals(sample2Pileup.getReads().get(0),read2,"Sample 2 pileup has incorrect read"); - ReadBackedPileup missingSamplePileup = pileup.getPileupForSample(new Sample("missing")); + ReadBackedPileup missingSamplePileup = pileup.getPileupForSample("missing"); Assert.assertNull(missingSamplePileup,"Pileup for sample 'missing' should be null but isn't"); - missingSamplePileup = pileup.getPileupForSampleName("not here"); + missingSamplePileup = pileup.getPileupForSample("not here"); Assert.assertNull(missingSamplePileup,"Pileup for sample 'not here' should be null but isn't"); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/runtime/ProcessControllerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/runtime/ProcessControllerUnitTest.java new file mode 100644 index 000000000..6db9d77ef --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/runtime/ProcessControllerUnitTest.java @@ -0,0 +1,517 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.runtime; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang.StringUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.io.IOUtils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +public class ProcessControllerUnitTest extends BaseTest { + private static final String NL = String.format("%n"); + + @Test(timeOut = 60 * 1000) + public void testDestroyThreadLocal() throws InterruptedException { + for (int i = 0; i < 3; i++) { + final ProcessController controller = ProcessController.getThreadLocal(); + final ProcessSettings job = new ProcessSettings( + new String[] {"sh", "-c", "echo Hello World && sleep 600 && echo Goodbye"}); + job.getStdoutSettings().setBufferSize(-1); + + Thread t = new Thread(new Runnable() { + @Override + public void run() { + System.out.println("BACK: Starting on background thread"); + ProcessOutput result = controller.exec(job); + // Assert in background thread doesn't make it to main thread but does print a trace. + Assert.assertTrue(result.getExitValue() != 0, "Destroy-attempted job returned zero exit status"); + System.out.println("BACK: Background thread exiting"); + } + }); + + System.out.println("MAIN: Starting background thread"); + t.start(); + System.out.println("MAIN: Sleeping main thread 3s"); + Thread.sleep(3000); + System.out.println("MAIN: Destroying job"); + controller.tryDestroy(); + System.out.println("MAIN: Not waiting on background thread to exit"); + // Using standard java.io this was blocking on linux. + // TODO: try again with NIO. + //t.join(); + //System.out.println("MAIN: Background thread exited"); + } + } + + @Test + public void testReuseAfterError() { + ProcessController controller = new ProcessController(); + + ProcessSettings job; + + for (int i = 0; i < 3; i++) { + // Test bad command + job = new ProcessSettings(new String[] {"no_such_command"}); + try { + controller.exec(job); + } catch (ReviewedStingException e) { + /* Was supposed to throw an exception */ + } + + // Test exit != 0 + job = new ProcessSettings(new String[] {"cat", "non_existent_file"}); + int exitValue = controller.exec(job).getExitValue(); + Assert.assertTrue(exitValue != 0, "'cat' non existent file returned 0"); + + // Text success + job = new ProcessSettings(new String[] {"echo", "Hello World"}); + exitValue = controller.exec(job).getExitValue(); + Assert.assertEquals(exitValue, 0, "Echo failed"); + } + } + + @Test + public void testEnvironment() { + String key = "MY_NEW_VAR"; + String value = "value is here"; + + ProcessSettings job = new ProcessSettings(new String[] {"sh", "-c", "echo $"+key}); + job.getStdoutSettings().setBufferSize(-1); + job.setRedirectErrorStream(true); + + Map env = new HashMap(System.getenv()); + env.put(key, value); + job.setEnvironment(env); + + ProcessController controller = new ProcessController(); + ProcessOutput result = controller.exec(job); + int exitValue = result.getExitValue(); + + Assert.assertEquals(exitValue, 0, "Echo environment variable failed"); + Assert.assertEquals(result.getStdout().getBufferString(), value + NL, "Echo environment returned unexpected output"); + } + + @Test + public void testDirectory() throws IOException { + File dir = null; + try { + dir = IOUtils.tempDir("temp.", "").getCanonicalFile(); + + ProcessSettings job = new ProcessSettings(new String[] {"pwd"}); + job.getStdoutSettings().setBufferSize(-1); + job.setRedirectErrorStream(true); + job.setDirectory(dir); + + ProcessController controller = new ProcessController(); + ProcessOutput result = controller.exec(job); + int exitValue = result.getExitValue(); + + Assert.assertEquals(exitValue, 0, "Getting working directory failed"); + + Assert.assertEquals(result.getStdout().getBufferString(), dir.getAbsolutePath() + NL, + "Setting/getting working directory returned unexpected output"); + } finally { + FileUtils.deleteQuietly(dir); + } + } + + @Test + public void testReadStdInBuffer() { + String bufferText = "Hello from buffer"; + ProcessSettings job = new ProcessSettings(new String[] {"cat"}); + job.getStdoutSettings().setBufferSize(-1); + job.setRedirectErrorStream(true); + job.getStdinSettings().setInputBuffer(bufferText); + + ProcessController controller = new ProcessController(); + ProcessOutput output = controller.exec(job); + + Assert.assertEquals(output.getStdout().getBufferString(), bufferText, + "Unexpected output from cat stdin buffer"); + } + + @Test + public void testReadStdInFile() { + File input = null; + try { + String fileText = "Hello from file"; + input = IOUtils.writeTempFile(fileText, "stdin.", ".txt"); + + ProcessSettings job = new ProcessSettings(new String[] {"cat"}); + job.getStdoutSettings().setBufferSize(-1); + job.setRedirectErrorStream(true); + job.getStdinSettings().setInputFile(input); + + ProcessController controller = new ProcessController(); + ProcessOutput output = controller.exec(job); + + Assert.assertEquals(output.getStdout().getBufferString(), fileText, + "Unexpected output from cat stdin file"); + } finally { + FileUtils.deleteQuietly(input); + } + } + + @Test + public void testWriteStdOut() { + ProcessSettings job = new ProcessSettings(new String[] {"echo", "Testing to stdout"}); + // Not going to call the System.setOut() for now. Just running a basic visual test. + job.getStdoutSettings().printStandard(true); + job.setRedirectErrorStream(true); + + System.out.println("testWriteStdOut: Writing two lines to std out..."); + ProcessController controller = new ProcessController(); + controller.exec(job); + job.setCommand(new String[]{"cat", "non_existent_file"}); + controller.exec(job); + System.out.println("testWriteStdOut: ...two lines should have been printed to std out"); + } + + @Test + public void testErrorToOut() throws IOException { + File outFile = null; + File errFile = null; + try { + outFile = BaseTest.createTempFile("temp", ""); + errFile = BaseTest.createTempFile("temp", ""); + + ProcessSettings job = new ProcessSettings(new String[]{"cat", "non_existent_file"}); + job.getStdoutSettings().setOutputFile(outFile); + job.getStdoutSettings().setBufferSize(-1); + job.getStderrSettings().setOutputFile(errFile); + job.getStderrSettings().setBufferSize(-1); + job.setRedirectErrorStream(true); + + ProcessOutput result = new ProcessController().exec(job); + int exitValue = result.getExitValue(); + + Assert.assertTrue(exitValue != 0, "'cat' non existent file returned 0"); + + String fileString, bufferString; + + fileString = FileUtils.readFileToString(outFile); + Assert.assertTrue(fileString.length() > 0, "Out file was length 0"); + + bufferString = result.getStdout().getBufferString(); + Assert.assertTrue(bufferString.length() > 0, "Out buffer was length 0"); + + Assert.assertFalse(result.getStdout().isBufferTruncated(), "Out buffer was truncated"); + Assert.assertEquals(bufferString.length(), fileString.length(), "Out buffer length did not match file length"); + + fileString = FileUtils.readFileToString(errFile); + Assert.assertEquals(fileString, "", "Unexpected output to err file"); + + bufferString = result.getStderr().getBufferString(); + Assert.assertEquals(bufferString, "", "Unexepected output to err buffer"); + } finally { + FileUtils.deleteQuietly(outFile); + FileUtils.deleteQuietly(errFile); + } + } + + @Test + public void testErrorToErr() throws IOException { + File outFile = null; + File errFile = null; + try { + outFile = BaseTest.createTempFile("temp", ""); + errFile = BaseTest.createTempFile("temp", ""); + + ProcessSettings job = new ProcessSettings(new String[]{"cat", "non_existent_file"}); + job.getStdoutSettings().setOutputFile(outFile); + job.getStdoutSettings().setBufferSize(-1); + job.getStderrSettings().setOutputFile(errFile); + job.getStderrSettings().setBufferSize(-1); + job.setRedirectErrorStream(false); + + ProcessOutput result = new ProcessController().exec(job); + int exitValue = result.getExitValue(); + + Assert.assertTrue(exitValue != 0, "'cat' non existent file returned 0"); + + String fileString, bufferString; + + fileString = FileUtils.readFileToString(errFile); + Assert.assertTrue(fileString.length() > 0, "Err file was length 0"); + + bufferString = result.getStderr().getBufferString(); + Assert.assertTrue(bufferString.length() > 0, "Err buffer was length 0"); + + Assert.assertFalse(result.getStderr().isBufferTruncated(), "Err buffer was truncated"); + Assert.assertEquals(bufferString.length(), fileString.length(), "Err buffer length did not match file length"); + + fileString = FileUtils.readFileToString(outFile); + Assert.assertEquals(fileString, "", "Unexpected output to out file"); + + bufferString = result.getStdout().getBufferString(); + Assert.assertEquals(bufferString, "", "Unexepected output to out buffer"); + } finally { + FileUtils.deleteQuietly(outFile); + FileUtils.deleteQuietly(errFile); + } + } + + private static final String TRUNCATE_TEXT = "Hello World"; + private static final byte[] TRUNCATE_OUTPUT_BYTES = (TRUNCATE_TEXT + NL).getBytes(); + + /** + * @return Test truncating content vs. not truncating (run at -1/+1 size) + */ + @DataProvider(name = "truncateSizes") + public Object[][] getTruncateBufferSizes() { + int l = TRUNCATE_OUTPUT_BYTES.length; + return new Object[][]{ + new Object[]{0, 0}, + new Object[]{l, l}, + new Object[]{l + 1, l}, + new Object[]{l - 1, l - 1} + }; + } + + @Test(dataProvider = "truncateSizes") + public void testTruncateBuffer(int truncateLen, int expectedLen) { + byte[] expected = Arrays.copyOf(TRUNCATE_OUTPUT_BYTES, expectedLen); + + String[] command = {"echo", TRUNCATE_TEXT}; + ProcessController controller = new ProcessController(); + + ProcessSettings job = new ProcessSettings(command); + job.getStdoutSettings().setBufferSize(truncateLen); + ProcessOutput result = controller.exec(job); + + int exitValue = result.getExitValue(); + + Assert.assertEquals(exitValue, 0, + String.format("Echo returned %d: %s", exitValue, TRUNCATE_TEXT)); + + byte[] bufferBytes = result.getStdout().getBufferBytes(); + + Assert.assertEquals(bufferBytes, expected, + String.format("Output buffer didn't match (%d vs %d)", expected.length, bufferBytes.length)); + + boolean truncated = result.getStdout().isBufferTruncated(); + + Assert.assertEquals(truncated, TRUNCATE_OUTPUT_BYTES.length > truncateLen, + "Unexpected buffer truncation result"); + } + + private static final String[] LONG_COMMAND = getLongCommand(); + private static final String LONG_COMMAND_STRING = StringUtils.join(LONG_COMMAND, " "); + private static final String LONG_COMMAND_DESCRIPTION = ""; + + @DataProvider(name = "echoCommands") + public Object[][] getEchoCommands() { + + new EchoCommand(new String[]{"echo", "Hello", "World"}, "Hello World" + NL); + new EchoCommand(new String[]{"echo", "'Hello", "World"}, "'Hello World" + NL); + new EchoCommand(new String[]{"echo", "Hello", "World'"}, "Hello World'" + NL); + new EchoCommand(new String[]{"echo", "'Hello", "World'"}, "'Hello World'" + NL); + + String[] longCommand = new String[LONG_COMMAND.length + 1]; + longCommand[0] = "echo"; + System.arraycopy(LONG_COMMAND, 0, longCommand, 1, LONG_COMMAND.length); + new EchoCommand(longCommand, LONG_COMMAND_STRING + NL) { + @Override + public String toString() { + return LONG_COMMAND_DESCRIPTION; + } + }; + + return TestDataProvider.getTests(EchoCommand.class); + } + + @Test(dataProvider = "echoCommands") + public void testEcho(EchoCommand script) throws IOException { + File outputFile = null; + try { + outputFile = BaseTest.createTempFile("temp", ""); + + ProcessSettings job = new ProcessSettings(script.command); + if (script.output != null) { + job.getStdoutSettings().setOutputFile(outputFile); + job.getStdoutSettings().setBufferSize(script.output.getBytes().length); + } + + ProcessOutput result = new ProcessController().exec(job); + int exitValue = result.getExitValue(); + + Assert.assertEquals(exitValue, 0, + String.format("Echo returned %d: %s", exitValue, script)); + + if (script.output != null) { + + String fileString = FileUtils.readFileToString(outputFile); + Assert.assertEquals(fileString, script.output, + String.format("Output file didn't match (%d vs %d): %s", + fileString.length(), script.output.length(), script)); + + String bufferString = result.getStdout().getBufferString(); + Assert.assertEquals(bufferString, script.output, + String.format("Output content didn't match (%d vs %d): %s", + bufferString.length(), script.output.length(), script)); + + Assert.assertFalse(result.getStdout().isBufferTruncated(), + "Output content was truncated: " + script); + } + } finally { + FileUtils.deleteQuietly(outputFile); + } + } + + @Test(expectedExceptions = ReviewedStingException.class) + public void testUnableToStart() { + ProcessSettings job = new ProcessSettings(new String[]{"no_such_command"}); + new ProcessController().exec(job); + } + + @DataProvider(name = "scriptCommands") + public Object[][] getScriptCommands() { + new ScriptCommand(true, "echo Hello World", "Hello World" + NL); + new ScriptCommand(false, "echo 'Hello World", null); + new ScriptCommand(false, "echo Hello World'", null); + new ScriptCommand(true, "echo 'Hello World'", "Hello World" + NL); + new ScriptCommand(true, "echo \"Hello World\"", "Hello World" + NL); + new ScriptCommand(false, "no_such_echo Hello World", null); + new ScriptCommand(true, "echo #", NL); + new ScriptCommand(true, "echo \\#", "#" + NL); + new ScriptCommand(true, "echo \\\\#", "\\#" + NL); + + new ScriptCommand(true, "echo " + LONG_COMMAND_STRING, LONG_COMMAND_STRING + NL) { + @Override + public String toString() { + return LONG_COMMAND_DESCRIPTION; + } + }; + + return TestDataProvider.getTests(ScriptCommand.class); + } + + @Test(dataProvider = "scriptCommands") + public void testScript(ScriptCommand script) throws IOException { + File scriptFile = null; + File outputFile = null; + try { + scriptFile = writeScript(script.content); + outputFile = BaseTest.createTempFile("temp", ""); + + ProcessSettings job = new ProcessSettings(new String[]{"sh", scriptFile.getAbsolutePath()}); + if (script.output != null) { + job.getStdoutSettings().setOutputFile(outputFile); + job.getStdoutSettings().setBufferSize(script.output.getBytes().length); + } + + ProcessOutput result = new ProcessController().exec(job); + int exitValue = result.getExitValue(); + + Assert.assertEquals(exitValue == 0, script.succeed, + String.format("Script returned %d: %s", exitValue, script)); + + if (script.output != null) { + + String fileString = FileUtils.readFileToString(outputFile); + Assert.assertEquals(fileString, script.output, + String.format("Output file didn't match (%d vs %d): %s", + fileString.length(), script.output.length(), script)); + + String bufferString = result.getStdout().getBufferString(); + Assert.assertEquals(bufferString, script.output, + String.format("Output content didn't match (%d vs %d): %s", + bufferString.length(), script.output.length(), script)); + + Assert.assertFalse(result.getStdout().isBufferTruncated(), + "Output content was truncated: " + script); + } + } finally { + FileUtils.deleteQuietly(scriptFile); + FileUtils.deleteQuietly(outputFile); + } + } + + private static String[] getLongCommand() { + // This command fails on some systems with a 4096 character limit when run via the old sh -c "echo ...", + // but works on the same systems when run via sh