From 1f19afe1d9e8724ea998ac62c481e2ebdc4af7a5 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 29 Jun 2011 15:54:09 -0400 Subject: [PATCH 01/21] Fixed bug in the IndelRealigner: now that variants are correctly typed in VariantContext, it is possible that a variant can be an indel but neither an insertion or a deletion; added a isComplexIndel() method and now we check for such an event in the realigner (we don't use them to generate alternate consenses). Also, added a isMNP() method while I was there so that it would be consistent with other variant types. --- .../sting/gatk/walkers/indels/IndelRealigner.java | 10 +++++----- .../gatk/walkers/phasing/AnnotateMNPsWalker.java | 2 +- .../gatk/walkers/sequenom/PickSequenomProbes.java | 2 +- .../gatk/walkers/variantutils/ValidateVariants.java | 4 ++-- .../sting/utils/variantcontext/VariantContext.java | 11 +++++++++++ 5 files changed, 20 insertions(+), 9 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 28b88f9f2..33de08bb6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -798,11 +798,11 @@ public class IndelRealigner extends ReadWalker { private void generateAlternateConsensesFromKnownIndels(final Set altConsensesToPopulate, final int leftmostIndex, final byte[] reference) { for ( VariantContext knownIndel : knownIndelsToTry ) { - if ( knownIndel == null || !knownIndel.isIndel() ) + if ( knownIndel == null || !knownIndel.isIndel() || knownIndel.isComplexIndel() ) continue; byte[] indelStr = knownIndel.isInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length()); int start = knownIndel.getStart() - leftmostIndex + 1; - Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel.isDeletion()); + Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel); if ( c != null ) altConsensesToPopulate.add(c); } @@ -988,7 +988,7 @@ public class IndelRealigner extends ReadWalker { } // create a Consensus from just the indel string that falls on the reference - private Consensus createAlternateConsensus(final int indexOnRef, final byte[] reference, final byte[] indelStr, final boolean isDeletion) { + private Consensus createAlternateConsensus(final int indexOnRef, final byte[] reference, final byte[] indelStr, final VariantContext indel) { if ( indexOnRef < 0 || indexOnRef >= reference.length ) return null; @@ -1002,11 +1002,11 @@ public class IndelRealigner extends ReadWalker { if ( indexOnRef > 0 ) cigar.add(new CigarElement(indexOnRef, CigarOperator.M)); - if ( isDeletion ) { + if ( indel.isDeletion() ) { refIdx += indelStr.length; cigar.add(new CigarElement(indelStr.length, CigarOperator.D)); } - else { + else if ( indel.isInsertion() ) { for ( byte b : indelStr ) sb.append((char)b); cigar.add(new CigarElement(indelStr.length, CigarOperator.I)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java index 7989f52db..81d9b4ddb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/AnnotateMNPsWalker.java @@ -161,7 +161,7 @@ public class AnnotateMNPsWalker extends RodWalker { boolean atStartOfVc = curLocus.getStart() == vcLoc.getStart(); boolean atEndOfVc = curLocus.getStart() == vcLoc.getStop(); - if (vc.getType() == VariantContext.Type.MNP) { + if (vc.isMNP()) { logger.debug("Observed MNP at " + vcLoc); if (isChrM(vc)) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java index f350e7531..fde233b5d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/sequenom/PickSequenomProbes.java @@ -209,7 +209,7 @@ public class PickSequenomProbes extends RodWalker { String assay_sequence; if ( vc.isSNP() ) assay_sequence = leading_bases + "[" + (char)ref.getBase() + "/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases; - else if ( vc.getType() == VariantContext.Type.MNP ) + else if ( vc.isMNP() ) assay_sequence = leading_bases + "[" + new String(vc.getReference().getBases()) + "/" + new String(vc.getAlternateAllele(0).getBases())+"]"+trailing_bases.substring(vc.getReference().length()-1); else if ( vc.isInsertion() ) assay_sequence = leading_bases + (char)ref.getBase() + "[-/" + vc.getAlternateAllele(0).toString() + "]" + trailing_bases; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java index ec7395f85..1bd73414c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java @@ -113,7 +113,7 @@ public class ValidateVariants extends RodWalker { observedRefAllele = Allele.create(Allele.NULL_ALLELE_STRING); } // deletions - else if ( vc.isDeletion() || vc.isMixed() || vc.getType() == VariantContext.Type.MNP ) { + else if ( vc.isDeletion() || vc.isMixed() || vc.isMNP() ) { // we can't validate arbitrarily long deletions if ( reportedRefAllele.length() > 100 ) { logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", reportedRefAllele.length(), vc.getChr(), vc.getStart())); @@ -123,7 +123,7 @@ public class ValidateVariants extends RodWalker { // deletions are associated with the (position of) the last (preceding) non-deleted base; // hence to get actually deleted bases we need offset = 1 int offset = 1 ; - if ( vc.getType() == VariantContext.Type.MNP ) offset = 0; // if it's an MNP, the reported position IS the first modified base + if ( vc.isMNP() ) offset = 0; // if it's an MNP, the reported position IS the first modified base byte[] refBytes = ref.getBases(); byte[] trueRef = new byte[reportedRefAllele.length()]; for (int i = 0; i < reportedRefAllele.length(); i++) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 87466e21b..92262f1bf 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -566,10 +566,21 @@ public class VariantContext implements Feature { // to enable tribble intergrati return getType() == Type.INDEL && getAlternateAllele(0).isNull(); } + /** + * @return true if the alleles indicate neither a simple deletion nor a simple insertion + */ + public boolean isComplexIndel() { + return isIndel() && !isDeletion() && !isInsertion(); + } + public boolean isSymbolic() { return getType() == Type.SYMBOLIC; } + public boolean isMNP() { + return getType() == Type.MNP; + } + /** * convenience method for indels * From 70ba851478a1805509d306a8514c192b57b38e6b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 29 Jun 2011 15:59:10 -0400 Subject: [PATCH 02/21] Might as well check for the illegal state and throw an exception --- .../sting/gatk/walkers/indels/IndelRealigner.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 33de08bb6..3cfa58c02 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -1010,6 +1010,8 @@ public class IndelRealigner extends ReadWalker { for ( byte b : indelStr ) sb.append((char)b); cigar.add(new CigarElement(indelStr.length, CigarOperator.I)); + } else { + throw new ReviewedStingException("Creating an alternate consensus from a complex indel is not allowed"); } if ( reference.length - refIdx > 0 ) From 1085df8b7b3f23e853444074bf9d202c9a54be1a Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 29 Jun 2011 16:05:32 -0400 Subject: [PATCH 04/21] Making the BQSR pipeline publicly available and supported. this is for all the Pacbio validation that is going on right no in the cancer group. They are all using this script, and I'm happy to support it. --- .../core/RecalibrateBaseQualities.scala | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100755 public/scala/qscript/core/RecalibrateBaseQualities.scala diff --git a/public/scala/qscript/core/RecalibrateBaseQualities.scala b/public/scala/qscript/core/RecalibrateBaseQualities.scala new file mode 100755 index 000000000..ba13d267a --- /dev/null +++ b/public/scala/qscript/core/RecalibrateBaseQualities.scala @@ -0,0 +1,91 @@ +package core + +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.extensions.gatk._ +import net.sf.samtools.SAMFileReader + +/** + * Created by IntelliJ IDEA. + * User: carneiro + * Date: 4/20/11 + * Time: 16:29 PM + */ + + +class RecalibrateBaseQualities extends QScript { + + @Input(doc="path to GenomeAnalysisTK.jar", shortName="gatk", required=true) + var GATKjar: File = _ + + @Input(doc="input BAM file - or list of BAM files", shortName="i", required=true) + var input: File = _ + + @Input(doc="path to R resources folder inside the Sting repository", fullName="path_to_r", shortName="r", required=false) + var R: String = new File("/humgen/gsa-scr1/carneiro/stable/R") + + @Input(doc="Reference fasta file", shortName="R", required=false) + var reference: File = new File("/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta") + + @Input(doc="dbsnp ROD to use (VCF)", shortName="D", required=false) + var dbSNP: File = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_132_b37.leftAligned.vcf") + + val queueLogDir: String = ".qlog/" + var nContigs: Int = 0 + + def getNumberOfContigs(bamFile: File): Int = { + val samReader = new SAMFileReader(new File(bamFile)) + return samReader.getFileHeader.getSequenceDictionary.getSequences.size() + } + + def script = { + + nContigs = getNumberOfContigs(input) + + val recalFile1: File = new File("recal1.csv") + val recalFile2: File = new File("recal2.csv") + val recalBam: File = swapExt(input, ".bam", "recal.bam") + val path1: String = "before" + val path2: String = "after" + + add(cov(input, recalFile1), + recal(input, recalFile1, recalBam), + cov(recalBam, recalFile2), + analyzeCovariates(recalFile1, path1), + analyzeCovariates(recalFile2, path2)) + } + + trait CommandLineGATKArgs extends CommandLineGATK { + this.jarFile = GATKjar + this.reference_sequence = reference + this.memoryLimit = 4 + this.isIntermediate = true + } + + case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs { + this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP) + this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate") + this.input_file :+= inBam + this.recal_file = outRecalFile + this.analysisName = queueLogDir + outRecalFile + ".covariates" + this.jobName = queueLogDir + outRecalFile + ".covariates" + this.scatterCount = nContigs + } + + case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs { + this.input_file :+= inBam + this.recal_file = inRecalFile + this.out = outBam + this.isIntermediate = false + this.analysisName = queueLogDir + outBam + ".recalibration" + this.jobName = queueLogDir + outBam + ".recalibration" + this.scatterCount = nContigs + } + + case class analyzeCovariates (inRecalFile: File, outPath: String) extends AnalyzeCovariates { + this.resources = R + this.recal_file = inRecalFile + this.output_dir = outPath.toString + this.analysisName = queueLogDir + inRecalFile + ".analyze_covariates" + this.jobName = queueLogDir + inRecalFile + ".analyze_covariates" + } +} \ No newline at end of file From f18fffd6254a9973f9f95b521278b6444d7fb80e Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 29 Jun 2011 17:36:47 -0400 Subject: [PATCH 05/21] Fixing broken paths to the testdata directory throughout the codebase. --- .../test/org/broadinstitute/sting/BaseTest.java | 2 +- .../sting/gatk/GenomeAnalysisEngineUnitTest.java | 14 +++++++------- .../sting/utils/bed/BedParserUnitTest.java | 2 +- .../sting/utils/codecs/hapmap/HapMapUnitTest.java | 2 +- .../utils/codecs/vcf/IndexFactoryUnitTest.java | 4 ++-- .../sting/utils/text/ListFileUtilsUnitTest.java | 12 ++++++------ .../GenomeLocProcessingTrackerUnitTest.java | 2 +- 7 files changed, 19 insertions(+), 19 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 07f46c1c3..61bb8b34b 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -83,7 +83,7 @@ public abstract class BaseTest { */ public static final String MD5_FILE_DB_SUBDIR = "integrationtests"; - public static final String testDir = "testdata/"; + public static final String testDir = "public/testdata/"; /** before the class starts up */ static { diff --git a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java index a17c162f5..1e4625bf0 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/GenomeAnalysisEngineUnitTest.java @@ -60,8 +60,8 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); testEngine.setSAMFileIDs(samFiles); testEngine.checkForDuplicateSamFiles(); @@ -72,10 +72,10 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { GenomeAnalysisEngine testEngine = new GenomeAnalysisEngine(); Collection samFiles = new ArrayList(); - samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("testdata/exampleNORG.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags())); - samFiles.add(new SAMReaderID(new File("testdata/exampleNORG.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); + samFiles.add(new SAMReaderID(new File("public/testdata/exampleNORG.bam"), new Tags())); testEngine.setSAMFileIDs(samFiles); testEngine.checkForDuplicateSamFiles(); @@ -97,7 +97,7 @@ public class GenomeAnalysisEngineUnitTest extends BaseTest { GATKArgumentCollection argCollection = new GATKArgumentCollection(); testEngine.setArguments(argCollection); - File fastaFile = new File("testdata/exampleFASTA.fasta"); + File fastaFile = new File("public/testdata/exampleFASTA.fasta"); GenomeLocParser genomeLocParser = new GenomeLocParser(new IndexedFastaSequenceFile(fastaFile)); testEngine.setGenomeLocParser(genomeLocParser); diff --git a/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java index e592c99b9..56bf66f53 100644 --- a/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/bed/BedParserUnitTest.java @@ -21,7 +21,7 @@ public class BedParserUnitTest extends BaseTest { private static IndexedFastaSequenceFile seq; private GenomeLocParser genomeLocParser; - private File bedFile = new File("testdata/sampleBedFile.bed"); + private File bedFile = new File("public/testdata/sampleBedFile.bed"); @BeforeClass public void beforeTests() { diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java index 2e618d5eb..e912f97e9 100644 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/hapmap/HapMapUnitTest.java @@ -39,7 +39,7 @@ import java.io.IOException; */ public class HapMapUnitTest { // our sample hapmap file - private final static File hapMapFile = new File("testdata/genotypes_chr1_ASW_phase3.3_first500.hapmap"); + private final static File hapMapFile = new File("public/testdata/genotypes_chr1_ASW_phase3.3_first500.hapmap"); private final static String knownLine = "rs2185539 C/T chr1 556738 + ncbi_b36 bbs urn:lsid:bbs.hapmap.org:Protocol:Phase3.r3:1 urn:lsid:bbs.hapmap.org:Assay:Phase3.r3_r" + "s2185539:1 urn:lsid:dcc.hapmap.org:Panel:US_African-30-trios:4 QC+ CC TC TT CT CC CC CC CC CC CC CC CC CC"; /** diff --git a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java index 3929ff388..2f6b589f4 100755 --- a/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/codecs/vcf/IndexFactoryUnitTest.java @@ -17,8 +17,8 @@ import java.util.*; */ public class IndexFactoryUnitTest { - File inputFile = new File("testdata/HiSeq.10000.vcf"); - File outputFile = new File("testdata/onTheFlyOutputTest.vcf"); + File inputFile = new File("public/testdata/HiSeq.10000.vcf"); + File outputFile = new File("public/testdata/onTheFlyOutputTest.vcf"); File outputFileIndex = Tribble.indexFile(outputFile); /** diff --git a/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java index f85a2f599..f0b1de6fe 100644 --- a/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/text/ListFileUtilsUnitTest.java @@ -49,12 +49,12 @@ public class ListFileUtilsUnitTest extends BaseTest { public void testIgnoreBlankLinesInBAMListFiles() throws Exception { File tempListFile = createTempListFile("testIgnoreBlankLines", "", - "testdata/exampleBAM.bam", + "public/testdata/exampleBAM.bam", " " ); List expectedBAMFileListAfterUnpacking = new ArrayList(); - expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags())); + expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking); } @@ -63,13 +63,13 @@ public class ListFileUtilsUnitTest extends BaseTest { public void testCommentSupportInBAMListFiles() throws Exception { File tempListFile = createTempListFile("testCommentSupport", "#", - "testdata/exampleBAM.bam", - "#testdata/foo.bam", - " # testdata/bar.bam" + "public/testdata/exampleBAM.bam", + "#public/testdata/foo.bam", + " # public/testdata/bar.bam" ); List expectedBAMFileListAfterUnpacking = new ArrayList(); - expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("testdata/exampleBAM.bam"), new Tags())); + expectedBAMFileListAfterUnpacking.add(new SAMReaderID(new File("public/testdata/exampleBAM.bam"), new Tags())); performBAMListFileUnpackingTest(tempListFile, expectedBAMFileListAfterUnpacking); } diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java index df4e1660d..78ab916db 100644 --- a/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/GenomeLocProcessingTrackerUnitTest.java @@ -29,7 +29,7 @@ public class GenomeLocProcessingTrackerUnitTest extends BaseTest { IndexedFastaSequenceFile fasta = null; GenomeLocParser genomeLocParser = null; String chr1 = null; - private final static String FILE_ROOT = "testdata/GLPTFile"; + private final static String FILE_ROOT = "public/testdata/GLPTFile"; @BeforeTest public void before() { From efd99c3c117e2a423c93e9390076d1b9437d406b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 30 Jun 2011 11:32:06 -0400 Subject: [PATCH 10/21] new home for the core qscripts --- .../sting/queue/qscripts}/DataProcessingPipeline.scala | 2 +- .../sting/queue/qscripts}/GATKResourcesBundle.scala | 2 +- .../queue/qscripts}/MethodsDevelopmentCallingPipeline.scala | 2 ++ .../sting/queue/qscripts}/RecalibrateBaseQualities.scala | 2 +- .../sting/queue/qscripts}/StandardVariantEvaluation.scala | 2 +- 5 files changed, 6 insertions(+), 4 deletions(-) rename public/scala/qscript/{core => org/broadinstitute/sting/queue/qscripts}/DataProcessingPipeline.scala (99%) rename public/scala/qscript/{core => org/broadinstitute/sting/queue/qscripts}/GATKResourcesBundle.scala (99%) rename public/scala/qscript/{core => org/broadinstitute/sting/queue/qscripts}/MethodsDevelopmentCallingPipeline.scala (99%) rename public/scala/qscript/{core => org/broadinstitute/sting/queue/qscripts}/RecalibrateBaseQualities.scala (98%) rename public/scala/qscript/{core => org/broadinstitute/sting/queue/qscripts}/StandardVariantEvaluation.scala (99%) diff --git a/public/scala/qscript/core/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala similarity index 99% rename from public/scala/qscript/core/DataProcessingPipeline.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index cad19bd23..fdde5e8a2 100755 --- a/public/scala/qscript/core/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -1,4 +1,4 @@ -package core +package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript diff --git a/public/scala/qscript/core/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala similarity index 99% rename from public/scala/qscript/core/GATKResourcesBundle.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 24ae045d3..150d78019 100755 --- a/public/scala/qscript/core/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -1,4 +1,4 @@ -package core +package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ diff --git a/public/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala similarity index 99% rename from public/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala index 632c03499..d71c6ae5d 100755 --- a/public/scala/qscript/core/MethodsDevelopmentCallingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala @@ -1,3 +1,5 @@ +package org.broadinstitute.sting.queue.qscripts + import org.broadinstitute.sting.commandline.Hidden import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript diff --git a/public/scala/qscript/core/RecalibrateBaseQualities.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala similarity index 98% rename from public/scala/qscript/core/RecalibrateBaseQualities.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala index ba13d267a..678b356ee 100755 --- a/public/scala/qscript/core/RecalibrateBaseQualities.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/RecalibrateBaseQualities.scala @@ -1,4 +1,4 @@ -package core +package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk._ diff --git a/public/scala/qscript/core/StandardVariantEvaluation.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala similarity index 99% rename from public/scala/qscript/core/StandardVariantEvaluation.scala rename to public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala index d6af7019e..d333e1dc0 100755 --- a/public/scala/qscript/core/StandardVariantEvaluation.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/StandardVariantEvaluation.scala @@ -1,4 +1,4 @@ -package core +package org.broadinstitute.sting.queue.qscripts import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.extensions.gatk.RodBind From 352c38fc0b48d8910ad3346eff0313b2f0890b06 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 30 Jun 2011 11:55:56 -0400 Subject: [PATCH 12/21] Updated to reflect dbsnp conversion fix --- .../walkers/indels/RealignerTargetCreatorIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java index acfcbab7b..4b225aaea 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorIntegrationTest.java @@ -19,7 +19,7 @@ public class RealignerTargetCreatorIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( "-T RealignerTargetCreator -D /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -R " + b36KGReference + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000 -o %s", 1, - Arrays.asList("fd2d9dbf718f7a6d82ae1787ac1fe61e")); + Arrays.asList("f23ba17ee0f9573dd307708175d90cd2")); executeTest("test dbsnp", spec2); WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( From 9e234cf5d6b8cf3439fc0703248c637c3119f1e2 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 30 Jun 2011 13:17:14 -0400 Subject: [PATCH 13/21] This is a temporary commit for Picard. It will absolutely break integration tests, but I'm going to revert it in 1 minute. Because we don't want them in unstable, I need to push this into stable. --- .../sting/gatk/walkers/indels/IndelRealigner.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 3cfa58c02..9816e17f7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -99,7 +99,7 @@ public class IndelRealigner extends ReadWalker { protected SAMFileWriter writerToUse = null; @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "How should we determine the possible alternate consenses? -- in the order of least permissive to most permissive there is KNOWNS_ONLY (use only indels from known indels provided in RODs), USE_READS (additionally use indels already present in the original alignments of the reads), and USE_SW (additionally use 'Smith-Waterman' to generate alternate consenses). The default is USE_READS", required = false) - public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS; + public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_SW; // ADVANCED OPTIONS FOLLOW @@ -167,7 +167,8 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="realignReadsWithBadMates", doc="This argument is no longer used.", required=false) protected boolean DEPRECATED_REALIGN_MATES = false; - @Deprecated + //@Deprecated + @Hidden @Argument(fullName="useOnlyKnownIndels", shortName="knownsOnly", doc="This argument is no longer used. See --consensusDeterminationModel instead.", required=false) protected boolean DEPRECATED_KNOWNS_ONLY = false; @@ -261,6 +262,9 @@ public class IndelRealigner extends ReadWalker { public void initialize() { + if ( DEPRECATED_KNOWNS_ONLY ) + consensusModel = ConsensusDeterminationModel.KNOWNS_ONLY; + if ( N_WAY_OUT == null && writer == null ) { throw new UserException.CommandLineException("Either -o or -nWayOut must be specified"); } From 804d5f22d522516ea787e3873a7bfbaf3c2035a0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 30 Jun 2011 13:18:30 -0400 Subject: [PATCH 14/21] Reverting previous change, as promised. --- .../sting/gatk/walkers/indels/IndelRealigner.java | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 9816e17f7..a53665d64 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -99,7 +99,7 @@ public class IndelRealigner extends ReadWalker { protected SAMFileWriter writerToUse = null; @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "How should we determine the possible alternate consenses? -- in the order of least permissive to most permissive there is KNOWNS_ONLY (use only indels from known indels provided in RODs), USE_READS (additionally use indels already present in the original alignments of the reads), and USE_SW (additionally use 'Smith-Waterman' to generate alternate consenses). The default is USE_READS", required = false) - public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_SW; + public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS; // ADVANCED OPTIONS FOLLOW @@ -167,8 +167,7 @@ public class IndelRealigner extends ReadWalker { @Argument(fullName="realignReadsWithBadMates", doc="This argument is no longer used.", required=false) protected boolean DEPRECATED_REALIGN_MATES = false; - //@Deprecated - @Hidden + @Deprecated @Argument(fullName="useOnlyKnownIndels", shortName="knownsOnly", doc="This argument is no longer used. See --consensusDeterminationModel instead.", required=false) protected boolean DEPRECATED_KNOWNS_ONLY = false; @@ -262,9 +261,6 @@ public class IndelRealigner extends ReadWalker { public void initialize() { - if ( DEPRECATED_KNOWNS_ONLY ) - consensusModel = ConsensusDeterminationModel.KNOWNS_ONLY; - if ( N_WAY_OUT == null && writer == null ) { throw new UserException.CommandLineException("Either -o or -nWayOut must be specified"); } @@ -1015,7 +1011,7 @@ public class IndelRealigner extends ReadWalker { sb.append((char)b); cigar.add(new CigarElement(indelStr.length, CigarOperator.I)); } else { - throw new ReviewedStingException("Creating an alternate consensus from a complex indel is not allowed"); + throw new IllegalStateException("Creating an alternate consensus from a complex indel is not allows"); } if ( reference.length - refIdx > 0 ) From f4463d38ca28209ad59b3c5cb7579c3f104fab5c Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 30 Jun 2011 14:29:21 -0400 Subject: [PATCH 15/21] BWA requires pair ended reads to be sorted by read names when operating over BAM files, but Picard sorts by coordinate, so in case we use BWA in pair ended reads, the pipeline now resorts the BAM in read name order, realigns it then sorts it in coordinate order. --- .../queue/qscripts/DataProcessingPipeline.scala | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index fdde5e8a2..623181de6 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -4,12 +4,12 @@ import org.broadinstitute.sting.queue.extensions.gatk._ import org.broadinstitute.sting.queue.QScript import org.broadinstitute.sting.queue.function.ListWriterFunction -import net.sf.samtools.{SAMFileReader,SAMReadGroupRecord} - import scala.io.Source._ import collection.JavaConversions._ import org.broadinstitute.sting.gatk.walkers.indels.IndelRealigner.ConsensusDeterminationModel import org.broadinstitute.sting.queue.extensions.picard._ +import net.sf.samtools.{SAMFileReader, SAMReadGroupRecord} +import net.sf.samtools.SAMFileHeader.SortOrder class DataProcessingPipeline extends QScript { @@ -180,6 +180,7 @@ class DataProcessingPipeline extends QScript { var realignedBams: List[File] = List() var index = 1 for (bam <- bams) { + val readSortedBam = swapExt(bam, ".bam", "." + index + ".sorted.bam" ) val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai") val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai") val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam") @@ -190,11 +191,12 @@ class DataProcessingPipeline extends QScript { bwa_sam_se(bam, saiFile1, realignedSamFile)) } else { - add(bwa_aln_pe(bam, saiFile1, 1), - bwa_aln_pe(bam, saiFile2, 2), - bwa_sam_pe(bam, saiFile1, saiFile2, realignedSamFile)) + add(sortSam(bam, readSortedBam, SortOrder.queryname), + bwa_aln_pe(readSortedBam, saiFile1, 1), + bwa_aln_pe(readSortedBam, saiFile2, 2), + bwa_sam_pe(readSortedBam, saiFile1, saiFile2, realignedSamFile)) } - add(sortSam(realignedSamFile, realignedBamFile)) + add(sortSam(realignedSamFile, realignedBamFile, SortOrder.coordinate)) addReadGroups(realignedBamFile, rgRealignedBamFile, new SAMFileReader(bam)) realignedBams :+= rgRealignedBamFile index = index + 1 @@ -385,9 +387,10 @@ class DataProcessingPipeline extends QScript { this.jobName = queueLogDir + outBam + ".joinBams" } - case class sortSam (inSam: File, outBam: File) extends SortSam { + case class sortSam (inSam: File, outBam: File, sortOrderP: SortOrder) extends SortSam { this.input = List(inSam) this.output = outBam + this.sortOrder = sortOrderP this.memoryLimit = 4 this.isIntermediate = true this.analysisName = queueLogDir + outBam + ".sortSam" From 197b7141c198b40b23c058414379fdba2fb47450 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 30 Jun 2011 14:41:57 -0400 Subject: [PATCH 16/21] Added an optional argument -bt for BWA to run multithreaded. --- .../sting/queue/qscripts/DataProcessingPipeline.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 623181de6..f9369ee3f 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -69,6 +69,8 @@ class DataProcessingPipeline extends QScript { @Input(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false) var useBWApe: Boolean = false + @Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false) + var bwaThreads: Int = 1 /**************************************************************************** @@ -433,7 +435,7 @@ class DataProcessingPipeline extends QScript { case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with BWACommonArgs { @Input(doc="bam file to be aligned") var bam = inBam @Output(doc="output sai file") var sai = outSai - def commandLine = bwaPath + " aln -q 5 " + reference + " -b " + bam + " > " + sai + def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b " + bam + " > " + sai this.analysisName = queueLogDir + outSai + ".bwa_aln_se" this.jobName = queueLogDir + outSai + ".bwa_aln_se" } @@ -441,7 +443,7 @@ class DataProcessingPipeline extends QScript { case class bwa_aln_pe (inBam: File, outSai1: File, index: Int) extends CommandLineFunction with BWACommonArgs { @Input(doc="bam file to be aligned") var bam = inBam @Output(doc="output sai file for 1st mating pair") var sai = outSai1 - def commandLine = bwaPath + " aln -q 5 " + reference + " -b" + index + " " + bam + " > " + sai + def commandLine = bwaPath + " aln -t " + bwaThreads + " -q 5 " + reference + " -b" + index + " " + bam + " > " + sai this.analysisName = queueLogDir + outSai1 + ".bwa_aln_pe1" this.jobName = queueLogDir + outSai1 + ".bwa_aln_pe1" } From f4ae6edb9240ee80debba64693c678d782b23ca1 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 30 Jun 2011 14:55:25 -0400 Subject: [PATCH 17/21] Moving some of the released R scripts into public from private --- .../R/plot_Annotations_BinnedTruthMetrics.R | 190 ++++++++++++++++++ public/R/plot_Tranches.R | 87 ++++++++ public/R/plot_residualError_OtherCovariate.R | 108 ++++++++++ ...plot_residualError_QualityScoreCovariate.R | 70 +++++++ public/R/titvFPEst.R | 138 +++++++++++++ 5 files changed, 593 insertions(+) create mode 100644 public/R/plot_Annotations_BinnedTruthMetrics.R create mode 100755 public/R/plot_Tranches.R create mode 100644 public/R/plot_residualError_OtherCovariate.R create mode 100644 public/R/plot_residualError_QualityScoreCovariate.R create mode 100755 public/R/titvFPEst.R diff --git a/public/R/plot_Annotations_BinnedTruthMetrics.R b/public/R/plot_Annotations_BinnedTruthMetrics.R new file mode 100644 index 000000000..9f9ee290c --- /dev/null +++ b/public/R/plot_Annotations_BinnedTruthMetrics.R @@ -0,0 +1,190 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +input = args[1] +annotationName = args[2] +minBinCutoff = as.numeric(args[3]) +medianNumVariants = args[4] + +c <- read.table(input, header=T) + +all = c[c$numVariants>minBinCutoff & c$category=="all",] +novel = c[c$numVariants>minBinCutoff & c$category=="novel",] +dbsnp = c[c$numVariants>minBinCutoff & c$category=="dbsnp",] +truth = c[c$numVariants>minBinCutoff & c$category=="truth",] + +# +# Calculate min, max, medians +# + +d = c[c$numVariants>minBinCutoff,] +ymin = min(d$titv) +ymax = max(d$titv) +xmin = min(d$value) +xmax = max(d$value) +m = weighted.mean(all$value,all$numVariants/sum(all$numVariants)) +ma = all[all$value > m,] +mb = all[all$value < m,] +m75 = weighted.mean(ma$value,ma$numVariants/sum(ma$numVariants)) +m25 = weighted.mean(mb$value,mb$numVariants/sum(mb$numVariants)) +if(medianNumVariants == "true") { +vc = cumsum( all$numVariants/sum(all$numVariants) ) +m10 = all$value[ max(which(vc<=0.10)) ] +m25 = all$value[ max(which(vc<=0.25)) ] +m = all$value[ max(which(vc<=0.5)) ] +m75 = all$value[ min(which(vc>=0.75)) ] +m90 = all$value[ min(which(vc>=0.90)) ] +} + +# +# Plot TiTv ratio as a function of the annotation +# + +outfile = paste(input, ".TiTv.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$titv,xlab=annotationName,ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +points(novel$value,novel$titv,col="green",pch=20) +points(dbsnp$value,dbsnp$titv,col="blue",pch=20) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(truth$value,truth$titv,col="magenta",pch=20) +legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) +} else { +legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) +} +dev.off() + +# +# Plot TiTv ratio as a function of the annotation, log scale on the x-axis +# + +outfile = paste(input, ".TiTv_log.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$titv,xlab=annotationName,log="x",ylab="Ti/Tv Ratio",pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +points(novel$value,novel$titv,col="green",pch=20) +points(dbsnp$value,dbsnp$titv,col="blue",pch=20) +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(truth$value,truth$titv,col="magenta",pch=20) +legend("topleft", c("all","novel","dbsnp","truth"),col=c("black","green","blue","magenta"),pch=c(20,20,20,20)) +} else { +legend("topleft", c("all","novel","dbsnp"),col=c("black","green","blue"),pch=c(20,20,20)) +} +dev.off() + +# +# Plot dbsnp and true positive rate as a function of the annotation +# + +ymin = min(all$dbsnp) +ymax = max(all$dbsnp) +outfile = paste(input, ".truthRate.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +yLabel = "DBsnp Rate" +if( sum(all$truePositive==0) != length(all$truePositive) ) { +t = all[all$truePositive>0,] +yLabel = "DBsnp/True Positive Rate" +ymin = min(min(all$dbsnp),min(t$truePositive)) +ymax = max(max(all$dbsnp),max(t$truePositive)) +} +plot(all$value,all$dbsnp,xlab=annotationName,ylab=yLabel,pch=20,ylim=c(ymin,ymax),xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(t$value,t$truePositive,col="magenta",pch=20); +legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) +} +dev.off() + +# +# Plot dbsnp and true positive rate as a function of the annotation, log scale on the x-axis +# + +outfile = paste(input, ".truthRate_log.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +yLabel = "DBsnp Rate" +if( sum(all$truePositive==0) != length(all$truePositive) ) { +yLabel = "DBsnp/Truth Rate" +} +plot(all$value,all$dbsnp,xlab=annotationName,log="x",ylab=yLabel,ylim=c(ymin,ymax),pch=20,xaxt="n",ps=14); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +abline(v=m,lty=2,col="red") +abline(v=m75,lty=3) +abline(v=m25,lty=3) +text(m, ymin, "50", col="red", cex=0.6); +text(m75, ymin, "75", col="black", cex=0.6); +text(m25, ymin, "25", col="black", cex=0.6); +if(medianNumVariants == "true") { +abline(v=m90,lty=3) +abline(v=m10,lty=3) +text(m10, ymin, "10", col="black", cex=0.6); +text(m90, ymin, "90", col="black", cex=0.6); +} +if( sum(all$truePositive==0) != length(all$truePositive) ) { +points(t$value,t$truePositive,col="magenta",pch=20); +legend("topleft", c("dbsnp","truth"),col=c("black","magenta"),pch=c(20,20)) +} +dev.off() + +# +# Plot histogram of the annotation's value +# + +outfile = paste(input, ".Histogram.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$numVariants,xlab=annotationName,ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +dev.off() + +# +# Plot histogram of the annotation's value, log scale on x-axis +# + +outfile = paste(input, ".Histogram_log.pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +plot(all$value,all$numVariants,xlab=annotationName,log="x",ylab="Num variants in bin",type="h",xaxt="n",ps=14,lwd=4); +axis(1,axTicks(1), format(axTicks(1), scientific=F)) +dev.off() diff --git a/public/R/plot_Tranches.R b/public/R/plot_Tranches.R new file mode 100755 index 000000000..a79ddd3ab --- /dev/null +++ b/public/R/plot_Tranches.R @@ -0,0 +1,87 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +tranchesFile = args[1] +targetTITV = as.numeric(args[2]) +targetSensitivity = as.numeric(args[3]) +suppressLegend = ! is.na(args[4]) + +# ----------------------------------------------------------------------------------------------- +# Useful general routines +# ----------------------------------------------------------------------------------------------- + +MIN_FP_RATE = 0.001 # 1 / 1000 is min error rate + +titvFPEst <- function(titvExpected, titvObserved) { + max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), MIN_FP_RATE) +} + +titvFPEstV <- function(titvExpected, titvs) { + sapply(titvs, function(x) titvFPEst(titvExpected, x)) +} + +nTPFP <- function(nVariants, FDR) { + return(list(TP = nVariants * (1 - FDR/100), FP = nVariants * (FDR / 100))) +} + +leftShift <- function(x, leftValue = 0) { + r = rep(leftValue, length(x)) + for ( i in 1:(length(x)-1) ) { + #print(list(i=i)) + r[i] = x[i+1] + } + r +} + +# ----------------------------------------------------------------------------------------------- +# Tranches plot +# ----------------------------------------------------------------------------------------------- +data2 = read.table(tranchesFile,sep=",",head=T) +data2 = data2[order(data2$novelTiTv, decreasing=F),] +#data2 = data2[order(data2$FDRtranche, decreasing=T),] +cols = c("cornflowerblue", "cornflowerblue", "darkorange", "darkorange") +density=c(20, -1, -1, 20) +outfile = paste(tranchesFile, ".pdf", sep="") +pdf(outfile, height=5, width=8) +par(mar = c(5, 5, 4, 2) + 0.1) +novelTiTv = c(data2$novelTITV,data2$novelTiTv) +alpha = 1 - titvFPEstV(targetTITV, novelTiTv) +#print(alpha) + +numGood = round(alpha * data2$numNovel); + +#numGood = round(data2$numNovel * (1-data2$targetTruthSensitivity/100)) +numBad = data2$numNovel - numGood; + +numPrevGood = leftShift(numGood, 0) +numNewGood = numGood - numPrevGood +numPrevBad = leftShift(numBad, 0) +numNewBad = numBad - numPrevBad + +d=matrix(c(numPrevGood,numNewGood, numNewBad, numPrevBad),4,byrow=TRUE) +#print(d) +barplot(d/1000,horiz=TRUE,col=cols,space=0.2,xlab="Number of Novel Variants (1000s)", density=density, cex.axis=1.25, cex.lab=1.25) # , xlim=c(250000,350000)) +#abline(v= d[2,dim(d)[2]], lty=2) +#abline(v= d[1,3], lty=2) +if ( ! suppressLegend ) + legend(3, length(data2$targetTruthSensitivity)/3 +1, c('Cumulative TPs','Tranch-specific TPs', 'Tranch-specific FPs', 'Cumulative FPs' ), fill=cols, density=density, bg='white', cex=1.25) + +mtext("Ti/Tv",2,line=2.25,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1) +mtext("truth",2,line=0,at=length(data2$targetTruthSensitivity)*1.2,las=1, cex=1) +axis(2,line=-1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=data2$targetTruthSensitivity, las=1, cex.axis=1.0) +axis(2,line=1,at=0.7+(0:(length(data2$targetTruthSensitivity)-1))*1.2,tick=FALSE,labels=round(novelTiTv,3), las=1, cex.axis=1.0) + +# plot sensitivity vs. specificity +sensitivity = data2$truthSensitivity +if ( ! is.null(sensitivity) ) { + #specificity = titvFPEstV(targetTITV, novelTiTv) + specificity = novelTiTv + plot(sensitivity, specificity, type="b", col="cornflowerblue", xlab="Tranche truth sensitivity", ylab="Specificity (Novel Ti/Tv ratio)") + abline(h=targetTITV, lty=2) + abline(v=targetSensitivity, lty=2) + #text(max(sensitivity), targetTITV-0.05, labels="Expected novel Ti/Tv", pos=2) +} + +dev.off() diff --git a/public/R/plot_residualError_OtherCovariate.R b/public/R/plot_residualError_OtherCovariate.R new file mode 100644 index 000000000..a1385ff3f --- /dev/null +++ b/public/R/plot_residualError_OtherCovariate.R @@ -0,0 +1,108 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) +verbose = TRUE + +input = args[1] +covariateName = args[2] + +outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +c <- read.table(input, header=T) +c <- c[sort.list(c[,1]),] + +# +# Plot residual error as a function of the covariate +# + +d.good <- c[c$nBases >= 1000,] +d.1000 <- c[c$nBases < 1000,] +rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh +rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) ) +theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) +if( length(d.good$nBases) == length(c$nBases) ) { + theTitle = paste("RMSE =", round(rmseAll,digits=3)) +} +# Don't let residual error go off the edge of the plot +d.good$residualError = d.good$Qempirical-d.good$Qreported +d.good$residualError[which(d.good$residualError > 10)] = 10 +d.good$residualError[which(d.good$residualError < -10)] = -10 +d.1000$residualError = d.1000$Qempirical-d.1000$Qreported +d.1000$residualError[which(d.1000$residualError > 10)] = 10 +d.1000$residualError[which(d.1000$residualError < -10)] = -10 +c$residualError = c$Qempirical-c$Qreported +c$residualError[which(c$residualError > 10)] = 10 +c$residualError[which(c$residualError < -10)] = -10 +pointType = "p" +if( length(c$Covariate) <= 20 ) { + pointType = "o" +} +if( is.numeric(c$Covariate) ) { + plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate))) + points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10)) + points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue") +} +dev.off() + + +# +# Plot mean quality versus the covariate +# + +outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="") +pdf(outfile, height=7, width=7) +par(cex=1.1) +pointType = "p" +if( length(c$Covariate) <= 20 ) { + pointType = "o" +} +theTitle = paste("Quality By", covariateName); +if( is.numeric(c$Covariate) ) { + plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate))) + points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40)) + points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue") +} +dev.off() + +# +# Plot histogram of the covariate +# + +e = d.good +f = d.1000 +outfile = paste(input, ".", covariateName,"_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0) + +lwdSize=2 +if( length(c$Covariate) <= 20 ) { + lwdSize=7 +} else if( length(c$Covariate) <= 70 ) { + lwdSize=4 +} + +if( is.numeric(c$Covariate) ) { + if( length(hst$e.Covariate) == 0 ) { + plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate))) + } else { + plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate))) + points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue") + } + axis(2,axTicks(2), format(axTicks(2), scientific=F)) +} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice + hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0) + plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n") + if( length(hst$c.Covariate) > 9 ) { + axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)]) + } else { + axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate) + } + axis(2,axTicks(2), format(axTicks(2), scientific=F)) +} +dev.off() diff --git a/public/R/plot_residualError_QualityScoreCovariate.R b/public/R/plot_residualError_QualityScoreCovariate.R new file mode 100644 index 000000000..81bc9460d --- /dev/null +++ b/public/R/plot_residualError_QualityScoreCovariate.R @@ -0,0 +1,70 @@ +#!/bin/env Rscript + +args <- commandArgs(TRUE) + +input = args[1] +Qcutoff = as.numeric(args[2]) +maxQ = as.numeric(args[3]) +maxHist = as.numeric(args[4]) + +t=read.table(input, header=T) + +# +# Plot of reported quality versus empirical quality +# + +outfile = paste(input, ".quality_emp_v_stated.pdf", sep="") +pdf(outfile, height=7, width=7) +d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,] +d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,] +d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,] +f <- t[t$Qreported < Qcutoff,] +e <- rbind(d.good, d.1000, d.10000) +rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh +rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) ) +theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3)) +if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) { + theTitle = paste("RMSE =", round(rmseAll,digits=3)); +} +plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score") +points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16) +points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16) +points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16) +abline(0,1, lty=2) +dev.off() + +# +# Plot Q empirical histogram +# + +outfile = paste(input, ".quality_emp_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0) +percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases)) +entropy = -sum(log2(percentBases)*percentBases) +yMax = max(hst$e.nBases) +if(maxHist != 0) { +yMax = maxHist +} +plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n") +points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1") +axis(2,axTicks(2), format(axTicks(2), scientific=F)) +dev.off() + +# +# Plot Q reported histogram +# + +outfile = paste(input, ".quality_rep_hist.pdf", sep="") +pdf(outfile, height=7, width=7) +hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0) +hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0) +yMax = max(hst$e.nBases) +if(maxHist != 0) { +yMax = maxHist +} +plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n") +points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1") +axis(2,axTicks(2), format(axTicks(2), scientific=F)) +dev.off() diff --git a/public/R/titvFPEst.R b/public/R/titvFPEst.R new file mode 100755 index 000000000..7af5e8bbb --- /dev/null +++ b/public/R/titvFPEst.R @@ -0,0 +1,138 @@ +titvFPEst <- function(titvExpected, titvObserved) { max(min(1 - (titvObserved - 0.5) / (titvExpected - 0.5), 1), 0.001) } + +titvFPEstV <- function(titvExpected, titvs) { + sapply(titvs, function(x) titvFPEst(titvExpected, x)) +} + +calcHet <- function(nknown, knownTiTv, nnovel, novelTiTv, callable) { + TP <- nknown + (1-titvFPEst(knownTiTv, novelTiTv)) * nnovel + 2 * TP / 3 / callable +} + +marginalTiTv <- function( nx, titvx, ny, titvy ) { + tvx = nx / (titvx + 1) + tix = nx - tvx + tvy = ny / (titvy + 1) + tiy = ny - tvy + tiz = tix - tiy + tvz = tvx - tvy + return(tiz / tvz) +} +marginaldbSNPRate <- function( nx, dbx, ny, dby ) { + knownx = nx * dbx / 100 + novelx = nx - knownx + knowny = ny * dby / 100 + novely = ny - knowny + knownz = knownx - knowny + novelz = novelx - novely + return(knownz / ( knownz + novelz ) * 100) +} + +numExpectedCalls <- function(L, theta, calledFractionOfRegion, nIndividuals, dbSNPRate) { + nCalls <- L * theta * calledFractionOfRegion * sum(1 / seq(1, 2 * nIndividuals)) + return(list(nCalls = nCalls, nKnown = dbSNPRate * nCalls, nNovel = (1-dbSNPRate) * nCalls)) +} + +normalize <- function(x) { + x / sum(x) +} + +normcumsum <- function(x) { + cumsum(normalize(x)) +} + +cumhist <- function(d, ...) { + plot(d[order(d)], type="b", col="orange", lwd=2, ...) +} + +revcumsum <- function(x) { + return(rev(cumsum(rev(x)))) +} + +phred <- function(x) { + log10(max(x,10^(-9.9)))*-10 +} + +pOfB <- function(b, B, Q) { + #print(paste(b, B, Q)) + p = 1 - 10^(-Q/10) + if ( b == B ) + return(p) + else + return(1 - p) +} + +pOfG <- function(bs, qs, G) { + a1 = G[1] + a2 = G[2] + + log10p = 0 + for ( i in 1:length(bs) ) { + b = bs[i] + q = qs[i] + p1 = pOfB(b, a1, q) / 2 + pOfB(b, a2, q) / 2 + log10p = log10p + log10(p1) + } + + return(log10p) +} + +pOfGs <- function(nAs, nBs, Q) { + bs = c(rep("a", nAs), rep("t", nBs)) + qs = rep(Q, nAs + nBs) + G1 = c("a", "a") + G2 = c("a", "t") + G3 = c("t", "t") + + log10p1 = pOfG(bs, qs, G1) + log10p2 = pOfG(bs, qs, G2) + log10p3 = pOfG(bs, qs, G3) + Qsample = phred(1 - 10^log10p2 / sum(10^(c(log10p1, log10p2, log10p3)))) + + return(list(p1=log10p1, p2=log10p2, p3=log10p3, Qsample=Qsample)) +} + +QsampleExpected <- function(depth, Q) { + weightedAvg = 0 + for ( d in 1:(depth*3) ) { + Qsample = 0 + pOfD = dpois(d, depth) + for ( nBs in 0:d ) { + pOfnB = dbinom(nBs, d, 0.5) + nAs = d - nBs + Qsample = pOfGs(nAs, nBs, Q)$Qsample + #Qsample = 1 + weightedAvg = weightedAvg + Qsample * pOfD * pOfnB + print(as.data.frame(list(d=d, nBs = nBs, pOfD=pOfD, pOfnB = pOfnB, Qsample=Qsample, weightedAvg = weightedAvg))) + } + } + + return(weightedAvg) +} + +plotQsamples <- function(depths, Qs, Qmax) { + cols = rainbow(length(Qs)) + plot(depths, rep(Qmax, length(depths)), type="n", ylim=c(0,Qmax), xlab="Average sequencing coverage", ylab="Qsample", main = "Expected Qsample values, including depth and allele sampling") + + for ( i in 1:length(Qs) ) { + Q = Qs[i] + y = as.numeric(lapply(depths, function(x) QsampleExpected(x, Q))) + points(depths, y, col=cols[i], type="b") + } + + legend("topleft", paste("Q", Qs), fill=cols) +} + +pCallHetGivenDepth <- function(depth, nallelesToCall) { + depths = 0:(2*depth) + pNoAllelesToCall = apply(as.matrix(depths),1,function(d) sum(dbinom(0:nallelesToCall,d,0.5))) + dpois(depths,depth)*(1-pNoAllelesToCall) +} + +pCallHets <- function(depth, nallelesToCall) { + sum(pCallHetGivenDepth(depth,nallelesToCall)) +} + +pCallHetMultiSample <- function(depth, nallelesToCall, nsamples) { + 1-(1-pCallHets(depth,nallelesToCall))^nsamples +} From 2cb1376ed0acb2fea84d5d38a6c524e603ed269b Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 30 Jun 2011 14:55:39 -0400 Subject: [PATCH 18/21] VCFStreaming was failing integration tests because now select variants outputs the samples in alphabetical order, instead of random as before. Fixed the MD5. --- .../gatk/walkers/variantutils/VCFStreamingIntegrationTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index 15733e104..4ed9718fd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -60,7 +60,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " --NO_HEADER" + " -o %s", 1, - Arrays.asList("2cae3d16f9ed00b07d87e9c49272d877") + Arrays.asList("995c07ccd593fe1c35d0d28155112a55") ); executeTest("testSimpleVCFStreaming", spec); From defa3cfe852c1a1e6a0a01a4cffb59353e2dbeed Mon Sep 17 00:00:00 2001 From: "Mark A. DePristo" Date: Thu, 30 Jun 2011 14:59:58 -0400 Subject: [PATCH 19/21] Moved around private walkers into appropriate directories in private gatk.walkers. Moved a few public walkers into private qc package, and some private qc walkers into the public directory. Removed several obviously broken and/or unused walkers. --- .../walkers/indels/RealignedReadCounter.java | 143 +++++++++ .../sting/gatk/walkers/qc/CountIntervals.java | 62 ++++ .../gatk/walkers/qc/ProfileRodSystem.java | 157 --------- .../walkers/qc/RodSystemValidationWalker.java | 153 +++++++++ .../gatk/walkers/qc/ValidateBAQWalker.java | 298 ------------------ .../sting/utils/baq/BAQUnitTest.java | 21 +- 6 files changed, 373 insertions(+), 461 deletions(-) create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java new file mode 100755 index 000000000..fc196e712 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignedReadCounter.java @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.indels; + +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.filters.BadMateFilter; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.broadinstitute.sting.commandline.Argument; + +import java.io.File; +import java.util.*; + +@By(DataSource.READS) +// walker to count realigned reads +public class RealignedReadCounter extends ReadWalker { + + public static final String ORIGINAL_CIGAR_TAG = "OC"; + public static final String ORIGINAL_POSITION_TAG = "OP"; + + @Argument(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true) + protected String intervalsFile = null; + + // the intervals input by the user + private Iterator intervals = null; + + // the current interval in the list + private GenomeLoc currentInterval = null; + + private long updatedIntervals = 0, updatedReads = 0, affectedBases = 0; + private boolean intervalWasUpdated = false; + + public void initialize() { + // prepare to read intervals one-by-one, as needed (assuming they are sorted). + intervals = new IntervalFileMergingIterator( getToolkit().getGenomeLocParser(), new File(intervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); + currentInterval = intervals.hasNext() ? intervals.next() : null; + } + + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + if ( currentInterval == null ) { + return 0; + } + + GenomeLoc readLoc = ref.getGenomeLocParser().createGenomeLoc(read); + // hack to get around unmapped reads having screwy locations + if ( readLoc.getStop() == 0 ) + readLoc = ref.getGenomeLocParser().createGenomeLoc(readLoc.getContig(), readLoc.getStart(), readLoc.getStart()); + + if ( readLoc.isBefore(currentInterval) || ReadUtils.is454Read(read) ) + return 0; + + if ( readLoc.overlapsP(currentInterval) ) { + if ( doNotTryToClean(read) ) + return 0; + + if ( read.getAttribute(ORIGINAL_CIGAR_TAG) != null ) { + String newCigar = (String)read.getAttribute(ORIGINAL_CIGAR_TAG); + // deal with an old bug + if ( read.getCigar().toString().equals(newCigar) ) { + //System.out.println(currentInterval + ": " + read.getReadName() + " " + read.getCigarString() + " " + newCigar); + return 0; + } + + if ( !intervalWasUpdated ) { + intervalWasUpdated = true; + updatedIntervals++; + affectedBases += 20 + getIndelSize(read); + } + updatedReads++; + + } + } else { + do { + intervalWasUpdated = false; + currentInterval = intervals.hasNext() ? intervals.next() : null; + } while ( currentInterval != null && currentInterval.isBefore(readLoc) ); + } + + return 0; + } + + private int getIndelSize(SAMRecord read) { + for ( CigarElement ce : read.getCigar().getCigarElements() ) { + if ( ce.getOperator() == CigarOperator.I ) + return 0; + if ( ce.getOperator() == CigarOperator.D ) + return ce.getLength(); + } + logger.warn("We didn't see an indel for this read: " + read.getReadName() + " " + read.getAlignmentStart() + " " + read.getCigar()); + return 0; + } + + private boolean doNotTryToClean(SAMRecord read) { + return read.getReadUnmappedFlag() || + read.getNotPrimaryAlignmentFlag() || + read.getReadFailsVendorQualityCheckFlag() || + read.getMappingQuality() == 0 || + read.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START || + (BadMateFilter.hasBadMate(read)); + } + + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer value, Integer sum) { + return sum + value; + } + + public void onTraversalDone(Integer result) { + System.out.println(updatedIntervals + " intervals were updated"); + System.out.println(updatedReads + " reads were updated"); + System.out.println(affectedBases + " bases were affected"); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java new file mode 100755 index 000000000..feb5f62af --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java @@ -0,0 +1,62 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.RefWalker; +import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.collections.Pair; + +import java.util.List; +import java.io.PrintStream; + +/** + * Counts the number of contiguous regions the walker traverses over. Slower than it needs to be, but + * very useful since overlapping intervals get merged, so you can count the number of intervals the GATK merges down to. + * This was its very first use. + */ +public class CountIntervals extends RefWalker { + @Output + PrintStream out; + + @Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false) + int numOverlaps = 2; + + public Long reduceInit() { + return 0l; + } + + public boolean isReduceByInterval() { return true; } + + public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) { + return null; + } + + List checkIntervals = tracker.getGATKFeatureMetaData("check",false); + return (long) checkIntervals.size(); + } + + public Long reduce(Long loc, Long prev) { + if ( loc == null ) { + return 0l; + } else { + return Math.max(prev,loc); + } + } + + public void onTraversalDone(List> finalReduce) { + long count = 0; + for ( Pair g : finalReduce ) { + if ( g.second >= numOverlaps) { + count ++; + } + } + out.printf("Number of contiguous intervals: %d",count); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java deleted file mode 100755 index 67879a442..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ProfileRodSystem.java +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2010, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.qc; - -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; -import org.broad.tribble.readers.AsciiLineReader; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.Requires; -import org.broadinstitute.sting.gatk.walkers.RodWalker; -import org.broadinstitute.sting.utils.SimpleTimer; - -import java.io.PrintStream; -import java.io.File; -import java.io.FileInputStream; -import java.util.*; - -/** - * Emits specific fields as dictated by the user from one or more VCF files. - */ -@Requires(value={}) -public class ProfileRodSystem extends RodWalker { - @Output(doc="File to which results should be written",required=true) - protected PrintStream out; - - @Argument(fullName="nIterations", shortName="N", doc="Number of raw reading iterations to perform", required=false) - int nIterations = 1; - - @Argument(fullName="maxRecords", shortName="M", doc="Max. number of records to process", required=false) - int MAX_RECORDS = -1; - - SimpleTimer timer = new SimpleTimer("myTimer"); - - public void initialize() { - File rodFile = getRodFile(); - - out.printf("# walltime is in seconds%n"); - out.printf("# file is %s%n", rodFile); - out.printf("# file size is %d bytes%n", rodFile.length()); - out.printf("operation\titeration\twalltime%n"); - for ( int i = 0; i < nIterations; i++ ) { - out.printf("read.bytes\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_BYTE)); - out.printf("read.line\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_LINE)); - out.printf("line.and.parts\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.BY_PARTS)); - out.printf("decode.loc\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE_LOC)); - out.printf("full.decode\t%d\t%.2f%n", i, readFile(rodFile, ReadMode.DECODE)); - } - - timer.start(); // start up timer for map itself - } - - private enum ReadMode { BY_BYTE, BY_LINE, BY_PARTS, DECODE_LOC, DECODE }; - - private final double readFile(File f, ReadMode mode) { - timer.start(); - - try { - byte[] data = new byte[100000]; - FileInputStream s = new FileInputStream(f); - - if ( mode == ReadMode.BY_BYTE ) { - while (true) { - if ( s.read(data) == -1 ) - break; - } - } else { - int counter = 0; - VCFCodec codec = new VCFCodec(); - String[] parts = new String[100000]; - AsciiLineReader lineReader = new AsciiLineReader(s); - - if ( mode == ReadMode.DECODE_LOC || mode == ReadMode.DECODE ) - codec.readHeader(lineReader); - - while (counter++ < MAX_RECORDS || MAX_RECORDS == -1) { - String line = lineReader.readLine(); - if ( line == null ) - break; - else if ( mode == ReadMode.BY_PARTS ) { - ParsingUtils.split(line, parts, VCFConstants.FIELD_SEPARATOR_CHAR); - } - else if ( mode == ReadMode.DECODE_LOC ) { - codec.decodeLoc(line); - } - else if ( mode == ReadMode.DECODE ) { - processOneVC((VariantContext)codec.decode(line)); - } - } - } - } catch ( Exception e ) { - throw new RuntimeException(e); - } - - return timer.getElapsedTime(); - } - - private File getRodFile() { - List rods = this.getToolkit().getRodDataSources(); - ReferenceOrderedDataSource rod = rods.get(0); - return rod.getFile(); - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( tracker == null ) // RodWalkers can make funky map calls - return 0; - - VariantContext vc = tracker.getVariantContext(ref, "rod", context.getLocation()); - processOneVC(vc); - - return 0; - } - - private static final void processOneVC(VariantContext vc) { - vc.getNSamples(); // force us to parse the samples - } - - public Integer reduceInit() { - return 0; - } - - public Integer reduce(Integer counter, Integer sum) { - return counter + sum; - } - - public void onTraversalDone(Integer sum) { - out.printf("gatk.traversal\t%d\t%.2f%n", 0, timer.getElapsedTime()); - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java new file mode 100644 index 000000000..9cb715507 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/RodSystemValidationWalker.java @@ -0,0 +1,153 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.*; +import java.math.BigInteger; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Collection; +import java.util.List; + +/** + * a walker for validating (in the style of validating pile-up) the ROD system. + */ +@Reference(window=@Window(start=-40,stop=40)) +public class RodSystemValidationWalker extends RodWalker { + + // the divider to use in some of the text output + private static final String DIVIDER = ","; + + @Output + public PrintStream out; + + @Argument(fullName="PerLocusEqual",required=false,doc="Should we check that all records at the same site produce equivilent variant contexts") + public boolean allRecordsVariantContextEquivalent = false; + + // used to calculate the MD5 of a file + MessageDigest digest = null; + + // we sometimes need to know what rods the engine's seen + List rodList; + + /** + * emit the md5 sums for each of the input ROD files (will save up a lot of time if and when the ROD files change + * underneath us). + */ + public void initialize() { + // setup the MD5-er + try { + digest = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + throw new ReviewedStingException("Unable to find MD5 checksumer"); + } + out.println("Header:"); + // enumerate the list of ROD's we've loaded + rodList = this.getToolkit().getRodDataSources(); + for (ReferenceOrderedDataSource rod : rodList) { + out.println(rod.getName() + DIVIDER + rod.getType()); + out.println(rod.getName() + DIVIDER + rod.getFile()); + out.println(rod.getName() + DIVIDER + md5sum(rod.getFile())); + } + out.println("Data:"); + } + + /** + * + * @param tracker the ref meta data tracker to get RODs + * @param ref reference context + * @param context the reads + * @return an 1 for each site with a rod(s), 0 otherwise + */ + @Override + public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + int ret = 0; + if (tracker != null && tracker.getAllRods().size() > 0) { + out.print(context.getLocation() + DIVIDER); + Collection features = tracker.getAllRods(); + for (GATKFeature feat : features) + out.print(feat.getName() + DIVIDER); + out.println(";"); + ret++; + } + + // if the argument was set, check for equivalence + if (allRecordsVariantContextEquivalent && tracker != null) { + Collection col = tracker.getAllVariantContexts(ref); + VariantContext con = null; + for (VariantContext contextInList : col) + if (con == null) con = contextInList; + else if (!con.equals(col)) out.println("FAIL: context " + col + " doesn't match " + con); + } + return ret; + } + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + @Override + public Integer reduceInit() { + return 0; + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + @Override + public Integer reduce(Integer value, Integer sum) { + return value + sum; + } + + @Override + public void onTraversalDone(Integer result) { + // Double check traversal result to make count is the same. + // TODO: Is this check necessary? + out.println("[REDUCE RESULT] Traversal result is: " + result); + } + + // shamelessly absconded and adapted from http://www.javalobby.org/java/forums/t84420.html + private String md5sum(File f) { + InputStream is; + try { + is = new FileInputStream(f); + } catch (FileNotFoundException e) { + return "Not a file"; + } + byte[] buffer = new byte[8192]; + int read = 0; + try { + while ((read = is.read(buffer)) > 0) { + digest.update(buffer, 0, read); + } + byte[] md5sum = digest.digest(); + BigInteger bigInt = new BigInteger(1, md5sum); + return bigInt.toString(16); + } + catch (IOException e) { + throw new RuntimeException("Unable to process file for MD5", e); + } + finally { + try { + is.close(); + } + catch (IOException e) { + throw new RuntimeException("Unable to close input stream for MD5 calculation", e); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java deleted file mode 100755 index eda01bdb4..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidateBAQWalker.java +++ /dev/null @@ -1,298 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.qc; - -import net.sf.samtools.SAMRecord; -import net.sf.samtools.CigarElement; -import net.sf.samtools.CigarOperator; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.utils.exceptions.StingException; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.SimpleTimer; -import org.broadinstitute.sting.commandline.Output; -import org.broadinstitute.sting.commandline.Argument; - -import java.io.PrintStream; - -/** - * Walks over the input data set, calculating the number of reads seen for diagnostic purposes. - * Can also count the number of reads matching a given criterion using read filters (see the - * --read-filter command line argument). Simplest example of a read-backed analysis. - */ -@BAQMode(QualityMode = BAQ.QualityMode.DONT_MODIFY, ApplicationTime = BAQ.ApplicationTime.HANDLED_IN_WALKER) -@Reference(window=@Window(start=-5,stop=5)) -@Requires({DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES}) -public class ValidateBAQWalker extends ReadWalker { - @Output(doc="File to which results should be written",required=true) - protected PrintStream out; - - @Argument(doc="maximum read length to apply the BAQ calculation too",required=false) - protected int maxReadLen = 1000; - - @Argument(doc="",required=false) - protected int bw = 7; - - @Argument(doc="",required=false) - protected boolean samtoolsMode = false; - - @Argument(doc="only operates on reads with this name",required=false) - protected String readName = null; - - @Argument(doc="If true, all differences are errors", required=false) - protected boolean strict = false; - - @Argument(doc="prints info for each read", required=false) - protected boolean printEachRead = false; - - @Argument(doc="Also prints out detailed comparison information when for known calculation differences", required=false) - protected boolean alsoPrintWarnings = false; - - @Argument(doc="Include reads without BAQ tag", required=false) - protected boolean includeReadsWithoutBAQTag = false; - - @Argument(doc="x each read is processed", required=false) - protected int magnification = 1; - - @Argument(doc="Profile performance", required=false) - protected boolean profile = false; - - int counter = 0; - - BAQ baqHMM = null; // matches current samtools parameters - - public void initialize() { - if ( samtoolsMode ) - baqHMM = new BAQ(1e-3, 0.1, bw, (byte)0, true); - else - baqHMM = new BAQ(); - } - - long goodReads = 0, badReads = 0; - - public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker tracker) { - - if ( (readName == null || readName.equals(read.getReadName())) && read.getReadLength() <= maxReadLen && (includeReadsWithoutBAQTag || BAQ.hasBAQTag(read) ) ) { - if ( baqHMM.excludeReadFromBAQ(read) ) - return 0; - - if ( profile ) { - profileBAQ(ref, read); - } else { - validateBAQ(ref, read); - } - - return 1; - } - - return 0; - } - - SimpleTimer tagTimer = new SimpleTimer("from.tag"); - SimpleTimer baqReadTimer = new SimpleTimer("baq.read"); - SimpleTimer glocalTimer = new SimpleTimer("hmm.glocal"); - - private void profileBAQ(ReferenceContext ref, SAMRecord read) { - IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference(); - BAQ.BAQCalculationResult baq = null; - - tagTimer.restart(); - for ( int i = 0; i < magnification; i++ ) { BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); } - tagTimer.stop(); - - baqReadTimer.restart(); - for ( int i = 0; i < magnification; i++ ) { baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY ); } - baqReadTimer.stop(); - - glocalTimer.restart(); - for ( int i = 0; i < magnification; i++ ) - baqHMM.baqRead(read, refReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.DONT_MODIFY); - glocalTimer.stop(); - } - - - private void validateBAQ(ReferenceContext ref, SAMRecord read) { - IndexedFastaSequenceFile refReader = this.getToolkit().getReferenceDataSource().getReference(); - byte[] baqFromTag = BAQ.calcBAQFromTag(read, false, includeReadsWithoutBAQTag); - if (counter++ % 1000 == 0 || printEachRead) out.printf("Checking read %s (%d)%n", read.getReadName(), counter); - - BAQ.BAQCalculationResult baq = baqHMM.calcBAQFromHMM(read, refReader); - - boolean fail = false; - boolean print = false; - int badi = 0; - - if ( BAQ.hasBAQTag(read) ) { - for ( badi = 0; badi < baqFromTag.length; badi++ ) { - if ( baqFromTag[badi] != baq.bq[badi] ) { - if ( cigarLength(read) != read.getReadLength() ) { - print = true; - fail = false; - out.printf(" different, but cigar length != read length%n"); - break; - } - if (MathUtils.arrayMin(read.getBaseQualities()) == 0) { - print = true; - fail = strict; - out.printf(" different, but Q0 base detected%n"); - break; - } - else if (readHasSoftClip(read) && ! samtoolsMode) { - print = true; - fail = strict; - out.printf(" different, but soft clip detected%n"); - break; - } else if (readHasDeletion(read) ) { // && ! samtoolsMode) { - print = true; - fail = strict; - out.printf(" different, but deletion detected%n"); - break; - } else if ( baq.bq[badi] < baqHMM.getMinBaseQual() ) { - print = fail = true; - out.printf(" Base quality %d < min %d", baq.bq[badi], baqHMM.getMinBaseQual()); - break; - } else { - print = fail = true; - break; - } - } - } - if ( fail || print ) - badReads++; - else - goodReads++; - } - - if ( fail || printEachRead || ( print && alsoPrintWarnings ) ) { - byte[] pos = new byte[baq.bq.length]; - for ( int i = 0; i < pos.length; i++ ) pos[i] = (byte)i; - - out.printf(" read length : %d%n", read.getReadLength()); - out.printf(" read start : %d (%d unclipped)%n", read.getAlignmentStart(), read.getUnclippedStart()); - out.printf(" cigar : %s%n", read.getCigarString()); - out.printf(" ref bases : %s%n", new String(baq.refBases)); - out.printf(" read bases : %s%n", new String(read.getReadBases())); - out.printf(" ref length : %d%n", baq.refBases.length); - out.printf(" BQ tag : %s%n", read.getStringAttribute(BAQ.BAQ_TAG)); - if ( BAQ.hasBAQTag(read) ) printQuals(" BQ deltas : ", getBAQDeltas(read), true); - printQuals(" original quals: ", read.getBaseQualities(), true); - printQuals(" baq quals: ", baq.bq, true); - printQuals(" positions : ", pos, true); - printQuals(" original quals: ", read.getBaseQualities()); - if ( BAQ.hasBAQTag(read) ) printQuals(" tag quals: ", baqFromTag); - printQuals(" hmm quals: ", baq.bq); - out.printf(" read bases : %s%n", new String(read.getReadBases())); - out.println(Utils.dupString('-', 80)); - } - - - if ( fail ) - throw new StingException(String.format("BAQ from read and from HMM differ in read %s at position %d: tag qual = %d, hmm qual = %d", - read.getReadName(), badi, baqFromTag[badi], baq.bq[badi])); - } - - private final static boolean readHasSoftClip(SAMRecord read) { - for (CigarElement e : read.getCigar().getCigarElements()) { - if ( e.getOperator() == CigarOperator.SOFT_CLIP ) - return true; - } - - return false; - } - - private final static boolean readHasDeletion(SAMRecord read) { - for (CigarElement e : read.getCigar().getCigarElements()) { - if ( e.getOperator() == CigarOperator.DELETION ) - return true; - } - - return false; - } - - public final void printQuals( String prefix, byte[] quals ) { - printQuals(prefix, quals, false); - } - - public final void printQuals( String prefix, byte[] quals, boolean asInt ) { - printQuals(out, prefix, quals, asInt); - } - - public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) { - out.print(prefix); - for ( int i = 0; i < quals.length; i++) { - if ( asInt ) { - out.printf("%2d", (int)quals[i]); - if ( i+1 != quals.length ) out.print(","); - } else - out.print((char)(quals[i]+33)); - } - out.println(); - } - - /** - * Get the BAQ delta bytes from the tag in read. Returns null if no BAQ tag is present. - * @param read - * @return - */ - public static byte[] getBAQDeltas(SAMRecord read) { - byte[] baq = BAQ.getBAQTag(read); - if ( baq != null ) { - byte[] deltas = new byte[baq.length]; - for ( int i = 0; i < deltas.length; i++) - deltas[i] = (byte)(-1 * (baq[i] - 64)); - return deltas; - } else - return null; - } - - private int cigarLength(SAMRecord read) { - int readI = 0; - for ( CigarElement elt : read.getCigar().getCigarElements() ) { - int l = elt.getLength(); - switch (elt.getOperator()) { - case N: // cannot handle these - return 0; - case H : case P : // ignore pads and hard clips - break; - case S : - case I : - readI += l; - break; - case D : break; - case M : - readI += l; - break; - default: - throw new ReviewedStingException("BUG: Unexpected CIGAR element " + elt + " in read " + read.getReadName()); - } - } - return readI; - } - - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } - - public void onTraversalDone(Integer nreads) { - if ( profile ) { - out.printf("n.reads baq.per.read calculation time.in.secs%n"); - printTimer(nreads, tagTimer); - printTimer(nreads, glocalTimer); - printTimer(nreads, baqReadTimer); - } else { - out.printf("total reads BAQ'd %d; concordant BAQ reads %d %.4f; discordant BAQ reads %d %.4f%n", nreads, - goodReads, (100.0 * goodReads) / nreads, - badReads, (100.0 * badReads) / nreads); - } - } - - private final void printTimer(int nreads, SimpleTimer timer) { - out.printf("%d %d %s %.2f%n", nreads, magnification, timer.getName(), timer.getElapsedTime()); - } -} - diff --git a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java index 08be4f742..f4c3163cf 100644 --- a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java @@ -12,19 +12,16 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.BeforeMethod; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.walkers.qc.ValidateBAQWalker; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; -import org.broadinstitute.sting.utils.sam.ArtificialSAMFileWriter; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.Utils; import java.io.File; import java.io.FileNotFoundException; -import java.util.Arrays; +import java.io.PrintStream; import java.util.List; import java.util.ArrayList; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.*; /** @@ -188,8 +185,8 @@ public class BAQUnitTest extends BaseTest { System.out.println(Utils.dupString('-', 40)); System.out.println("reads : " + new String(test.readBases)); - ValidateBAQWalker.printQuals(System.out, "in-quals:", test.quals, false); - ValidateBAQWalker.printQuals(System.out, "bq-quals:", result.bq, false); + printQuals(System.out, "in-quals:", test.quals, false); + printQuals(System.out, "bq-quals:", result.bq, false); for (int i = 0; i < test.quals.length; i++) { //result.bq[i] = baqHMM.capBaseByBAQ(result.rawQuals[i], result.bq[i], result.state[i], i); Assert.assertTrue(result.bq[i] >= baqHMM.getMinBaseQual() || test.expected[i] < baqHMM.getMinBaseQual(), "BQ < min base quality"); @@ -197,4 +194,16 @@ public class BAQUnitTest extends BaseTest { } } + + public final static void printQuals( PrintStream out, String prefix, byte[] quals, boolean asInt ) { + out.print(prefix); + for ( int i = 0; i < quals.length; i++) { + if ( asInt ) { + out.printf("%2d", (int)quals[i]); + if ( i+1 != quals.length ) out.print(","); + } else + out.print((char)(quals[i]+33)); + } + out.println(); + } } \ No newline at end of file From 64048a67e890be2c479a597daa6d48f6890af671 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 30 Jun 2011 15:20:43 -0400 Subject: [PATCH 20/21] cleaning up ghost scala scripts. Deleting clearly unused one and moving others to qscripts.archive --- .../scala/BaseTransitionTableCalculator.scala | 211 ------------------ .../scala/IntervalAnnotationWalker.scala | 120 ---------- .../sting/scala/ScalaCountLoci.scala | 24 -- 3 files changed, 355 deletions(-) delete mode 100755 public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala delete mode 100755 public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala delete mode 100755 public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala diff --git a/public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala b/public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala deleted file mode 100755 index 7fe249beb..000000000 --- a/public/scala/src/org/broadinstitute/sting/scala/BaseTransitionTableCalculator.scala +++ /dev/null @@ -1,211 +0,0 @@ -package org.broadinstitute.sting.scala -/** -import gatk.walkers.genotyper.{UnifiedGenotyper, GenotypeCall} -import java.io.File -import net.sf.samtools.SAMRecord -import org.broadinstitute.sting.gatk.walkers.LocusWalker -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker -import org.broadinstitute.sting.gatk.contexts.ReferenceContext -import org.broadinstitute.sting.gatk.contexts.AlignmentContext -import org.broadinstitute.sting.utils.Pair -import org.broadinstitute.sting.utils.genotype.GenotypeMetaData -import utils._ -import cmdLine.Argument - -class TransitionTable() { - var table: Array[Array[Double]] = new Array[Array[Double]](4,4) - - def makeKey(b: Char): Int = { - return BaseUtils.simpleBaseToBaseIndex(b) - } - - def fromKey(i: Int): Char = { - return BaseUtils.baseIndexToSimpleBase(i) - } - - def add(ref: Char, subst: Char) = { - table(makeKey(ref))(makeKey(subst)) += 1 - } - - def get(i: Int, j: Int): Double = { - return table(i)(j) - } - - def set(i: Int, j: Int, d: Double) = { - //printf("Setting %d / %d => %f%n", i, j, d) - table(i)(j) = d - } - - def mapEntries[A](f: ((Int, Int, Double) => A)): List[A] = { - return for { i <- List.range(0, 4); j <- List.range(0,4) } yield f(i, j, get(i,j)) - } - - def header(): String = { - //return Utils.join("\t", mapEntries((i,j,x) => "P(%c|true=%c)" format (fromKey(j), fromKey(i))).toArray) - return Utils.join("\t", mapEntries((i,j,x) => "[%c|%c]" format (fromKey(j), fromKey(i))).toArray) - } - - override def toString(): String = { - return Utils.join("\t", mapEntries((i,j,x) => "%.2f" format x).toArray) - } - - def sum(): Double = { - return sumList(table.toList map (x => sumList(x.toList))) - } - - def sumList(xs: List[Double]): Double = { - return (0.0 :: xs) reduceLeft {(x, y) => x + y} - } - - def getRateMatrix(): TransitionTable = { - var sums = (table.toList map (x => sumList(x.toList))).toArray - var t = new TransitionTable() - - //println(sums.toList) - this mapEntries ((i,j,x) => t.set(i, j, x / sums(i))) - - return t - } -} - -**/class BaseTransitionTableCalculator /**{ // extends LocusWalker[Unit,Int] { - private var MIN_MAPPING_QUALITY = 30 - private var MIN_BASE_QUALITY = 20 - private var MIN_LOD = 5 - private var PRINT_FREQ = 1000000 - private var CARE_ABOUT_STRAND = true - - private val SSG = new UnifiedGenotyper() - private val table1 = new TransitionTable() - private val tableFWD = new TransitionTable() - private val tableREV = new TransitionTable() - private val table2 = new TransitionTable() - - @Argument() { val shortName = "v", val doc = "Print out verbose output", val required = false } - var VERBOSE: Boolean = false - - override def initialize() { - SSG.initialize(); - } - - override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Unit = { - val callPair = SSG.map(tracker, ref, context) - //val call = callPair.getFirst().get(0) - val pileup = new ReadBackedPileup(ref.getBase(), context) - - def hasNoNs(): Boolean = { - return ! (pileup.getBases() exists ('N' ==)) - } - - if ( isDefinitelyHomRef(callPair) && hasNoNs() ) { - var (refBases, nonRefBases) = splitBases(ref.getBase, pileup.getBases) - - //printf("nonRefBases %s%n", nonRefBases) - if ( nonRefBases.length > 0 ) { - var (nonRefReads, offsets) = getNonRefReads(ref.getBase, pileup) - var nonRefRead: SAMRecord = nonRefReads.head - - nonRefBases.length match { - case 1 => - //if ( goodRead(nonRefRead,offsets.head) ) { - //printf("Including site:%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head) - //} - addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table1) - addRead(nonRefRead, offsets.head, nonRefBases.head, ref, if ( nonRefRead.getReadNegativeStrandFlag ) tableREV else tableFWD ) - case 2 => - addRead(nonRefRead, offsets.head, nonRefBases.head, ref, table2) - addRead(nonRefReads.tail.head, offsets.tail.head, nonRefBases.tail.head, ref, table2) - case x => - } - - if ( VERBOSE && pileup.size > 30 && nonRefReads.length < 4 ) - printNonRefBases(ref, nonRefReads, offsets, context.getReads.size()) - } - } - } - - implicit def listToJavaList[T](l: List[T]) = java.util.Arrays.asList(l.toArray: _*) - - def printNonRefBases(ref: ReferenceContext, nonRefReads: List[SAMRecord], offsets: List[Integer], depth: Integer): Unit = { - out.println(new ReadBackedPileup(ref.getLocus(), ref.getBase(), listToJavaList(nonRefReads), offsets) + " " + depth) - } - - def addRead(nonRefRead: SAMRecord, offset: Integer, nonRefBase: Char, ref: ReferenceContext, table: TransitionTable): Unit = { - if ( goodRead(nonRefRead, offset) ) { - //printf("Including site:%n call=%s%n nonRefBases=%s%n refBases=%s%n nonRefRead=%s%n offset=%s%n", nonRefBases, refBases, nonRefRead.format, offsets.head) - //println(call.getReadDepth, call.getReferencebase, nonRefBases, refBases, nonRefRead.getMappingQuality) - if ( CARE_ABOUT_STRAND && nonRefRead.getReadNegativeStrandFlag() ) { - // it's on the reverse strand - val refComp: Char = BaseUtils.simpleComplement(ref.getBase) - val nonRefComp: Char = BaseUtils.simpleComplement(nonRefBase) - - //printf("Negative: %c/%c is actually %c/%c%n", ref.getBase, nonRefBases.head, refComp, nonRefComp) - table.add(refComp, nonRefComp) - } else { - table.add(ref.getBase, nonRefBase) - } - } - } - - def getNonRefReads(ref: Char, pileup: ReadBackedPileup): (List[SAMRecord], List[Integer]) = { - def getBase(i: Int): Char = { - var offset = pileup.getOffsets().get(i) - return pileup.getReads().get(i).getReadString().charAt(offset.asInstanceOf[Int]) - } - - var subset = for { i <- List.range(0, pileup.size) if getBase(i) != ref } yield i - var reads = subset map (i => pileup.getReads().get(i)) - var offsets = subset map (i => pileup.getOffsets().get(i)) - - //println(reads, offsets, subset) - return (reads, offsets) - } - - - def hasFewNonRefBases(refBase: Char, pileup: String): List[Char] = { - splitBases(refBase, pileup) match { - case (refs, nonRefs) => - //printf("nrf=%s, rb=%s%n", nonRefs.mkString, refs.mkString) - return nonRefs - } - } - - def goodRead( read: SAMRecord, offset: Integer ): Boolean = { - return read.getMappingQuality() > MIN_MAPPING_QUALITY && read.getBaseQualities()(offset.intValue) > MIN_BASE_QUALITY - } - - def splitBases(refBase: Char, pileup: String): (List[Char], List[Char]) = { - val refBases = pileup.toList filter (refBase ==) - val nonRefBases = pileup.toList filter (refBase !=) - return (refBases, nonRefBases) - } - - def isDefinitelyHomRef(call: Pair[java.util.List[GenotypeCall],GenotypeMetaData]): Boolean = { - if ( call == null ) - return false; - - return (! call.getFirst().get(0).isVariant()) && call.getFirst().get(0).getNegLog10PError() > MIN_LOD - } - - def reduceInit(): Int = { - return 0; - } - - def reduce(m: Unit, r: Int): Int = { - return r; - } - - override def onTraversalDone(r: Int) = { - out.println("-------------------- FINAL RESULT --------------------") - out.println("type\t" + table1.header) - def print1(t: String, x: TransitionTable) = { - out.println(t + "\t" + x) - } - - print1("1-mismatch", table1) - print1("2-mismatch", table2) - print1("1-mismatch-fwd", tableFWD) - print1("1-mismatch-rev", tableREV) - } -} -*/ \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala b/public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala deleted file mode 100755 index fa30fceff..000000000 --- a/public/scala/src/org/broadinstitute/sting/scala/IntervalAnnotationWalker.scala +++ /dev/null @@ -1,120 +0,0 @@ -package org.broadinstitute.sting.scala - -import java.io.PrintStream -import org.broadinstitute.sting.gatk.contexts.{ReferenceContext, AlignmentContext} -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker -import collection.JavaConversions._ -import collection.immutable.List -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature -import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature -import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature -import org.broadinstitute.sting.gatk.walkers.{TreeReducible, RefWalker} -import org.broadinstitute.sting.commandline.{Output, Argument} -import org.broadinstitute.sting.utils.{BaseUtils, GenomeLoc} -import collection.mutable.{ListBuffer, HashSet} -import java.lang.Math - -class IntervalAnnotationWalker extends RefWalker[AnnotationMetaData,List[IntervalInfoBuilder]] { - @Argument(doc="Min proportion of bases overlapping between an interval of interest and an annotation interval for annotation to occur",shortName="mpb") - var minPropOverlap : Double = 0.50 // default to 50% - @Output var out : PrintStream = _ - - override def reduceInit : List[IntervalInfoBuilder] = { - Nil - } - - override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext) : AnnotationMetaData = { - val ivalsOfInterest : List[GATKFeature] = tracker.getGATKFeatureMetaData("interval_list",false).toList - val refSeqData : List[Object] = tracker.getReferenceMetaData("refseq").toList - val tableMetaData : List[Object] = tracker.getReferenceMetaData("table",false).toList - var amd : AnnotationMetaData = new AnnotationMetaData(ref.getLocus,ref.getBase) - if ( refSeqData != null ) { - amd.refSeqFeatures = refSeqData.map( (o: Object) => o.asInstanceOf[RefSeqFeature]) - } - if ( tableMetaData != null ) { - amd.tableFeatures = tableMetaData.map( (o: Object) => o.asInstanceOf[TableFeature]).map( (u: TableFeature) => - new Tuple2(u.getLocation,u.getAllValues.toList.reduceLeft(_ + "," + _))) - } - amd.newIvals = ivalsOfInterest.map(_.getLocation) - return amd - } - - override def reduce(mapMetaData : AnnotationMetaData, rList : List[IntervalInfoBuilder]) : List[IntervalInfoBuilder] = { - rList.filter( (u : IntervalInfoBuilder) => u.location.isBefore(mapMetaData.locus)).foreach(u => out.print("%s%n".format(u.done))) - val newList : List[IntervalInfoBuilder] = rList.filter( ! _.finalized ) ::: - mapMetaData.newIvals.filter( (g: GenomeLoc) => g.getStart == mapMetaData.locus.getStart ).map( u => new IntervalInfoBuilder(u,minPropOverlap) ) - newList.foreach(_.update(mapMetaData)) - return newList - } - - override def onTraversalDone(finalList : List[IntervalInfoBuilder]) = { - finalList.foreach(u => out.print("%s%n".format(u.done))) - } - -} - -class AnnotationMetaData(loc: GenomeLoc, ref: Byte) { - val locus : GenomeLoc = loc - var newIvals : List[GenomeLoc] = Nil - var refSeqFeatures : List[RefSeqFeature] = Nil - var tableFeatures : List[(GenomeLoc,String)] = Nil - val refBase : Byte = ref -} - -class IntervalInfoBuilder(loc : GenomeLoc, minProp : Double) { - - val OVERLAP_PROP : Double = minProp - - var metaData : HashSet[String] = new HashSet[String] - var seenTranscripts : HashSet[String] = new HashSet[String] - var baseContent : ListBuffer[Byte] = new ListBuffer[Byte] - var location : GenomeLoc = loc - var gcContent : Double = _ - var entropy : Double = _ - var finalized : Boolean = false - - - def update(mmd : AnnotationMetaData) = { - baseContent += mmd.refBase - mmd.refSeqFeatures.filter( p => ! seenTranscripts.contains(p.getTranscriptUniqueGeneName)).view.foreach(u => addRefSeq(u)) - mmd.tableFeatures.filter( (t : (GenomeLoc,String)) => - ! metaData.contains(t._2) && (t._1.intersect(location).size.asInstanceOf[Double]/location.size() >= OVERLAP_PROP) ).foreach( t => metaData.add(t._2)) - } - - def addRefSeq( rs: RefSeqFeature ) : Unit = { - seenTranscripts.add(rs.getTranscriptUniqueGeneName) - val exons : List[(GenomeLoc,Int)] = rs.getExons.toList.zipWithIndex.filter( p => ((p._1.intersect(location).size+0.0)/location.size) >= minProp ) - if ( exons.size > 0 ) { - metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,exons.map( p => "exon%d".format(p._2) ).reduceLeft(_ + "," + _))) - } else { - if ( location.isBefore(rs.getExons.get(0)) || location.isPast(rs.getExons.get(rs.getNumExons-1)) ) { - metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"UTR")) - } else { - metaData.add("%s[%s]".format(rs.getTranscriptUniqueGeneName,"Intron")) - } - } - - } - - def done : String = { - finalized = true - def isGC(b : Byte) : Int = if ( BaseUtils.gIndex == b || BaseUtils.cIndex == b ) { 1 } else { 0 } - gcContent = baseContent.foldLeft[Int](0)( (a,b) => a + isGC(b)).asInstanceOf[Double]/location.size() - entropy = calcEntropy(baseContent.map(b => ListBuffer(b))) + calcEntropy(baseContent.reverse.map(b => ListBuffer(b))) - val meta : String = metaData.reduceLeft(_ + "\t" + _) - return "%s\t%d\t%d\t%.2f\t%.2f\t%s".format(location.getContig,location.getStart,location.getStop,gcContent,entropy,meta) - } - - def calcEntropy(byteList : ListBuffer[ListBuffer[Byte]]) : Double = { - if(byteList.size == 1) return 0 - Math.log(1+byteList.tail.size-byteList.tail.dropWhile( u => u.equals(byteList(1))).size) + - calcEntropy(byteList.tail.foldLeft(ListBuffer(byteList(0)))( (a,b) => { - if ( b.equals(byteList(1)) ) { - a.dropRight(1) :+ (a.last ++ b) - } else { - a :+ b - } - })) - } - -} \ No newline at end of file diff --git a/public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala b/public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala deleted file mode 100755 index 658aabc18..000000000 --- a/public/scala/src/org/broadinstitute/sting/scala/ScalaCountLoci.scala +++ /dev/null @@ -1,24 +0,0 @@ -package org.broadinstitute.sting.scala - -import org.broadinstitute.sting.gatk.walkers.LocusWalker -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker -import org.broadinstitute.sting.gatk.contexts.ReferenceContext -import org.broadinstitute.sting.gatk.contexts.AlignmentContext - -class ScalaCountLoci extends LocusWalker[Int,Int] { - override def map(tracker: RefMetaDataTracker, ref: ReferenceContext, context: AlignmentContext): Int = { - return 1 - } - - def reduceInit(): Int = { - return 0; - } - - def reduce(m: Int, r: Int): Int = { - return m + r; - } - - def main(args: Array[String]) { - println("Hello, world!") - } -} From 761347b8d5b71e54bc1db01ec686efe323602c67 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 30 Jun 2011 15:26:09 -0400 Subject: [PATCH 21/21] The VariantContext utility method used by SelectVariants wasn't checking the filter status (unfiltered vs. passing filters) and always returned a VC that was passing filters. This is fixed and the md5 from the VCF Streaming test has been re-updated. --- .../sting/utils/variantcontext/VariantContext.java | 2 +- .../walkers/variantutils/VCFStreamingIntegrationTest.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 92262f1bf..3d375aba2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -414,7 +414,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati * @return vc subcontext */ public VariantContext subContextFromGenotypes(Collection genotypes, Set alleles) { - return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), getFilters(), getAttributes()); + return new VariantContext(getSource(), contig, start, stop, alleles, genotypes, getNegLog10PError(), filtersWereApplied() ? getFilters() : null, getAttributes()); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java index 4ed9718fd..cf0673ee6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VCFStreamingIntegrationTest.java @@ -60,7 +60,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " --NO_HEADER" + " -o %s", 1, - Arrays.asList("995c07ccd593fe1c35d0d28155112a55") + Arrays.asList("debbbf3e661b6857cc8d99ff7635bb1d") ); executeTest("testSimpleVCFStreaming", spec); @@ -98,7 +98,7 @@ public class VCFStreamingIntegrationTest extends WalkerTest { " -EV CompOverlap -noEV -noST" + " -o %s", 1, - Arrays.asList("f60729c900bc8368717653b3fad80d1e") + Arrays.asList("f60729c900bc8368717653b3fad80d1e") //"f60729c900bc8368717653b3fad80d1e" ); executeTest("testVCFStreamingChain", selectTestSpec);