From e0803eabd914dc31f44c4f3ce112cee79d011744 Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 29 May 2009 14:51:08 +0000 Subject: [PATCH] enabled underlying filtering of zero mapping quality reads, vastly improves system performance git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@853 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GATKArgumentCollection.java | 8 + .../sting/gatk/GenomeAnalysisEngine.java | 1 + .../org/broadinstitute/sting/gatk/Reads.java | 6 + .../simpleDataSources/SAMDataSource.java | 2 + .../gatk/traversals/TraversalEngine.java | 25 +++- .../varianteval/VariantEvalWalker.java | 16 +- python/picard_utils.py | 137 ++++++++++++++++++ 7 files changed, 192 insertions(+), 3 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java index 1dce36cc8..b67b7738c 100755 --- a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java +++ b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java @@ -112,6 +112,10 @@ public class GATKArgumentCollection { @Argument(fullName = "sort_on_the_fly", shortName = "sort", doc = "Maximum number of reads to sort on the fly", required = false) public Integer maximumReadSorts = null; + @Element(required=false) + @Argument(fullName = "filterZeroMappingQualityReads", shortName = "fmq0", doc = "If true, mapping quality zero reads will be filtered at the lowest GATK level. Vastly improves performance at areas with abnormal depth due to mapping Q0 reads", required = false) + public Boolean filterZeroMappingQualityReads = false; + @Element(required=false) @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false) public Double downsampleFraction = null; @@ -241,6 +245,10 @@ public class GATKArgumentCollection { (other.maximumReadSorts != null && !other.maximumReadSorts.equals(this.maximumReadSorts))) { return false; } + if ((other.filterZeroMappingQualityReads == null && this.filterZeroMappingQualityReads != null) || + (other.filterZeroMappingQualityReads != null && !other.filterZeroMappingQualityReads.equals(this.filterZeroMappingQualityReads))) { + return false; + } if ((other.downsampleFraction == null && this.downsampleFraction != null) || (other.downsampleFraction != null && !other.downsampleFraction.equals(this.downsampleFraction))) { return false; diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index eb2651956..47e985ae2 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -195,6 +195,7 @@ public class GenomeAnalysisEngine { engine.setStrictness(strictness); engine.setMaxReads(Integer.parseInt(argCollection.maximumReads)); + engine.setFilterZeroMappingQualityReads(argCollection.filterZeroMappingQualityReads); // we default interval files over the genome region string if (argCollection.intervals != null) { diff --git a/java/src/org/broadinstitute/sting/gatk/Reads.java b/java/src/org/broadinstitute/sting/gatk/Reads.java index bca047b3f..83d836653 100755 --- a/java/src/org/broadinstitute/sting/gatk/Reads.java +++ b/java/src/org/broadinstitute/sting/gatk/Reads.java @@ -30,6 +30,7 @@ public class Reads { private Integer downsampleToCoverage = null; private Integer maxOnFlySorts = null; private Boolean beSafe = null; + private Boolean filterZeroMappingQualityReads = null; /** * Gets a list of the files acting as sources of reads. @@ -71,6 +72,10 @@ public class Reads { return beSafe; } + public Boolean getFilterZeroMappingQualityReads() { + return filterZeroMappingQualityReads; + } + /** * Simple constructor for unit testing. * @param readsFiles List of reads files to open. @@ -90,6 +95,7 @@ public class Reads { if (arguments.downsampleFraction != null) downsamplingFraction = arguments.downsampleFraction; if (arguments.downsampleCoverage != null) downsampleToCoverage = arguments.downsampleCoverage; if (arguments.maximumReadSorts != null) maxOnFlySorts = arguments.maximumReadSorts; + if (arguments.filterZeroMappingQualityReads != null) filterZeroMappingQualityReads = arguments.filterZeroMappingQualityReads; beSafe = !arguments.unsafe; } } diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java index 84eb3e954..9372c815e 100755 --- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java +++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java @@ -147,6 +147,7 @@ public class SAMDataSource implements SimpleDataSource { iterator, reads.getDownsamplingFraction(), reads.getMaxOnTheFlySorts(), + reads.getFilterZeroMappingQualityReads(), reads.getSafetyChecking()); } else if (shard.getShardType() == Shard.ShardType.LOCUS || shard.getShardType() == Shard.ShardType.INTERVAL) { @@ -155,6 +156,7 @@ public class SAMDataSource implements SimpleDataSource { iterator, reads.getDownsamplingFraction(), reads.getMaxOnTheFlySorts(), + reads.getFilterZeroMappingQualityReads(), reads.getSafetyChecking()); } else { diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index f149ec22b..2d3b16129 100755 --- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.traversals; import net.sf.picard.filter.SamRecordFilter; +import net.sf.picard.filter.FilteringIterator; import net.sf.picard.reference.ReferenceSequence; import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.samtools.SAMFileHeader; @@ -75,7 +76,7 @@ public abstract class TraversalEngine { protected long N_RECORDS_TO_PRINT = 100000; protected boolean THREADED_IO = false; protected int THREADED_IO_BUFFER_SIZE = 10000; - + protected boolean filterZeroMappingQualityReads = false; // the stored header @@ -124,6 +125,10 @@ public abstract class TraversalEngine { this.maxReads = maxReads; } + public void setFilterZeroMappingQualityReads(final boolean filterZeroMappingQualityReads) { + this.filterZeroMappingQualityReads = filterZeroMappingQualityReads; + } + public void setThreadedIO(final boolean threadedIO) { this.THREADED_IO = threadedIO; } @@ -387,6 +392,7 @@ public abstract class TraversalEngine { wrappedIterator, DOWNSAMPLE_BY_FRACTION ? downsamplingFraction : null, SORT_ON_FLY ? maxOnFlySorts : null, + filterZeroMappingQualityReads, beSafeP); } @@ -398,6 +404,7 @@ public abstract class TraversalEngine { StingSAMIterator wrappedIterator, Double downsamplingFraction, Integer maxOnFlySorts, + Boolean filterZeroMappingQualityReads, Boolean beSafeP) { // NOTE: this (and other filtering) should be done before on-the-fly sorting // as there is no reason to sort something that we will end of throwing away @@ -409,9 +416,25 @@ public abstract class TraversalEngine { if (beSafeP != null && beSafeP && enableVerification) wrappedIterator = new VerifyingSamIterator(wrappedIterator); + + if ( filterZeroMappingQualityReads ) + wrappedIterator = StingSAMIteratorAdapter.adapt(wrappedIterator.getSourceInfo(), + new FilteringIterator(wrappedIterator, new ZeroMappingQualityReadFilterFunc())); + return wrappedIterator; } + private static class ZeroMappingQualityReadFilterFunc implements SamRecordFilter { + public boolean filterOut(SAMRecord rec) { + if (rec.getMappingQuality() == 0) { + //System.out.printf("Filtering 0 mapping quality read %s%n", rec.format()); + return true; + } else { + return false; + } + } + } + protected SAMFileReader initializeSAMFile(File samFile) { // todo: fixme, this is a hack to try out dynamic merging if ( samFile.toString().endsWith(".list") ) { diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java index 99a646bf2..90bd51283 100755 --- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java @@ -22,6 +22,10 @@ import java.util.EnumMap; public class VariantEvalWalker extends RefWalker { @Argument(shortName="minDiscoveryQ", doc="Phred-scaled minimum LOD to consider an evaluation SNP a variant", required=false) public int minDiscoveryQ = -1; + + @Argument(shortName="printVariants", doc="If true, prints the variants in all of the variant tracks that are examined", required=false) + public boolean printVariants = false; + int nBasesCovered = 0; VariantDBCoverage dbSNPStats = new VariantDBCoverage("dbSNP"); @@ -51,8 +55,16 @@ public class VariantEvalWalker extends RefWalker { AllelicVariant dbsnp = (AllelicVariant)tracker.lookup("dbSNP", null); AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null); - //if ( eval != null || dbsnp != null ) - // System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval); + if ( printVariants && ( eval != null || dbsnp != null ) ) { + String matchFlag = " "; + if ( eval != null && dbsnp != null ) matchFlag = "*** "; + if ( eval == null && dbsnp != null ) matchFlag = ">>> "; + if ( eval != null && dbsnp == null ) matchFlag = "<<< "; + + System.out.printf("%sDBSNP: %s%n%sEVAL:%s%n", + matchFlag, dbsnp, + matchFlag, eval); + } if ( eval != null && eval.isSNP() && eval.getVariationConfidence() >= minDiscoveryQ ) { //System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval); diff --git a/python/picard_utils.py b/python/picard_utils.py index fe271bbf7..85b5c9a49 100755 --- a/python/picard_utils.py +++ b/python/picard_utils.py @@ -5,6 +5,8 @@ from optparse import OptionParser import string import re import glob +import unittest +import itertools #lanes = ["30JW3AAXX.6", "30KRNAAXX.1", "30KRNAAXX.6", "30PYMAAXX.5"] #idsList = ['NA12843', 'NA19065', 'NA19064', 'NA18637'] @@ -16,6 +18,69 @@ gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar" ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" analysis = "CombineDuplicates" +MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar' +CALL_GENOTYPES_BIN = '/seq/software/picard/current/bin/CallGenotypes.jar' + +def unique(l): + # NotÊorderÊpreservingÊÊÊÊ + return list(set(l)) + +def genotypes2heterozyosity(genotypes, nIndividuals = -1): + def isHET(genotype): + return genotype[0] <> genotype[1] + + if nIndividuals == -1: + n = len(genotypes) + else: + n = nIndividuals + + hets = filter( isHET, genotypes ) + nhets = len(hets) + # print genotypes, ' => hets', hets + return [nhets / (1.0*n), nhets, n] + +def genotypes2allele(genotypes, nIndividuals = -1): + def isHET(genotype): + return genotype[0] <> genotype[1] + + if nIndividuals == -1: + n = len(genotypes) + else: + n = nIndividuals + + hets = filter( isHET, genotypes ) + nhets = len(hets) + print genotypes, ' => hets', hets + return [nhets / (1.0*n), nhets, n] + +def aggregatedGeliCalls2SNP( geliCallsAtSite, nIndividuals ): + #print 'geliCallsAtSite', geliCallsAtSite + loc = geliCallsAtSite[0] + refBases = map( lambda call: call[2], geliCallsAtSite[1] ) + #print 'refBases', refBases + genotypes = map( lambda call: call[5], geliCallsAtSite[1] ) + #print 'genotypes', genotypes + polymorphism = unique(list(refBases[0] + genotypes[0])) + if polymorphism[0] <> refBases[0]: polymorphism.reverse() + assert polymorphism[0] == refBases[0] + #print 'polymorphism', polymorphism + genotype = list(geliCallsAtSite[1][0][5]) + + return [loc, polymorphism, genotypes2heterozyosity(genotypes, nIndividuals), genotypes] + + #return '%s %s %s 0.002747 -411.622578 -420.661738 0.000000 9.039160 364.000000 %d 1 0' % (loc, genotype[0], genotype[1], len(geliCallsAtSite)) + +def call2loc(call): + return call[0] + ':' + call[1] + +def aggregateGeliCalls( sortedGeliCalls ): + return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)] + +def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN ): + if type(inputFiles) <> list: + inputFiles = list(inputFiles) + return 'java -Xmx4096m -jar ' + mergeBin + ' MSD=true AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles)) + def getPicardPath(lane, picardRoot = '/seq/picard/'): flowcell, laneNo = lane.split('.') filePat = os.path.join(picardRoot, flowcell, '*', laneNo, '*') @@ -35,6 +100,9 @@ def getReferenceGenotypeFileFromConcordanceFile(concordFile): return match.group(1) return None +def callGenotypesCmd( inputBam, outputFilename, callGenotypesBin = CALL_GENOTYPES_BIN, options = ''): + return "java -jar %s INPUT=%s OUTPUT=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, options) + def concord(options, geli, output, genotypeFile): return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) ) @@ -49,3 +117,72 @@ def readPicardConcordance(file): return [f(x) for f, x in zip(types, line.split())] data = [parse1(line) for line in open(file) if p.match(line) <> None] return data + +# ------------------------------------------------------------------------------------------ +# +# Unit testing! +# +# ------------------------------------------------------------------------------------------ +class TestPicardUnils(unittest.TestCase): + def setUp(self): + import cStringIO + dataString = """chr1 1105366 T 52 99 CT 10.559975 10.559975 -117.68 -93.107178 -116.616493 -45.536842 -88.591728 -92.043671 -20.964022 -116.014435 -44.473339 -31.523996 +chr1 1105411 G 22 99 AG 12.484722 12.484722 -23.995817 -27.909206 -10.875731 -27.909206 -46.579994 -29.546518 -46.579994 -23.360453 -29.546518 -46.579994 +chr1 1105411 G 29 99 AG 12.033216 12.033216 -30.641142 -34.376297 -14.483982 -35.457623 -53.197636 -32.525024 -53.498665 -26.517199 -33.606354 -54.579994 +chr1 1105857 G 6 99 AG 7.442399 2.096584 -7.55462 -9.3608 -5.458036 -9.3608 -20.279993 -16.37723 -20.279993 -12.900434 -16.37723 -20.279993 +chr1 1105857 G 7 99 AG 10.889011 1.795554 -7.406977 -9.514187 -5.611423 -9.514187 -23.879993 -19.97723 -23.879993 -16.500435 -19.97723 -23.879993 +chr1 1106094 T 20 99 CT 6.747106 6.747106 -56.979992 -43.143734 -56.806652 -23.699236 -41.036522 -42.97039 -9.862975 -56.505623 -23.525892 -16.610081 +chr1 1110294 G 42 99 AG 21.076984 21.076984 -44.285015 -49.702267 -17.869579 -49.831242 -80.276649 -48.442669 -80.404335 -38.946564 -48.571644 -80.405624 +chr1 1111204 C 26 99 CT 11.364679 11.364679 -55.479992 -31.040928 -55.479992 -36.424099 -23.349712 -31.040928 -11.985033 -55.479992 -36.424099 -32.811741 +chr1 1111204 C 29 99 TT 34.740601 3.890135 -52.282055 -48.646442 -52.704597 -17.954794 -44.565525 -48.464859 -13.715057 -52.100475 -17.773212 -9.824923 +chr1 1111204 C 31 99 CT 18.784479 18.784479 -71.079994 -39.870823 -71.079994 -44.303268 -31.878578 -39.870823 -13.094099 -71.079994 -44.303268 -39.48679 +""" + dataFile = cStringIO.StringIO(dataString) + self.nIndividuals = 10 + + self.genotypesSets = aggregateGeliCalls(map( string.split, dataFile.readlines() ) ) + self.genotypes = map(lambda x: aggregatedGeliCalls2SNP(x, self.nIndividuals), self.genotypesSets ) + self.locs = ["chr1:1105366", "chr1:1105411", "chr1:1105857", "chr1:1106094", "chr1:1110294", "chr1:1111204"] + self.nhets = [1, 2, 2, 1, 1, 2] + self.altAlleles = [1, 2, 2, 1, 1, 4] + + self.aaf = map( lambda x: (1.0*x) / self.nIndividuals, self.altAlleles ) + self.hets = map( lambda x: (1.0*x) / self.nIndividuals, self.nhets ) + + def testGenotypesSize(self): + self.assertEqual(len(self.genotypesSets), 6) + + def testGenotypes2Het(self): + print 'testGenotypes2Het...' + self.assertEqual(genotypes2heterozyosity(['AT']), [1, 1, 1]) + self.assertEqual(genotypes2heterozyosity(['AA']), [0, 0, 1]) + self.assertEqual(genotypes2heterozyosity(['TT']), [0, 0, 1]) + self.assertEqual(genotypes2heterozyosity(['AT', 'AT']), [1, 2, 2]) + self.assertEqual(genotypes2heterozyosity(['AA', 'AA']), [0, 0, 2]) + self.assertEqual(genotypes2heterozyosity(['AT', 'AA']), [0.5, 1, 2]) + self.assertEqual(genotypes2heterozyosity(['AT', 'TT']), [0.5, 1, 2]) + self.assertEqual(genotypes2heterozyosity(['AT', 'TT', 'AA']), [1.0/3, 1, 3]) + self.assertEqual(genotypes2heterozyosity(['AT', 'AT', 'AA']), [2.0/3, 2, 3]) + + self.assertEqual(genotypes2heterozyosity(['AT', 'AT'], 10), [2.0/10, 2, 10]) + self.assertEqual(genotypes2heterozyosity(['AT', 'AA'], 10), [1.0/10, 1, 10]) + + + def testGenotypeSetLocs(self): + for set, loc in zip(self.genotypesSets, self.locs): + #print loc, set + self.assertEqual(set[0], loc) + + def testGenotypeLocs(self): + for genotype, loc in zip(self.genotypes, self.locs): + self.assertEqual(genotype[0], loc) + + def testGenotypeHets(self): + print 'testGenotypeHets:' + for genotype, het in zip(self.genotypes, self.hets): + print ' => ', genotype, het + self.assertEqual(genotype[2][0], het) + + +if __name__ == '__main__': + unittest.main()