From e0803eabd914dc31f44c4f3ce112cee79d011744 Mon Sep 17 00:00:00 2001
From: depristo <depristo@348d0f76-0448-11de-a6fe-93d51630548a>
Date: Fri, 29 May 2009 14:51:08 +0000
Subject: [PATCH] enabled underlying filtering of zero mapping quality reads,
 vastly improves system performance

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@853 348d0f76-0448-11de-a6fe-93d51630548a
---
 .../sting/gatk/GATKArgumentCollection.java    |   8 +
 .../sting/gatk/GenomeAnalysisEngine.java      |   1 +
 .../org/broadinstitute/sting/gatk/Reads.java  |   6 +
 .../simpleDataSources/SAMDataSource.java      |   2 +
 .../gatk/traversals/TraversalEngine.java      |  25 +++-
 .../varianteval/VariantEvalWalker.java        |  16 +-
 python/picard_utils.py                        | 137 ++++++++++++++++++
 7 files changed, 192 insertions(+), 3 deletions(-)

diff --git a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java
index 1dce36cc8..b67b7738c 100755
--- a/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java
+++ b/java/src/org/broadinstitute/sting/gatk/GATKArgumentCollection.java
@@ -112,6 +112,10 @@ public class GATKArgumentCollection {
     @Argument(fullName = "sort_on_the_fly", shortName = "sort", doc = "Maximum number of reads to sort on the fly", required = false)
     public Integer maximumReadSorts = null;
 
+    @Element(required=false)
+    @Argument(fullName = "filterZeroMappingQualityReads", shortName = "fmq0", doc = "If true, mapping quality zero reads will be filtered at the lowest GATK level.  Vastly improves performance at areas with abnormal depth due to mapping Q0 reads", required = false)
+    public Boolean filterZeroMappingQualityReads = false;
+
     @Element(required=false)
     @Argument(fullName = "downsample_to_fraction", shortName = "dfrac", doc = "Fraction [0.0-1.0] of reads to downsample to", required = false)
     public Double downsampleFraction = null;
@@ -241,6 +245,10 @@ public class GATKArgumentCollection {
             (other.maximumReadSorts != null && !other.maximumReadSorts.equals(this.maximumReadSorts))) {
             return false;
         }
+        if ((other.filterZeroMappingQualityReads == null && this.filterZeroMappingQualityReads != null) ||
+            (other.filterZeroMappingQualityReads != null && !other.filterZeroMappingQualityReads.equals(this.filterZeroMappingQualityReads))) {
+            return false;
+        }
         if ((other.downsampleFraction == null && this.downsampleFraction != null) ||
             (other.downsampleFraction != null && !other.downsampleFraction.equals(this.downsampleFraction))) {
             return false;
diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index eb2651956..47e985ae2 100755
--- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -195,6 +195,7 @@ public class GenomeAnalysisEngine {
         engine.setStrictness(strictness);
 
         engine.setMaxReads(Integer.parseInt(argCollection.maximumReads));
+        engine.setFilterZeroMappingQualityReads(argCollection.filterZeroMappingQualityReads);
 
         // we default interval files over the genome region string
         if (argCollection.intervals != null) {
diff --git a/java/src/org/broadinstitute/sting/gatk/Reads.java b/java/src/org/broadinstitute/sting/gatk/Reads.java
index bca047b3f..83d836653 100755
--- a/java/src/org/broadinstitute/sting/gatk/Reads.java
+++ b/java/src/org/broadinstitute/sting/gatk/Reads.java
@@ -30,6 +30,7 @@ public class Reads {
     private Integer downsampleToCoverage = null;
     private Integer maxOnFlySorts = null;
     private Boolean beSafe = null;
+    private Boolean filterZeroMappingQualityReads = null;
 
     /**
      * Gets a list of the files acting as sources of reads.
@@ -71,6 +72,10 @@ public class Reads {
         return beSafe;
     }
 
+    public Boolean getFilterZeroMappingQualityReads() {
+        return filterZeroMappingQualityReads;
+    }
+
     /**
      * Simple constructor for unit testing.
      * @param readsFiles List of reads files to open.
@@ -90,6 +95,7 @@ public class Reads {
         if (arguments.downsampleFraction != null) downsamplingFraction = arguments.downsampleFraction;
         if (arguments.downsampleCoverage != null) downsampleToCoverage = arguments.downsampleCoverage;
         if (arguments.maximumReadSorts != null) maxOnFlySorts = arguments.maximumReadSorts;
+        if (arguments.filterZeroMappingQualityReads != null) filterZeroMappingQualityReads = arguments.filterZeroMappingQualityReads;
         beSafe = !arguments.unsafe;
     }
 }
diff --git a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java
index 84eb3e954..9372c815e 100755
--- a/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java
+++ b/java/src/org/broadinstitute/sting/gatk/dataSources/simpleDataSources/SAMDataSource.java
@@ -147,6 +147,7 @@ public class SAMDataSource implements SimpleDataSource {
                     iterator,
                     reads.getDownsamplingFraction(),
                     reads.getMaxOnTheFlySorts(),
+                    reads.getFilterZeroMappingQualityReads(),
                     reads.getSafetyChecking());
         } else if (shard.getShardType() == Shard.ShardType.LOCUS ||
                    shard.getShardType() == Shard.ShardType.INTERVAL) {
@@ -155,6 +156,7 @@ public class SAMDataSource implements SimpleDataSource {
                     iterator,
                     reads.getDownsamplingFraction(),
                     reads.getMaxOnTheFlySorts(),
+                    reads.getFilterZeroMappingQualityReads(),
                     reads.getSafetyChecking());
         }
         else {
diff --git a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
index f149ec22b..2d3b16129 100755
--- a/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
+++ b/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java
@@ -1,6 +1,7 @@
 package org.broadinstitute.sting.gatk.traversals;
 
 import net.sf.picard.filter.SamRecordFilter;
+import net.sf.picard.filter.FilteringIterator;
 import net.sf.picard.reference.ReferenceSequence;
 import net.sf.picard.sam.SamFileHeaderMerger;
 import net.sf.samtools.SAMFileHeader;
@@ -75,7 +76,7 @@ public abstract class TraversalEngine {
     protected long N_RECORDS_TO_PRINT = 100000;
     protected boolean THREADED_IO = false;
     protected int THREADED_IO_BUFFER_SIZE = 10000;
-
+    protected boolean filterZeroMappingQualityReads = false;
 
 
     // the stored header
@@ -124,6 +125,10 @@ public abstract class TraversalEngine {
         this.maxReads = maxReads;
     }
 
+    public void setFilterZeroMappingQualityReads(final boolean filterZeroMappingQualityReads) {
+        this.filterZeroMappingQualityReads = filterZeroMappingQualityReads;
+    }
+
     public void setThreadedIO(final boolean threadedIO) {
         this.THREADED_IO = threadedIO;
     }
@@ -387,6 +392,7 @@ public abstract class TraversalEngine {
                                         wrappedIterator,
                                         DOWNSAMPLE_BY_FRACTION ? downsamplingFraction : null,
                                         SORT_ON_FLY ? maxOnFlySorts : null,
+                                        filterZeroMappingQualityReads,
                                         beSafeP);
     }
 
@@ -398,6 +404,7 @@ public abstract class TraversalEngine {
                                                             StingSAMIterator wrappedIterator,
                                                             Double downsamplingFraction,
                                                             Integer maxOnFlySorts,
+                                                            Boolean filterZeroMappingQualityReads, 
                                                             Boolean beSafeP) {
         // NOTE: this (and other filtering) should be done before on-the-fly sorting
         //  as there is no reason to sort something that we will end of throwing away
@@ -409,9 +416,25 @@ public abstract class TraversalEngine {
 
         if (beSafeP != null && beSafeP && enableVerification)
             wrappedIterator = new VerifyingSamIterator(wrappedIterator);
+
+        if ( filterZeroMappingQualityReads )
+            wrappedIterator = StingSAMIteratorAdapter.adapt(wrappedIterator.getSourceInfo(),
+                                                            new FilteringIterator(wrappedIterator, new ZeroMappingQualityReadFilterFunc()));
+
         return wrappedIterator;
     }
 
+    private static class ZeroMappingQualityReadFilterFunc implements SamRecordFilter {
+        public boolean filterOut(SAMRecord rec) {
+            if (rec.getMappingQuality() == 0) {
+                //System.out.printf("Filtering 0 mapping quality read %s%n", rec.format());
+                return true;
+            } else {
+                return false;
+            }
+        }
+    }
+
     protected SAMFileReader initializeSAMFile(File samFile) {
         // todo: fixme, this is a hack to try out dynamic merging
         if ( samFile.toString().endsWith(".list") ) {
diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java
index 99a646bf2..90bd51283 100755
--- a/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/varianteval/VariantEvalWalker.java
@@ -22,6 +22,10 @@ import java.util.EnumMap;
 public class VariantEvalWalker extends RefWalker<Integer, Integer> {
     @Argument(shortName="minDiscoveryQ", doc="Phred-scaled minimum LOD to consider an evaluation SNP a variant", required=false)
     public int minDiscoveryQ = -1;
+
+    @Argument(shortName="printVariants", doc="If true, prints the variants in all of the variant tracks that are examined", required=false)
+    public boolean printVariants = false;
+
     int nBasesCovered = 0;
     VariantDBCoverage dbSNPStats = new VariantDBCoverage("dbSNP");
 
@@ -51,8 +55,16 @@ public class VariantEvalWalker extends RefWalker<Integer, Integer> {
         AllelicVariant dbsnp = (AllelicVariant)tracker.lookup("dbSNP", null);
         AllelicVariant eval = (AllelicVariant)tracker.lookup("eval", null);
 
-        //if ( eval != null || dbsnp != null )
-        //    System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval);
+        if ( printVariants && ( eval != null || dbsnp != null ) ) {
+            String matchFlag = "    ";
+            if ( eval != null && dbsnp != null ) matchFlag = "*** ";
+            if ( eval == null && dbsnp != null ) matchFlag = ">>> ";
+            if ( eval != null && dbsnp == null ) matchFlag = "<<< ";
+
+            System.out.printf("%sDBSNP: %s%n%sEVAL:%s%n",
+                    matchFlag, dbsnp,
+                    matchFlag, eval);
+        }
 
         if ( eval != null && eval.isSNP() && eval.getVariationConfidence() >= minDiscoveryQ ) {
             //System.out.printf("%s has: %nDBSNP: %s%nEVAL:%s%n", context.getLocation(), dbsnp, eval);
diff --git a/python/picard_utils.py b/python/picard_utils.py
index fe271bbf7..85b5c9a49 100755
--- a/python/picard_utils.py
+++ b/python/picard_utils.py
@@ -5,6 +5,8 @@ from optparse import OptionParser
 import string
 import re
 import glob
+import unittest
+import itertools
 
 #lanes = ["30JW3AAXX.6", "30KRNAAXX.1", "30KRNAAXX.6", "30PYMAAXX.5"]
 #idsList = ['NA12843', 'NA19065', 'NA19064', 'NA18637']
@@ -16,6 +18,69 @@ gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar"
 ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
 analysis = "CombineDuplicates"
 
+MERGE_BIN = '/seq/software/picard/current/bin/MergeSamFiles.jar'
+CALL_GENOTYPES_BIN = '/seq/software/picard/current/bin/CallGenotypes.jar'
+
+def unique(l):
+    # NotÊorderÊpreservingÊÊÊÊ
+    return list(set(l))
+
+def genotypes2heterozyosity(genotypes, nIndividuals = -1):
+    def isHET(genotype):
+        return genotype[0] <> genotype[1]
+       
+    if nIndividuals == -1:
+        n = len(genotypes)
+    else:
+        n = nIndividuals
+    
+    hets = filter( isHET, genotypes )
+    nhets = len(hets)
+    # print genotypes, ' => hets', hets
+    return [nhets / (1.0*n), nhets, n]
+
+def genotypes2allele(genotypes, nIndividuals = -1):
+    def isHET(genotype):
+        return genotype[0] <> genotype[1]
+       
+    if nIndividuals == -1:
+        n = len(genotypes)
+    else:
+        n = nIndividuals
+    
+    hets = filter( isHET, genotypes )
+    nhets = len(hets)
+    print genotypes, ' => hets', hets
+    return [nhets / (1.0*n), nhets, n]
+
+def aggregatedGeliCalls2SNP( geliCallsAtSite, nIndividuals ):
+    #print 'geliCallsAtSite', geliCallsAtSite
+    loc = geliCallsAtSite[0]
+    refBases = map( lambda call: call[2], geliCallsAtSite[1] )
+    #print 'refBases', refBases
+    genotypes = map( lambda call: call[5], geliCallsAtSite[1] )
+    #print 'genotypes', genotypes
+    polymorphism = unique(list(refBases[0] + genotypes[0]))
+    if polymorphism[0] <> refBases[0]: polymorphism.reverse()
+    assert polymorphism[0] == refBases[0]
+    #print 'polymorphism', polymorphism
+    genotype  = list(geliCallsAtSite[1][0][5])
+    
+    return [loc, polymorphism, genotypes2heterozyosity(genotypes, nIndividuals), genotypes]
+
+    #return '%s   %s %s 0.002747 -411.622578 -420.661738 0.000000 9.039160 364.000000 %d 1 0' % (loc, genotype[0], genotype[1], len(geliCallsAtSite))
+    
+def call2loc(call):
+    return call[0] + ':' + call[1]
+
+def aggregateGeliCalls( sortedGeliCalls ):
+    return [[loc, list(sharedCallsGroup)] for (loc, sharedCallsGroup) in itertools.groupby(sortedGeliCalls, call2loc)]
+
+def mergeBAMCmd( output_filename, inputFiles, mergeBin = MERGE_BIN ):
+    if type(inputFiles) <> list:
+        inputFiles = list(inputFiles)
+    return 'java -Xmx4096m -jar ' + mergeBin + ' MSD=true AS=true SO=coordinate O=' + output_filename + ' VALIDATION_STRINGENCY=SILENT ' + ' I=' + (' I='.join(inputFiles))
+
 def getPicardPath(lane, picardRoot = '/seq/picard/'):
     flowcell, laneNo = lane.split('.')
     filePat = os.path.join(picardRoot, flowcell, '*', laneNo, '*')
@@ -35,6 +100,9 @@ def getReferenceGenotypeFileFromConcordanceFile(concordFile):
             return match.group(1)
     return None
 
+def callGenotypesCmd( inputBam, outputFilename, callGenotypesBin = CALL_GENOTYPES_BIN, options = ''):
+    return "java -jar %s INPUT=%s OUTPUT=%s CALLER_ALGORITHM=QUALITY_SCORE PRIOR_MODEL=SNP_FREQUENCY %s" % ( callGenotypesBin, inputBam, outputFilename, options)
+
 def concord(options, geli, output, genotypeFile):
     return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) )
 
@@ -49,3 +117,72 @@ def readPicardConcordance(file):
         return [f(x) for f, x in zip(types, line.split())]
     data = [parse1(line) for line in open(file) if p.match(line) <> None]
     return data
+
+# ------------------------------------------------------------------------------------------
+#
+# Unit testing!
+#
+# ------------------------------------------------------------------------------------------
+class TestPicardUnils(unittest.TestCase):
+    def setUp(self):
+        import cStringIO
+        dataString = """chr1    1105366 T       52      99      CT      10.559975       10.559975               -117.68 -93.107178      -116.616493     -45.536842      -88.591728      -92.043671      -20.964022      -116.014435     -44.473339      -31.523996
+chr1    1105411 G       22      99      AG      12.484722       12.484722               -23.995817      -27.909206      -10.875731      -27.909206      -46.579994      -29.546518      -46.579994      -23.360453      -29.546518      -46.579994
+chr1    1105411 G       29      99      AG      12.033216       12.033216               -30.641142      -34.376297      -14.483982      -35.457623      -53.197636      -32.525024      -53.498665      -26.517199      -33.606354      -54.579994
+chr1    1105857 G       6       99      AG      7.442399        2.096584                -7.55462        -9.3608 -5.458036       -9.3608 -20.279993      -16.37723       -20.279993      -12.900434      -16.37723       -20.279993
+chr1    1105857 G       7       99      AG      10.889011       1.795554                -7.406977       -9.514187       -5.611423       -9.514187       -23.879993      -19.97723       -23.879993      -16.500435      -19.97723       -23.879993
+chr1    1106094 T       20      99      CT      6.747106        6.747106                -56.979992      -43.143734      -56.806652      -23.699236      -41.036522      -42.97039       -9.862975       -56.505623      -23.525892      -16.610081
+chr1    1110294 G       42      99      AG      21.076984       21.076984               -44.285015      -49.702267      -17.869579      -49.831242      -80.276649      -48.442669      -80.404335      -38.946564      -48.571644      -80.405624
+chr1    1111204 C       26      99      CT      11.364679       11.364679               -55.479992      -31.040928      -55.479992      -36.424099      -23.349712      -31.040928      -11.985033      -55.479992      -36.424099      -32.811741
+chr1    1111204 C       29      99      TT      34.740601       3.890135                -52.282055      -48.646442      -52.704597      -17.954794      -44.565525      -48.464859      -13.715057      -52.100475      -17.773212      -9.824923
+chr1    1111204 C       31      99      CT      18.784479       18.784479               -71.079994      -39.870823      -71.079994      -44.303268      -31.878578      -39.870823      -13.094099      -71.079994      -44.303268      -39.48679
+"""
+        dataFile = cStringIO.StringIO(dataString)
+        self.nIndividuals = 10
+
+        self.genotypesSets = aggregateGeliCalls(map( string.split, dataFile.readlines() ) )
+        self.genotypes = map(lambda x: aggregatedGeliCalls2SNP(x, self.nIndividuals), self.genotypesSets )
+        self.locs = ["chr1:1105366", "chr1:1105411", "chr1:1105857", "chr1:1106094", "chr1:1110294", "chr1:1111204"]
+        self.nhets = [1, 2, 2, 1, 1, 2]
+        self.altAlleles = [1, 2, 2, 1, 1, 4] 
+
+        self.aaf = map( lambda x: (1.0*x) / self.nIndividuals, self.altAlleles )
+        self.hets = map( lambda x: (1.0*x) / self.nIndividuals, self.nhets )
+
+    def testGenotypesSize(self):
+        self.assertEqual(len(self.genotypesSets), 6)
+
+    def testGenotypes2Het(self):
+        print 'testGenotypes2Het...'
+        self.assertEqual(genotypes2heterozyosity(['AT']), [1, 1, 1])
+        self.assertEqual(genotypes2heterozyosity(['AA']), [0, 0, 1])
+        self.assertEqual(genotypes2heterozyosity(['TT']), [0, 0, 1])
+        self.assertEqual(genotypes2heterozyosity(['AT', 'AT']), [1, 2, 2])
+        self.assertEqual(genotypes2heterozyosity(['AA', 'AA']), [0, 0, 2])
+        self.assertEqual(genotypes2heterozyosity(['AT', 'AA']), [0.5, 1, 2])
+        self.assertEqual(genotypes2heterozyosity(['AT', 'TT']), [0.5, 1, 2])
+        self.assertEqual(genotypes2heterozyosity(['AT', 'TT', 'AA']), [1.0/3, 1, 3])
+        self.assertEqual(genotypes2heterozyosity(['AT', 'AT', 'AA']), [2.0/3, 2, 3])
+
+        self.assertEqual(genotypes2heterozyosity(['AT', 'AT'], 10), [2.0/10, 2, 10])
+        self.assertEqual(genotypes2heterozyosity(['AT', 'AA'], 10), [1.0/10, 1, 10])
+
+
+    def testGenotypeSetLocs(self):
+        for set, loc in zip(self.genotypesSets, self.locs):
+            #print loc, set
+            self.assertEqual(set[0], loc)
+
+    def testGenotypeLocs(self):
+        for genotype, loc in zip(self.genotypes, self.locs):
+            self.assertEqual(genotype[0], loc)
+
+    def testGenotypeHets(self):
+        print 'testGenotypeHets:'
+        for genotype, het in zip(self.genotypes, self.hets):
+            print ' => ', genotype, het
+            self.assertEqual(genotype[2][0], het)
+
+
+if __name__ == '__main__':
+    unittest.main()