From 154f65e0dec1e75fc318b501014e65b2f833a73c Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Mon, 20 Aug 2012 12:43:17 -0400
Subject: [PATCH 1/4] Temporarily disabling multi-threaded usage of
 BaseRecalibrator for performance reasons.

---
 .../sting/gatk/walkers/bqsr/BQSRIntegrationTest.java     | 9 +++++----
 .../sting/gatk/walkers/bqsr/BaseRecalibrator.java        | 4 ++++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
index bd75806dd..580667ee2 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
@@ -75,10 +75,11 @@ public class BQSRIntegrationTest extends WalkerTest {
                 Arrays.asList(params.md5));
         executeTest("testBQSR-"+params.args, spec).getFirst();
 
-        WalkerTestSpec specNT2 = new WalkerTestSpec(
-                params.getCommandLine() + " -nt 2",
-                Arrays.asList(params.md5));
-        executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst();
+        // TODO -- re-enable once parallelization is fixed in BaseRecalibrator
+        //WalkerTestSpec specNT2 = new WalkerTestSpec(
+        //        params.getCommandLine() + " -nt 2",
+        //        Arrays.asList(params.md5));
+        //executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst();
     }
 
     @Test
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
index e45cad971..91d982f20 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java
@@ -136,6 +136,10 @@ public class BaseRecalibrator extends LocusWalker<Long, Long> implements TreeRed
      */
     public void initialize() {
 
+        // TODO -- remove me after the 2.1 release
+        if ( getToolkit().getArguments().numberOfThreads > 1 )
+            throw new UserException("We have temporarily disabled the ability to run BaseRecalibrator multi-threaded for performance reasons.  We hope to have this fixed for the next GATK release (2.2) and apologize for the inconvenience.");
+
         // check for unsupported access
         if (getToolkit().isGATKLite() && !getToolkit().getArguments().disableIndelQuals)
             throw new UserException.NotSupportedInGATKLite("base insertion/deletion recalibration is not supported, please use the --disable_indel_quals argument");

From c67d708c5173ea0f960e60f381b962b046faa12d Mon Sep 17 00:00:00 2001
From: Ryan Poplin <rpoplin@broadinstitute.org>
Date: Mon, 20 Aug 2012 13:41:08 -0400
Subject: [PATCH 2/4] Bug fix in HaplotypeCaller for non-regular bases in the
 reference or reads. Those events don't get created any more. Bug fix for
 advanced GenotypeFullActiveRegion mode: custom variant annotations created by
 the HC don't make sense when in this mode so don't try to calculate them.

---
 .../haplotypecaller/GenotypingEngine.java     | 47 ++++++++++----
 .../haplotypecaller/HaplotypeCaller.java      | 61 ++++++++++---------
 .../LikelihoodCalculationEngine.java          |  1 -
 .../genotyper/UnifiedArgumentCollection.java  |  1 -
 .../broadinstitute/sting/utils/BaseUtils.java | 11 +++-
 5 files changed, 76 insertions(+), 45 deletions(-)

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
index 6afdc58ea..c56cf5bf2 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
@@ -56,8 +56,12 @@ public class GenotypingEngine {
 
     // This function is the streamlined approach, currently not being used
     @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
-    public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList<Haplotype> haplotypes, final byte[] ref, final GenomeLoc refLoc,
-                                                                                                                             final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser ) {
+    public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine,
+                                                                                                                             final ArrayList<Haplotype> haplotypes,
+                                                                                                                             final byte[] ref,
+                                                                                                                             final GenomeLoc refLoc,
+                                                                                                                             final GenomeLoc activeRegionWindow,
+                                                                                                                             final GenomeLocParser genomeLocParser ) {
         // Prepare the list of haplotype indices to genotype
         final ArrayList<Allele> allelesToGenotype = new ArrayList<Allele>();
 
@@ -224,7 +228,6 @@ public class GenotypingEngine {
             }
         }
 
-
         // Walk along each position in the key set and create each event to be outputted
         for( final int loc : startPosKeySet ) {
             if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
@@ -533,24 +536,36 @@ public class GenotypingEngine {
             final int elementLength = ce.getLength();
             switch( ce.getOperator() ) {
                 case I:
+                {
                     final ArrayList<Allele> insertionAlleles = new ArrayList<Allele>();
                     final int insertionStart = refLoc.getStart() + refPos - 1;
-                    insertionAlleles.add( Allele.create(ref[refPos-1], true) );
+                    final byte refByte = ref[refPos-1];
+                    if( BaseUtils.isRegularBase(refByte) ) {
+                        insertionAlleles.add( Allele.create(refByte, true) );
+                    }
                     if( haplotype != null && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) {
                         insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
                     } else {
                         byte[] insertionBases = new byte[]{};
                         insertionBases = ArrayUtils.add(insertionBases, ref[refPos-1]); // add the padding base
                         insertionBases = ArrayUtils.addAll(insertionBases, Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength ));
-                        insertionAlleles.add( Allele.create(insertionBases, false) );
+                        if( BaseUtils.isAllRegularBases(insertionBases) ) {
+                            insertionAlleles.add( Allele.create(insertionBases, false) );
+                        }
+                    }
+                    if( insertionAlleles.size() == 2 ) { // found a proper ref and alt allele
+                        vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
                     }
-                    vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
                     alignmentPos += elementLength;
                     break;
+                }
                 case S:
+                {
                     alignmentPos += elementLength;
                     break;
+                }
                 case D:
+                {
                     final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength );  // add padding base
                     final ArrayList<Allele> deletionAlleles = new ArrayList<Allele>();
                     final int deletionStart = refLoc.getStart() + refPos - 1;
@@ -561,15 +576,20 @@ public class GenotypingEngine {
                     //    deletionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
                     //    vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart, deletionAlleles).make());
                     //} else {
+                    final byte refByte = ref[refPos-1];
+                    if( BaseUtils.isRegularBase(refByte) && BaseUtils.isAllRegularBases(deletionBases) ) {
                         deletionAlleles.add( Allele.create(deletionBases, true) );
-                        deletionAlleles.add( Allele.create(ref[refPos-1], false) );
+                        deletionAlleles.add( Allele.create(refByte, false) );
                         vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).make());
+                    }
                     //}
                     refPos += elementLength;
                     break;
+                }
                 case M:
                 case EQ:
                 case X:
+                {
                     int numSinceMismatch = -1;
                     int stopOfMismatch = -1;
                     int startOfMismatch = -1;
@@ -592,11 +612,13 @@ public class GenotypingEngine {
                         if( numSinceMismatch > MNP_LOOK_AHEAD || (iii == elementLength - 1 && stopOfMismatch != -1) ) {
                             final byte[] refBases = Arrays.copyOfRange( ref, refPosStartOfMismatch, refPosStartOfMismatch + (stopOfMismatch - startOfMismatch) + 1 );
                             final byte[] mismatchBases = Arrays.copyOfRange( alignment, startOfMismatch, stopOfMismatch + 1 );
-                            final ArrayList<Allele> snpAlleles = new ArrayList<Allele>();
-                            snpAlleles.add( Allele.create( refBases, true ) );
-                            snpAlleles.add( Allele.create( mismatchBases, false ) );
-                            final int snpStart = refLoc.getStart() + refPosStartOfMismatch;
-                            vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make());
+                            if( BaseUtils.isAllRegularBases(refBases) && BaseUtils.isAllRegularBases(mismatchBases) ) {
+                                final ArrayList<Allele> snpAlleles = new ArrayList<Allele>();
+                                snpAlleles.add( Allele.create( refBases, true ) );
+                                snpAlleles.add( Allele.create( mismatchBases, false ) );
+                                final int snpStart = refLoc.getStart() + refPosStartOfMismatch;
+                                vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make());
+                            }
                             numSinceMismatch = -1;
                             stopOfMismatch = -1;
                             startOfMismatch = -1;
@@ -606,6 +628,7 @@ public class GenotypingEngine {
                         alignmentPos++;
                     }
                     break;
+                }
                 case N:
                 case H:
                 case P:
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index ec4fb3950..502711bb3 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -413,45 +413,48 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
 
         for( final Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>> callResult :
                 ( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
-                  ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser() )
+                  ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getExtendedLoc(), getToolkit().getGenomeLocParser() )
                   : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
             if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
 
             final Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult );
             final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
-
-            // add some custom annotations to the calls
             final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
-            // Calculate the number of variants on the haplotype
-            int maxNumVar = 0;
-            for( final Allele allele : callResult.getFirst().getAlleles() ) {
-                if( !allele.isReference() ) {
-                    for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
-                        final int numVar = haplotype.getEventMap().size();
-                        if( numVar > maxNumVar ) { maxNumVar = numVar; }
+
+            if( !GENOTYPE_FULL_ACTIVE_REGION ) {
+                // add some custom annotations to the calls
+
+                // Calculate the number of variants on the haplotype
+                int maxNumVar = 0;
+                for( final Allele allele : callResult.getFirst().getAlleles() ) {
+                    if( !allele.isReference() ) {
+                        for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
+                            final int numVar = haplotype.getEventMap().size();
+                            if( numVar > maxNumVar ) { maxNumVar = numVar; }
+                        }
                     }
                 }
-            }
-            // Calculate the event length
-            int maxLength = 0;
-            for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
-                final int length = a.length() - annotatedCall.getReference().length();
-                if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
-            }
+                // Calculate the event length
+                int maxLength = 0;
+                for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
+                    final int length = a.length() - annotatedCall.getReference().length();
+                    if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
+                }
 
-            myAttributes.put("NVH", maxNumVar);
-            myAttributes.put("NumHapEval", bestHaplotypes.size());
-            myAttributes.put("NumHapAssembly", haplotypes.size());
-            myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
-            myAttributes.put("EVENTLENGTH", maxLength);
-            myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
-            myAttributes.put("extType", annotatedCall.getType().toString() );
+                myAttributes.put("NVH", maxNumVar);
+                myAttributes.put("NumHapEval", bestHaplotypes.size());
+                myAttributes.put("NumHapAssembly", haplotypes.size());
+                myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
+                myAttributes.put("EVENTLENGTH", maxLength);
+                myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
+                myAttributes.put("extType", annotatedCall.getType().toString() );
 
-            //if( likelihoodCalculationEngine.haplotypeScore != null ) {
-            //    myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
-            //}
-            if( annotatedCall.hasAttribute("QD") ) {
-                myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
+                //if( likelihoodCalculationEngine.haplotypeScore != null ) {
+                //    myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
+                //}
+                if( annotatedCall.hasAttribute("QD") ) {
+                    myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
+                }
             }
 
             vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() );
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
index fabf5633f..b5ce4b4bc 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@@ -179,7 +179,6 @@ public class LikelihoodCalculationEngine {
                             final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
                             for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
                                 // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
-                                // log10(10^(a*x1) + 10^(b*x2))  ???
                                 // First term is approximated by Jacobian log with table lookup.
                                 haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
                             }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
index e755a1e36..f11f1e599 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@@ -181,7 +181,6 @@ public class UnifiedArgumentCollection {
        Generalized ploidy argument (debug only): When building site error models, ignore lane information and build only
        sample-level error model
      */
-
     @Argument(fullName = "ignoreLaneInfo", shortName = "ignoreLane", doc = "Ignore lane when building error model, error model is then per-site", required = false)
     public boolean IGNORE_LANE_INFO = false;
 
diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java
index 13571df78..2d7f51c3f 100644
--- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java
@@ -227,14 +227,21 @@ public class BaseUtils {
     }
 
     @Deprecated
-    static public boolean isRegularBase(char base) {
+    static public boolean isRegularBase( final char base ) {
         return simpleBaseToBaseIndex(base) != -1;
     }
 
-    static public boolean isRegularBase(byte base) {
+    static public boolean isRegularBase( final byte base ) {
         return simpleBaseToBaseIndex(base) != -1;
     }
 
+    static public boolean isAllRegularBases( final byte[] bases ) {
+        for( final byte base : bases) {
+            if( !isRegularBase(base) ) { return false; }
+        }
+        return true;
+    }
+
     static public boolean isNBase(byte base) {
         return base == 'N' || base == 'n';
     }

From 4450d66c647f949da89e9590d2c1e336e82da742 Mon Sep 17 00:00:00 2001
From: Eric Banks <ebanks@broadinstitute.org>
Date: Mon, 20 Aug 2012 15:10:24 -0400
Subject: [PATCH 3/4] Fixing the docs for DP and AD

---
 .../gatk/walkers/annotator/DepthOfCoverage.java | 17 ++++-------------
 .../annotator/DepthPerAlleleBySample.java       | 12 ++++++------
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java
index 28ca77f18..39b5e84dc 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java
@@ -22,19 +22,10 @@ import java.util.Map;
 /**
  * Total (unfiltered) depth over all samples.
  *
- * This and AD are complementary fields that are two important ways of thinking about the depth of the data for this sample
- * at this site.  The DP field describe the total depth of reads that passed the Unified Genotypers internal
- * quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site.
- * The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the
- * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
- * power I have to determine the genotype of the sample at this site, while the AD tells me how many times
- * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
- * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like
- * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
- * normally be excluded from the statistical calculations going into GQ and QUAL.
- *
- * Note that the DP is affected by downsampling (-dcov) though, so the max value one can obtain for N samples with
- * -dcov D is N * D
+ * While the sample-level (FORMAT) DP field describes the total depth of reads that passed the Unified Genotyper's
+ * internal quality control metrics (like MAPQ > 17, for example), the INFO field DP represents the unfiltered depth
+ * over all samples.  Note though that the DP is affected by downsampling (-dcov), so the max value one can obtain for
+ * N samples with -dcov D is N * D
  */
 public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation {
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
index a9edab752..5d83ddd51 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java
@@ -24,10 +24,10 @@ import java.util.List;
 /**
  * The depth of coverage of each VCF allele in this sample.
  *
- * This and DP are complementary fields that are two important ways of thinking about the depth of the data for this sample
- * at this site. The DP field describe the total depth of reads that passed the Unified Genotypers internal
- * quality control metrics (like MAPQ > 17, for example), whatever base was present in the read at this site.
- * The AD values (one for each of REF and ALT fields) is the count of all reads that carried with them the
+ * The AD and DP are complementary fields that are two important ways of thinking about the depth of the data for this
+ * sample at this site.  While the sample-level (FORMAT) DP field describes the total depth of reads that passed the
+ * Unified Genotyper's internal quality control metrics (like MAPQ > 17, for example), the AD values (one for each of
+ * REF and ALT fields) is the unfiltered count of all reads that carried with them the
  * REF and ALT alleles. The reason for this distinction is that the DP is in some sense reflective of the
  * power I have to determine the genotype of the sample at this site, while the AD tells me how many times
  * I saw each of the REF and ALT alleles in the reads, free of any bias potentially introduced by filtering
@@ -35,10 +35,10 @@ import java.util.List;
  * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would
  * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that
  * the AD isn't necessarily calculated exactly for indels (it counts as non-reference only those indels that
- * are actually present and correctly left-aligned in the alignments themselves). Because of this fact and
+ * are unambiguously informative about the alternate allele). Because of this fact and
  * because the AD includes reads and bases that were filtered by the Unified Genotyper, <b>one should not base
  * assumptions about the underlying genotype based on it</b>; instead, the genotype likelihoods (PLs) are what
- * determine the genotype calls (see below).
+ * determine the genotype calls.
  */
 public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation {
 

From 464d49509a7f3acba38b89b64b7fed8e04972365 Mon Sep 17 00:00:00 2001
From: Ryan Poplin <rpoplin@broadinstitute.org>
Date: Mon, 20 Aug 2012 15:28:39 -0400
Subject: [PATCH 4/4] Pulling out common caller arguments into its own
 StandardCallerArgumentCollection base class so that every caller isn't
 exposed to the unused arguments from every other caller.

---
 .../haplotypecaller/HaplotypeCaller.java      |  5 +-
 .../StandardCallerArgumentCollection.java     | 62 +++++++++++++++++++
 .../genotyper/UnifiedArgumentCollection.java  | 58 ++++-------------
 3 files changed, 78 insertions(+), 47 deletions(-)
 create mode 100644 public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java

diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 502711bb3..4f434bba6 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
 
 import com.google.java.contract.Ensures;
 import net.sf.picard.reference.IndexedFastaSequenceFile;
+import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
 import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
 import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
 import org.broadinstitute.sting.commandline.*;
@@ -189,7 +190,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
     protected String[] annotationClassesToUse = { "Standard" };
 
     @ArgumentCollection
-    private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
+    private StandardCallerArgumentCollection SCAC = new StandardCallerArgumentCollection();
 
     // the calculation arguments
     private UnifiedGenotyperEngine UG_engine = null;
@@ -240,7 +241,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
         Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
         samplesList.addAll( samples );
         // initialize the UnifiedGenotyper Engine which is used to call into the exact model
-        UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; // the GLmodel isn't used by the HaplotypeCaller but it is dangerous to let the user change this argument
+        final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
         UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
         UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
         UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
new file mode 100644
index 000000000..f30fc0316
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
@@ -0,0 +1,62 @@
+package org.broadinstitute.sting.gatk.arguments;
+
+import org.broadinstitute.sting.commandline.Advanced;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Input;
+import org.broadinstitute.sting.commandline.RodBinding;
+import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: rpoplin
+ * Date: 8/20/12
+ * A collection of arguments that are common to the various callers.
+ * This is pulled out so that every caller isn't exposed to the arguments from every other caller.
+ */
+
+public class StandardCallerArgumentCollection {
+    /**
+     * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are:
+     * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2
+     */
+    @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
+    public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY;
+
+    @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
+    public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
+
+    @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
+    public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
+
+    /**
+     * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
+     * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
+     * is the default).
+     */
+    @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false)
+    public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
+
+    /**
+     * This argument allows you to emit low quality calls as filtered records.
+     */
+    @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false)
+    public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
+
+    /**
+     * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
+     */
+    @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false)
+    public RodBinding<VariantContext> alleles;
+
+    /**
+     * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
+     * then only this many alleles will be used.  Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
+     * scales exponentially based on the number of alternate alleles.  Unless there is a good reason to change the default value, we highly recommend
+     * that you not play around with this parameter.
+     */
+    @Advanced
+    @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
+    public int MAX_ALTERNATE_ALLELES = 3;
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
index f11f1e599..30c0f3e18 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@@ -26,11 +26,12 @@
 package org.broadinstitute.sting.gatk.walkers.genotyper;
 
 import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
 
 
-public class UnifiedArgumentCollection {
+public class UnifiedArgumentCollection extends StandardCallerArgumentCollection {
 
     @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false)
     public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
@@ -42,13 +43,6 @@ public class UnifiedArgumentCollection {
     @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
     protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
 
-    /**
-     * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are:
-     * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2
-     */
-    @Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
-    public Double heterozygosity = UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY;
-
     /**
      * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily
      * distinguish between PCR errors vs. sequencing errors.  The practical implication for this value is that it
@@ -57,26 +51,6 @@ public class UnifiedArgumentCollection {
     @Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false)
     public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE;
 
-    @Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Specifies how to determine the alternate alleles to use for genotyping", required = false)
-    public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
-
-    @Argument(fullName = "output_mode", shortName = "out_mode", doc = "Specifies which type of calls we should output", required = false)
-    public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
-
-    /**
-     * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
-     * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
-     * is the default).
-     */
-    @Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be called", required = false)
-    public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
-
-    /**
-     * This argument allows you to emit low quality calls as filtered records.
-     */
-    @Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants should be emitted (and filtered with LowQual if less than the calling threshold)", required = false)
-    public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
-
     /**
      * Note that calculating the SLOD increases the runtime by an appreciable amount.
      */
@@ -90,12 +64,6 @@ public class UnifiedArgumentCollection {
     @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false)
     public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false;
 
-    /**
-     * When the UnifiedGenotyper is put into GENOTYPE_GIVEN_ALLELES mode it will genotype the samples using only the alleles provide in this rod binding
-     */
-    @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype when --genotyping_mode is GENOTYPE_GIVEN_ALLELES", required=false)
-    public RodBinding<VariantContext> alleles;
-
     /**
      * The minimum confidence needed in a given base for it to be used in variant calling.  Note that the base quality of a base
      * is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value.
@@ -107,16 +75,6 @@ public class UnifiedArgumentCollection {
     @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
     public Double MAX_DELETION_FRACTION = 0.05;
 
-    /**
-     * If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
-     * then only this many alleles will be used.  Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
-     * scales exponentially based on the number of alternate alleles.  Unless there is a good reason to change the default value, we highly recommend
-     * that you not play around with this parameter.
-     */
-    @Advanced
-    @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
-    public int MAX_ALTERNATE_ALLELES = 3;
-
     @Hidden
     @Argument(fullName = "cap_max_alternate_alleles_for_indels", shortName = "capMaxAltAllelesForIndels", doc = "Cap the maximum number of alternate alleles to genotype for indel calls at 2; overrides the --max_alternate_alleles argument; GSA production use only", required = false)
     public boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = false;
@@ -139,7 +97,6 @@ public class UnifiedArgumentCollection {
     @Argument(fullName = "min_indel_fraction_per_sample", shortName = "minIndelFrac", doc = "Minimum fraction of all reads at a locus that must contain an indel (of any allele) for that sample to contribute to the indel count for alleles", required = false)
     public double MIN_INDEL_FRACTION_PER_SAMPLE = 0.25;
 
-
     /**
      * This argument informs the prior probability of having an indel at a site.
      */
@@ -274,5 +231,16 @@ public class UnifiedArgumentCollection {
         return uac;
     }
 
+    public UnifiedArgumentCollection() { }
 
+    public UnifiedArgumentCollection( final StandardCallerArgumentCollection SCAC ) {
+        super();
+        this.alleles = SCAC.alleles;
+        this.GenotypingMode = SCAC.GenotypingMode;
+        this.heterozygosity = SCAC.heterozygosity;
+        this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
+        this.OutputMode = SCAC.OutputMode;
+        this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
+        this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
+    }
 }