Replaced dumb training function with a version that creates a training set slightly more sensibly.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@806 348d0f76-0448-11de-a6fe-93d51630548a
2009-05-22 19:34:33 +00:00 · 2009-05-22 19:34:33 +00:00 · cd80e3f372
parent 02c0afdb85
commit cd80e3f372
1 changed files with 20 additions and 3 deletions
--- a/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java
+++ b/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java
@ -58,9 +58,10 @@ public class BasecallingTrainer {
    }

    /**
-     * Take the first N reads that have no ambiguous bases and add them to the training set.
+     * Take the first N reads that have no ambiguous bases, an average quality score greater
+     * than or equal to 15, and are not largely homopolymers and add them to the training set.
     */
-    public void loadFirstNUnambiguousReadsTrainingSet() {
+    public void loadFirstNReasonableReadsTrainingSet() {
        this.trainingData = new ArrayList<RawRead>(trainingLimit);

        IlluminaParser iparser = new IlluminaParser(bustardDir, lane);
@ -80,13 +81,29 @@ public class BasecallingTrainer {
                }
            }

-            if (numAmbiguous == 0) {
+            if (numAmbiguous == 0 && getAverageQualityScore(rawread) >= 15 && BaseUtils.mostFrequentBaseFraction(rawread.getSequence()) < 0.4) {
                trainingData.add(rawread);
                numreads++;
            }
        }
    }

+    /**
+     * Return the average quality score of a raw read.
+     *
+     * @param read  the raw read
+     * @return  the average quality score
+     */
+    private double getAverageQualityScore(RawRead read) {
+        double averageQual = 0;
+
+        for ( byte qual : read.getQuals() ) {
+            averageQual += qual;
+        }
+
+        return averageQual / ((double) read.getReadLength());
+    }
+
    /**
     * Load a training set from perfect reads in an already-aligned bam file.
     *