From cd80e3f372e7e8b9aed14af227f72f3e52607c0e Mon Sep 17 00:00:00 2001 From: kiran Date: Fri, 22 May 2009 19:34:33 +0000 Subject: [PATCH] Replaced dumb training function with a version that creates a training set slightly more sensibly. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@806 348d0f76-0448-11de-a6fe-93d51630548a --- .../secondarybase/BasecallingTrainer.java | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java b/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java index 8ad85aac4..222ba8a44 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java +++ b/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java @@ -58,9 +58,10 @@ public class BasecallingTrainer { } /** - * Take the first N reads that have no ambiguous bases and add them to the training set. + * Take the first N reads that have no ambiguous bases, an average quality score greater + * than or equal to 15, and are not largely homopolymers and add them to the training set. */ - public void loadFirstNUnambiguousReadsTrainingSet() { + public void loadFirstNReasonableReadsTrainingSet() { this.trainingData = new ArrayList(trainingLimit); IlluminaParser iparser = new IlluminaParser(bustardDir, lane); @@ -80,13 +81,29 @@ public class BasecallingTrainer { } } - if (numAmbiguous == 0) { + if (numAmbiguous == 0 && getAverageQualityScore(rawread) >= 15 && BaseUtils.mostFrequentBaseFraction(rawread.getSequence()) < 0.4) { trainingData.add(rawread); numreads++; } } } + /** + * Return the average quality score of a raw read. + * + * @param read the raw read + * @return the average quality score + */ + private double getAverageQualityScore(RawRead read) { + double averageQual = 0; + + for ( byte qual : read.getQuals() ) { + averageQual += qual; + } + + return averageQual / ((double) read.getReadLength()); + } + /** * Load a training set from perfect reads in an already-aligned bam file. *