From 03fe166994366aa937d197d8573e70992a753f53 Mon Sep 17 00:00:00 2001 From: kiran Date: Thu, 18 Jun 2009 20:18:17 +0000 Subject: [PATCH] Wrote a public static version of loadFirstNReasonableReadsTrainingSet() so Alec can call it. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1046 348d0f76-0448-11de-a6fe-93d51630548a --- .../secondarybase/BasecallingTrainer.java | 37 ++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java b/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java index 222ba8a44..b9b805caf 100755 --- a/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java +++ b/java/src/org/broadinstitute/sting/secondarybase/BasecallingTrainer.java @@ -88,13 +88,48 @@ public class BasecallingTrainer { } } + /** + * Take the first N reads that have no ambiguous bases, an average quality score greater + * than or equal to 15, and are not largely homopolymers and add them to the training set. + * + * @param bustardDir the bustard directory + * @param lane the lane number + * @param trainingLimit how many reads should we use to train? + */ + public static void loadNReasonableReadsTrainingSet(File bustardDir, int lane, int trainingLimit) { + ArrayList trainingData = new ArrayList(trainingLimit); + + IlluminaParser iparser = new IlluminaParser(bustardDir, lane); + + RawRead rawread; + int numreads = 0; + + while (numreads < trainingLimit && iparser.next()) { + rawread = iparser.getRawRead(); + + int numAmbiguous = 0; + byte[] sequence = rawread.getSequence(); + + for ( byte byteBase : sequence ) { + if (BaseUtils.simpleBaseToBaseIndex((char) byteBase) == -1) { + numAmbiguous++; + } + } + + if (numAmbiguous == 0 && getAverageQualityScore(rawread) >= 15 && BaseUtils.mostFrequentBaseFraction(rawread.getSequence()) < 0.4) { + trainingData.add(rawread); + numreads++; + } + } + } + /** * Return the average quality score of a raw read. * * @param read the raw read * @return the average quality score */ - private double getAverageQualityScore(RawRead read) { + private static double getAverageQualityScore(RawRead read) { double averageQual = 0; for ( byte qual : read.getQuals() ) {