Replaced dumb training function with a version that creates a training set slightly more sensibly.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@806 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
02c0afdb85
commit
cd80e3f372
|
|
@ -58,9 +58,10 @@ public class BasecallingTrainer {
|
|||
}
|
||||
|
||||
/**
|
||||
* Take the first N reads that have no ambiguous bases and add them to the training set.
|
||||
* Take the first N reads that have no ambiguous bases, an average quality score greater
|
||||
* than or equal to 15, and are not largely homopolymers and add them to the training set.
|
||||
*/
|
||||
public void loadFirstNUnambiguousReadsTrainingSet() {
|
||||
public void loadFirstNReasonableReadsTrainingSet() {
|
||||
this.trainingData = new ArrayList<RawRead>(trainingLimit);
|
||||
|
||||
IlluminaParser iparser = new IlluminaParser(bustardDir, lane);
|
||||
|
|
@ -80,13 +81,29 @@ public class BasecallingTrainer {
|
|||
}
|
||||
}
|
||||
|
||||
if (numAmbiguous == 0) {
|
||||
if (numAmbiguous == 0 && getAverageQualityScore(rawread) >= 15 && BaseUtils.mostFrequentBaseFraction(rawread.getSequence()) < 0.4) {
|
||||
trainingData.add(rawread);
|
||||
numreads++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the average quality score of a raw read.
|
||||
*
|
||||
* @param read the raw read
|
||||
* @return the average quality score
|
||||
*/
|
||||
private double getAverageQualityScore(RawRead read) {
|
||||
double averageQual = 0;
|
||||
|
||||
for ( byte qual : read.getQuals() ) {
|
||||
averageQual += qual;
|
||||
}
|
||||
|
||||
return averageQual / ((double) read.getReadLength());
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a training set from perfect reads in an already-aligned bam file.
|
||||
*
|
||||
|
|
|
|||
Loading…
Reference in New Issue