Experimental refactoring.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@674 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-05-12 19:46:34 +00:00
parent 1518f8f9bf
commit 758f8aa89b
1 changed files with 131 additions and 0 deletions

View File

@ -0,0 +1,131 @@
package org.broadinstitute.sting.playground.fourbasecaller;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.QualityUtils;
import java.io.File;
import java.util.HashMap;
import java.util.ArrayList;
import java.util.Vector;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
public class AnnotateSecondaryBase extends CommandLineProgram {
public static AnnotateSecondaryBase Instance = null;
@Argument(fullName="dir", shortName="D", doc="Illumina Firecrest directory") public File FIRECREST_DIR;
@Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE;
@Argument(fullName="sam_in", shortName="SI", doc="The file to annotate") public File SAM_IN;
@Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT;
@Argument(fullName="reference", shortName="R", doc="Reference sequence to which sam_in is aligned (in fasta format)") public File REFERENCE;
@Argument(fullName="cycle_begin", shortName="CB", doc="On what cycle does the read begin? (0-based inclusive)") public int CYCLE_BEGIN;
@Argument(fullName="cycle_end", shortName="CE", doc="On what cycle does the read end? (0-based inclusive)") public int CYCLE_END;
@Argument(fullName="tlim", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 1000000;
@Argument(fullName="clim", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;
@Argument(fullName="context", shortName="X", doc="Attempt to correct for context?", required=false) public Boolean CONTEXT = false;
public static void main(String[] argv) {
Instance = new AnnotateSecondaryBase();
start(Instance, argv);
}
protected int execute() {
// Iterate through bam file
// take an alignment
// verify that it has zero mismatches to the reference
// if so, store it
// continue until we've picked up TRAINING_LIMIT alignments
Vector< HashMap<String, SAMRecord> > trainingSet = loadPreAlignedTrainingSet();
// Iterate through the stored alignments
// find the corresponding firecrest intensity line
// add the info to the BasecallingReadModel
BasecallingReadModel model = train(trainingSet);
// Iterate through the bam file
// take an alignment
// call it
// stick the 2nd-best bases into the file
// spit the alignment back out
return 0;
}
private Vector< HashMap<String, SAMRecord> > loadPreAlignedTrainingSet() {
FastaSequenceFile2 ref = new FastaSequenceFile2(REFERENCE);
String currentContig = "none";
byte[] refbases = null;
SAMFileReader sf = new SAMFileReader(SAM_IN);
SAMRecord sr;
CloseableIterator<SAMRecord> sfit = sf.iterator();
int numTrainingReads = 0;
Vector< HashMap<String, SAMRecord> > trainingReads = new Vector< HashMap<String, SAMRecord> >(100);
while (numTrainingReads < TRAINING_LIMIT && (sr = sfit.next()) != null) {
if (sr.getCigar().numCigarElements() == 1) {
int offset = sr.getAlignmentStart();
if (!currentContig.matches(sr.getReferenceName())) {
ref.seekToContig(sr.getReferenceName());
currentContig = sr.getReferenceName();
refbases = ref.nextSequence().getBases();
}
int mismatches = 0;
String refString = "";
for (int i = offset, j = 0; i < offset + sr.getReadBases().length; i++, j++) {
refString += (char) refbases[i - 1];
mismatches += (BaseUtils.simpleBaseToBaseIndex((char) refbases[i - 1]) !=
BaseUtils.simpleBaseToBaseIndex((char) sr.getReadBases()[j]))
? 1 : 0;
}
if (mismatches == 0) {
Pattern p = Pattern.compile(":(\\d):(\\d+):(\\d+):(\\d+)#");
Matcher m = p.matcher(sr.getReadName());
if (m.find()) {
int tile = Integer.valueOf(m.group(2));
String readKey = String.format("%s:%s:%s:%s", m.group(1), m.group(2), m.group(3), m.group(4));
if (tile > trainingReads.size()) {
trainingReads.setSize(tile + 1);
}
if (trainingReads.get(tile) == null) {
HashMap<String, SAMRecord> tileTrainingReads = new HashMap<String, SAMRecord>();
trainingReads.set(tile, tileTrainingReads);
}
trainingReads.get(tile).put(readKey, sr);
numTrainingReads++;
} else {
throw new StingException("Pattern '" + p.pattern() + "' does not match against read name '" + sr.getReadName() + "'");
}
}
}
}
System.out.println("Collected " + numTrainingReads + " reads.");
return trainingReads;
}
private BasecallingReadModel train(Vector< HashMap<String, SAMRecord> > trainingSet) {
BasecallingReadModel model = new BasecallingReadModel(CYCLE_END - CYCLE_BEGIN + 1, CONTEXT);
return model;
}
}