From 758f8aa89bb3b5ce22e0d3cb3740c2a8e872284b Mon Sep 17 00:00:00 2001 From: kiran Date: Tue, 12 May 2009 19:46:34 +0000 Subject: [PATCH] Experimental refactoring. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@674 348d0f76-0448-11de-a6fe-93d51630548a --- .../fourbasecaller/AnnotateSecondaryBase.java | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/playground/fourbasecaller/AnnotateSecondaryBase.java diff --git a/java/src/org/broadinstitute/sting/playground/fourbasecaller/AnnotateSecondaryBase.java b/java/src/org/broadinstitute/sting/playground/fourbasecaller/AnnotateSecondaryBase.java new file mode 100755 index 000000000..d84eb2305 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/fourbasecaller/AnnotateSecondaryBase.java @@ -0,0 +1,131 @@ +package org.broadinstitute.sting.playground.fourbasecaller; + +import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.io.File; +import java.util.HashMap; +import java.util.ArrayList; +import java.util.Vector; +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; + +public class AnnotateSecondaryBase extends CommandLineProgram { + public static AnnotateSecondaryBase Instance = null; + + @Argument(fullName="dir", shortName="D", doc="Illumina Firecrest directory") public File FIRECREST_DIR; + @Argument(fullName="lane", shortName="L", doc="Illumina flowcell lane") public int LANE; + @Argument(fullName="sam_in", shortName="SI", doc="The file to annotate") public File SAM_IN; + @Argument(fullName="sam_out", shortName="SO", doc="Output path for sam file") public File SAM_OUT; + @Argument(fullName="reference", shortName="R", doc="Reference sequence to which sam_in is aligned (in fasta format)") public File REFERENCE; + @Argument(fullName="cycle_begin", shortName="CB", doc="On what cycle does the read begin? (0-based inclusive)") public int CYCLE_BEGIN; + @Argument(fullName="cycle_end", shortName="CE", doc="On what cycle does the read end? (0-based inclusive)") public int CYCLE_END; + @Argument(fullName="tlim", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 1000000; + @Argument(fullName="clim", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE; + @Argument(fullName="context", shortName="X", doc="Attempt to correct for context?", required=false) public Boolean CONTEXT = false; + + public static void main(String[] argv) { + Instance = new AnnotateSecondaryBase(); + start(Instance, argv); + } + + protected int execute() { + // Iterate through bam file + // take an alignment + // verify that it has zero mismatches to the reference + // if so, store it + // continue until we've picked up TRAINING_LIMIT alignments + Vector< HashMap > trainingSet = loadPreAlignedTrainingSet(); + + // Iterate through the stored alignments + // find the corresponding firecrest intensity line + // add the info to the BasecallingReadModel + BasecallingReadModel model = train(trainingSet); + + // Iterate through the bam file + // take an alignment + // call it + // stick the 2nd-best bases into the file + // spit the alignment back out + + return 0; + } + + private Vector< HashMap > loadPreAlignedTrainingSet() { + FastaSequenceFile2 ref = new FastaSequenceFile2(REFERENCE); + String currentContig = "none"; + byte[] refbases = null; + + SAMFileReader sf = new SAMFileReader(SAM_IN); + SAMRecord sr; + CloseableIterator sfit = sf.iterator(); + + int numTrainingReads = 0; + Vector< HashMap > trainingReads = new Vector< HashMap >(100); + + while (numTrainingReads < TRAINING_LIMIT && (sr = sfit.next()) != null) { + if (sr.getCigar().numCigarElements() == 1) { + int offset = sr.getAlignmentStart(); + + if (!currentContig.matches(sr.getReferenceName())) { + ref.seekToContig(sr.getReferenceName()); + currentContig = sr.getReferenceName(); + refbases = ref.nextSequence().getBases(); + } + + int mismatches = 0; + + String refString = ""; + for (int i = offset, j = 0; i < offset + sr.getReadBases().length; i++, j++) { + refString += (char) refbases[i - 1]; + + mismatches += (BaseUtils.simpleBaseToBaseIndex((char) refbases[i - 1]) != + BaseUtils.simpleBaseToBaseIndex((char) sr.getReadBases()[j])) + ? 1 : 0; + } + + if (mismatches == 0) { + Pattern p = Pattern.compile(":(\\d):(\\d+):(\\d+):(\\d+)#"); + Matcher m = p.matcher(sr.getReadName()); + + if (m.find()) { + int tile = Integer.valueOf(m.group(2)); + String readKey = String.format("%s:%s:%s:%s", m.group(1), m.group(2), m.group(3), m.group(4)); + + if (tile > trainingReads.size()) { + trainingReads.setSize(tile + 1); + } + + if (trainingReads.get(tile) == null) { + HashMap tileTrainingReads = new HashMap(); + trainingReads.set(tile, tileTrainingReads); + } + + trainingReads.get(tile).put(readKey, sr); + numTrainingReads++; + } else { + throw new StingException("Pattern '" + p.pattern() + "' does not match against read name '" + sr.getReadName() + "'"); + } + } + } + } + + System.out.println("Collected " + numTrainingReads + " reads."); + + return trainingReads; + } + + private BasecallingReadModel train(Vector< HashMap > trainingSet) { + BasecallingReadModel model = new BasecallingReadModel(CYCLE_END - CYCLE_BEGIN + 1, CONTEXT); + + return model; + } +}