From 5cdc5dffc6b083f3ff6a9119e53a54ee39fdf8c2 Mon Sep 17 00:00:00 2001 From: kiran Date: Mon, 23 Mar 2009 19:42:31 +0000 Subject: [PATCH] Added some code to handle pairs properly, extend IC solution beyond IC reference length, and allow output to a specified file. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@155 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/illumina/FirecrestFileParser.java | 15 ++++- .../fourbasecaller/FourBaseCaller.java | 60 +++++++++++++------ 2 files changed, 55 insertions(+), 20 deletions(-) diff --git a/playground/java/src/org/broadinstitute/sting/illumina/FirecrestFileParser.java b/playground/java/src/org/broadinstitute/sting/illumina/FirecrestFileParser.java index 24b63f981..210619221 100644 --- a/playground/java/src/org/broadinstitute/sting/illumina/FirecrestFileParser.java +++ b/playground/java/src/org/broadinstitute/sting/illumina/FirecrestFileParser.java @@ -27,6 +27,8 @@ public class FirecrestFileParser extends AbstractFirecrestFileParser { private BasicTextFileParser parser; private final FormatUtil formatter = new FormatUtil(); private final File[] intensityFiles; + private int cycle_start = -1; + private int cycle_stop = -1; /** * Constructor @@ -39,6 +41,13 @@ public class FirecrestFileParser extends AbstractFirecrestFileParser { intensityFiles = getFilesMatchingRegexp("s_" + lane + "_\\d{4}_int.txt(.gz)?"); } + public FirecrestFileParser(final File firecrestDirectory, final int lane, final int cycle_start, final int cycle_stop) { + super(firecrestDirectory, lane); + intensityFiles = getFilesMatchingRegexp("s_" + lane + "_\\d{4}_int.txt(.gz)?"); + this.cycle_start = cycle_start; + this.cycle_stop = cycle_stop; + } + @Override public boolean isValidFirecrestDirectory() { return (intensityFiles.length > 0); @@ -90,8 +99,12 @@ public class FirecrestFileParser extends AbstractFirecrestFileParser { intensities[cycle] = new FourIntensity(fIntensities); } + FourIntensity[] intensities2 = new FourIntensity[(cycle_start > 0 && cycle_stop > 0 && cycle_stop > cycle_start) ? numIntensities : (cycle_stop - cycle_start)]; + for (int cycle = 0, offset = (cycle_start >= 0 ? cycle_start : 0); cycle < intensities2.length; cycle++) { + intensities2[cycle] = intensities[offset + cycle]; + } - return new FirecrestReadData(lane, tile, x, y, intensities); + return new FirecrestReadData(lane, tile, x, y, intensities2); } /** diff --git a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/FourBaseCaller.java b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/FourBaseCaller.java index 6daad50c1..38f54acb5 100644 --- a/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/FourBaseCaller.java +++ b/playground/java/src/org/broadinstitute/sting/projects/fourbasecaller/FourBaseCaller.java @@ -15,21 +15,29 @@ import net.sf.samtools.SAMFileWriterFactory; import net.sf.samtools.SAMRecord; public class FourBaseCaller { - public static void main(String[] args) { + public static void main(String[] argv) { // Parse args - File firecrestDirectory = new File(args[0]); - int lane = Integer.valueOf(args[1]); - String icrefpath = args[2]; + File FIRECREST_DIR = new File(argv[0]); + int LANE = Integer.valueOf(argv[1]); + File SAM_OUT = new File(argv[2]); + int CYCLE_START = Integer.valueOf(argv[3]); + int CYCLE_STOP = Integer.valueOf(argv[4]); - // Load ic reference - ManyNucleotideSequences controls = new ManyNucleotideSequences(icrefpath); - controls.reverseComplement(); + String IC_REF_PATH = "/seq/references/Synthetic_internal_controls_set1/v0/Synthetic_internal_controls_set1.fasta"; + + // Load ic reference in an ugly way + ManyNucleotideSequences fw_controls = new ManyNucleotideSequences(IC_REF_PATH); + ManyNucleotideSequences rc_controls = new ManyNucleotideSequences(IC_REF_PATH); + rc_controls.reverseComplement(); + ManyNucleotideSequences controls = new ManyNucleotideSequences(); + controls.addAll(fw_controls); + controls.addAll(rc_controls); // Loop through firecrest data, find ICs, and compute signal means Vector recoveredICs = new Vector(); Vector icids = new Vector(); - FirecrestFileParser ffp = new FirecrestFileParser(firecrestDirectory, lane); + FirecrestFileParser ffp = new FirecrestFileParser(FIRECREST_DIR, LANE, CYCLE_START, CYCLE_STOP); NucleotideChannelMeans[] cmeans = new NucleotideChannelMeans[controls.get(0).size()]; for (int i = 0; i < cmeans.length; i++) { @@ -37,9 +45,9 @@ public class FourBaseCaller { } int numReads = 0; - while (ffp.hasNext() && numReads < 100000) { - if (numReads % 10000 == 0) { - System.out.println("Processed " + numReads + " reads."); + while (ffp.hasNext()) { + if (numReads % 1000000 == 0) { + System.err.println("Processed " + numReads + " reads for parameters."); } FourIntensity[] intensities = ffp.next().getIntensities(); NucleotideSequence rawread = toNucleotideSequence(intensities); @@ -60,6 +68,8 @@ public class FourBaseCaller { cmeans[cycle].add(nuc, sig); } + System.err.print("Found " + icids.size() + " ICs in " + numReads + " reads " + (((double) icids.size())/((double) numReads)) + ".\r"); + break; } } @@ -87,6 +97,15 @@ public class FourBaseCaller { } } + System.out.println("Found " + recoveredICs.size() + " in " + numReads + " reads."); + + // Extend IC solutions to end of read length (if ICs are shorter than read) + System.err.println("Extending cycle " + (controls.get(0).size() - 1) + " parameters from cycle " + controls.get(0).size() + " to cycle " + recoveredICs.get(0).length); + for (int cycle = controls.get(0).size(); cycle < recoveredICs.get(0).length; cycle++) { + cmeans[cycle] = cmeans[controls.get(0).size() - 1]; + ccov[cycle] = ccov[controls.get(0).size() - 1]; + } + // Now compute probabilities for the bases Algebra alg = new Algebra(); @@ -94,14 +113,17 @@ public class FourBaseCaller { ccov[cycle].invert(); } - FirecrestFileParser ffp2 = new FirecrestFileParser(firecrestDirectory, lane); - - numReads = 0; + FirecrestFileParser ffp2 = new FirecrestFileParser(FIRECREST_DIR, LANE, CYCLE_START, CYCLE_STOP); SAMFileHeader sfh = new SAMFileHeader(); - SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, new File("/wga/scr1/GSA/kiran/basecalling/simulation/test.sam")); + SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT); + + numReads = 0; + while (ffp2.hasNext()) { + if (numReads % 1000000 == 0) { + System.err.println("Basecalled " + numReads + " reads."); + } - while (ffp2.hasNext() && numReads < 100000) { FourIntensity[] intensities = ffp2.next().getIntensities(); byte[] asciiseq = new byte[intensities.length]; @@ -157,8 +179,6 @@ public class FourBaseCaller { asciiseq[cycle] = (byte) call1.asChar(); bestqual[cycle] = toPhredScore(prob1); nextbestqual[cycle] = toCompressedQuality(call2, prob2); - - //System.out.print(call.asChar()); } SAMRecord sr = new SAMRecord(sfh); @@ -173,10 +193,12 @@ public class FourBaseCaller { } sfw.close(); + + System.out.println("Done."); } private static byte toPhredScore(double prob) { - byte qual = (prob - 1.0 < 0.00001) ? 40 : (byte) (-10*Math.log10(1.0 - prob)); + byte qual = (1.0 - prob < 0.00001) ? 40 : (byte) (-10*Math.log10(1.0 - prob)); return (qual > 40) ? 40 : qual; }