diff --git a/java/src/org/broadinstitute/sting/playground/piecemealannotator/BoundedScoringSet.java b/java/src/org/broadinstitute/sting/playground/piecemealannotator/BoundedScoringSet.java new file mode 100755 index 000000000..d815a3e8f --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/piecemealannotator/BoundedScoringSet.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.playground.piecemealannotator; + +import java.util.PriorityQueue; + +public class BoundedScoringSet > { + private PriorityQueue pq; + private int maximumSize; + + public BoundedScoringSet(int maximumSize) { + pq = new PriorityQueue(maximumSize); + this.maximumSize = maximumSize; + } + + public boolean add(E o) { + if (canAdd(o)) { + pq.add(o); + + while (pq.size() > maximumSize) { + pq.poll(); + } + + return true; + } + + return false; + } + + private boolean canAdd(E o) { return pq.size() < maximumSize || o.compareTo(pq.peek()) == 1; } + + public void clear() { pq.clear(); } + + public boolean contains(E o) { return pq.contains(o); } + + public boolean offer(E o) { return pq.offer(o); } + + public E peek() { return pq.peek(); } + + public E poll() { return pq.poll(); } + + public boolean remove(E o) { return pq.remove(o); } + + public int size() { return pq.size(); } + + public E[] toArray(E[] os) { return pq.toArray(os); } +} diff --git a/java/src/org/broadinstitute/sting/playground/piecemealannotator/IlluminaTile.java b/java/src/org/broadinstitute/sting/playground/piecemealannotator/IlluminaTile.java new file mode 100755 index 000000000..8a7a419ff --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/piecemealannotator/IlluminaTile.java @@ -0,0 +1,77 @@ +package org.broadinstitute.sting.playground.piecemealannotator; + +import edu.mit.broad.picard.util.BasicTextFileParser; +import edu.mit.broad.picard.util.PasteParser; +import org.broadinstitute.sting.secondarybase.RawRead; +import org.broadinstitute.sting.utils.StingException; + +import java.io.Closeable; +import java.io.File; +import java.util.Iterator; + +public class IlluminaTile implements Iterator, Iterable, Closeable { + private PasteParser parser; + + private RawRead next; + private boolean isIterating = false; + + public IlluminaTile(File bustardDir, int lane, int tile) { + //BasicTextFileParser intparser = new BasicTextFileParser(true, new File(bustardDir.getParent() + "/" + String.format("s_%d_%04d_int.txt.gz", lane, tile))); + //BasicTextFileParser seqparser = new BasicTextFileParser(true, new File(bustardDir.getAbsolutePath() + "/" + String.format("s_%d_%04d_seq.txt.gz", lane, tile))); + //BasicTextFileParser prbparser = new BasicTextFileParser(true, new File(bustardDir.getAbsolutePath() + "/" + String.format("s_%d_%04d_prb.txt.gz", lane, tile))); + + BasicTextFileParser intparser = new BasicTextFileParser(true, findFile(bustardDir.getParentFile(), lane, tile, "int")); + BasicTextFileParser seqparser = new BasicTextFileParser(true, findFile(bustardDir, lane, tile, "seq")); + BasicTextFileParser prbparser = new BasicTextFileParser(true, findFile(bustardDir, lane, tile, "prb")); + + parser = new PasteParser(intparser, seqparser, prbparser); + } + + private File findFile(File dir, int lane, int tile, String ext) { + File file1 = new File(dir.getAbsolutePath() + "/" + String.format("s_%d_%04d_%s.txt.gz", lane, tile, ext)); + if (file1.exists()) { return file1; } + + File file2 = new File(dir.getAbsolutePath() + "/" + String.format("s_%d_%04d_%s.txt", lane, tile, ext)); + if (file2.exists()) { return file2; } + + throw new StingException(String.format("Can't find file '%s' or '%s'", file1.getName(), file2.getName())); + + } + + public boolean hasNext() { + if (!isIterating) { + iterator(); + } + + return parser.hasNext(); + } + + public RawRead next() { + if (hasNext()) { + next = new RawRead(parser.next()); + return next; + } + + return null; + } + + public void remove() { + throw new StingException("Remove is not implemented by IlluminaTile"); + } + + public void close() { + parser.close(); + isIterating = false; + } + + public Iterator iterator() { + if (isIterating) { + throw new StingException("IlluminaTile is already iterating"); + } + + isIterating = true; + next = new RawRead(parser.next()); + + return this; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/piecemealannotator/MergeAlignedAndSecondarySAMs.java b/java/src/org/broadinstitute/sting/playground/piecemealannotator/MergeAlignedAndSecondarySAMs.java new file mode 100755 index 000000000..070f8483c --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/piecemealannotator/MergeAlignedAndSecondarySAMs.java @@ -0,0 +1,74 @@ +package org.broadinstitute.sting.playground.piecemealannotator; + +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; + +import java.io.File; +import java.util.HashMap; + +public class MergeAlignedAndSecondarySAMs extends CommandLineProgram { + public static MergeAlignedAndSecondarySAMs instance = null; + + @Argument(fullName="unaligned_sam", shortName="US", doc="unaligned SAM with secondary bases") public File USAM; + @Argument(fullName="aligned_sam", shortName="AS", doc="aligned SAM without secondary bases") public File ASAM; + @Argument(fullName="sam_out", shortName="SO", doc="output SAM file") public File OSAM; + + public static void main(String[] argv) { + instance = new MergeAlignedAndSecondarySAMs(); + start(instance, argv); + } + + protected int execute() { + // Hash unaligned file + HashMap usamhash = new HashMap(500000); + + SAMFileReader usamr = new SAMFileReader(USAM); + for (SAMRecord usr : usamr) { + String key = String.format("%s:%b", usr.getReadName(), usr.getFirstOfPairFlag()); + + usamhash.put(key, usr); + } + usamr.close(); + + // Annotate aligned file + SAMFileReader asamr = new SAMFileReader(ASAM); + + SAMFileHeader asamh = asamr.getFileHeader(); + asamh.setSortOrder(SAMFileHeader.SortOrder.unsorted); + SAMFileWriter osamw = new SAMFileWriterFactory().makeSAMOrBAMWriter(asamh, true, OSAM); + + for (SAMRecord asr : asamr) { + String key = String.format("%s:%b", asr.getReadName(), asr.getFirstOfPairFlag()); + + if (usamhash.containsKey(key)) { + String abases = asr.getReadString(); + String ubases1 = usamhash.get(key).getReadString(); + String ubases2 = (String) usamhash.get(key).getAttribute("SB"); + + if (asr.getReadNegativeStrandFlag()) { + ubases1 = BaseUtils.simpleReverseComplement(ubases1); + ubases2 = BaseUtils.simpleReverseComplement(ubases2); + } + + byte[] sqbases = new byte[abases.length()]; + for (int cycle = 0; cycle < abases.length(); cycle++) { + char sqbase = (abases.charAt(cycle) == ubases1.charAt(cycle)) ? ubases2.charAt(cycle) : ubases1.charAt(cycle); + int sqBaseIndex = BaseUtils.simpleBaseToBaseIndex(sqbase); + sqbases[cycle] = QualityUtils.baseAndProbDiffToCompressedQuality(sqBaseIndex, 0.0); + } + + asr.setAttribute("SQ", sqbases); + + osamw.addAlignment(asr); + } + } + + osamw.close(); + asamr.close(); + + return 0; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/piecemealannotator/MergeAlignedSAMs.java b/java/src/org/broadinstitute/sting/playground/piecemealannotator/MergeAlignedSAMs.java new file mode 100755 index 000000000..e6fdc68aa --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/piecemealannotator/MergeAlignedSAMs.java @@ -0,0 +1,51 @@ +package org.broadinstitute.sting.playground.piecemealannotator; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMFileWriterFactory; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; + +import java.io.File; + +public class MergeAlignedSAMs extends CommandLineProgram { + public static MergeAlignedSAMs instance = null; + + @Argument(fullName="sam_tile_prefix", shortName="STP", doc="SAM tile prefix") public String SAM_TILE_PREFIX; + @Argument(fullName="sam_tile_suffix", shortName="STS", doc="SAM tile suffix") public String SAM_TILE_SUFFIX; + @Argument(fullName="sam_out", shortName="SO", doc="SAM output file") public File SAM_OUT; + + public static void main(String[] argv) { + instance = new MergeAlignedSAMs(); + start(instance, argv); + } + + protected int execute() { + SAMFileWriter swriter = null; + + for (int tile = 1; tile <= 100; tile++) { + File tileFile = new File(String.format("%s.%d.%s", SAM_TILE_PREFIX, tile, SAM_TILE_SUFFIX)); + + SAMFileReader sreader = new SAMFileReader(tileFile); + + if (swriter == null) { + swriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(sreader.getFileHeader(), true, SAM_OUT); + } + + System.out.println("Processing " + tileFile.getName() + " ..."); + + for (SAMRecord sr : sreader) { + swriter.addAlignment(sr); + } + + sreader.close(); + } + + if (swriter != null) { + swriter.close(); + } + + return 0; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/piecemealannotator/MergePieces.java b/java/src/org/broadinstitute/sting/playground/piecemealannotator/MergePieces.java new file mode 100755 index 000000000..f9c0c7cce --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/piecemealannotator/MergePieces.java @@ -0,0 +1,85 @@ +package org.broadinstitute.sting.playground.piecemealannotator; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMFileWriterFactory; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; + +import java.io.File; +import java.util.Date; +import java.util.HashMap; + +public class MergePieces extends CommandLineProgram { + public static MergePieces instance = null; + + @Argument(fullName="unaligned_sam_prefix", shortName="USP", doc="Prefix for unaligned SAM files") public String USAM_PREFIX; + @Argument(fullName="unaligned_sam_suffix", shortName="USS", doc="Suffix for unaligned SAM files") public String USAM_SUFFIX; + @Argument(fullName="tile_start", shortName="TS", doc="Starting tile number") public int TILE_START; + @Argument(fullName="tile_end", shortName="TE", doc="Ending tile number (inclusive)") public int TILE_END; + @Argument(fullName="aligned_sam_in", shortName="ASI", doc="Aligned SAM file") public File ALIGNED_SAM; + @Argument(fullName="annotated_sam_out", shortName="ASO", doc="Annotated SAM output file") public File ANNOTATED_SAM; + + public static void main(String[] argv) { + instance = new MergePieces(); + start(instance, argv); + } + + protected int execute() { + HashMap samHash = new HashMap(30000000); + + System.out.println("Hashing records..."); + for (int tile = TILE_START; tile <= TILE_END; tile++) { + System.out.printf(" %s: Hashing SQ tags from tile %d...\n", (new Date()).toString(), tile); + + File uTileFile = new File(String.format("%s.%d.%s", USAM_PREFIX, tile, USAM_SUFFIX)); + + SAMFileReader uTileReader = new SAMFileReader(uTileFile); + + for (SAMRecord sr : uTileReader) { + String key = String.format("%s:%b", sr.getReadName(), sr.getReadPairedFlag() && sr.getFirstOfPairFlag()); + + //System.out.printf("name=%s ispaired=%b negstrand=%b firstend=%b secondend=%b\n", sr.getReadName(), sr.getReadPairedFlag(), sr.getReadNegativeStrandFlag(), sr.getFirstOfPairFlag(), sr.getSecondOfPairFlag()); + + samHash.put(key, (byte[]) sr.getAttribute("SQ")); + } + + uTileReader.close(); + } + + SAMFileReader aReader = new SAMFileReader(ALIGNED_SAM); + SAMFileWriter aWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(aReader.getFileHeader(), true, ANNOTATED_SAM); + + System.out.println("Annotating reads..."); + int annotatedReads = 0; + for (SAMRecord sr : aReader) { + String key = String.format("%s:%b", sr.getReadName(), sr.getReadPairedFlag() && sr.getFirstOfPairFlag()); + + if (samHash.containsKey(key)) { + byte[] sqtag = samHash.get(key); + + if (sr.getReadNegativeStrandFlag()) { + sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag); + } + + sr.setAttribute("SQ", sqtag); + + annotatedReads++; + if (annotatedReads % 100000 == 0) { + System.out.printf(" %s: Annotated %d reads...\n", (new Date()).toString(), annotatedReads); + } + + //aWriter.addAlignment(sr); + } + + aWriter.addAlignment(sr); + } + + aReader.close(); + aWriter.close(); + + return 0; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/piecemealannotator/SplitAlignedSAMByTile.java b/java/src/org/broadinstitute/sting/playground/piecemealannotator/SplitAlignedSAMByTile.java new file mode 100755 index 000000000..be2b0753e --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/piecemealannotator/SplitAlignedSAMByTile.java @@ -0,0 +1,63 @@ +package org.broadinstitute.sting.playground.piecemealannotator; + +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; + +import java.io.File; +import java.util.Date; +import java.util.Vector; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class SplitAlignedSAMByTile extends CommandLineProgram { + public static SplitAlignedSAMByTile instance = null; + + @Argument(fullName="sam_in", shortName="SI", doc="SAM file to split") public File SAM_IN; + @Argument(fullName="sam_out_prefix", shortName="SOP", doc="Output prefix for split SAMs") public String SAM_OUT_PREFIX; + + public static void main(String[] argv) { + instance = new SplitAlignedSAMByTile(); + start(instance, argv); + } + + protected int execute() { + SAMFileReader sreader = new SAMFileReader(SAM_IN); + + SAMFileHeader sheader = sreader.getFileHeader(); + sheader.setSortOrder(SAMFileHeader.SortOrder.unsorted); + + Vector swriters = new Vector(); + swriters.setSize(101); + for (int tile = 1; tile <= 100; tile++) { + File tileFile = new File(String.format("%s.%d.sam", SAM_OUT_PREFIX, tile)); + swriters.add(tile, new SAMFileWriterFactory().makeSAMOrBAMWriter(sheader, true, tileFile)); + } + + int reads = 0; + for (SAMRecord sr : sreader) { + Pattern p = Pattern.compile(":\\d:(\\d+):\\d+:\\d+#"); + Matcher m = p.matcher(sr.getReadName()); + + if (m.find()) { + int tile = Integer.valueOf(m.group(1)); + + swriters.get(tile).addAlignment(sr); + + if (reads % 1000000 == 0) { + System.out.printf("%s: processed %d reads ... \n", (new Date()).toString(), reads); + } + reads++; + } + } + + sreader.close(); + for (SAMFileWriter sfw : swriters) { + if (sfw != null) { + sfw.close(); + } + } + + return 0; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/piecemealannotator/TileAnnotator.java b/java/src/org/broadinstitute/sting/playground/piecemealannotator/TileAnnotator.java new file mode 100755 index 000000000..3b29d87fd --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/piecemealannotator/TileAnnotator.java @@ -0,0 +1,298 @@ +package org.broadinstitute.sting.playground.piecemealannotator; + +import net.sf.samtools.*; +import org.broadinstitute.sting.secondarybase.BasecallingReadModel; +import org.broadinstitute.sting.secondarybase.BasecallingStats; +import org.broadinstitute.sting.secondarybase.FourProbRead; +import org.broadinstitute.sting.secondarybase.RawRead; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.Pair; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; +import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; + +import java.io.File; +import java.util.ArrayList; +import java.util.Date; +import java.util.HashMap; +import java.util.HashSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class TileAnnotator extends CommandLineProgram { + public static TileAnnotator instance = null; + + private String currentContig = ""; + private byte[] refbases; + //private int lane; + //private int tile; + + @Argument(fullName="sam_tile_in", shortName="STI", doc="SAM tile file", required=false) public File SAM_TILE_IN; + @Argument(fullName="sam_tile_out", shortName="STO", doc="Annotated SAM tile output file") public File SAM_TILE_OUT; + @Argument(fullName="reference", shortName="R", doc="The fasta reference") public File REFERENCE; + @Argument(fullName="bustard_dir", shortName="D", doc="The Bustard directory") public File BUSTARD_DIR; + @Argument(fullName="training_limit", shortName="TL", doc="Number of reads to train from", required=false) public int TRAINING_LIMIT = 10000; + @Argument(fullName="run_barcode", shortName="RB", doc="Illumina run barcode") public String RUN_BARCODE; + @Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-75,76-151') (0-based, inclusive)") public String CYCLE_RANGES; + @Argument(fullName="lane", shortName="L", doc="The lane to process (if not specified, this will be read from the 'sam_tile_in' file)", required=false) public Integer lane; + @Argument(fullName="tile", shortName="T", doc="The tile to process (if not specified, this will be read from the 'sam_tile_in' file)", required=false) public Integer tile; + + public static void main(String[] argv) { + instance = new TileAnnotator(); + start(instance, argv); + } + + protected int execute() { + ArrayList> cycleRanges = getCycleRanges(CYCLE_RANGES); + + System.out.printf("%s: Loading training set...\n", (new Date()).toString()); + ArrayList trainingData = loadTrainingData(); + + System.out.printf("%s: Training model...\n", (new Date()).toString()); + BasecallingReadModel model = new BasecallingReadModel(trainingData); + + System.out.printf("%s: Calling bases...\n", (new Date()).toString()); + //callBases(model, cycleRanges, trainingData); + callBases(model, cycleRanges); + + System.out.println("Done."); + + return 0; + } + + //private void callBases(BasecallingReadModel model, ArrayList> cycleRanges, ArrayList trainingData) { + private void callBases(BasecallingReadModel model, ArrayList> cycleRanges) { + SAMFileHeader sheader = new SAMFileHeader(); + sheader.setSortOrder(SAMFileHeader.SortOrder.unsorted); + SAMFileWriter swriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(sheader, true, SAM_TILE_OUT); + + IlluminaTile tileParser = new IlluminaTile(BUSTARD_DIR, lane, tile); + + BasecallingStats bstats = new BasecallingStats(); + + //for (RawRead rr : trainingData) { + for (RawRead rr : tileParser) { + FourProbRead fpr = model.call(rr); + + //System.out.println(rr.getSequenceAsString()); + //System.out.println(fpr.getPrimaryBaseSequence()); + + for (int rangeIndex = 0; rangeIndex < cycleRanges.size(); rangeIndex++) { + FourProbRead fprEnd = fpr.getSubset(cycleRanges.get(rangeIndex).getFirst(), cycleRanges.get(rangeIndex).getSecond()); + RawRead rrEnd = rr.getSubset(cycleRanges.get(rangeIndex).getFirst(), cycleRanges.get(rangeIndex).getSecond()); + + //System.out.println(rangeIndex + " : " + cycleRanges.get(rangeIndex).getFirst() + " " + cycleRanges.get(rangeIndex).getSecond()); + //System.out.println(rrEnd.getSequenceAsString()); + //System.out.println(fprEnd.getPrimaryBaseSequence()); + + SAMRecord sr = constructSAMRecord(rrEnd, fprEnd, sheader, cycleRanges.size() > 1, rangeIndex == 1); + + swriter.addAlignment(sr); + } + + //System.out.println(); + + bstats.update(rr, fpr); + bstats.notifyOnInterval(10000); + } + + bstats.notifyNow(); + + tileParser.close(); + swriter.close(); + } + + private ArrayList> getCycleRanges(String cycleRangesString) { + ArrayList< Pair > cycleRanges = new ArrayList< Pair >(); + + String[] pieces = cycleRangesString.split(","); + + Pattern p = Pattern.compile("(\\d+)-(\\d+)"); + + for (String piece : pieces) { + Matcher m = p.matcher(piece); + + if (m.find()) { + Integer cycleStart = new Integer(m.group(1)); + Integer cycleStop = new Integer(m.group(2)); + + cycleRanges.add(new Pair(cycleStart, cycleStop)); + } + } + + if (cycleRanges.size() == 0) { + throw new StingException("At least one cycle range must be specified."); + } + + if (cycleRanges.size() > 2) { + throw new StingException(cycleRanges.size() + " specified, but we're unable to handle more than 2."); + } + + return cycleRanges; + } + + private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, boolean isPaired, boolean isSecondEndOfPair) { + SAMRecord sr = new SAMRecord(sfh); + + sr.setReadName(String.format("%s:%d:%d:%d:%d#0", RUN_BARCODE, lane, tile, rr.getXCoordinate(), rr.getYCoordinate())); + sr.setReadUmappedFlag(true); + sr.setReadString(rr.getSequenceAsString()); + sr.setBaseQualities(rr.getQuals()); + sr.setAttribute("SQ", fpr.getSQTag(rr)); + + sr.setReadPairedFlag(isPaired); + if (isPaired) { + sr.setMateUnmappedFlag(true); + sr.setFirstOfPairFlag(!isSecondEndOfPair); + sr.setSecondOfPairFlag(isSecondEndOfPair); + } + + return sr; + } + + private ArrayList loadTrainingData() { + FastaSequenceFile2 ref = new FastaSequenceFile2(REFERENCE); + HashMap srs = loadTileAlignments(ref); + return loadGoodReads(srs, BUSTARD_DIR); + } + + private HashMap loadTileAlignments(FastaSequenceFile2 ref) { + HashMap srs = new HashMap(); + HashSet seenEnds = new HashSet(); + + int numPerfect = 0; + + if (SAM_TILE_IN != null && SAM_TILE_IN.exists()) { + SAMFileReader sreader = new SAMFileReader(SAM_TILE_IN); + + for (SAMRecord sr : sreader) { + Pattern p = Pattern.compile(":(\\d+):(\\d+):(\\d+):(\\d+)#"); + Matcher m = p.matcher(sr.getReadName()); + + if (m.find()) { + this.lane = Integer.valueOf(m.group(1)); + this.tile = Integer.valueOf(m.group(2)); + int x = Integer.valueOf(m.group(3)); + int y = Integer.valueOf(m.group(4)); + boolean end = sr.getReadPairedFlag() && sr.getSecondOfPairFlag(); + + String otherKey = String.format("%d:%d:%b", x, y, !end); + String currentKey = String.format("%d:%d:%b", x, y, end); + + seenEnds.add(currentKey); + + if (isWellAligned(sr, ref)) { + if (srs.containsKey(otherKey) || !seenEnds.contains(otherKey)) { + srs.put(currentKey, sr); + } + + if (srs.containsKey(currentKey) && srs.containsKey(otherKey)) { + numPerfect++; + if (numPerfect % (TRAINING_LIMIT < 1000 ? TRAINING_LIMIT : 1000) == 0) { + System.out.println(" " + numPerfect + " well-aligned reads"); + } + } + } else { + if (srs.containsKey(otherKey)) { + srs.remove(otherKey); + } + } + } + + if (numPerfect >= TRAINING_LIMIT) { break; } + } + + sreader.close(); + } + + return srs; + } + + private boolean isWellAligned(SAMRecord sr, FastaSequenceFile2 ref) { + boolean valid = false; + int mismatches = 0; + + if (!sr.getReadUnmappedFlag() && sr.getCigar().numCigarElements() == 1) { + if (!currentContig.matches(sr.getReferenceName())) { + ref.seekToContig(sr.getReferenceName()); + currentContig = sr.getReferenceName(); + + refbases = ref.nextSequence().getBases(); + } + + byte[] readbases = sr.getReadBases(); + int offset = sr.getAlignmentStart(); + + if (offset + readbases.length < refbases.length) { + valid = true; + + //String refString = ""; + for (int i = offset, j = 0; i < offset + readbases.length; i++, j++) { + //refString += (char) refbases[i - 1]; + + int refbase = BaseUtils.simpleBaseToBaseIndex((char) refbases[i - 1]); + int readbase = BaseUtils.simpleBaseToBaseIndex((char) readbases[j]); + + mismatches += (refbase >= 0 && readbase >= 0 && refbase != readbase) ? 1 : 0; + } + } + } + + return (valid && mismatches == 0); + } + + private ArrayList loadGoodReads(HashMap srs, File bustardDir) { + ArrayList trainingData = new ArrayList(); + BoundedScoringSet additionalData = new BoundedScoringSet(TRAINING_LIMIT); + + IlluminaTile tileParser = new IlluminaTile(bustardDir, lane, tile); + + int correlatedReads = 0; + for (RawRead rr : tileParser) { + String key1 = String.format("%d:%d:%b", rr.getXCoordinate(), rr.getYCoordinate(), false); + String key2 = String.format("%d:%d:%b", rr.getXCoordinate(), rr.getYCoordinate(), true); + + if (srs.containsKey(key1) && srs.containsKey(key2)) { + byte[] quals = new byte[rr.getReadLength()]; + for (int cycle = 0; cycle < rr.getReadLength(); cycle++) { + quals[cycle] = (byte) (BaseUtils.simpleBaseToBaseIndex((char) rr.getSequence()[cycle]) >= 0 ? 50 : 0); + } + rr.setQuals(quals); + + trainingData.add(rr); + + correlatedReads++; + if (correlatedReads % (TRAINING_LIMIT < 1000 ? TRAINING_LIMIT : 1000) == 0) { + System.out.println(" " + correlatedReads + " intensity-correlated reads"); + } + } else { + additionalData.add(rr); + } + + //trainingData.add(rr); + + //if (trainingData.size() % 10000 == 0) { + // System.out.printf(" loaded %d reads\n", trainingData.size()); + //} + } + + //System.out.printf(" loaded %d reads\n", trainingData.size()); + + //for (RawRead rr : additionalData.toArray(new RawRead[0])) { + // System.out.println(rr.getSequenceAsString()); + //} + + tileParser.close(); + + System.out.printf(" found %d perfect reads with an optional reservoir of %d good reads\n", trainingData.size(), additionalData.size()); + + RawRead[] qrs = additionalData.toArray(new RawRead[0]); + int limit = (TRAINING_LIMIT - trainingData.size() < additionalData.size()) ? (TRAINING_LIMIT - trainingData.size()) : additionalData.size(); + for (int i = 0; i < limit; i++) { + trainingData.add(qrs[i]); + } + + return trainingData; + } +} diff --git a/java/src/org/broadinstitute/sting/playground/piecemealannotator/VerifySAM.java b/java/src/org/broadinstitute/sting/playground/piecemealannotator/VerifySAM.java new file mode 100755 index 000000000..836f44ef2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/piecemealannotator/VerifySAM.java @@ -0,0 +1,34 @@ +package org.broadinstitute.sting.playground.piecemealannotator; + +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram; + +import java.io.File; + +public class VerifySAM extends CommandLineProgram { + public static VerifySAM instance = null; + + @Argument(fullName="sam_in", shortName="SI", doc="SAM file to check") public File SAM_IN; + + public static void main(String[] argv) { + instance = new VerifySAM(); + start(instance, argv); + } + + protected int execute() { + SAMFileReader sfr = new SAMFileReader(SAM_IN); + + for ( SAMRecord sr : sfr ) { + if (sr.getAttribute("SQ") == null) { + throw new StingException(String.format("SAMRecord is missing an SQ tag\n%s", sr.format())); + } + } + + sfr.close(); + + return 0; + } +}