An experimental, tile-parallel version of the secondary base annotator.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1044 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-06-18 19:58:09 +00:00
parent e26df45e8e
commit 768a16e791
8 changed files with 727 additions and 0 deletions

View File

@ -0,0 +1,45 @@
package org.broadinstitute.sting.playground.piecemealannotator;
import java.util.PriorityQueue;
public class BoundedScoringSet<E extends Comparable<E> > {
private PriorityQueue<E> pq;
private int maximumSize;
public BoundedScoringSet(int maximumSize) {
pq = new PriorityQueue<E>(maximumSize);
this.maximumSize = maximumSize;
}
public boolean add(E o) {
if (canAdd(o)) {
pq.add(o);
while (pq.size() > maximumSize) {
pq.poll();
}
return true;
}
return false;
}
private boolean canAdd(E o) { return pq.size() < maximumSize || o.compareTo(pq.peek()) == 1; }
public void clear() { pq.clear(); }
public boolean contains(E o) { return pq.contains(o); }
public boolean offer(E o) { return pq.offer(o); }
public E peek() { return pq.peek(); }
public E poll() { return pq.poll(); }
public boolean remove(E o) { return pq.remove(o); }
public int size() { return pq.size(); }
public E[] toArray(E[] os) { return pq.toArray(os); }
}

View File

@ -0,0 +1,77 @@
package org.broadinstitute.sting.playground.piecemealannotator;
import edu.mit.broad.picard.util.BasicTextFileParser;
import edu.mit.broad.picard.util.PasteParser;
import org.broadinstitute.sting.secondarybase.RawRead;
import org.broadinstitute.sting.utils.StingException;
import java.io.Closeable;
import java.io.File;
import java.util.Iterator;
public class IlluminaTile implements Iterator<RawRead>, Iterable<RawRead>, Closeable {
private PasteParser parser;
private RawRead next;
private boolean isIterating = false;
public IlluminaTile(File bustardDir, int lane, int tile) {
//BasicTextFileParser intparser = new BasicTextFileParser(true, new File(bustardDir.getParent() + "/" + String.format("s_%d_%04d_int.txt.gz", lane, tile)));
//BasicTextFileParser seqparser = new BasicTextFileParser(true, new File(bustardDir.getAbsolutePath() + "/" + String.format("s_%d_%04d_seq.txt.gz", lane, tile)));
//BasicTextFileParser prbparser = new BasicTextFileParser(true, new File(bustardDir.getAbsolutePath() + "/" + String.format("s_%d_%04d_prb.txt.gz", lane, tile)));
BasicTextFileParser intparser = new BasicTextFileParser(true, findFile(bustardDir.getParentFile(), lane, tile, "int"));
BasicTextFileParser seqparser = new BasicTextFileParser(true, findFile(bustardDir, lane, tile, "seq"));
BasicTextFileParser prbparser = new BasicTextFileParser(true, findFile(bustardDir, lane, tile, "prb"));
parser = new PasteParser(intparser, seqparser, prbparser);
}
private File findFile(File dir, int lane, int tile, String ext) {
File file1 = new File(dir.getAbsolutePath() + "/" + String.format("s_%d_%04d_%s.txt.gz", lane, tile, ext));
if (file1.exists()) { return file1; }
File file2 = new File(dir.getAbsolutePath() + "/" + String.format("s_%d_%04d_%s.txt", lane, tile, ext));
if (file2.exists()) { return file2; }
throw new StingException(String.format("Can't find file '%s' or '%s'", file1.getName(), file2.getName()));
}
public boolean hasNext() {
if (!isIterating) {
iterator();
}
return parser.hasNext();
}
public RawRead next() {
if (hasNext()) {
next = new RawRead(parser.next());
return next;
}
return null;
}
public void remove() {
throw new StingException("Remove is not implemented by IlluminaTile");
}
public void close() {
parser.close();
isIterating = false;
}
public Iterator<RawRead> iterator() {
if (isIterating) {
throw new StingException("IlluminaTile is already iterating");
}
isIterating = true;
next = new RawRead(parser.next());
return this;
}
}

View File

@ -0,0 +1,74 @@
package org.broadinstitute.sting.playground.piecemealannotator;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.File;
import java.util.HashMap;
public class MergeAlignedAndSecondarySAMs extends CommandLineProgram {
public static MergeAlignedAndSecondarySAMs instance = null;
@Argument(fullName="unaligned_sam", shortName="US", doc="unaligned SAM with secondary bases") public File USAM;
@Argument(fullName="aligned_sam", shortName="AS", doc="aligned SAM without secondary bases") public File ASAM;
@Argument(fullName="sam_out", shortName="SO", doc="output SAM file") public File OSAM;
public static void main(String[] argv) {
instance = new MergeAlignedAndSecondarySAMs();
start(instance, argv);
}
protected int execute() {
// Hash unaligned file
HashMap<String, SAMRecord> usamhash = new HashMap<String, SAMRecord>(500000);
SAMFileReader usamr = new SAMFileReader(USAM);
for (SAMRecord usr : usamr) {
String key = String.format("%s:%b", usr.getReadName(), usr.getFirstOfPairFlag());
usamhash.put(key, usr);
}
usamr.close();
// Annotate aligned file
SAMFileReader asamr = new SAMFileReader(ASAM);
SAMFileHeader asamh = asamr.getFileHeader();
asamh.setSortOrder(SAMFileHeader.SortOrder.unsorted);
SAMFileWriter osamw = new SAMFileWriterFactory().makeSAMOrBAMWriter(asamh, true, OSAM);
for (SAMRecord asr : asamr) {
String key = String.format("%s:%b", asr.getReadName(), asr.getFirstOfPairFlag());
if (usamhash.containsKey(key)) {
String abases = asr.getReadString();
String ubases1 = usamhash.get(key).getReadString();
String ubases2 = (String) usamhash.get(key).getAttribute("SB");
if (asr.getReadNegativeStrandFlag()) {
ubases1 = BaseUtils.simpleReverseComplement(ubases1);
ubases2 = BaseUtils.simpleReverseComplement(ubases2);
}
byte[] sqbases = new byte[abases.length()];
for (int cycle = 0; cycle < abases.length(); cycle++) {
char sqbase = (abases.charAt(cycle) == ubases1.charAt(cycle)) ? ubases2.charAt(cycle) : ubases1.charAt(cycle);
int sqBaseIndex = BaseUtils.simpleBaseToBaseIndex(sqbase);
sqbases[cycle] = QualityUtils.baseAndProbDiffToCompressedQuality(sqBaseIndex, 0.0);
}
asr.setAttribute("SQ", sqbases);
osamw.addAlignment(asr);
}
}
osamw.close();
asamr.close();
return 0;
}
}

View File

@ -0,0 +1,51 @@
package org.broadinstitute.sting.playground.piecemealannotator;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileWriterFactory;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.File;
public class MergeAlignedSAMs extends CommandLineProgram {
public static MergeAlignedSAMs instance = null;
@Argument(fullName="sam_tile_prefix", shortName="STP", doc="SAM tile prefix") public String SAM_TILE_PREFIX;
@Argument(fullName="sam_tile_suffix", shortName="STS", doc="SAM tile suffix") public String SAM_TILE_SUFFIX;
@Argument(fullName="sam_out", shortName="SO", doc="SAM output file") public File SAM_OUT;
public static void main(String[] argv) {
instance = new MergeAlignedSAMs();
start(instance, argv);
}
protected int execute() {
SAMFileWriter swriter = null;
for (int tile = 1; tile <= 100; tile++) {
File tileFile = new File(String.format("%s.%d.%s", SAM_TILE_PREFIX, tile, SAM_TILE_SUFFIX));
SAMFileReader sreader = new SAMFileReader(tileFile);
if (swriter == null) {
swriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(sreader.getFileHeader(), true, SAM_OUT);
}
System.out.println("Processing " + tileFile.getName() + " ...");
for (SAMRecord sr : sreader) {
swriter.addAlignment(sr);
}
sreader.close();
}
if (swriter != null) {
swriter.close();
}
return 0;
}
}

View File

@ -0,0 +1,85 @@
package org.broadinstitute.sting.playground.piecemealannotator;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileWriterFactory;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.File;
import java.util.Date;
import java.util.HashMap;
public class MergePieces extends CommandLineProgram {
public static MergePieces instance = null;
@Argument(fullName="unaligned_sam_prefix", shortName="USP", doc="Prefix for unaligned SAM files") public String USAM_PREFIX;
@Argument(fullName="unaligned_sam_suffix", shortName="USS", doc="Suffix for unaligned SAM files") public String USAM_SUFFIX;
@Argument(fullName="tile_start", shortName="TS", doc="Starting tile number") public int TILE_START;
@Argument(fullName="tile_end", shortName="TE", doc="Ending tile number (inclusive)") public int TILE_END;
@Argument(fullName="aligned_sam_in", shortName="ASI", doc="Aligned SAM file") public File ALIGNED_SAM;
@Argument(fullName="annotated_sam_out", shortName="ASO", doc="Annotated SAM output file") public File ANNOTATED_SAM;
public static void main(String[] argv) {
instance = new MergePieces();
start(instance, argv);
}
protected int execute() {
HashMap<String, byte[]> samHash = new HashMap<String, byte[]>(30000000);
System.out.println("Hashing records...");
for (int tile = TILE_START; tile <= TILE_END; tile++) {
System.out.printf(" %s: Hashing SQ tags from tile %d...\n", (new Date()).toString(), tile);
File uTileFile = new File(String.format("%s.%d.%s", USAM_PREFIX, tile, USAM_SUFFIX));
SAMFileReader uTileReader = new SAMFileReader(uTileFile);
for (SAMRecord sr : uTileReader) {
String key = String.format("%s:%b", sr.getReadName(), sr.getReadPairedFlag() && sr.getFirstOfPairFlag());
//System.out.printf("name=%s ispaired=%b negstrand=%b firstend=%b secondend=%b\n", sr.getReadName(), sr.getReadPairedFlag(), sr.getReadNegativeStrandFlag(), sr.getFirstOfPairFlag(), sr.getSecondOfPairFlag());
samHash.put(key, (byte[]) sr.getAttribute("SQ"));
}
uTileReader.close();
}
SAMFileReader aReader = new SAMFileReader(ALIGNED_SAM);
SAMFileWriter aWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(aReader.getFileHeader(), true, ANNOTATED_SAM);
System.out.println("Annotating reads...");
int annotatedReads = 0;
for (SAMRecord sr : aReader) {
String key = String.format("%s:%b", sr.getReadName(), sr.getReadPairedFlag() && sr.getFirstOfPairFlag());
if (samHash.containsKey(key)) {
byte[] sqtag = samHash.get(key);
if (sr.getReadNegativeStrandFlag()) {
sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag);
}
sr.setAttribute("SQ", sqtag);
annotatedReads++;
if (annotatedReads % 100000 == 0) {
System.out.printf(" %s: Annotated %d reads...\n", (new Date()).toString(), annotatedReads);
}
//aWriter.addAlignment(sr);
}
aWriter.addAlignment(sr);
}
aReader.close();
aWriter.close();
return 0;
}
}

View File

@ -0,0 +1,63 @@
package org.broadinstitute.sting.playground.piecemealannotator;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.File;
import java.util.Date;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SplitAlignedSAMByTile extends CommandLineProgram {
public static SplitAlignedSAMByTile instance = null;
@Argument(fullName="sam_in", shortName="SI", doc="SAM file to split") public File SAM_IN;
@Argument(fullName="sam_out_prefix", shortName="SOP", doc="Output prefix for split SAMs") public String SAM_OUT_PREFIX;
public static void main(String[] argv) {
instance = new SplitAlignedSAMByTile();
start(instance, argv);
}
protected int execute() {
SAMFileReader sreader = new SAMFileReader(SAM_IN);
SAMFileHeader sheader = sreader.getFileHeader();
sheader.setSortOrder(SAMFileHeader.SortOrder.unsorted);
Vector<SAMFileWriter> swriters = new Vector<SAMFileWriter>();
swriters.setSize(101);
for (int tile = 1; tile <= 100; tile++) {
File tileFile = new File(String.format("%s.%d.sam", SAM_OUT_PREFIX, tile));
swriters.add(tile, new SAMFileWriterFactory().makeSAMOrBAMWriter(sheader, true, tileFile));
}
int reads = 0;
for (SAMRecord sr : sreader) {
Pattern p = Pattern.compile(":\\d:(\\d+):\\d+:\\d+#");
Matcher m = p.matcher(sr.getReadName());
if (m.find()) {
int tile = Integer.valueOf(m.group(1));
swriters.get(tile).addAlignment(sr);
if (reads % 1000000 == 0) {
System.out.printf("%s: processed %d reads ... \n", (new Date()).toString(), reads);
}
reads++;
}
}
sreader.close();
for (SAMFileWriter sfw : swriters) {
if (sfw != null) {
sfw.close();
}
}
return 0;
}
}

View File

@ -0,0 +1,298 @@
package org.broadinstitute.sting.playground.piecemealannotator;
import net.sf.samtools.*;
import org.broadinstitute.sting.secondarybase.BasecallingReadModel;
import org.broadinstitute.sting.secondarybase.BasecallingStats;
import org.broadinstitute.sting.secondarybase.FourProbRead;
import org.broadinstitute.sting.secondarybase.RawRead;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Pair;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import java.io.File;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TileAnnotator extends CommandLineProgram {
public static TileAnnotator instance = null;
private String currentContig = "";
private byte[] refbases;
//private int lane;
//private int tile;
@Argument(fullName="sam_tile_in", shortName="STI", doc="SAM tile file", required=false) public File SAM_TILE_IN;
@Argument(fullName="sam_tile_out", shortName="STO", doc="Annotated SAM tile output file") public File SAM_TILE_OUT;
@Argument(fullName="reference", shortName="R", doc="The fasta reference") public File REFERENCE;
@Argument(fullName="bustard_dir", shortName="D", doc="The Bustard directory") public File BUSTARD_DIR;
@Argument(fullName="training_limit", shortName="TL", doc="Number of reads to train from", required=false) public int TRAINING_LIMIT = 10000;
@Argument(fullName="run_barcode", shortName="RB", doc="Illumina run barcode") public String RUN_BARCODE;
@Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-75,76-151') (0-based, inclusive)") public String CYCLE_RANGES;
@Argument(fullName="lane", shortName="L", doc="The lane to process (if not specified, this will be read from the 'sam_tile_in' file)", required=false) public Integer lane;
@Argument(fullName="tile", shortName="T", doc="The tile to process (if not specified, this will be read from the 'sam_tile_in' file)", required=false) public Integer tile;
public static void main(String[] argv) {
instance = new TileAnnotator();
start(instance, argv);
}
protected int execute() {
ArrayList<Pair<Integer, Integer>> cycleRanges = getCycleRanges(CYCLE_RANGES);
System.out.printf("%s: Loading training set...\n", (new Date()).toString());
ArrayList<RawRead> trainingData = loadTrainingData();
System.out.printf("%s: Training model...\n", (new Date()).toString());
BasecallingReadModel model = new BasecallingReadModel(trainingData);
System.out.printf("%s: Calling bases...\n", (new Date()).toString());
//callBases(model, cycleRanges, trainingData);
callBases(model, cycleRanges);
System.out.println("Done.");
return 0;
}
//private void callBases(BasecallingReadModel model, ArrayList<Pair<Integer, Integer>> cycleRanges, ArrayList<RawRead> trainingData) {
private void callBases(BasecallingReadModel model, ArrayList<Pair<Integer, Integer>> cycleRanges) {
SAMFileHeader sheader = new SAMFileHeader();
sheader.setSortOrder(SAMFileHeader.SortOrder.unsorted);
SAMFileWriter swriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(sheader, true, SAM_TILE_OUT);
IlluminaTile tileParser = new IlluminaTile(BUSTARD_DIR, lane, tile);
BasecallingStats bstats = new BasecallingStats();
//for (RawRead rr : trainingData) {
for (RawRead rr : tileParser) {
FourProbRead fpr = model.call(rr);
//System.out.println(rr.getSequenceAsString());
//System.out.println(fpr.getPrimaryBaseSequence());
for (int rangeIndex = 0; rangeIndex < cycleRanges.size(); rangeIndex++) {
FourProbRead fprEnd = fpr.getSubset(cycleRanges.get(rangeIndex).getFirst(), cycleRanges.get(rangeIndex).getSecond());
RawRead rrEnd = rr.getSubset(cycleRanges.get(rangeIndex).getFirst(), cycleRanges.get(rangeIndex).getSecond());
//System.out.println(rangeIndex + " : " + cycleRanges.get(rangeIndex).getFirst() + " " + cycleRanges.get(rangeIndex).getSecond());
//System.out.println(rrEnd.getSequenceAsString());
//System.out.println(fprEnd.getPrimaryBaseSequence());
SAMRecord sr = constructSAMRecord(rrEnd, fprEnd, sheader, cycleRanges.size() > 1, rangeIndex == 1);
swriter.addAlignment(sr);
}
//System.out.println();
bstats.update(rr, fpr);
bstats.notifyOnInterval(10000);
}
bstats.notifyNow();
tileParser.close();
swriter.close();
}
private ArrayList<Pair<Integer, Integer>> getCycleRanges(String cycleRangesString) {
ArrayList< Pair<Integer, Integer> > cycleRanges = new ArrayList< Pair<Integer, Integer> >();
String[] pieces = cycleRangesString.split(",");
Pattern p = Pattern.compile("(\\d+)-(\\d+)");
for (String piece : pieces) {
Matcher m = p.matcher(piece);
if (m.find()) {
Integer cycleStart = new Integer(m.group(1));
Integer cycleStop = new Integer(m.group(2));
cycleRanges.add(new Pair<Integer, Integer>(cycleStart, cycleStop));
}
}
if (cycleRanges.size() == 0) {
throw new StingException("At least one cycle range must be specified.");
}
if (cycleRanges.size() > 2) {
throw new StingException(cycleRanges.size() + " specified, but we're unable to handle more than 2.");
}
return cycleRanges;
}
private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, boolean isPaired, boolean isSecondEndOfPair) {
SAMRecord sr = new SAMRecord(sfh);
sr.setReadName(String.format("%s:%d:%d:%d:%d#0", RUN_BARCODE, lane, tile, rr.getXCoordinate(), rr.getYCoordinate()));
sr.setReadUmappedFlag(true);
sr.setReadString(rr.getSequenceAsString());
sr.setBaseQualities(rr.getQuals());
sr.setAttribute("SQ", fpr.getSQTag(rr));
sr.setReadPairedFlag(isPaired);
if (isPaired) {
sr.setMateUnmappedFlag(true);
sr.setFirstOfPairFlag(!isSecondEndOfPair);
sr.setSecondOfPairFlag(isSecondEndOfPair);
}
return sr;
}
private ArrayList<RawRead> loadTrainingData() {
FastaSequenceFile2 ref = new FastaSequenceFile2(REFERENCE);
HashMap<String, SAMRecord> srs = loadTileAlignments(ref);
return loadGoodReads(srs, BUSTARD_DIR);
}
private HashMap<String, SAMRecord> loadTileAlignments(FastaSequenceFile2 ref) {
HashMap<String, SAMRecord> srs = new HashMap<String, SAMRecord>();
HashSet<String> seenEnds = new HashSet<String>();
int numPerfect = 0;
if (SAM_TILE_IN != null && SAM_TILE_IN.exists()) {
SAMFileReader sreader = new SAMFileReader(SAM_TILE_IN);
for (SAMRecord sr : sreader) {
Pattern p = Pattern.compile(":(\\d+):(\\d+):(\\d+):(\\d+)#");
Matcher m = p.matcher(sr.getReadName());
if (m.find()) {
this.lane = Integer.valueOf(m.group(1));
this.tile = Integer.valueOf(m.group(2));
int x = Integer.valueOf(m.group(3));
int y = Integer.valueOf(m.group(4));
boolean end = sr.getReadPairedFlag() && sr.getSecondOfPairFlag();
String otherKey = String.format("%d:%d:%b", x, y, !end);
String currentKey = String.format("%d:%d:%b", x, y, end);
seenEnds.add(currentKey);
if (isWellAligned(sr, ref)) {
if (srs.containsKey(otherKey) || !seenEnds.contains(otherKey)) {
srs.put(currentKey, sr);
}
if (srs.containsKey(currentKey) && srs.containsKey(otherKey)) {
numPerfect++;
if (numPerfect % (TRAINING_LIMIT < 1000 ? TRAINING_LIMIT : 1000) == 0) {
System.out.println(" " + numPerfect + " well-aligned reads");
}
}
} else {
if (srs.containsKey(otherKey)) {
srs.remove(otherKey);
}
}
}
if (numPerfect >= TRAINING_LIMIT) { break; }
}
sreader.close();
}
return srs;
}
private boolean isWellAligned(SAMRecord sr, FastaSequenceFile2 ref) {
boolean valid = false;
int mismatches = 0;
if (!sr.getReadUnmappedFlag() && sr.getCigar().numCigarElements() == 1) {
if (!currentContig.matches(sr.getReferenceName())) {
ref.seekToContig(sr.getReferenceName());
currentContig = sr.getReferenceName();
refbases = ref.nextSequence().getBases();
}
byte[] readbases = sr.getReadBases();
int offset = sr.getAlignmentStart();
if (offset + readbases.length < refbases.length) {
valid = true;
//String refString = "";
for (int i = offset, j = 0; i < offset + readbases.length; i++, j++) {
//refString += (char) refbases[i - 1];
int refbase = BaseUtils.simpleBaseToBaseIndex((char) refbases[i - 1]);
int readbase = BaseUtils.simpleBaseToBaseIndex((char) readbases[j]);
mismatches += (refbase >= 0 && readbase >= 0 && refbase != readbase) ? 1 : 0;
}
}
}
return (valid && mismatches == 0);
}
private ArrayList<RawRead> loadGoodReads(HashMap<String, SAMRecord> srs, File bustardDir) {
ArrayList<RawRead> trainingData = new ArrayList<RawRead>();
BoundedScoringSet<RawRead> additionalData = new BoundedScoringSet<RawRead>(TRAINING_LIMIT);
IlluminaTile tileParser = new IlluminaTile(bustardDir, lane, tile);
int correlatedReads = 0;
for (RawRead rr : tileParser) {
String key1 = String.format("%d:%d:%b", rr.getXCoordinate(), rr.getYCoordinate(), false);
String key2 = String.format("%d:%d:%b", rr.getXCoordinate(), rr.getYCoordinate(), true);
if (srs.containsKey(key1) && srs.containsKey(key2)) {
byte[] quals = new byte[rr.getReadLength()];
for (int cycle = 0; cycle < rr.getReadLength(); cycle++) {
quals[cycle] = (byte) (BaseUtils.simpleBaseToBaseIndex((char) rr.getSequence()[cycle]) >= 0 ? 50 : 0);
}
rr.setQuals(quals);
trainingData.add(rr);
correlatedReads++;
if (correlatedReads % (TRAINING_LIMIT < 1000 ? TRAINING_LIMIT : 1000) == 0) {
System.out.println(" " + correlatedReads + " intensity-correlated reads");
}
} else {
additionalData.add(rr);
}
//trainingData.add(rr);
//if (trainingData.size() % 10000 == 0) {
// System.out.printf(" loaded %d reads\n", trainingData.size());
//}
}
//System.out.printf(" loaded %d reads\n", trainingData.size());
//for (RawRead rr : additionalData.toArray(new RawRead[0])) {
// System.out.println(rr.getSequenceAsString());
//}
tileParser.close();
System.out.printf(" found %d perfect reads with an optional reservoir of %d good reads\n", trainingData.size(), additionalData.size());
RawRead[] qrs = additionalData.toArray(new RawRead[0]);
int limit = (TRAINING_LIMIT - trainingData.size() < additionalData.size()) ? (TRAINING_LIMIT - trainingData.size()) : additionalData.size();
for (int i = 0; i < limit; i++) {
trainingData.add(qrs[i]);
}
return trainingData;
}
}

View File

@ -0,0 +1,34 @@
package org.broadinstitute.sting.playground.piecemealannotator;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.File;
public class VerifySAM extends CommandLineProgram {
public static VerifySAM instance = null;
@Argument(fullName="sam_in", shortName="SI", doc="SAM file to check") public File SAM_IN;
public static void main(String[] argv) {
instance = new VerifySAM();
start(instance, argv);
}
protected int execute() {
SAMFileReader sfr = new SAMFileReader(SAM_IN);
for ( SAMRecord sr : sfr ) {
if (sr.getAttribute("SQ") == null) {
throw new StingException(String.format("SAMRecord is missing an SQ tag\n%s", sr.format()));
}
}
sfr.close();
return 0;
}
}