An experimental, tile-parallel version of the secondary base annotator.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1044 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e26df45e8e
commit
768a16e791
|
|
@ -0,0 +1,45 @@
|
|||
package org.broadinstitute.sting.playground.piecemealannotator;
|
||||
|
||||
import java.util.PriorityQueue;
|
||||
|
||||
public class BoundedScoringSet<E extends Comparable<E> > {
|
||||
private PriorityQueue<E> pq;
|
||||
private int maximumSize;
|
||||
|
||||
public BoundedScoringSet(int maximumSize) {
|
||||
pq = new PriorityQueue<E>(maximumSize);
|
||||
this.maximumSize = maximumSize;
|
||||
}
|
||||
|
||||
public boolean add(E o) {
|
||||
if (canAdd(o)) {
|
||||
pq.add(o);
|
||||
|
||||
while (pq.size() > maximumSize) {
|
||||
pq.poll();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean canAdd(E o) { return pq.size() < maximumSize || o.compareTo(pq.peek()) == 1; }
|
||||
|
||||
public void clear() { pq.clear(); }
|
||||
|
||||
public boolean contains(E o) { return pq.contains(o); }
|
||||
|
||||
public boolean offer(E o) { return pq.offer(o); }
|
||||
|
||||
public E peek() { return pq.peek(); }
|
||||
|
||||
public E poll() { return pq.poll(); }
|
||||
|
||||
public boolean remove(E o) { return pq.remove(o); }
|
||||
|
||||
public int size() { return pq.size(); }
|
||||
|
||||
public E[] toArray(E[] os) { return pq.toArray(os); }
|
||||
}
|
||||
|
|
@ -0,0 +1,77 @@
|
|||
package org.broadinstitute.sting.playground.piecemealannotator;
|
||||
|
||||
import edu.mit.broad.picard.util.BasicTextFileParser;
|
||||
import edu.mit.broad.picard.util.PasteParser;
|
||||
import org.broadinstitute.sting.secondarybase.RawRead;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.File;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class IlluminaTile implements Iterator<RawRead>, Iterable<RawRead>, Closeable {
|
||||
private PasteParser parser;
|
||||
|
||||
private RawRead next;
|
||||
private boolean isIterating = false;
|
||||
|
||||
public IlluminaTile(File bustardDir, int lane, int tile) {
|
||||
//BasicTextFileParser intparser = new BasicTextFileParser(true, new File(bustardDir.getParent() + "/" + String.format("s_%d_%04d_int.txt.gz", lane, tile)));
|
||||
//BasicTextFileParser seqparser = new BasicTextFileParser(true, new File(bustardDir.getAbsolutePath() + "/" + String.format("s_%d_%04d_seq.txt.gz", lane, tile)));
|
||||
//BasicTextFileParser prbparser = new BasicTextFileParser(true, new File(bustardDir.getAbsolutePath() + "/" + String.format("s_%d_%04d_prb.txt.gz", lane, tile)));
|
||||
|
||||
BasicTextFileParser intparser = new BasicTextFileParser(true, findFile(bustardDir.getParentFile(), lane, tile, "int"));
|
||||
BasicTextFileParser seqparser = new BasicTextFileParser(true, findFile(bustardDir, lane, tile, "seq"));
|
||||
BasicTextFileParser prbparser = new BasicTextFileParser(true, findFile(bustardDir, lane, tile, "prb"));
|
||||
|
||||
parser = new PasteParser(intparser, seqparser, prbparser);
|
||||
}
|
||||
|
||||
private File findFile(File dir, int lane, int tile, String ext) {
|
||||
File file1 = new File(dir.getAbsolutePath() + "/" + String.format("s_%d_%04d_%s.txt.gz", lane, tile, ext));
|
||||
if (file1.exists()) { return file1; }
|
||||
|
||||
File file2 = new File(dir.getAbsolutePath() + "/" + String.format("s_%d_%04d_%s.txt", lane, tile, ext));
|
||||
if (file2.exists()) { return file2; }
|
||||
|
||||
throw new StingException(String.format("Can't find file '%s' or '%s'", file1.getName(), file2.getName()));
|
||||
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
if (!isIterating) {
|
||||
iterator();
|
||||
}
|
||||
|
||||
return parser.hasNext();
|
||||
}
|
||||
|
||||
public RawRead next() {
|
||||
if (hasNext()) {
|
||||
next = new RawRead(parser.next());
|
||||
return next;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new StingException("Remove is not implemented by IlluminaTile");
|
||||
}
|
||||
|
||||
public void close() {
|
||||
parser.close();
|
||||
isIterating = false;
|
||||
}
|
||||
|
||||
public Iterator<RawRead> iterator() {
|
||||
if (isIterating) {
|
||||
throw new StingException("IlluminaTile is already iterating");
|
||||
}
|
||||
|
||||
isIterating = true;
|
||||
next = new RawRead(parser.next());
|
||||
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,74 @@
|
|||
package org.broadinstitute.sting.playground.piecemealannotator;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
|
||||
public class MergeAlignedAndSecondarySAMs extends CommandLineProgram {
|
||||
public static MergeAlignedAndSecondarySAMs instance = null;
|
||||
|
||||
@Argument(fullName="unaligned_sam", shortName="US", doc="unaligned SAM with secondary bases") public File USAM;
|
||||
@Argument(fullName="aligned_sam", shortName="AS", doc="aligned SAM without secondary bases") public File ASAM;
|
||||
@Argument(fullName="sam_out", shortName="SO", doc="output SAM file") public File OSAM;
|
||||
|
||||
public static void main(String[] argv) {
|
||||
instance = new MergeAlignedAndSecondarySAMs();
|
||||
start(instance, argv);
|
||||
}
|
||||
|
||||
protected int execute() {
|
||||
// Hash unaligned file
|
||||
HashMap<String, SAMRecord> usamhash = new HashMap<String, SAMRecord>(500000);
|
||||
|
||||
SAMFileReader usamr = new SAMFileReader(USAM);
|
||||
for (SAMRecord usr : usamr) {
|
||||
String key = String.format("%s:%b", usr.getReadName(), usr.getFirstOfPairFlag());
|
||||
|
||||
usamhash.put(key, usr);
|
||||
}
|
||||
usamr.close();
|
||||
|
||||
// Annotate aligned file
|
||||
SAMFileReader asamr = new SAMFileReader(ASAM);
|
||||
|
||||
SAMFileHeader asamh = asamr.getFileHeader();
|
||||
asamh.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||
SAMFileWriter osamw = new SAMFileWriterFactory().makeSAMOrBAMWriter(asamh, true, OSAM);
|
||||
|
||||
for (SAMRecord asr : asamr) {
|
||||
String key = String.format("%s:%b", asr.getReadName(), asr.getFirstOfPairFlag());
|
||||
|
||||
if (usamhash.containsKey(key)) {
|
||||
String abases = asr.getReadString();
|
||||
String ubases1 = usamhash.get(key).getReadString();
|
||||
String ubases2 = (String) usamhash.get(key).getAttribute("SB");
|
||||
|
||||
if (asr.getReadNegativeStrandFlag()) {
|
||||
ubases1 = BaseUtils.simpleReverseComplement(ubases1);
|
||||
ubases2 = BaseUtils.simpleReverseComplement(ubases2);
|
||||
}
|
||||
|
||||
byte[] sqbases = new byte[abases.length()];
|
||||
for (int cycle = 0; cycle < abases.length(); cycle++) {
|
||||
char sqbase = (abases.charAt(cycle) == ubases1.charAt(cycle)) ? ubases2.charAt(cycle) : ubases1.charAt(cycle);
|
||||
int sqBaseIndex = BaseUtils.simpleBaseToBaseIndex(sqbase);
|
||||
sqbases[cycle] = QualityUtils.baseAndProbDiffToCompressedQuality(sqBaseIndex, 0.0);
|
||||
}
|
||||
|
||||
asr.setAttribute("SQ", sqbases);
|
||||
|
||||
osamw.addAlignment(asr);
|
||||
}
|
||||
}
|
||||
|
||||
osamw.close();
|
||||
asamr.close();
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
package org.broadinstitute.sting.playground.piecemealannotator;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMFileWriterFactory;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
public class MergeAlignedSAMs extends CommandLineProgram {
|
||||
public static MergeAlignedSAMs instance = null;
|
||||
|
||||
@Argument(fullName="sam_tile_prefix", shortName="STP", doc="SAM tile prefix") public String SAM_TILE_PREFIX;
|
||||
@Argument(fullName="sam_tile_suffix", shortName="STS", doc="SAM tile suffix") public String SAM_TILE_SUFFIX;
|
||||
@Argument(fullName="sam_out", shortName="SO", doc="SAM output file") public File SAM_OUT;
|
||||
|
||||
public static void main(String[] argv) {
|
||||
instance = new MergeAlignedSAMs();
|
||||
start(instance, argv);
|
||||
}
|
||||
|
||||
protected int execute() {
|
||||
SAMFileWriter swriter = null;
|
||||
|
||||
for (int tile = 1; tile <= 100; tile++) {
|
||||
File tileFile = new File(String.format("%s.%d.%s", SAM_TILE_PREFIX, tile, SAM_TILE_SUFFIX));
|
||||
|
||||
SAMFileReader sreader = new SAMFileReader(tileFile);
|
||||
|
||||
if (swriter == null) {
|
||||
swriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(sreader.getFileHeader(), true, SAM_OUT);
|
||||
}
|
||||
|
||||
System.out.println("Processing " + tileFile.getName() + " ...");
|
||||
|
||||
for (SAMRecord sr : sreader) {
|
||||
swriter.addAlignment(sr);
|
||||
}
|
||||
|
||||
sreader.close();
|
||||
}
|
||||
|
||||
if (swriter != null) {
|
||||
swriter.close();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
package org.broadinstitute.sting.playground.piecemealannotator;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMFileWriterFactory;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
|
||||
public class MergePieces extends CommandLineProgram {
|
||||
public static MergePieces instance = null;
|
||||
|
||||
@Argument(fullName="unaligned_sam_prefix", shortName="USP", doc="Prefix for unaligned SAM files") public String USAM_PREFIX;
|
||||
@Argument(fullName="unaligned_sam_suffix", shortName="USS", doc="Suffix for unaligned SAM files") public String USAM_SUFFIX;
|
||||
@Argument(fullName="tile_start", shortName="TS", doc="Starting tile number") public int TILE_START;
|
||||
@Argument(fullName="tile_end", shortName="TE", doc="Ending tile number (inclusive)") public int TILE_END;
|
||||
@Argument(fullName="aligned_sam_in", shortName="ASI", doc="Aligned SAM file") public File ALIGNED_SAM;
|
||||
@Argument(fullName="annotated_sam_out", shortName="ASO", doc="Annotated SAM output file") public File ANNOTATED_SAM;
|
||||
|
||||
public static void main(String[] argv) {
|
||||
instance = new MergePieces();
|
||||
start(instance, argv);
|
||||
}
|
||||
|
||||
protected int execute() {
|
||||
HashMap<String, byte[]> samHash = new HashMap<String, byte[]>(30000000);
|
||||
|
||||
System.out.println("Hashing records...");
|
||||
for (int tile = TILE_START; tile <= TILE_END; tile++) {
|
||||
System.out.printf(" %s: Hashing SQ tags from tile %d...\n", (new Date()).toString(), tile);
|
||||
|
||||
File uTileFile = new File(String.format("%s.%d.%s", USAM_PREFIX, tile, USAM_SUFFIX));
|
||||
|
||||
SAMFileReader uTileReader = new SAMFileReader(uTileFile);
|
||||
|
||||
for (SAMRecord sr : uTileReader) {
|
||||
String key = String.format("%s:%b", sr.getReadName(), sr.getReadPairedFlag() && sr.getFirstOfPairFlag());
|
||||
|
||||
//System.out.printf("name=%s ispaired=%b negstrand=%b firstend=%b secondend=%b\n", sr.getReadName(), sr.getReadPairedFlag(), sr.getReadNegativeStrandFlag(), sr.getFirstOfPairFlag(), sr.getSecondOfPairFlag());
|
||||
|
||||
samHash.put(key, (byte[]) sr.getAttribute("SQ"));
|
||||
}
|
||||
|
||||
uTileReader.close();
|
||||
}
|
||||
|
||||
SAMFileReader aReader = new SAMFileReader(ALIGNED_SAM);
|
||||
SAMFileWriter aWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(aReader.getFileHeader(), true, ANNOTATED_SAM);
|
||||
|
||||
System.out.println("Annotating reads...");
|
||||
int annotatedReads = 0;
|
||||
for (SAMRecord sr : aReader) {
|
||||
String key = String.format("%s:%b", sr.getReadName(), sr.getReadPairedFlag() && sr.getFirstOfPairFlag());
|
||||
|
||||
if (samHash.containsKey(key)) {
|
||||
byte[] sqtag = samHash.get(key);
|
||||
|
||||
if (sr.getReadNegativeStrandFlag()) {
|
||||
sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag);
|
||||
}
|
||||
|
||||
sr.setAttribute("SQ", sqtag);
|
||||
|
||||
annotatedReads++;
|
||||
if (annotatedReads % 100000 == 0) {
|
||||
System.out.printf(" %s: Annotated %d reads...\n", (new Date()).toString(), annotatedReads);
|
||||
}
|
||||
|
||||
//aWriter.addAlignment(sr);
|
||||
}
|
||||
|
||||
aWriter.addAlignment(sr);
|
||||
}
|
||||
|
||||
aReader.close();
|
||||
aWriter.close();
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
package org.broadinstitute.sting.playground.piecemealannotator;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Date;
|
||||
import java.util.Vector;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class SplitAlignedSAMByTile extends CommandLineProgram {
|
||||
public static SplitAlignedSAMByTile instance = null;
|
||||
|
||||
@Argument(fullName="sam_in", shortName="SI", doc="SAM file to split") public File SAM_IN;
|
||||
@Argument(fullName="sam_out_prefix", shortName="SOP", doc="Output prefix for split SAMs") public String SAM_OUT_PREFIX;
|
||||
|
||||
public static void main(String[] argv) {
|
||||
instance = new SplitAlignedSAMByTile();
|
||||
start(instance, argv);
|
||||
}
|
||||
|
||||
protected int execute() {
|
||||
SAMFileReader sreader = new SAMFileReader(SAM_IN);
|
||||
|
||||
SAMFileHeader sheader = sreader.getFileHeader();
|
||||
sheader.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||
|
||||
Vector<SAMFileWriter> swriters = new Vector<SAMFileWriter>();
|
||||
swriters.setSize(101);
|
||||
for (int tile = 1; tile <= 100; tile++) {
|
||||
File tileFile = new File(String.format("%s.%d.sam", SAM_OUT_PREFIX, tile));
|
||||
swriters.add(tile, new SAMFileWriterFactory().makeSAMOrBAMWriter(sheader, true, tileFile));
|
||||
}
|
||||
|
||||
int reads = 0;
|
||||
for (SAMRecord sr : sreader) {
|
||||
Pattern p = Pattern.compile(":\\d:(\\d+):\\d+:\\d+#");
|
||||
Matcher m = p.matcher(sr.getReadName());
|
||||
|
||||
if (m.find()) {
|
||||
int tile = Integer.valueOf(m.group(1));
|
||||
|
||||
swriters.get(tile).addAlignment(sr);
|
||||
|
||||
if (reads % 1000000 == 0) {
|
||||
System.out.printf("%s: processed %d reads ... \n", (new Date()).toString(), reads);
|
||||
}
|
||||
reads++;
|
||||
}
|
||||
}
|
||||
|
||||
sreader.close();
|
||||
for (SAMFileWriter sfw : swriters) {
|
||||
if (sfw != null) {
|
||||
sfw.close();
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,298 @@
|
|||
package org.broadinstitute.sting.playground.piecemealannotator;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.secondarybase.BasecallingReadModel;
|
||||
import org.broadinstitute.sting.secondarybase.BasecallingStats;
|
||||
import org.broadinstitute.sting.secondarybase.FourProbRead;
|
||||
import org.broadinstitute.sting.secondarybase.RawRead;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.Pair;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class TileAnnotator extends CommandLineProgram {
|
||||
public static TileAnnotator instance = null;
|
||||
|
||||
private String currentContig = "";
|
||||
private byte[] refbases;
|
||||
//private int lane;
|
||||
//private int tile;
|
||||
|
||||
@Argument(fullName="sam_tile_in", shortName="STI", doc="SAM tile file", required=false) public File SAM_TILE_IN;
|
||||
@Argument(fullName="sam_tile_out", shortName="STO", doc="Annotated SAM tile output file") public File SAM_TILE_OUT;
|
||||
@Argument(fullName="reference", shortName="R", doc="The fasta reference") public File REFERENCE;
|
||||
@Argument(fullName="bustard_dir", shortName="D", doc="The Bustard directory") public File BUSTARD_DIR;
|
||||
@Argument(fullName="training_limit", shortName="TL", doc="Number of reads to train from", required=false) public int TRAINING_LIMIT = 10000;
|
||||
@Argument(fullName="run_barcode", shortName="RB", doc="Illumina run barcode") public String RUN_BARCODE;
|
||||
@Argument(fullName="cycle_ranges", shortName="CR", doc="Cycle ranges for single-end or paired reads (i.e. '0-75,76-151') (0-based, inclusive)") public String CYCLE_RANGES;
|
||||
@Argument(fullName="lane", shortName="L", doc="The lane to process (if not specified, this will be read from the 'sam_tile_in' file)", required=false) public Integer lane;
|
||||
@Argument(fullName="tile", shortName="T", doc="The tile to process (if not specified, this will be read from the 'sam_tile_in' file)", required=false) public Integer tile;
|
||||
|
||||
public static void main(String[] argv) {
|
||||
instance = new TileAnnotator();
|
||||
start(instance, argv);
|
||||
}
|
||||
|
||||
protected int execute() {
|
||||
ArrayList<Pair<Integer, Integer>> cycleRanges = getCycleRanges(CYCLE_RANGES);
|
||||
|
||||
System.out.printf("%s: Loading training set...\n", (new Date()).toString());
|
||||
ArrayList<RawRead> trainingData = loadTrainingData();
|
||||
|
||||
System.out.printf("%s: Training model...\n", (new Date()).toString());
|
||||
BasecallingReadModel model = new BasecallingReadModel(trainingData);
|
||||
|
||||
System.out.printf("%s: Calling bases...\n", (new Date()).toString());
|
||||
//callBases(model, cycleRanges, trainingData);
|
||||
callBases(model, cycleRanges);
|
||||
|
||||
System.out.println("Done.");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//private void callBases(BasecallingReadModel model, ArrayList<Pair<Integer, Integer>> cycleRanges, ArrayList<RawRead> trainingData) {
|
||||
private void callBases(BasecallingReadModel model, ArrayList<Pair<Integer, Integer>> cycleRanges) {
|
||||
SAMFileHeader sheader = new SAMFileHeader();
|
||||
sheader.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||
SAMFileWriter swriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(sheader, true, SAM_TILE_OUT);
|
||||
|
||||
IlluminaTile tileParser = new IlluminaTile(BUSTARD_DIR, lane, tile);
|
||||
|
||||
BasecallingStats bstats = new BasecallingStats();
|
||||
|
||||
//for (RawRead rr : trainingData) {
|
||||
for (RawRead rr : tileParser) {
|
||||
FourProbRead fpr = model.call(rr);
|
||||
|
||||
//System.out.println(rr.getSequenceAsString());
|
||||
//System.out.println(fpr.getPrimaryBaseSequence());
|
||||
|
||||
for (int rangeIndex = 0; rangeIndex < cycleRanges.size(); rangeIndex++) {
|
||||
FourProbRead fprEnd = fpr.getSubset(cycleRanges.get(rangeIndex).getFirst(), cycleRanges.get(rangeIndex).getSecond());
|
||||
RawRead rrEnd = rr.getSubset(cycleRanges.get(rangeIndex).getFirst(), cycleRanges.get(rangeIndex).getSecond());
|
||||
|
||||
//System.out.println(rangeIndex + " : " + cycleRanges.get(rangeIndex).getFirst() + " " + cycleRanges.get(rangeIndex).getSecond());
|
||||
//System.out.println(rrEnd.getSequenceAsString());
|
||||
//System.out.println(fprEnd.getPrimaryBaseSequence());
|
||||
|
||||
SAMRecord sr = constructSAMRecord(rrEnd, fprEnd, sheader, cycleRanges.size() > 1, rangeIndex == 1);
|
||||
|
||||
swriter.addAlignment(sr);
|
||||
}
|
||||
|
||||
//System.out.println();
|
||||
|
||||
bstats.update(rr, fpr);
|
||||
bstats.notifyOnInterval(10000);
|
||||
}
|
||||
|
||||
bstats.notifyNow();
|
||||
|
||||
tileParser.close();
|
||||
swriter.close();
|
||||
}
|
||||
|
||||
private ArrayList<Pair<Integer, Integer>> getCycleRanges(String cycleRangesString) {
|
||||
ArrayList< Pair<Integer, Integer> > cycleRanges = new ArrayList< Pair<Integer, Integer> >();
|
||||
|
||||
String[] pieces = cycleRangesString.split(",");
|
||||
|
||||
Pattern p = Pattern.compile("(\\d+)-(\\d+)");
|
||||
|
||||
for (String piece : pieces) {
|
||||
Matcher m = p.matcher(piece);
|
||||
|
||||
if (m.find()) {
|
||||
Integer cycleStart = new Integer(m.group(1));
|
||||
Integer cycleStop = new Integer(m.group(2));
|
||||
|
||||
cycleRanges.add(new Pair<Integer, Integer>(cycleStart, cycleStop));
|
||||
}
|
||||
}
|
||||
|
||||
if (cycleRanges.size() == 0) {
|
||||
throw new StingException("At least one cycle range must be specified.");
|
||||
}
|
||||
|
||||
if (cycleRanges.size() > 2) {
|
||||
throw new StingException(cycleRanges.size() + " specified, but we're unable to handle more than 2.");
|
||||
}
|
||||
|
||||
return cycleRanges;
|
||||
}
|
||||
|
||||
private SAMRecord constructSAMRecord(RawRead rr, FourProbRead fpr, SAMFileHeader sfh, boolean isPaired, boolean isSecondEndOfPair) {
|
||||
SAMRecord sr = new SAMRecord(sfh);
|
||||
|
||||
sr.setReadName(String.format("%s:%d:%d:%d:%d#0", RUN_BARCODE, lane, tile, rr.getXCoordinate(), rr.getYCoordinate()));
|
||||
sr.setReadUmappedFlag(true);
|
||||
sr.setReadString(rr.getSequenceAsString());
|
||||
sr.setBaseQualities(rr.getQuals());
|
||||
sr.setAttribute("SQ", fpr.getSQTag(rr));
|
||||
|
||||
sr.setReadPairedFlag(isPaired);
|
||||
if (isPaired) {
|
||||
sr.setMateUnmappedFlag(true);
|
||||
sr.setFirstOfPairFlag(!isSecondEndOfPair);
|
||||
sr.setSecondOfPairFlag(isSecondEndOfPair);
|
||||
}
|
||||
|
||||
return sr;
|
||||
}
|
||||
|
||||
private ArrayList<RawRead> loadTrainingData() {
|
||||
FastaSequenceFile2 ref = new FastaSequenceFile2(REFERENCE);
|
||||
HashMap<String, SAMRecord> srs = loadTileAlignments(ref);
|
||||
return loadGoodReads(srs, BUSTARD_DIR);
|
||||
}
|
||||
|
||||
private HashMap<String, SAMRecord> loadTileAlignments(FastaSequenceFile2 ref) {
|
||||
HashMap<String, SAMRecord> srs = new HashMap<String, SAMRecord>();
|
||||
HashSet<String> seenEnds = new HashSet<String>();
|
||||
|
||||
int numPerfect = 0;
|
||||
|
||||
if (SAM_TILE_IN != null && SAM_TILE_IN.exists()) {
|
||||
SAMFileReader sreader = new SAMFileReader(SAM_TILE_IN);
|
||||
|
||||
for (SAMRecord sr : sreader) {
|
||||
Pattern p = Pattern.compile(":(\\d+):(\\d+):(\\d+):(\\d+)#");
|
||||
Matcher m = p.matcher(sr.getReadName());
|
||||
|
||||
if (m.find()) {
|
||||
this.lane = Integer.valueOf(m.group(1));
|
||||
this.tile = Integer.valueOf(m.group(2));
|
||||
int x = Integer.valueOf(m.group(3));
|
||||
int y = Integer.valueOf(m.group(4));
|
||||
boolean end = sr.getReadPairedFlag() && sr.getSecondOfPairFlag();
|
||||
|
||||
String otherKey = String.format("%d:%d:%b", x, y, !end);
|
||||
String currentKey = String.format("%d:%d:%b", x, y, end);
|
||||
|
||||
seenEnds.add(currentKey);
|
||||
|
||||
if (isWellAligned(sr, ref)) {
|
||||
if (srs.containsKey(otherKey) || !seenEnds.contains(otherKey)) {
|
||||
srs.put(currentKey, sr);
|
||||
}
|
||||
|
||||
if (srs.containsKey(currentKey) && srs.containsKey(otherKey)) {
|
||||
numPerfect++;
|
||||
if (numPerfect % (TRAINING_LIMIT < 1000 ? TRAINING_LIMIT : 1000) == 0) {
|
||||
System.out.println(" " + numPerfect + " well-aligned reads");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (srs.containsKey(otherKey)) {
|
||||
srs.remove(otherKey);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numPerfect >= TRAINING_LIMIT) { break; }
|
||||
}
|
||||
|
||||
sreader.close();
|
||||
}
|
||||
|
||||
return srs;
|
||||
}
|
||||
|
||||
private boolean isWellAligned(SAMRecord sr, FastaSequenceFile2 ref) {
|
||||
boolean valid = false;
|
||||
int mismatches = 0;
|
||||
|
||||
if (!sr.getReadUnmappedFlag() && sr.getCigar().numCigarElements() == 1) {
|
||||
if (!currentContig.matches(sr.getReferenceName())) {
|
||||
ref.seekToContig(sr.getReferenceName());
|
||||
currentContig = sr.getReferenceName();
|
||||
|
||||
refbases = ref.nextSequence().getBases();
|
||||
}
|
||||
|
||||
byte[] readbases = sr.getReadBases();
|
||||
int offset = sr.getAlignmentStart();
|
||||
|
||||
if (offset + readbases.length < refbases.length) {
|
||||
valid = true;
|
||||
|
||||
//String refString = "";
|
||||
for (int i = offset, j = 0; i < offset + readbases.length; i++, j++) {
|
||||
//refString += (char) refbases[i - 1];
|
||||
|
||||
int refbase = BaseUtils.simpleBaseToBaseIndex((char) refbases[i - 1]);
|
||||
int readbase = BaseUtils.simpleBaseToBaseIndex((char) readbases[j]);
|
||||
|
||||
mismatches += (refbase >= 0 && readbase >= 0 && refbase != readbase) ? 1 : 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (valid && mismatches == 0);
|
||||
}
|
||||
|
||||
private ArrayList<RawRead> loadGoodReads(HashMap<String, SAMRecord> srs, File bustardDir) {
|
||||
ArrayList<RawRead> trainingData = new ArrayList<RawRead>();
|
||||
BoundedScoringSet<RawRead> additionalData = new BoundedScoringSet<RawRead>(TRAINING_LIMIT);
|
||||
|
||||
IlluminaTile tileParser = new IlluminaTile(bustardDir, lane, tile);
|
||||
|
||||
int correlatedReads = 0;
|
||||
for (RawRead rr : tileParser) {
|
||||
String key1 = String.format("%d:%d:%b", rr.getXCoordinate(), rr.getYCoordinate(), false);
|
||||
String key2 = String.format("%d:%d:%b", rr.getXCoordinate(), rr.getYCoordinate(), true);
|
||||
|
||||
if (srs.containsKey(key1) && srs.containsKey(key2)) {
|
||||
byte[] quals = new byte[rr.getReadLength()];
|
||||
for (int cycle = 0; cycle < rr.getReadLength(); cycle++) {
|
||||
quals[cycle] = (byte) (BaseUtils.simpleBaseToBaseIndex((char) rr.getSequence()[cycle]) >= 0 ? 50 : 0);
|
||||
}
|
||||
rr.setQuals(quals);
|
||||
|
||||
trainingData.add(rr);
|
||||
|
||||
correlatedReads++;
|
||||
if (correlatedReads % (TRAINING_LIMIT < 1000 ? TRAINING_LIMIT : 1000) == 0) {
|
||||
System.out.println(" " + correlatedReads + " intensity-correlated reads");
|
||||
}
|
||||
} else {
|
||||
additionalData.add(rr);
|
||||
}
|
||||
|
||||
//trainingData.add(rr);
|
||||
|
||||
//if (trainingData.size() % 10000 == 0) {
|
||||
// System.out.printf(" loaded %d reads\n", trainingData.size());
|
||||
//}
|
||||
}
|
||||
|
||||
//System.out.printf(" loaded %d reads\n", trainingData.size());
|
||||
|
||||
//for (RawRead rr : additionalData.toArray(new RawRead[0])) {
|
||||
// System.out.println(rr.getSequenceAsString());
|
||||
//}
|
||||
|
||||
tileParser.close();
|
||||
|
||||
System.out.printf(" found %d perfect reads with an optional reservoir of %d good reads\n", trainingData.size(), additionalData.size());
|
||||
|
||||
RawRead[] qrs = additionalData.toArray(new RawRead[0]);
|
||||
int limit = (TRAINING_LIMIT - trainingData.size() < additionalData.size()) ? (TRAINING_LIMIT - trainingData.size()) : additionalData.size();
|
||||
for (int i = 0; i < limit; i++) {
|
||||
trainingData.add(qrs[i]);
|
||||
}
|
||||
|
||||
return trainingData;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
package org.broadinstitute.sting.playground.piecemealannotator;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
public class VerifySAM extends CommandLineProgram {
|
||||
public static VerifySAM instance = null;
|
||||
|
||||
@Argument(fullName="sam_in", shortName="SI", doc="SAM file to check") public File SAM_IN;
|
||||
|
||||
public static void main(String[] argv) {
|
||||
instance = new VerifySAM();
|
||||
start(instance, argv);
|
||||
}
|
||||
|
||||
protected int execute() {
|
||||
SAMFileReader sfr = new SAMFileReader(SAM_IN);
|
||||
|
||||
for ( SAMRecord sr : sfr ) {
|
||||
if (sr.getAttribute("SQ") == null) {
|
||||
throw new StingException(String.format("SAMRecord is missing an SQ tag\n%s", sr.format()));
|
||||
}
|
||||
}
|
||||
|
||||
sfr.close();
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue