Documentation.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@712 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
68c9455c0f
commit
5858f20902
|
|
@ -17,10 +17,6 @@ import java.io.File;
|
|||
import java.io.FilenameFilter;
|
||||
import java.util.*;
|
||||
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broadinstitute.sting.secondarybase.FirecrestReadData;
|
||||
import org.broadinstitute.sting.secondarybase.FirecrestFilenameComparator;
|
||||
|
||||
/**
|
||||
* Abstract base class for implementing parsers for various versions of Firecrest output
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -1,14 +1,12 @@
|
|||
package org.broadinstitute.sting.playground.fourbasecaller;
|
||||
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
|
||||
public class AddFourProbsToSAM extends CommandLineProgram {
|
||||
public static AddFourProbsToSAM Instance = null;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,18 +1,14 @@
|
|||
package org.broadinstitute.sting.secondarybase;
|
||||
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.secondarybase.BasecallingReadModel;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.HashMap;
|
||||
import java.util.Vector;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMFileWriterFactory;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
public class AnnotateSecondaryBase extends CommandLineProgram {
|
||||
public static AnnotateSecondaryBase Instance = null;
|
||||
|
|
@ -58,10 +54,10 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
|||
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT);
|
||||
|
||||
IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END);
|
||||
RawRead rr;
|
||||
|
||||
int basesConsistent = 0, basesTotal = 0;
|
||||
|
||||
RawRead rr;
|
||||
while ((rr = iparser.next()) != null) {
|
||||
FourProbRead fpr = model.call(rr);
|
||||
|
||||
|
|
@ -87,10 +83,9 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
|||
}
|
||||
|
||||
iparser.close();
|
||||
|
||||
sfw.close();
|
||||
|
||||
System.out.println("Done.");
|
||||
System.out.println("\nDone.");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,14 +1,15 @@
|
|||
package org.broadinstitute.sting.secondarybase;
|
||||
|
||||
import cern.colt.matrix.DoubleMatrix1D;
|
||||
import cern.colt.matrix.DoubleFactory1D;
|
||||
import cern.colt.matrix.DoubleMatrix2D;
|
||||
import cern.colt.matrix.DoubleFactory2D;
|
||||
import cern.colt.matrix.DoubleMatrix1D;
|
||||
import cern.colt.matrix.DoubleMatrix2D;
|
||||
import cern.colt.matrix.linalg.Algebra;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
|
||||
/**
|
||||
* BasecallingBaseModel is a class that represents the statistical
|
||||
|
|
|
|||
|
|
@ -2,8 +2,6 @@ package org.broadinstitute.sting.secondarybase;
|
|||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.secondarybase.BasecallingBaseModel;
|
||||
import org.broadinstitute.sting.secondarybase.FourProb;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
|
|
|
|||
|
|
@ -1,20 +1,24 @@
|
|||
package org.broadinstitute.sting.secondarybase;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Vector;
|
||||
import java.util.ArrayList;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.io.File;
|
||||
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Vector;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* BasecallingTrainingSet holds a set of raw read sequences, their raw intensities, and quality scores.
|
||||
*
|
||||
* @author Kiran Garimella
|
||||
*/
|
||||
public class BasecallingTrainingSet {
|
||||
private File bustardDir;
|
||||
private int lane;
|
||||
|
|
@ -24,6 +28,15 @@ public class BasecallingTrainingSet {
|
|||
|
||||
private ArrayList<RawRead> trainingData;
|
||||
|
||||
/**
|
||||
* Constructor for BasecallingTrainingSet.
|
||||
*
|
||||
* @param bustardDir the Bustard directory for the sample
|
||||
* @param lane the lane for the sample
|
||||
* @param cycleBegin the start cycle for the beginning of the read (0-based, inclusive)
|
||||
* @param cycleEnd the stop ccle for the end of the read (0-based, inclusive)
|
||||
* @param trainingLimit the number of training reads to accept
|
||||
*/
|
||||
public BasecallingTrainingSet(File bustardDir, int lane, int cycleBegin, int cycleEnd, int trainingLimit) {
|
||||
this.bustardDir = bustardDir;
|
||||
this.lane = lane;
|
||||
|
|
@ -32,14 +45,27 @@ public class BasecallingTrainingSet {
|
|||
this.trainingLimit = trainingLimit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the training data array list
|
||||
*
|
||||
* @return the arraylist of raw training reads
|
||||
*/
|
||||
public ArrayList<RawRead> getTrainingData() {
|
||||
return this.trainingData;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the training data array list
|
||||
*
|
||||
* @param trainingData the arraylist of raw training reads
|
||||
*/
|
||||
public void setTrainingData(ArrayList<RawRead> trainingData) {
|
||||
this.trainingData = trainingData;
|
||||
}
|
||||
|
||||
/**
|
||||
* Take the first N reads that have no ambiguous bases and add them to the training set.
|
||||
*/
|
||||
public void loadFirstNUnambiguousReadsTrainingSet() {
|
||||
this.trainingData = new ArrayList<RawRead>(trainingLimit);
|
||||
|
||||
|
|
@ -65,12 +91,25 @@ public class BasecallingTrainingSet {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a training set from perfect reads in an already-aligned bam file
|
||||
*
|
||||
* @param samIn the SAM/BAM file to load the reads from
|
||||
* @param reference the reference to which the reads should be compared
|
||||
*/
|
||||
public void loadPreAlignedTrainingSet(File samIn, File reference) {
|
||||
Vector< HashMap<String, SAMRecord> > trainingReads = getPerfectAlignmentsByTile(samIn, reference);
|
||||
|
||||
trainingData = correlateReadsAndIntensities(trainingReads);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find perfect reads and group them by tile.
|
||||
*
|
||||
* @param samIn the SAM/BAM file to load the raeds from
|
||||
* @param reference the reference to which the reads should be compared
|
||||
* @return a vector of perfect reads, grouped by tile
|
||||
*/
|
||||
private Vector<HashMap<String, SAMRecord>> getPerfectAlignmentsByTile(File samIn, File reference) {
|
||||
FastaSequenceFile2 ref = new FastaSequenceFile2(reference);
|
||||
String currentContig = "none";
|
||||
|
|
@ -132,6 +171,12 @@ public class BasecallingTrainingSet {
|
|||
return trainingReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Correlate the perfect reads with their raw intensities. Sloooooooow.
|
||||
*
|
||||
* @param trainingReads the perfect reads, grouped by tile
|
||||
* @return a training set of raw sequence, intensities, and quality scores (all set to 40 for these perfect bases)
|
||||
*/
|
||||
private ArrayList<RawRead> correlateReadsAndIntensities(Vector<HashMap<String, SAMRecord>> trainingReads) {
|
||||
ArrayList<RawRead> newTrainingData = new ArrayList<RawRead>(trainingLimit);
|
||||
|
||||
|
|
|
|||
|
|
@ -1,16 +1,18 @@
|
|||
package org.broadinstitute.sting.secondarybase;
|
||||
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.HashMap;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMFileWriterFactory;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
|
||||
public class CombineSamAndFourProbs extends CommandLineProgram {
|
||||
public static CombineSamAndFourProbs Instance = null;
|
||||
|
|
|
|||
|
|
@ -9,15 +9,12 @@
|
|||
*/
|
||||
package org.broadinstitute.sting.secondarybase;
|
||||
|
||||
import edu.mit.broad.picard.util.PasteParser;
|
||||
import edu.mit.broad.picard.util.FormatUtil;
|
||||
import edu.mit.broad.picard.util.BasicTextFileParser;
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.picard.util.BasicTextFileParser;
|
||||
import edu.mit.broad.picard.util.FormatUtil;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import org.broadinstitute.sting.secondarybase.FirecrestReadData;
|
||||
|
||||
/**
|
||||
* Class to parse the data in an Illumina Firecrest directory and return an iterator over that data, in order
|
||||
* by tile.
|
||||
|
|
|
|||
|
|
@ -1,20 +1,17 @@
|
|||
package org.broadinstitute.sting.secondarybase;
|
||||
|
||||
import edu.mit.broad.picard.illumina.BustardFileParser;
|
||||
import edu.mit.broad.picard.illumina.BustardReadData;
|
||||
import edu.mit.broad.picard.illumina.AbstractBustardFileParser;
|
||||
import edu.mit.broad.picard.illumina.BustardFileParser;
|
||||
import edu.mit.broad.picard.illumina.BustardFileParser_1_1;
|
||||
import edu.mit.broad.picard.illumina.BustardReadData;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMFileWriterFactory;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.secondarybase.FirecrestFileParser;
|
||||
import org.broadinstitute.sting.secondarybase.FirecrestReadData;
|
||||
import org.broadinstitute.sting.secondarybase.FourProb;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
|
|
|||
|
|
@ -1,7 +1,5 @@
|
|||
package org.broadinstitute.sting.secondarybase;
|
||||
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
public class FourIntensity {
|
||||
private float[] fIntensities;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
package org.broadinstitute.sting.secondarybase;
|
||||
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
/**
|
||||
* FourProb represents four base hypotheses, their probabilities, and the ranking among one another.
|
||||
|
|
|
|||
|
|
@ -5,11 +5,25 @@ import org.broadinstitute.sting.utils.QualityUtils;
|
|||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* FourProbRead contains the four-prob information for each base in a read.
|
||||
*/
|
||||
public class FourProbRead extends ArrayList<FourProb> {
|
||||
/**
|
||||
* Initialize the container with the specified capacity
|
||||
*
|
||||
* @param initialCapacity the number of bases in the read
|
||||
*/
|
||||
public FourProbRead(int initialCapacity) {
|
||||
super(initialCapacity);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the read sequence at a specified rank
|
||||
*
|
||||
* @param rank the rank of the sequence to return (0=best, 1=second-best, 2=third-best, 3=fourth-best)
|
||||
* @return the read sequence at the specified rank
|
||||
*/
|
||||
public String getBaseSequenceAtGivenRank(int rank) {
|
||||
String pseq = "";
|
||||
|
||||
|
|
@ -22,14 +36,28 @@ public class FourProbRead extends ArrayList<FourProb> {
|
|||
return pseq;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the primary read sequence
|
||||
* @return the primary read sequence
|
||||
*/
|
||||
public String getPrimaryBaseSequence() {
|
||||
return getBaseSequenceAtGivenRank(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the secondary read sequence
|
||||
* @return the secondary read sequence
|
||||
*/
|
||||
public String getSecondaryBaseSequence() {
|
||||
return getBaseSequenceAtGivenRank(1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the SAM spec-conformant SQ tag that will be written to the SAM/BAM file.
|
||||
*
|
||||
* @param rr the raw read
|
||||
* @return the byte array for the SQ tag (first two bits: the base identity, the last six bits: -10*log10(p3/p2).
|
||||
*/
|
||||
public byte[] getSQTag(RawRead rr) {
|
||||
byte[] sqtag = new byte[this.size()];
|
||||
|
||||
|
|
|
|||
|
|
@ -1,19 +1,17 @@
|
|||
package org.broadinstitute.sting.secondarybase;
|
||||
|
||||
import edu.mit.broad.picard.util.BasicTextFileParser;
|
||||
import edu.mit.broad.picard.util.PasteParser;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.File;
|
||||
import java.io.FilenameFilter;
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
|
||||
import edu.mit.broad.picard.util.PasteParser;
|
||||
import edu.mit.broad.picard.util.BasicTextFileParser;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Closeable {
|
||||
private File bustardDir;
|
||||
|
|
|
|||
|
|
@ -1,15 +1,14 @@
|
|||
package org.broadinstitute.sting.secondarybase;
|
||||
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import net.sf.samtools.SAMFileWriter;
|
||||
import net.sf.samtools.SAMFileWriterFactory;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.cmdLine.Argument;
|
||||
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
public class MatchSQTagToStrand extends CommandLineProgram {
|
||||
public static MatchSQTagToStrand Instance = null;
|
||||
|
|
|
|||
|
|
@ -1,7 +1,10 @@
|
|||
package org.broadinstitute.sting.secondarybase;
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
|
||||
/**
|
||||
* RawRead represents lane and tile coordinates, raw intensities, read bases, and quality scores
|
||||
*
|
||||
* @author Kiran Garimella
|
||||
*/
|
||||
public class RawRead {
|
||||
private byte lane;
|
||||
private short tile;
|
||||
|
|
@ -12,6 +15,14 @@ public class RawRead {
|
|||
private byte[] quals;
|
||||
private short[][] intensities;
|
||||
|
||||
/**
|
||||
* Construct a raw read from the output of a PasteParser (in the order of int, seq, prb).
|
||||
* Takes data within specified cycle ranges.
|
||||
*
|
||||
* @param pastedReadString the 3x(fragment length) output array from the PasteParser.
|
||||
* @param cycleBegin the start cycle for the read (0-based, inclusive)
|
||||
* @param cycleEnd the end cycle for the read (0-based, inclusive)
|
||||
*/
|
||||
public RawRead(String[][] pastedReadString, int cycleBegin, int cycleEnd) {
|
||||
lane = Byte.valueOf(pastedReadString[0][0]);
|
||||
tile = Short.valueOf(pastedReadString[0][1]);
|
||||
|
|
@ -43,25 +54,96 @@ public class RawRead {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get lane number of read
|
||||
*
|
||||
* @return lane number of read
|
||||
*/
|
||||
public byte getLane() { return lane; }
|
||||
|
||||
/**
|
||||
* Get tile number of read
|
||||
*
|
||||
* @return tile number of read
|
||||
*/
|
||||
public int getTile() { return tile; }
|
||||
|
||||
/**
|
||||
* Get x-coordinate of read
|
||||
*
|
||||
* @return x-coordinate of read
|
||||
*/
|
||||
public int getXCoordinate() { return x; }
|
||||
|
||||
/**
|
||||
* Get y-coordinate of read
|
||||
*
|
||||
* @return y-coordinate of read
|
||||
*/
|
||||
public int getYCoordinate() { return y; }
|
||||
|
||||
/**
|
||||
* Get read key (lane:tile:x:y)
|
||||
*
|
||||
* @return read key (lane:tile:x:y)
|
||||
*/
|
||||
public String getReadKey() { return String.format("%d:%d:%d:%d", lane, tile, x, y); }
|
||||
|
||||
/**
|
||||
* Get the read sequence between the cycles specified in the constructor as a byte array
|
||||
*
|
||||
* @return read sequence
|
||||
*/
|
||||
public byte[] getSequence() { return sequence; }
|
||||
|
||||
/**
|
||||
* Set the read sequence from a byte array
|
||||
*
|
||||
* @param sequence the read sequence in byte array form
|
||||
*/
|
||||
public void setSequence(byte[] sequence) { this.sequence = sequence; }
|
||||
|
||||
/**
|
||||
* Get the read sequence as a string
|
||||
*
|
||||
* @return the read sequence in string form
|
||||
*/
|
||||
public String getSequenceAsString() {
|
||||
return new String(getSequence());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the quals
|
||||
*
|
||||
* @return a byte array of quals
|
||||
*/
|
||||
public byte[] getQuals() { return quals; }
|
||||
|
||||
/**
|
||||
* Set the quals
|
||||
*
|
||||
* @param quals a byte array of quals
|
||||
*/
|
||||
public void setQuals(byte[] quals) { this.quals = quals; }
|
||||
|
||||
/**
|
||||
* Get the raw read intensities
|
||||
*
|
||||
* @return the (readLength)x(numChannels) array of raw intensities
|
||||
*/
|
||||
public short[][] getIntensities() { return intensities; }
|
||||
|
||||
/**
|
||||
* Set the raw intensities
|
||||
*
|
||||
* @param intensities the (readLength)x(numChannels) array of raw intensities
|
||||
*/
|
||||
public void setIntensities(short[][] intensities) { this.intensities = intensities; }
|
||||
|
||||
/**
|
||||
* Get the read length
|
||||
*
|
||||
* @return the read length
|
||||
*/
|
||||
public int getReadLength() { return sequence.length; }
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue