Documentation.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@712 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
kiran 2009-05-14 18:58:43 +00:00
parent 68c9455c0f
commit 5858f20902
15 changed files with 209 additions and 75 deletions

View File

@ -17,10 +17,6 @@ import java.io.File;
import java.io.FilenameFilter;
import java.util.*;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.secondarybase.FirecrestReadData;
import org.broadinstitute.sting.secondarybase.FirecrestFilenameComparator;
/**
* Abstract base class for implementing parsers for various versions of Firecrest output
*/

View File

@ -1,14 +1,12 @@
package org.broadinstitute.sting.playground.fourbasecaller;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.File;
import java.util.HashMap;
import net.sf.samtools.*;
public class AddFourProbsToSAM extends CommandLineProgram {
public static AddFourProbsToSAM Instance = null;

View File

@ -1,18 +1,14 @@
package org.broadinstitute.sting.secondarybase;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.secondarybase.BasecallingReadModel;
import java.io.File;
import java.util.HashMap;
import java.util.Vector;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileWriterFactory;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.File;
public class AnnotateSecondaryBase extends CommandLineProgram {
public static AnnotateSecondaryBase Instance = null;
@ -58,10 +54,10 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, SAM_OUT);
IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE, CYCLE_BEGIN, CYCLE_END);
RawRead rr;
int basesConsistent = 0, basesTotal = 0;
RawRead rr;
while ((rr = iparser.next()) != null) {
FourProbRead fpr = model.call(rr);
@ -87,10 +83,9 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
}
iparser.close();
sfw.close();
System.out.println("Done.");
System.out.println("\nDone.");
return 0;
}

View File

@ -1,14 +1,15 @@
package org.broadinstitute.sting.secondarybase;
import cern.colt.matrix.DoubleMatrix1D;
import cern.colt.matrix.DoubleFactory1D;
import cern.colt.matrix.DoubleMatrix2D;
import cern.colt.matrix.DoubleFactory2D;
import cern.colt.matrix.DoubleMatrix1D;
import cern.colt.matrix.DoubleMatrix2D;
import cern.colt.matrix.linalg.Algebra;
import org.broadinstitute.sting.utils.BaseUtils;
import java.io.*;
import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
/**
* BasecallingBaseModel is a class that represents the statistical

View File

@ -2,8 +2,6 @@ package org.broadinstitute.sting.secondarybase;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.secondarybase.BasecallingBaseModel;
import org.broadinstitute.sting.secondarybase.FourProb;
import java.io.File;
import java.util.ArrayList;

View File

@ -1,20 +1,24 @@
package org.broadinstitute.sting.secondarybase;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.CloseableIterator;
import java.util.HashMap;
import java.util.Vector;
import java.util.ArrayList;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.io.File;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.StingException;
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* BasecallingTrainingSet holds a set of raw read sequences, their raw intensities, and quality scores.
*
* @author Kiran Garimella
*/
public class BasecallingTrainingSet {
private File bustardDir;
private int lane;
@ -24,6 +28,15 @@ public class BasecallingTrainingSet {
private ArrayList<RawRead> trainingData;
/**
* Constructor for BasecallingTrainingSet.
*
* @param bustardDir the Bustard directory for the sample
* @param lane the lane for the sample
* @param cycleBegin the start cycle for the beginning of the read (0-based, inclusive)
* @param cycleEnd the stop ccle for the end of the read (0-based, inclusive)
* @param trainingLimit the number of training reads to accept
*/
public BasecallingTrainingSet(File bustardDir, int lane, int cycleBegin, int cycleEnd, int trainingLimit) {
this.bustardDir = bustardDir;
this.lane = lane;
@ -32,14 +45,27 @@ public class BasecallingTrainingSet {
this.trainingLimit = trainingLimit;
}
/**
* Get the training data array list
*
* @return the arraylist of raw training reads
*/
public ArrayList<RawRead> getTrainingData() {
return this.trainingData;
}
/**
* Set the training data array list
*
* @param trainingData the arraylist of raw training reads
*/
public void setTrainingData(ArrayList<RawRead> trainingData) {
this.trainingData = trainingData;
}
/**
* Take the first N reads that have no ambiguous bases and add them to the training set.
*/
public void loadFirstNUnambiguousReadsTrainingSet() {
this.trainingData = new ArrayList<RawRead>(trainingLimit);
@ -65,12 +91,25 @@ public class BasecallingTrainingSet {
}
}
/**
* Load a training set from perfect reads in an already-aligned bam file
*
* @param samIn the SAM/BAM file to load the reads from
* @param reference the reference to which the reads should be compared
*/
public void loadPreAlignedTrainingSet(File samIn, File reference) {
Vector< HashMap<String, SAMRecord> > trainingReads = getPerfectAlignmentsByTile(samIn, reference);
trainingData = correlateReadsAndIntensities(trainingReads);
}
/**
* Find perfect reads and group them by tile.
*
* @param samIn the SAM/BAM file to load the raeds from
* @param reference the reference to which the reads should be compared
* @return a vector of perfect reads, grouped by tile
*/
private Vector<HashMap<String, SAMRecord>> getPerfectAlignmentsByTile(File samIn, File reference) {
FastaSequenceFile2 ref = new FastaSequenceFile2(reference);
String currentContig = "none";
@ -132,6 +171,12 @@ public class BasecallingTrainingSet {
return trainingReads;
}
/**
* Correlate the perfect reads with their raw intensities. Sloooooooow.
*
* @param trainingReads the perfect reads, grouped by tile
* @return a training set of raw sequence, intensities, and quality scores (all set to 40 for these perfect bases)
*/
private ArrayList<RawRead> correlateReadsAndIntensities(Vector<HashMap<String, SAMRecord>> trainingReads) {
ArrayList<RawRead> newTrainingData = new ArrayList<RawRead>(trainingLimit);

View File

@ -1,16 +1,18 @@
package org.broadinstitute.sting.secondarybase;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.QualityUtils;
import java.io.*;
import java.util.HashMap;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileWriterFactory;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
public class CombineSamAndFourProbs extends CommandLineProgram {
public static CombineSamAndFourProbs Instance = null;

View File

@ -9,15 +9,12 @@
*/
package org.broadinstitute.sting.secondarybase;
import edu.mit.broad.picard.util.PasteParser;
import edu.mit.broad.picard.util.FormatUtil;
import edu.mit.broad.picard.util.BasicTextFileParser;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.util.BasicTextFileParser;
import edu.mit.broad.picard.util.FormatUtil;
import java.io.File;
import org.broadinstitute.sting.secondarybase.FirecrestReadData;
/**
* Class to parse the data in an Illumina Firecrest directory and return an iterator over that data, in order
* by tile.

View File

@ -1,20 +1,17 @@
package org.broadinstitute.sting.secondarybase;
import edu.mit.broad.picard.illumina.BustardFileParser;
import edu.mit.broad.picard.illumina.BustardReadData;
import edu.mit.broad.picard.illumina.AbstractBustardFileParser;
import edu.mit.broad.picard.illumina.BustardFileParser;
import edu.mit.broad.picard.illumina.BustardFileParser_1_1;
import edu.mit.broad.picard.illumina.BustardReadData;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileWriterFactory;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.secondarybase.FirecrestFileParser;
import org.broadinstitute.sting.secondarybase.FirecrestReadData;
import org.broadinstitute.sting.secondarybase.FourProb;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.File;
import java.io.IOException;

View File

@ -1,7 +1,5 @@
package org.broadinstitute.sting.secondarybase;
import java.util.StringTokenizer;
public class FourIntensity {
private float[] fIntensities;

View File

@ -1,7 +1,7 @@
package org.broadinstitute.sting.secondarybase;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
/**
* FourProb represents four base hypotheses, their probabilities, and the ranking among one another.

View File

@ -5,11 +5,25 @@ import org.broadinstitute.sting.utils.QualityUtils;
import java.util.ArrayList;
/**
* FourProbRead contains the four-prob information for each base in a read.
*/
public class FourProbRead extends ArrayList<FourProb> {
/**
* Initialize the container with the specified capacity
*
* @param initialCapacity the number of bases in the read
*/
public FourProbRead(int initialCapacity) {
super(initialCapacity);
}
/**
* Get the read sequence at a specified rank
*
* @param rank the rank of the sequence to return (0=best, 1=second-best, 2=third-best, 3=fourth-best)
* @return the read sequence at the specified rank
*/
public String getBaseSequenceAtGivenRank(int rank) {
String pseq = "";
@ -22,14 +36,28 @@ public class FourProbRead extends ArrayList<FourProb> {
return pseq;
}
/**
* Get the primary read sequence
* @return the primary read sequence
*/
public String getPrimaryBaseSequence() {
return getBaseSequenceAtGivenRank(0);
}
/**
* Get the secondary read sequence
* @return the secondary read sequence
*/
public String getSecondaryBaseSequence() {
return getBaseSequenceAtGivenRank(1);
}
/**
* Get the SAM spec-conformant SQ tag that will be written to the SAM/BAM file.
*
* @param rr the raw read
* @return the byte array for the SQ tag (first two bits: the base identity, the last six bits: -10*log10(p3/p2).
*/
public byte[] getSQTag(RawRead rr) {
byte[] sqtag = new byte[this.size()];

View File

@ -1,19 +1,17 @@
package org.broadinstitute.sting.secondarybase;
import edu.mit.broad.picard.util.BasicTextFileParser;
import edu.mit.broad.picard.util.PasteParser;
import org.broadinstitute.sting.utils.StingException;
import java.io.Closeable;
import java.io.File;
import java.io.FilenameFilter;
import java.io.Closeable;
import java.io.IOException;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
import edu.mit.broad.picard.util.PasteParser;
import edu.mit.broad.picard.util.BasicTextFileParser;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class IlluminaParser implements Iterator<RawRead>, Iterable<RawRead>, Closeable {
private File bustardDir;

View File

@ -1,15 +1,14 @@
package org.broadinstitute.sting.secondarybase;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.QualityUtils;
import java.io.File;
import net.sf.samtools.SAMFileReader;
import net.sf.samtools.SAMFileWriter;
import net.sf.samtools.SAMFileWriterFactory;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.cmdLine.Argument;
import org.broadinstitute.sting.utils.cmdLine.CommandLineProgram;
import java.io.File;
public class MatchSQTagToStrand extends CommandLineProgram {
public static MatchSQTagToStrand Instance = null;

View File

@ -1,7 +1,10 @@
package org.broadinstitute.sting.secondarybase;
import org.broadinstitute.sting.utils.BaseUtils;
/**
* RawRead represents lane and tile coordinates, raw intensities, read bases, and quality scores
*
* @author Kiran Garimella
*/
public class RawRead {
private byte lane;
private short tile;
@ -12,6 +15,14 @@ public class RawRead {
private byte[] quals;
private short[][] intensities;
/**
* Construct a raw read from the output of a PasteParser (in the order of int, seq, prb).
* Takes data within specified cycle ranges.
*
* @param pastedReadString the 3x(fragment length) output array from the PasteParser.
* @param cycleBegin the start cycle for the read (0-based, inclusive)
* @param cycleEnd the end cycle for the read (0-based, inclusive)
*/
public RawRead(String[][] pastedReadString, int cycleBegin, int cycleEnd) {
lane = Byte.valueOf(pastedReadString[0][0]);
tile = Short.valueOf(pastedReadString[0][1]);
@ -43,25 +54,96 @@ public class RawRead {
}
}
/**
* Get lane number of read
*
* @return lane number of read
*/
public byte getLane() { return lane; }
/**
* Get tile number of read
*
* @return tile number of read
*/
public int getTile() { return tile; }
/**
* Get x-coordinate of read
*
* @return x-coordinate of read
*/
public int getXCoordinate() { return x; }
/**
* Get y-coordinate of read
*
* @return y-coordinate of read
*/
public int getYCoordinate() { return y; }
/**
* Get read key (lane:tile:x:y)
*
* @return read key (lane:tile:x:y)
*/
public String getReadKey() { return String.format("%d:%d:%d:%d", lane, tile, x, y); }
/**
* Get the read sequence between the cycles specified in the constructor as a byte array
*
* @return read sequence
*/
public byte[] getSequence() { return sequence; }
/**
* Set the read sequence from a byte array
*
* @param sequence the read sequence in byte array form
*/
public void setSequence(byte[] sequence) { this.sequence = sequence; }
/**
* Get the read sequence as a string
*
* @return the read sequence in string form
*/
public String getSequenceAsString() {
return new String(getSequence());
}
/**
* Get the quals
*
* @return a byte array of quals
*/
public byte[] getQuals() { return quals; }
/**
* Set the quals
*
* @param quals a byte array of quals
*/
public void setQuals(byte[] quals) { this.quals = quals; }
/**
* Get the raw read intensities
*
* @return the (readLength)x(numChannels) array of raw intensities
*/
public short[][] getIntensities() { return intensities; }
/**
* Set the raw intensities
*
* @param intensities the (readLength)x(numChannels) array of raw intensities
*/
public void setIntensities(short[][] intensities) { this.intensities = intensities; }
/**
* Get the read length
*
* @return the read length
*/
public int getReadLength() { return sequence.length; }
}