Added the ability to specify the sorted, unaligned bam and/or the sorted, aligned bam such that broken computations can be restarted.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@805 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
454a6d1df7
commit
02c0afdb85
|
|
@ -49,6 +49,8 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
||||||
@Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN;
|
@Argument(fullName="sam_in", shortName="SI", doc="The file to use for training and annotation", required=false) public File SAM_IN;
|
||||||
@Argument(fullName="training_limit", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000;
|
@Argument(fullName="training_limit", shortName="T", doc="Number of reads to use for parameter initialization", required=false) public int TRAINING_LIMIT = 100000;
|
||||||
@Argument(fullName="calling_limit", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;
|
@Argument(fullName="calling_limit", shortName="C", doc="Number of reads to basecall", required=false) public int CALLING_LIMIT = Integer.MAX_VALUE;
|
||||||
|
@Argument(fullName="unaligned_sam", shortName="US", doc="Unaligned sam file, so we can skip making it", required=false) public File USAM;
|
||||||
|
@Argument(fullName="aligned_sam", shortName="AS", doc="Aligned, queryname-sorted sam file, so we can skip resorting it", required=false) public File ASAM;
|
||||||
|
|
||||||
public static void main(String[] argv) {
|
public static void main(String[] argv) {
|
||||||
Instance = new AnnotateSecondaryBase();
|
Instance = new AnnotateSecondaryBase();
|
||||||
|
|
@ -57,61 +59,71 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
||||||
|
|
||||||
protected int execute() {
|
protected int execute() {
|
||||||
ArrayList<Pair<Integer, Integer>> cycleRanges = getCycleRanges(CYCLE_RANGES);
|
ArrayList<Pair<Integer, Integer>> cycleRanges = getCycleRanges(CYCLE_RANGES);
|
||||||
|
File unalignedSam;
|
||||||
|
|
||||||
BasecallingTrainer trainer = new BasecallingTrainer(BUSTARD_DIR, LANE, TRAINING_LIMIT);
|
if (USAM == null || !USAM.exists()) {
|
||||||
|
BasecallingTrainer trainer = new BasecallingTrainer(BUSTARD_DIR, LANE, TRAINING_LIMIT);
|
||||||
|
|
||||||
// Iterate through raw Firecrest data and store the first unambiguous N reads up to TRAINING_LIMIT
|
// Iterate through raw Firecrest data and store the first N reasonable reads up to TRAINING_LIMIT
|
||||||
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " unambiguous reads in the raw data...");
|
System.out.println("Loading training set from the first " + TRAINING_LIMIT + " reasonable reads in the raw data...");
|
||||||
trainer.loadFirstNUnambiguousReadsTrainingSet();
|
trainer.loadFirstNReasonableReadsTrainingSet();
|
||||||
|
|
||||||
// Iterate through the stored training data and add the info to the BasecallingReadModel
|
// Iterate through the stored training data and add the info to the BasecallingReadModel
|
||||||
System.out.println("Applying training set...");
|
System.out.println("Applying training set...");
|
||||||
BasecallingReadModel model = new BasecallingReadModel(trainer.getTrainingData());
|
BasecallingReadModel model = new BasecallingReadModel(trainer.getTrainingData());
|
||||||
|
|
||||||
// Call bases and write results
|
// Call bases and write results
|
||||||
System.out.println("Calling bases...");
|
System.out.println("Calling bases...");
|
||||||
|
|
||||||
SAMFileHeader sfh = new SAMFileHeader();
|
SAMFileHeader sfh = new SAMFileHeader();
|
||||||
sfh.setSortOrder(SAMFileHeader.SortOrder.queryname);
|
sfh.setSortOrder(SAMFileHeader.SortOrder.queryname);
|
||||||
|
|
||||||
File unalignedSam = (canAnnotate(SAM_IN)) ? getTempSAMFile("unaligned") : SAM_OUT;
|
|
||||||
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, unalignedSam);
|
|
||||||
|
|
||||||
IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE);
|
unalignedSam = (canAnnotate(SAM_IN)) ? getTempSAMFile("unaligned") : SAM_OUT;
|
||||||
|
SAMFileWriter sfw = new SAMFileWriterFactory().makeSAMOrBAMWriter(sfh, false, unalignedSam);
|
||||||
|
|
||||||
BasecallingStats bstats = new BasecallingStats();
|
IlluminaParser iparser = new IlluminaParser(BUSTARD_DIR, LANE);
|
||||||
|
|
||||||
|
BasecallingStats bstats = new BasecallingStats();
|
||||||
|
|
||||||
while (bstats.getReadsTotal() < CALLING_LIMIT && iparser.next()) {
|
while (bstats.getReadsTotal() < CALLING_LIMIT && iparser.next()) {
|
||||||
RawRead rr = iparser.getRawRead();
|
RawRead rr = iparser.getRawRead();
|
||||||
FourProbRead fpr = model.call(rr);
|
FourProbRead fpr = model.call(rr);
|
||||||
|
|
||||||
for (int cycleRangeIndex = 0; cycleRangeIndex < cycleRanges.size(); cycleRangeIndex++) {
|
for (int cycleRangeIndex = 0; cycleRangeIndex < cycleRanges.size(); cycleRangeIndex++) {
|
||||||
Pair<Integer, Integer> cycleRange = cycleRanges.get(cycleRangeIndex);
|
Pair<Integer, Integer> cycleRange = cycleRanges.get(cycleRangeIndex);
|
||||||
|
|
||||||
RawRead rrEnd = iparser.getSubset(cycleRange.getFirst(), cycleRange.getSecond());
|
RawRead rrEnd = iparser.getSubset(cycleRange.getFirst(), cycleRange.getSecond());
|
||||||
FourProbRead fprEnd = fpr.getSubset(cycleRange.getFirst(), cycleRange.getSecond());
|
FourProbRead fprEnd = fpr.getSubset(cycleRange.getFirst(), cycleRange.getSecond());
|
||||||
|
|
||||||
sfw.addAlignment(constructSAMRecord(rrEnd, fprEnd, sfh, RUN_BARCODE, cycleRanges.size() == 2, cycleRangeIndex == 1));
|
sfw.addAlignment(constructSAMRecord(rrEnd, fprEnd, sfh, RUN_BARCODE, cycleRanges.size() == 2, cycleRangeIndex == 1));
|
||||||
|
|
||||||
if (cycleRangeIndex == 0) {
|
if (cycleRangeIndex == 0) {
|
||||||
bstats.update(rrEnd, fprEnd);
|
bstats.update(rrEnd, fprEnd);
|
||||||
bstats.notifyOnInterval(5000);
|
bstats.notifyOnInterval(5000);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bstats.notifyNow();
|
||||||
|
|
||||||
|
iparser.close();
|
||||||
|
sfw.close();
|
||||||
|
} else {
|
||||||
|
unalignedSam = USAM;
|
||||||
}
|
}
|
||||||
|
|
||||||
bstats.notifyNow();
|
|
||||||
|
|
||||||
iparser.close();
|
|
||||||
sfw.close();
|
|
||||||
|
|
||||||
if (canAnnotate(SAM_IN)) {
|
if (canAnnotate(SAM_IN)) {
|
||||||
// If we're in annotation mode, annotate the aligned BAM file with the SQ tag
|
// If we're in annotation mode, annotate the aligned BAM file with the SQ tag
|
||||||
System.out.println("Annotating aligned SAM file...");
|
System.out.println("Annotating aligned SAM file...");
|
||||||
|
|
||||||
System.out.println(" sorting aligned SAM file by read name...");
|
File alignedSam;
|
||||||
File alignedSam = getTempSAMFile("aligned");
|
if (ASAM == null || !ASAM.exists()) {
|
||||||
sortBAMByReadName(SAM_IN, alignedSam);
|
System.out.println(" sorting aligned SAM file by read name...");
|
||||||
|
alignedSam = getTempSAMFile("aligned");
|
||||||
|
sortBAMByReadName(SAM_IN, alignedSam);
|
||||||
|
} else {
|
||||||
|
alignedSam = ASAM;
|
||||||
|
}
|
||||||
|
|
||||||
System.out.println(" merging unaligned and aligned SAM files...");
|
System.out.println(" merging unaligned and aligned SAM files...");
|
||||||
File mergedSam = SAM_OUT;
|
File mergedSam = SAM_OUT;
|
||||||
|
|
@ -266,21 +278,30 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
||||||
SAMRecord usr = usamIt.next();
|
SAMRecord usr = usamIt.next();
|
||||||
SAMRecord asr = asamIt.next();
|
SAMRecord asr = asamIt.next();
|
||||||
|
|
||||||
|
int annotatedRecords = 0;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
// Pull a record from the unaligned file and advance the aligned file until we find the matching record. We
|
// Pull a record from the unaligned file and advance the aligned file until we find the matching record. We
|
||||||
// don't have to advance the unaligned file until we find our record because we assume every record we generate
|
// don't have to advance the unaligned file until we find our record because we assume every record we generate
|
||||||
// will be in the aligned file (which also contains unaligned reads).
|
// will be in the aligned file (which also contains unaligned reads).
|
||||||
//
|
//
|
||||||
// If Picard ever stops storing the unaligned reads, this logic will need to be rewritten.
|
// If Picard ever stops storing the unaligned reads, this logic will need to be rewritten.
|
||||||
|
System.out.println(asr.getReadString());
|
||||||
|
System.out.println(BaseUtils.simpleReverseComplement(asr.getReadString()));
|
||||||
|
System.out.println();
|
||||||
|
|
||||||
if (usr.getReadName().equals(asr.getReadName()) && usr.getFirstOfPairFlag() == asr.getFirstOfPairFlag()) {
|
if (usr.getReadName().equals(asr.getReadName()) && usr.getFirstOfPairFlag() == asr.getFirstOfPairFlag()) {
|
||||||
byte[] sqtag = (byte[]) usr.getAttribute("SQ");
|
byte[] sqtag = (byte[]) usr.getAttribute("SQ");
|
||||||
String usrread = usr.getReadString();
|
String usrread = usr.getReadString();
|
||||||
String asrread = asr.getReadString();
|
String asrread = asr.getReadString();
|
||||||
|
|
||||||
|
System.out.println(asrread);
|
||||||
|
|
||||||
if (asr.getReadNegativeStrandFlag()) {
|
if (asr.getReadNegativeStrandFlag()) {
|
||||||
sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag);
|
sqtag = QualityUtils.reverseComplementCompressedQualityArray(sqtag);
|
||||||
asrread = BaseUtils.simpleReverseComplement(asrread);
|
asrread = BaseUtils.simpleReverseComplement(asrread);
|
||||||
|
|
||||||
|
System.out.println(asrread);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (usrread != null && asrread != null && !usrread.equals(asrread)) {
|
if (usrread != null && asrread != null && !usrread.equals(asrread)) {
|
||||||
|
|
@ -291,6 +312,9 @@ public class AnnotateSecondaryBase extends CommandLineProgram {
|
||||||
}
|
}
|
||||||
|
|
||||||
asr.setAttribute("SQ", sqtag);
|
asr.setAttribute("SQ", sqtag);
|
||||||
|
annotatedRecords++;
|
||||||
|
|
||||||
|
System.out.println("Annotated " + annotatedRecords + " records.");
|
||||||
|
|
||||||
usr = usamIt.next();
|
usr = usamIt.next();
|
||||||
} else {
|
} else {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue