Improvements to make this work with uncompressed fastq files. Pulled the fastq parser out into it's own SAMFileReader-like entity.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1520 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
499b3536a4
commit
c3aaca1262
|
|
@ -6,106 +6,30 @@ import net.sf.picard.cmdline.Option;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.util.zip.GZIPInputStream;
|
|
||||||
import java.util.Iterator;
|
|
||||||
|
|
||||||
import org.broadinstitute.sting.utils.StingException;
|
import org.broadinstitute.sting.utils.fastq.FastqReader;
|
||||||
|
import org.broadinstitute.sting.utils.fastq.FastqRecord;
|
||||||
class FastqRecord {
|
|
||||||
private String seqHeader;
|
|
||||||
private String seqLine;
|
|
||||||
private String qualHeader;
|
|
||||||
private String qualLine;
|
|
||||||
|
|
||||||
private String accessionName;
|
|
||||||
private String readName;
|
|
||||||
private String runName;
|
|
||||||
|
|
||||||
public FastqRecord(BufferedReader in) {
|
|
||||||
try {
|
|
||||||
if (in.ready()) {
|
|
||||||
seqHeader = in.readLine();
|
|
||||||
seqLine = in.readLine();
|
|
||||||
qualHeader = in.readLine();
|
|
||||||
qualLine = in.readLine();
|
|
||||||
|
|
||||||
String[] seqHeaderPieces = seqHeader.split("\\s+");
|
|
||||||
accessionName = seqHeaderPieces[0];
|
|
||||||
readName = seqHeaderPieces[1];
|
|
||||||
|
|
||||||
String[] readNamePieces = readName.split(":");
|
|
||||||
runName = readNamePieces[0];
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new StingException("Could not read from fastq file.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getReadName() { return readName; }
|
|
||||||
public String getRunName() { return runName; }
|
|
||||||
public String getReadString() { return seqLine; }
|
|
||||||
public String getBaseQualityString() { return qualLine; }
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return String.format("%s %s : %s : %s", accessionName, readName, seqLine, qualLine);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class FastqReader implements Iterator<FastqRecord>, Iterable<FastqRecord> {
|
|
||||||
private BufferedReader in;
|
|
||||||
private FastqRecord nextRecord;
|
|
||||||
private String runName;
|
|
||||||
|
|
||||||
public FastqReader(File file) {
|
|
||||||
try {
|
|
||||||
in = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
|
|
||||||
|
|
||||||
nextRecord = new FastqRecord(in);
|
|
||||||
runName = nextRecord.getRunName();
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new StingException("IO problem");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getRunName() { return runName; }
|
|
||||||
|
|
||||||
public boolean hasNext() { return nextRecord != null; }
|
|
||||||
|
|
||||||
public FastqRecord next() {
|
|
||||||
FastqRecord rec = nextRecord;
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (in.ready()) {
|
|
||||||
nextRecord = new FastqRecord(in);
|
|
||||||
} else {
|
|
||||||
nextRecord = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new StingException("IO problem");
|
|
||||||
}
|
|
||||||
|
|
||||||
return rec;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void remove() { throw new UnsupportedOperationException("Unsupported operation"); }
|
|
||||||
|
|
||||||
public Iterator<FastqRecord> iterator() { return this; }
|
|
||||||
}
|
|
||||||
|
|
||||||
public class FastqToBam extends CommandLineProgram {
|
public class FastqToBam extends CommandLineProgram {
|
||||||
@Usage(programVersion="1.0") public String USAGE = "Extracts read sequences and qualities from the input fastq file and writes them into the output file in unaligned BAM format.";
|
@Usage(programVersion="1.0") public String USAGE = "Extracts read sequences and qualities from the input fastq file and writes them into the output file in unaligned BAM format.";
|
||||||
|
|
||||||
@Option(shortName="I1", doc="Input file (fastq.gz) to extract reads from (single-end fastq or, if paired, first end of the pair fastq).", optional=false) public File IN1 = null;
|
@Option(shortName="I1", doc="Input file (fastq.gz) to extract reads from (single-end fastq or, if paired, first end of the pair fastq).", optional=false) public File IN1 = null;
|
||||||
@Option(shortName="I2", doc="Input file (fastq.gz) to extract reads from (if paired, second end of the pair fastq).", optional=true) public File IN2 = null;
|
@Option(shortName="I2", doc="Input file (fastq.gz) to extract reads from (if paired, second end of the pair fastq).", optional=true) public File IN2 = null;
|
||||||
@Option(shortName="O", doc="Output file (bam).", optional=false) public File OUT = null;
|
@Option(shortName="O", doc="Output file (bam).", optional=false) public File OUT = null;
|
||||||
|
@Option(shortName="RB", doc="Run barcode", optional=false) public String RUN_BARCODE;
|
||||||
@Option(shortName="RG", doc="Read group name", optional=false) public String READ_GROUP_NAME;
|
@Option(shortName="RG", doc="Read group name", optional=false) public String READ_GROUP_NAME;
|
||||||
@Option(shortName="SM", doc="Sample name", optional=false) public String SAMPLE_NAME;
|
@Option(shortName="SM", doc="Sample name", optional=false) public String SAMPLE_NAME;
|
||||||
@Option(shortName="V", doc="Verbose mode", optional=true) public Boolean VERBOSE = false;
|
@Option(shortName="V", doc="Verbose mode", optional=true) public Boolean VERBOSE = false;
|
||||||
|
|
||||||
public static void main(final String[] argv) {
|
public static void main(final String[] argv) {
|
||||||
System.exit(new FastqToBam().instanceMain(argv));
|
System.exit(new FastqToBam().instanceMain(argv));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String getReadName(String fqrHeader) {
|
||||||
|
String[] headerPieces = fqrHeader.split("\\s+");
|
||||||
|
return headerPieces[0];
|
||||||
|
}
|
||||||
|
|
||||||
protected int doWork() {
|
protected int doWork() {
|
||||||
FastqReader end1 = new FastqReader(IN1);
|
FastqReader end1 = new FastqReader(IN1);
|
||||||
FastqReader end2 = (IN2 == null) ? null : new FastqReader(IN2);
|
FastqReader end2 = (IN2 == null) ? null : new FastqReader(IN2);
|
||||||
|
|
@ -124,12 +48,14 @@ public class FastqToBam extends CommandLineProgram {
|
||||||
FastqRecord fqr1 = end1.next();
|
FastqRecord fqr1 = end1.next();
|
||||||
FastqRecord fqr2 = (end2 == null) ? null : end2.next();
|
FastqRecord fqr2 = (end2 == null) ? null : end2.next();
|
||||||
|
|
||||||
if (fqr2 != null && !fqr1.getReadName().equalsIgnoreCase(fqr2.getReadName())) {
|
String fqr1Name = getReadName(fqr1.getReadHeader());
|
||||||
|
|
||||||
|
//if (fqr2 != null && !fqr1Name.equalsIgnoreCase(fqr2Name)) {
|
||||||
//throw new StingException(String.format("In paired mode, but end 1 read name (%s) does not match end 2 read name (%s)", fqr1.getReadName(), fqr2.getReadName()));
|
//throw new StingException(String.format("In paired mode, but end 1 read name (%s) does not match end 2 read name (%s)", fqr1.getReadName(), fqr2.getReadName()));
|
||||||
}
|
//}
|
||||||
|
|
||||||
SAMRecord sr1 = new SAMRecord(sfh);
|
SAMRecord sr1 = new SAMRecord(sfh);
|
||||||
sr1.setReadName(fqr1.getReadName());
|
sr1.setReadName(RUN_BARCODE + ":" + fqr1Name);
|
||||||
sr1.setReadString(fqr1.getReadString());
|
sr1.setReadString(fqr1.getReadString());
|
||||||
sr1.setBaseQualityString(fqr1.getBaseQualityString());
|
sr1.setBaseQualityString(fqr1.getBaseQualityString());
|
||||||
sr1.setReadUmappedFlag(true);
|
sr1.setReadUmappedFlag(true);
|
||||||
|
|
@ -144,17 +70,17 @@ public class FastqToBam extends CommandLineProgram {
|
||||||
sr1.setSecondOfPairFlag(false);
|
sr1.setSecondOfPairFlag(false);
|
||||||
sr1.setMateUnmappedFlag(true);
|
sr1.setMateUnmappedFlag(true);
|
||||||
|
|
||||||
|
String fqr2Name = getReadName(fqr2.getReadHeader());
|
||||||
sr2 = new SAMRecord(sfh);
|
sr2 = new SAMRecord(sfh);
|
||||||
|
sr2.setReadName(RUN_BARCODE + ":" + fqr2Name);
|
||||||
sr2.setReadName(fqr2.getReadName());
|
|
||||||
sr2.setReadString(fqr2.getReadString());
|
sr2.setReadString(fqr2.getReadString());
|
||||||
sr2.setBaseQualityString(fqr2.getBaseQualityString());
|
sr2.setBaseQualityString(fqr2.getBaseQualityString());
|
||||||
sr2.setReadUmappedFlag(true);
|
sr2.setReadUmappedFlag(true);
|
||||||
sr2.setReadPairedFlag(true);
|
sr2.setReadPairedFlag(true);
|
||||||
|
sr2.setAttribute("RG", READ_GROUP_NAME);
|
||||||
sr2.setFirstOfPairFlag(false);
|
sr2.setFirstOfPairFlag(false);
|
||||||
sr2.setSecondOfPairFlag(false);
|
sr2.setSecondOfPairFlag(false);
|
||||||
sr2.setMateUnmappedFlag(true);
|
sr2.setMateUnmappedFlag(true);
|
||||||
sr2.setAttribute("RG", READ_GROUP_NAME);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sfw.addAlignment(sr1);
|
sfw.addAlignment(sr1);
|
||||||
|
|
@ -164,8 +90,6 @@ public class FastqToBam extends CommandLineProgram {
|
||||||
readsSeen++;
|
readsSeen++;
|
||||||
|
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
//System.out.println(fqr1);
|
|
||||||
//if (fqr2 != null) { System.out.println(fqr2); }
|
|
||||||
System.out.println(sr1.format());
|
System.out.println(sr1.format());
|
||||||
if (fqr2 != null) { System.out.println(sr2.format()); }
|
if (fqr2 != null) { System.out.println(sr2.format()); }
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,72 @@
|
||||||
|
package org.broadinstitute.sting.utils.fastq;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
public class FastqReader implements Iterator<FastqRecord>, Iterable<FastqRecord>, Closeable {
|
||||||
|
private File fastqFile;
|
||||||
|
private BufferedReader in;
|
||||||
|
private FastqRecord nextRecord;
|
||||||
|
|
||||||
|
public FastqReader(File file) {
|
||||||
|
fastqFile = file;
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (fastqFile.getName().endsWith(".gz")) {
|
||||||
|
in = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(fastqFile))));
|
||||||
|
} else {
|
||||||
|
in = new BufferedReader(new InputStreamReader(new FileInputStream(fastqFile)));
|
||||||
|
}
|
||||||
|
|
||||||
|
nextRecord = readNextRecord();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new StingException(String.format("Error opening '%s'", fastqFile.getAbsolutePath()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private FastqRecord readNextRecord() {
|
||||||
|
try {
|
||||||
|
String seqHeader = in.readLine();
|
||||||
|
String seqLine = in.readLine();
|
||||||
|
String qualHeader = in.readLine();
|
||||||
|
String qualLine = in.readLine();
|
||||||
|
|
||||||
|
return new FastqRecord(seqHeader, seqLine, qualHeader, qualLine);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new StingException(String.format("Error reading '%s'", fastqFile.getAbsolutePath()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasNext() { return nextRecord != null; }
|
||||||
|
|
||||||
|
public FastqRecord next() {
|
||||||
|
FastqRecord rec = nextRecord;
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (in.ready()) {
|
||||||
|
nextRecord = readNextRecord();
|
||||||
|
} else {
|
||||||
|
nextRecord = null;
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new StingException("IO problem");
|
||||||
|
}
|
||||||
|
|
||||||
|
return rec;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void remove() { throw new UnsupportedOperationException("Unsupported operation"); }
|
||||||
|
|
||||||
|
public Iterator<FastqRecord> iterator() { return this; }
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
try {
|
||||||
|
in.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,45 @@
|
||||||
|
package org.broadinstitute.sting.utils.fastq;
|
||||||
|
|
||||||
|
import org.broadinstitute.sting.utils.StingException;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class FastqRecord {
|
||||||
|
private String seqHeader;
|
||||||
|
private String seqLine;
|
||||||
|
private String qualHeader;
|
||||||
|
private String qualLine;
|
||||||
|
|
||||||
|
public FastqRecord(String seqHeader, String seqLine, String qualHeader, String qualLine) {
|
||||||
|
setReadHeader(seqHeader);
|
||||||
|
setReadString(seqLine);
|
||||||
|
setBaseQualityHeader(qualHeader);
|
||||||
|
setBaseQualityString(qualLine);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReadHeader(String seqHeader) {
|
||||||
|
this.seqHeader = seqHeader.replaceFirst("@", "");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setReadString(String seqLine) { this.seqLine = seqLine; }
|
||||||
|
|
||||||
|
public void setBaseQualityHeader(String qualHeader) {
|
||||||
|
this.qualHeader = qualHeader.replaceFirst("\\+", "");
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBaseQualityString(String qualLine) { this.qualLine = qualLine; }
|
||||||
|
|
||||||
|
public String getReadHeader() { return seqHeader; }
|
||||||
|
public String getReadString() { return seqLine; }
|
||||||
|
public String getBaseQualityHeader() { return qualHeader; }
|
||||||
|
public String getBaseQualityString() { return qualLine; }
|
||||||
|
|
||||||
|
public String format() {
|
||||||
|
return String.format("@%s\n%s\n+%s\n%s", seqHeader, seqLine, qualHeader, qualLine);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return String.format("%s : %s %s", seqHeader, seqLine, qualLine);
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue