From c3aaca1262104836181519d25ac330af99b85ba5 Mon Sep 17 00:00:00 2001 From: kiran Date: Thu, 3 Sep 2009 17:20:16 +0000 Subject: [PATCH] Improvements to make this work with uncompressed fastq files. Pulled the fastq parser out into it's own SAMFileReader-like entity. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1520 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/playground/tools/FastqToBam.java | 112 +++--------------- .../sting/utils/fastq/FastqReader.java | 72 +++++++++++ .../sting/utils/fastq/FastqRecord.java | 45 +++++++ 3 files changed, 135 insertions(+), 94 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/utils/fastq/FastqReader.java create mode 100755 java/src/org/broadinstitute/sting/utils/fastq/FastqRecord.java diff --git a/java/src/org/broadinstitute/sting/playground/tools/FastqToBam.java b/java/src/org/broadinstitute/sting/playground/tools/FastqToBam.java index e4d06cad5..509ec887b 100755 --- a/java/src/org/broadinstitute/sting/playground/tools/FastqToBam.java +++ b/java/src/org/broadinstitute/sting/playground/tools/FastqToBam.java @@ -6,106 +6,30 @@ import net.sf.picard.cmdline.Option; import net.sf.samtools.*; import java.io.*; -import java.util.zip.GZIPInputStream; -import java.util.Iterator; -import org.broadinstitute.sting.utils.StingException; - -class FastqRecord { - private String seqHeader; - private String seqLine; - private String qualHeader; - private String qualLine; - - private String accessionName; - private String readName; - private String runName; - - public FastqRecord(BufferedReader in) { - try { - if (in.ready()) { - seqHeader = in.readLine(); - seqLine = in.readLine(); - qualHeader = in.readLine(); - qualLine = in.readLine(); - - String[] seqHeaderPieces = seqHeader.split("\\s+"); - accessionName = seqHeaderPieces[0]; - readName = seqHeaderPieces[1]; - - String[] readNamePieces = readName.split(":"); - runName = readNamePieces[0]; - } - } catch (IOException e) { - throw new StingException("Could not read from fastq file."); - } - } - - public String getReadName() { return readName; } - public String getRunName() { return runName; } - public String getReadString() { return seqLine; } - public String getBaseQualityString() { return qualLine; } - - public String toString() { - return String.format("%s %s : %s : %s", accessionName, readName, seqLine, qualLine); - } -} - -class FastqReader implements Iterator, Iterable { - private BufferedReader in; - private FastqRecord nextRecord; - private String runName; - - public FastqReader(File file) { - try { - in = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)))); - - nextRecord = new FastqRecord(in); - runName = nextRecord.getRunName(); - } catch (IOException e) { - throw new StingException("IO problem"); - } - } - - public String getRunName() { return runName; } - - public boolean hasNext() { return nextRecord != null; } - - public FastqRecord next() { - FastqRecord rec = nextRecord; - - try { - if (in.ready()) { - nextRecord = new FastqRecord(in); - } else { - nextRecord = null; - } - } catch (IOException e) { - throw new StingException("IO problem"); - } - - return rec; - } - - public void remove() { throw new UnsupportedOperationException("Unsupported operation"); } - - public Iterator iterator() { return this; } -} +import org.broadinstitute.sting.utils.fastq.FastqReader; +import org.broadinstitute.sting.utils.fastq.FastqRecord; public class FastqToBam extends CommandLineProgram { @Usage(programVersion="1.0") public String USAGE = "Extracts read sequences and qualities from the input fastq file and writes them into the output file in unaligned BAM format."; @Option(shortName="I1", doc="Input file (fastq.gz) to extract reads from (single-end fastq or, if paired, first end of the pair fastq).", optional=false) public File IN1 = null; @Option(shortName="I2", doc="Input file (fastq.gz) to extract reads from (if paired, second end of the pair fastq).", optional=true) public File IN2 = null; - @Option(shortName="O", doc="Output file (bam).", optional=false) public File OUT = null; + @Option(shortName="O", doc="Output file (bam).", optional=false) public File OUT = null; + @Option(shortName="RB", doc="Run barcode", optional=false) public String RUN_BARCODE; @Option(shortName="RG", doc="Read group name", optional=false) public String READ_GROUP_NAME; @Option(shortName="SM", doc="Sample name", optional=false) public String SAMPLE_NAME; - @Option(shortName="V", doc="Verbose mode", optional=true) public Boolean VERBOSE = false; + @Option(shortName="V", doc="Verbose mode", optional=true) public Boolean VERBOSE = false; public static void main(final String[] argv) { System.exit(new FastqToBam().instanceMain(argv)); } + private String getReadName(String fqrHeader) { + String[] headerPieces = fqrHeader.split("\\s+"); + return headerPieces[0]; + } + protected int doWork() { FastqReader end1 = new FastqReader(IN1); FastqReader end2 = (IN2 == null) ? null : new FastqReader(IN2); @@ -124,12 +48,14 @@ public class FastqToBam extends CommandLineProgram { FastqRecord fqr1 = end1.next(); FastqRecord fqr2 = (end2 == null) ? null : end2.next(); - if (fqr2 != null && !fqr1.getReadName().equalsIgnoreCase(fqr2.getReadName())) { + String fqr1Name = getReadName(fqr1.getReadHeader()); + + //if (fqr2 != null && !fqr1Name.equalsIgnoreCase(fqr2Name)) { //throw new StingException(String.format("In paired mode, but end 1 read name (%s) does not match end 2 read name (%s)", fqr1.getReadName(), fqr2.getReadName())); - } + //} SAMRecord sr1 = new SAMRecord(sfh); - sr1.setReadName(fqr1.getReadName()); + sr1.setReadName(RUN_BARCODE + ":" + fqr1Name); sr1.setReadString(fqr1.getReadString()); sr1.setBaseQualityString(fqr1.getBaseQualityString()); sr1.setReadUmappedFlag(true); @@ -144,17 +70,17 @@ public class FastqToBam extends CommandLineProgram { sr1.setSecondOfPairFlag(false); sr1.setMateUnmappedFlag(true); + String fqr2Name = getReadName(fqr2.getReadHeader()); sr2 = new SAMRecord(sfh); - - sr2.setReadName(fqr2.getReadName()); + sr2.setReadName(RUN_BARCODE + ":" + fqr2Name); sr2.setReadString(fqr2.getReadString()); sr2.setBaseQualityString(fqr2.getBaseQualityString()); sr2.setReadUmappedFlag(true); sr2.setReadPairedFlag(true); + sr2.setAttribute("RG", READ_GROUP_NAME); sr2.setFirstOfPairFlag(false); sr2.setSecondOfPairFlag(false); sr2.setMateUnmappedFlag(true); - sr2.setAttribute("RG", READ_GROUP_NAME); } sfw.addAlignment(sr1); @@ -164,8 +90,6 @@ public class FastqToBam extends CommandLineProgram { readsSeen++; if (VERBOSE) { - //System.out.println(fqr1); - //if (fqr2 != null) { System.out.println(fqr2); } System.out.println(sr1.format()); if (fqr2 != null) { System.out.println(sr2.format()); } } diff --git a/java/src/org/broadinstitute/sting/utils/fastq/FastqReader.java b/java/src/org/broadinstitute/sting/utils/fastq/FastqReader.java new file mode 100755 index 000000000..2dabbdbe8 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/fastq/FastqReader.java @@ -0,0 +1,72 @@ +package org.broadinstitute.sting.utils.fastq; + +import org.broadinstitute.sting.utils.StingException; + +import java.util.Iterator; +import java.util.zip.GZIPInputStream; +import java.io.*; + +public class FastqReader implements Iterator, Iterable, Closeable { + private File fastqFile; + private BufferedReader in; + private FastqRecord nextRecord; + + public FastqReader(File file) { + fastqFile = file; + + try { + if (fastqFile.getName().endsWith(".gz")) { + in = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(fastqFile)))); + } else { + in = new BufferedReader(new InputStreamReader(new FileInputStream(fastqFile))); + } + + nextRecord = readNextRecord(); + } catch (IOException e) { + throw new StingException(String.format("Error opening '%s'", fastqFile.getAbsolutePath())); + } + } + + private FastqRecord readNextRecord() { + try { + String seqHeader = in.readLine(); + String seqLine = in.readLine(); + String qualHeader = in.readLine(); + String qualLine = in.readLine(); + + return new FastqRecord(seqHeader, seqLine, qualHeader, qualLine); + } catch (IOException e) { + throw new StingException(String.format("Error reading '%s'", fastqFile.getAbsolutePath())); + } + } + + public boolean hasNext() { return nextRecord != null; } + + public FastqRecord next() { + FastqRecord rec = nextRecord; + + try { + if (in.ready()) { + nextRecord = readNextRecord(); + } else { + nextRecord = null; + } + } catch (IOException e) { + throw new StingException("IO problem"); + } + + return rec; + } + + public void remove() { throw new UnsupportedOperationException("Unsupported operation"); } + + public Iterator iterator() { return this; } + + public void close() { + try { + in.close(); + } catch (IOException e) { + + } + } +} diff --git a/java/src/org/broadinstitute/sting/utils/fastq/FastqRecord.java b/java/src/org/broadinstitute/sting/utils/fastq/FastqRecord.java new file mode 100755 index 000000000..ebf3396e7 --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/fastq/FastqRecord.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.utils.fastq; + +import org.broadinstitute.sting.utils.StingException; + +import java.io.BufferedReader; +import java.io.IOException; + +public class FastqRecord { + private String seqHeader; + private String seqLine; + private String qualHeader; + private String qualLine; + + public FastqRecord(String seqHeader, String seqLine, String qualHeader, String qualLine) { + setReadHeader(seqHeader); + setReadString(seqLine); + setBaseQualityHeader(qualHeader); + setBaseQualityString(qualLine); + } + + public void setReadHeader(String seqHeader) { + this.seqHeader = seqHeader.replaceFirst("@", ""); + } + + public void setReadString(String seqLine) { this.seqLine = seqLine; } + + public void setBaseQualityHeader(String qualHeader) { + this.qualHeader = qualHeader.replaceFirst("\\+", ""); + } + + public void setBaseQualityString(String qualLine) { this.qualLine = qualLine; } + + public String getReadHeader() { return seqHeader; } + public String getReadString() { return seqLine; } + public String getBaseQualityHeader() { return qualHeader; } + public String getBaseQualityString() { return qualLine; } + + public String format() { + return String.format("@%s\n%s\n+%s\n%s", seqHeader, seqLine, qualHeader, qualLine); + } + + public String toString() { + return String.format("%s : %s %s", seqHeader, seqLine, qualLine); + } +}