From fdd123fe162eb8e4948379ab92b087bf5f0a7e8e Mon Sep 17 00:00:00 2001 From: kiran Date: Thu, 14 May 2009 16:53:07 +0000 Subject: [PATCH] A parser the raw Illumina data. Allows one to arbitrarily jump from one tile to another. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@703 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/secondarybase/IlluminaParser.java | 160 ++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100755 java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java diff --git a/java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java b/java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java new file mode 100755 index 000000000..224f568c2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/secondarybase/IlluminaParser.java @@ -0,0 +1,160 @@ +package org.broadinstitute.sting.secondarybase; + +import org.broadinstitute.sting.utils.StingException; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.Closeable; +import java.io.IOException; +import java.util.regex.Pattern; +import java.util.regex.Matcher; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Iterator; + +import edu.mit.broad.picard.util.PasteParser; +import edu.mit.broad.picard.util.BasicTextFileParser; + +public class IlluminaParser implements Iterator, Iterable, Closeable { + private File bustardDir; + private File firecrestDir; + private int lane; + private int cycleBegin; + private int cycleEnd; + + private File[] intfiles; + private File[] seqfiles; + private File[] prbfiles; + + private int currentTileIndex; + private PasteParser currentTileParser; + + private boolean iterating = false; + + public IlluminaParser(File bustardDir, int lane, int cycleBegin, int cycleEnd) { + this.bustardDir = bustardDir; + this.firecrestDir = bustardDir.getParentFile(); + this.lane = lane; + this.cycleBegin = cycleBegin; + this.cycleEnd = cycleEnd; + + initializeParser(); + } + + public IlluminaParser(File bustardDir, File firecrestDir, int lane, int cycleBegin, int cycleEnd) { + this.bustardDir = bustardDir; + this.firecrestDir = firecrestDir; + this.lane = lane; + this.cycleBegin = cycleBegin; + this.cycleEnd = cycleEnd; + + initializeParser(); + } + + private void initializeParser() { + intfiles = firecrestDir.listFiles(getFilenameFilter("int")); + seqfiles = bustardDir.listFiles(getFilenameFilter("seq")); + prbfiles = bustardDir.listFiles(getFilenameFilter("prb")); + + if (intfiles.length != seqfiles.length || intfiles.length != prbfiles.length || seqfiles.length != prbfiles.length) { + throw new StingException( + String.format("File list lengths are unequal (int:%d, seq:%d, prb:%d)", + intfiles.length, + seqfiles.length, + prbfiles.length) + ); + } + + Arrays.sort(intfiles, getTileSortingComparator()); + Arrays.sort(seqfiles, getTileSortingComparator()); + Arrays.sort(prbfiles, getTileSortingComparator()); + + iterator(); + + // Todo: put some more consistency checks here + + } + + private FilenameFilter getFilenameFilter(final String suffix) { + return new FilenameFilter() { + public boolean accept(File file, String s) { + Pattern pseq = Pattern.compile(String.format("s_%d_\\d+_%s\\.txt(?!.+old.+)(\\.gz)?", lane, suffix)); + Matcher mseq = pseq.matcher(s); + + return mseq.find(); + } + }; + } + + private Comparator getTileSortingComparator() { + return new Comparator() { + public int compare(File file1, File file2) { + Pattern ptile = Pattern.compile(String.format("s_%d_(\\d+)_", lane)); + + Matcher mtile1 = ptile.matcher(file1.getName()); + Matcher mtile2 = ptile.matcher(file2.getName()); + + if (mtile1.find() && mtile2.find()) { + int tile1 = Integer.valueOf(mtile1.group(1)); + int tile2 = Integer.valueOf(mtile2.group(1)); + + if (tile1 < tile2) { return -1; } + else if (tile1 > tile2) { return 1; } + + return 0; + } + + throw new StingException("Tile filenames ('" + file1.getName() + "' or '" + file2.getName() + "') did not match against regexp pattern ('" + ptile.pattern() + "')"); + } + }; + } + + public int numTiles() { return intfiles.length; } + + public boolean seekToTile(int tile) { + if (tile < intfiles.length - 1) { + currentTileIndex = tile - 1; + + BasicTextFileParser intparser = new BasicTextFileParser(true, intfiles[currentTileIndex]); + BasicTextFileParser seqparser = new BasicTextFileParser(true, seqfiles[currentTileIndex]); + BasicTextFileParser prbparser = new BasicTextFileParser(true, prbfiles[currentTileIndex]); + + currentTileParser = new PasteParser(intparser, seqparser, prbparser); + + return true; + } + + return false; + } + + public boolean hasNext() { + return (currentTileParser.hasNext() || seekToTile(currentTileIndex + 1)); + } + + public RawRead next() { + if (hasNext()) { + return new RawRead(currentTileParser.next(), cycleBegin, cycleEnd); + } + + return null; + } + + public void remove() { + throw new UnsupportedOperationException("IlluminaParser.remove() method is not supported."); + } + + public Iterator iterator() { + if (iterating) { + throw new IllegalStateException("IlluminaParser.iterator() method can only be called once, before the first call to IlluminaParser.hasNext()"); + } + + seekToTile(1); + iterating = true; + + return this; + } + + public void close() { + currentTileParser.close(); + } +}