diff --git a/java/build.xml b/java/build.xml index bb02df9b3..826208bbd 100644 --- a/java/build.xml +++ b/java/build.xml @@ -3,10 +3,10 @@ simple build file - - + + - + @@ -18,16 +18,31 @@ - + + + + + + + - + - + + + + + + + + + + = seqALength || seqAEnd > seqALength || seqAStart >= seqAEnd) { - throw new IllegalArgumentException("Invalid alignment: " + text); - } - if (seqBStart < 0 || seqBEnd <= 0 || seqBLength <= 0 || - seqBStart >= seqBLength || seqBEnd > seqBLength || seqBStart >= seqBEnd) { - throw new IllegalArgumentException("Invalid alignment: " + text); - } - if (orientation < 0 || orientation > 1) { - throw new IllegalArgumentException("Invalid alignment: " + text); - } - if (fields.length != (11 + 3*blockCount)) { - throw new IllegalArgumentException("Invalid alignment: " + text); - } - - int[] alignmentBlocks = new int[3*blockCount]; - for (int i = 0; i < 3*blockCount; i++) { - alignmentBlocks[i] = parseIntField(fields[11 + i]); - } - - Alignment alignment = new Alignment(); - alignment.setASequenceId(seqAId); - alignment.setASequenceLength(seqALength); - alignment.setAStart(seqAStart+1); - alignment.setAEnd(seqAEnd); - alignment.setBSequenceId(seqBId); - alignment.setBSequenceLength(seqBLength); - alignment.setBStart(seqBStart+1); - alignment.setBEnd(seqBEnd); - alignment.setOrientation((orientation == 0) ? '+' : '-'); - alignment.setAlignmentBlocks(alignmentBlocks); - return alignment; - } - - private static int parseIntField(String text) { - try { - return Integer.parseInt(text); - } catch (NumberFormatException exc) { - throw new IllegalArgumentException("Illegal alignment field: " + text); - } - } - - public String arachneFormat() { - StringBuilder builder = new StringBuilder(); - builder.append("QUERY"); - builder.append(TAB); - builder.append(mASequenceId); - builder.append(TAB); - builder.append(mAStart-1); // zero based - builder.append(TAB); - builder.append(mAEnd); - builder.append(TAB); - builder.append(mASequenceLength); - builder.append(TAB); - builder.append(mOrientation == '+' ? 0 : 1); - builder.append(TAB); - builder.append(mBSequenceId); - builder.append(TAB); - builder.append(mBStart-1); // zero based - builder.append(TAB); - builder.append(mBEnd); - builder.append(TAB); - builder.append(mBSequenceLength); - builder.append(TAB); - builder.append(mAlignmentBlocks.length / 3); - for (int i = 0; i < mAlignmentBlocks.length; i++) { - builder.append(TAB); - builder.append(mAlignmentBlocks[i]); - } - return builder.toString(); - } - - public String format() { - StringBuilder builder = new StringBuilder(); - builder.append("Alignment"); - builder.append(' '); - builder.append(mASequenceId); - builder.append(' '); - builder.append(mAStart); - builder.append(' '); - builder.append(mAEnd); - builder.append(' '); - builder.append(mOrientation); - builder.append(' '); - builder.append(mBSequenceId); - builder.append(' '); - builder.append(mBStart); - builder.append(' '); - builder.append(mBEnd); - builder.append(' '); - builder.append(mAlignmentBlocks.length / 3); - for (int i = 0; i < mAlignmentBlocks.length; i++) { - builder.append(' '); - builder.append(mAlignmentBlocks[i]); - } - return builder.toString(); - } -} diff --git a/java/lib/edu/mit/broad/arachne/Fastb2Fasta.java b/java/lib/edu/mit/broad/arachne/Fastb2Fasta.java deleted file mode 100644 index 964e054ef..000000000 --- a/java/lib/edu/mit/broad/arachne/Fastb2Fasta.java +++ /dev/null @@ -1,132 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.arachne; - -import java.io.*; - -/** - * Utility to convert fastb to fasta files. - * More importantly, can be used to extract a subset of the reads. - */ -public class Fastb2Fasta { - - private boolean mVerbose = false; - private boolean mDebug = false; - private String mInputPath = null; - private String mIdListFilePath = null; - - - public static void main(String[] args) - throws Exception { - new Fastb2Fasta().run(args); - } - - private void usage() { - System.out.println("Usage: Fastb2Fasta ... "); - System.out.println(" -idlist "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-idlist") && argsleft > 1) { - argpos++; - mIdListFilePath = args[argpos++]; - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 1) { - usage(); - return false; - } - - mInputPath = args[argpos]; - return true; - } - - private void run(String[] args) - throws Exception { - - if (!parseArguments(args)) { - System.exit(1); - } - - FastbReader fastbReader = new FastbReader(new File(mInputPath)); - try { - if (mIdListFilePath != null) { - LineNumberReader reader = new LineNumberReader(new FileReader(mIdListFilePath)); - while (true) { - String line = reader.readLine(); - if (line == null) { - reader.close(); - break; - } - Integer id = parseReadId(line); - if (id == null) { - continue; - } - if (id < 0 || id >= fastbReader.getSequenceCount()) { - System.out.println("ERROR: Illegal sequence id: " + id); - System.exit(1); - } - String sequence = fastbReader.readSequence(id); - System.out.println(">" + id); - System.out.println(sequence); - } - } else { - int id = 0; - while (fastbReader.hasNext()) { - String sequence = fastbReader.next(); - System.out.println(">" + id); - System.out.println(sequence); - id++; - } - } - } finally { - fastbReader.close(); - } - } - - private Integer parseReadId(String line) { - String text = line.trim(); - if (text.length() == 0 || text.charAt(0) == '#') { - return null; - } - String token = text.split("\\s+")[0]; - Integer id = null; - try { - id = new Integer(token); - } catch (NumberFormatException exc) { - System.out.println("ERROR: Invalid sequence id: " + token); - System.exit(1); - } - return id; - } -} diff --git a/java/lib/edu/mit/broad/arachne/FastbReader.java b/java/lib/edu/mit/broad/arachne/FastbReader.java deleted file mode 100755 index 0d6cd3dd5..000000000 --- a/java/lib/edu/mit/broad/arachne/FastbReader.java +++ /dev/null @@ -1,220 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.arachne; - - -import edu.mit.broad.sam.util.CloseableIterator; - -import java.io.*; - - -/** - * Reader for arachne Fastb files. - */ -public class FastbReader - implements CloseableIterator { - - // Notes on fastb file format - // - // Fastb files contain the serialized contents of an arachne vecbasevector, - // which is a typedef for mastervec. - // The serialization of mastervec objects starts with a 24 byte mv_file_control_block, - // followed by N variable length segments (one for each element of the mastervec vector), - // followed by an offset table containing N 8-byte file offsets to the N variable length - // segments, followed by N fixed length data segments, one for each vector element. - // Thus, reading a single element of the mastervec vector requires reading from three - // separate places in the file (the offset table, the variable length section and the - // fixed length section). - // - // The mastervec file header is 24 bytes arranged as follows: - // n 4-byte signed(?) integer (number of entries) - // c1 1-byte unsigned bit mask (see below) - // reserved 1-byte unused - // sizeX 1-byte unsigned, sizeof first template parameter (16 for fastb files) - // sizeA 1-byte unsigned, sizeof second template parameter (4 for fastb files) - // offsets_start 8-byte signed(?) integer, file offset of offset table - // static_start 8-byte signed(?) integer, file offset of static data (fixed size section) - // - // For fastb files, the fixed size section contains 4 bytes for each object, which is the - // unsigned(?) count of the number of bases in this entry. - // For fastb files, the variable length section contains a bit vector with two bits per base. - // The bases are encoded as follows: A = 0, C = 1, G = 2, T = 3. - // - // For fastb files, in the file header N is the number of entries in the fastb file. - // c1 is unused/unimplemented except that the two low-order bits should be 0x01, indicating - // that we are using the single-file representation. There is also apparently a three-file - // representation that looks the same except that the offset table and static (fixed length) - // table are in separate files named .offsets and .static. - // The sizeX should be 16 for fastb files and sizeA should be 4. - // - // Note that in fastb files, the sequences are not identified by name or id, only by index - // (zero based) into the mastervec object. There is no representation for bases other than - // ACGT (i.e. Ns cannot be encoded). - - private static final char[] BASES = { 'A', 'C', 'G', 'T' }; - - private File mFile; - private RandomAccessFile mRandomFile; - private int mEntryCount; - private long mOffsetTableOffset; - private long mLengthTableOffset; - private int mCurrentPosition; - private byte[] mIOBuffer = new byte[8]; - - - public FastbReader(File file) - throws IOException { - mFile = file; - mRandomFile = new RandomAccessFile(mFile, "r"); - readHeader(); - } - - public int getSequenceCount() { - return mEntryCount; - } - - public boolean hasNext() { - return (mCurrentPosition < mEntryCount); - } - - public String next() { - if (!hasNext()) { - throw new IllegalStateException("Iterator exhausted"); - } - try { - return readSequence(mCurrentPosition); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - - public void remove() { - throw new UnsupportedOperationException("Not supported: remove"); - } - - public void close() { - if (mRandomFile != null) { - mEntryCount = 0; - mCurrentPosition = 0; - try { - mRandomFile.close(); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } finally { - mRandomFile = null; - } - } - } - - public String readSequence(int n) - throws IOException { - if (mRandomFile == null) { - throw new IllegalStateException("Reader is closed"); - } - if (n < 0 || n >= mEntryCount) { - throw new IndexOutOfBoundsException("Illegal index: " + n); - } - long offset = getEntryOffset(n); - int length = getEntryBaseCount(n); - String result = readBases(offset, length); - mCurrentPosition = n+1; - return result; - } - - private void readHeader() - throws IOException { - - byte[] fileControlBlock = new byte[24]; - mRandomFile.readFully(fileControlBlock, 0, 24); - - int word2 = deserializeInt(fileControlBlock, 4); - int nFiles = word2 & 0x3; - int sizeX = (word2 >> 16) & 0xFF; - int sizeA = (word2 >> 24) & 0xFF; - if (nFiles != 1) { - throw new RuntimeException(mFile + ": Invalid file header: nFiles = " + nFiles); - } - if (sizeX != 16) { - throw new RuntimeException(mFile + ": Invalid file header: sizeX = " + sizeX); - } - if (sizeA != 4) { - throw new RuntimeException(mFile + ": Invalid file header: sizeX = " + sizeA); - } - mEntryCount = deserializeInt(fileControlBlock, 0); - mOffsetTableOffset = deserializeLong(fileControlBlock, 8); - mLengthTableOffset = deserializeLong(fileControlBlock, 16); - } - - private long getEntryOffset(int n) - throws IOException { - mRandomFile.seek(mOffsetTableOffset + 8 * n); - mRandomFile.readFully(mIOBuffer, 0, 8); - return deserializeLong(mIOBuffer, 0); - } - - private int getEntryBaseCount(int n) - throws IOException { - mRandomFile.seek(mLengthTableOffset + 4 * n); - mRandomFile.readFully(mIOBuffer, 0, 4); - return deserializeInt(mIOBuffer, 0); - } - - private String readBases(long fileOffset, int baseCount) - throws IOException { - - - int byteCount = (baseCount + 3) / 4; - byte[] data = new byte[byteCount]; - mRandomFile.seek(fileOffset); - mRandomFile.readFully(data, 0, byteCount); - - int baseIndex = 0; - int dataIndex = 0; - char[] baseBuffer = new char[baseCount]; - while (baseIndex < baseCount) { - int b = data[dataIndex++]; - int count = Math.min(4, baseCount - baseIndex); - for (int i = 0; i < count; i++) { - baseBuffer[baseIndex++] = BASES[b & 0x3]; - b = b >> 2; - } - } - return new String(baseBuffer); - } - - private int deserializeInt(byte[] buffer, int offset) { - int byte1 = buffer[offset] & 0xFF; - int byte2 = buffer[offset+1] & 0xFF; - int byte3 = buffer[offset+2] & 0xFF; - int byte4 = buffer[offset+3] & 0xFF; - return (byte1 | (byte2 << 8) | (byte3 << 16) | (byte4 << 24)); - } - - private long deserializeLong(byte[] buffer, int offset) { - long int1 = deserializeInt(buffer, offset) & 0xFFFFFFFFL; - long int2 = deserializeInt(buffer, offset+4) & 0xFFFFFFFFL; - return (int1 | (int2 << 32)); - } - - // Stub for interactive use (see also Fastb2Fasta) - public static void main(String[] args) - throws Exception { - FastbReader reader = new FastbReader(new File(args[0])); - int readId = 0; - while (reader.hasNext()) { - System.out.println(">" + readId); - System.out.println(reader.next()); - readId++; - } - reader.close(); - } -} - diff --git a/java/lib/edu/mit/broad/arachne/GenomeMask.java b/java/lib/edu/mit/broad/arachne/GenomeMask.java deleted file mode 100644 index 7e7ebdcb0..000000000 --- a/java/lib/edu/mit/broad/arachne/GenomeMask.java +++ /dev/null @@ -1,83 +0,0 @@ -package edu.mit.broad.arachne; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.BitSet; -import java.util.SortedMap; -import java.util.TreeMap; - -/** - * Utility class to read in a set of contig-based genomic intervals in zero-based end inclusive - * and store them efficiently in memory as a 1-based bit-mask - */ -public class GenomeMask { - - // if memory usage becomes a problem... this could be changed to a SparseBitSet - // http://java.sun.com/developer/onlineTraining/collections/magercises/BitSet/index.html - private SortedMap data = new TreeMap(); - - - public GenomeMask(File maskFile) throws IOException { - BufferedReader baitReader = null; - try { - baitReader = new BufferedReader(new FileReader(maskFile)); - String line; - while ((line = baitReader.readLine()) != null) { - String[] arr = line.split(" "); - int contig = Integer.parseInt(arr[0]); - - // covert the coordinates from 0-based, end inclusive to - // 1-based end inclusive - int startPos = Integer.parseInt(arr[1]) + 1; - int endPos = Integer.parseInt(arr[2]) + 1; - - BitSet bits = data.get(contig); - if (bits == null) { - bits = new BitSet(endPos); - data.put(contig,bits); - } - - bits.set(startPos, endPos + 1); // set method is end exclusive - } - } finally { - if (baitReader != null) { baitReader.close(); } - } - } - - /** - * This ctor is useful if initializing a GenomeMask externally. - */ - public GenomeMask() { - } - - public boolean get(int contig, int position) { - BitSet bits = data.get(contig); - return (bits != null) && bits.get(position); - } - - public BitSet get(int contig) { - return data.get(contig); - } - - /** - * Get an existing BitSet for the given contig, or create one if not already present. This is - * useful when initializing a GenomeMask from an external source. - * @param contig which BitSet - * @param numBits if there was not already a BitSet for this contig, one is created and initialized to this size. - * @return the BitSet for the given contig, creating one if necessary - */ - public BitSet getOrCreate(int contig, int numBits) { - BitSet ret = data.get(contig); - if (ret == null) { - ret = new BitSet(numBits); - data.put(contig, ret); - } - return ret; - } - - public int getMaxContig() { - return data.lastKey(); - } -} diff --git a/java/lib/edu/mit/broad/arachne/LookAlignReader.java b/java/lib/edu/mit/broad/arachne/LookAlignReader.java deleted file mode 100755 index a00efcb7c..000000000 --- a/java/lib/edu/mit/broad/arachne/LookAlignReader.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.arachne; - - -import edu.mit.broad.sam.util.CloseableIterator; - -import java.io.*; - - -/** - * Reader for arachne LookAlign text format alignment files. - * Supports filtering of the input by genomic locus. - */ -public class LookAlignReader - implements CloseableIterator { - - private LineNumberReader mReader = null; - private Alignment mNextAlignment = null; - private int mBSequenceId = -1; - private int mBStart = 0; - private int mBEnd = 0; - - - public LookAlignReader(File file) - throws IOException { - this(new FileReader(file)); - } - - public LookAlignReader(Reader reader) { - if (reader instanceof LineNumberReader) { - mReader = (LineNumberReader) reader; - } else { - mReader = new LineNumberReader(reader); - } - } - - public void setBSequenceId(int value) { - mBSequenceId = value; - } - - public void setBStart(int value) { - mBStart = value; - } - - public void setBEnd(int value) { - mBEnd = value; - } - - public boolean hasNext() { - if (mNextAlignment != null) { - return true; - } - try { - mNextAlignment = nextAlignment(); - return (mNextAlignment != null); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - - public Alignment next() { - if (!hasNext()) { - throw new IllegalStateException("Iterator exhausted"); - } - try { - Alignment result = mNextAlignment; - mNextAlignment = nextAlignment(); - return result; - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - - public void remove() { - throw new UnsupportedOperationException("Not supported: remove"); - } - - public void close() { - if (mReader != null) { - try { - mReader.close(); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - mReader = null; - } - } - - private Alignment nextAlignment() - throws IOException { - if (mReader == null) { - return null; - } - while (true) { - String line = mReader.readLine(); - if (line == null) { - close(); - break; - } - if (!line.startsWith("QUERY")) { - continue; - } - Alignment alignment = Alignment.parse(line); - if (matchesFilters(alignment)) { - return alignment; - } - } - return null; - } - - private boolean matchesFilters(Alignment alignment) { - if (mBSequenceId < 0) { - return true; - } - if (alignment.getBSequenceId() != mBSequenceId) { - return false; - } - if (mBStart > 0 && alignment.getBEnd() < mBStart) { - return false; - } - if (mBEnd > 0 && alignment.getBStart() > mBEnd) { - return false; - } - return true; - } -} - diff --git a/java/lib/edu/mit/broad/cnv/AnalyzeCnvs.java b/java/lib/edu/mit/broad/cnv/AnalyzeCnvs.java deleted file mode 100755 index 07e9b79de..000000000 --- a/java/lib/edu/mit/broad/cnv/AnalyzeCnvs.java +++ /dev/null @@ -1,437 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv; - -import edu.mit.broad.arachne.Alignment; -import edu.mit.broad.arachne.LookAlignReader; - -import java.io.*; -import java.util.*; - - -/** - * Utility class to do data reduction on CNV data. - */ -public class AnalyzeCnvs { - - public static void main(String[] args) - throws Exception { - new AnalyzeCnvs().run(args); - } - - private void usage() { - System.out.println("Usage: AnalyzeCnvs ..."); - System.out.println(" -action "); - System.out.println(" -alignments or -"); - System.out.println(" -alignmentList "); - System.out.println(" -chromosome "); - System.out.println(" -start "); - System.out.println(" -end "); - System.out.println(" -bestAlignments"); - System.out.println(" -mismatchThreshold "); - System.out.println(" -binsize "); - System.out.println(" -output "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-action") && argsleft > 1) { - argpos++; - mAction = args[argpos++]; - } else if (arg.equals("-alignments") && argsleft > 1) { - argpos++; - mAlignmentFilePath = args[argpos++]; - } else if (arg.equals("-alignmentList") && argsleft > 1) { - argpos++; - mAlignmentListFilePath = args[argpos++]; - } else if (arg.equals("-chromosome") && argsleft > 1) { - argpos++; - mChromosome = args[argpos++]; - } else if (arg.equals("-start") && argsleft > 1) { - argpos++; - mStartPosition = new Integer(args[argpos++]); - } else if (arg.equals("-end") && argsleft > 1) { - argpos++; - mEndPosition = new Integer(args[argpos++]); - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-mismatchThreshold") && argsleft > 1) { - argpos++; - mMismatchThreshold = new Integer(args[argpos++]); - } else if (arg.equals("-bestAlignments")) { - argpos++; - mReturnBestHits = true; - } else if (arg.equals("-binsize") && argsleft > 1) { - argpos++; - mBinSize = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-output") && argsleft > 1) { - argpos++; - mOutputColumns = args[argpos++]; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void run(String[] args) - throws Exception { - - if (!parseArguments(args)) { - System.exit(1); - } - - if (mAction == null) { - mAction = "alignmentCoverage"; - } - - if (mAction.equals("alignmentCoverage")) { - mainAlignmentCoverage(); - } else { - System.out.println("Unknown action: " + mAction); - usage(); - System.exit(1); - } - } - - private void mainAlignmentCoverage() - throws IOException { - - if (mStartPosition == null || mEndPosition == null) { - usage(); - System.exit(1); - } else if (mStartPosition <= 0 || mEndPosition <= 0 || mStartPosition > mEndPosition) { - System.out.println("Invalid start/end positions: " + mStartPosition + " " + mEndPosition); - usage(); - System.exit(1); - } - - mSequenceId = chromosomeToSequenceId(mChromosome); - if (mSequenceId < 0) { - System.out.println("Invalid chromosome: " + mChromosome); - usage(); - System.exit(1); - } - - if (mBinSize <= 0) { - System.out.println("Invalid bin size: " + mBinSize); - usage(); - System.exit(1); - } - - runAlignmentCoverage(); - } - - private void runAlignmentCoverage() - throws IOException { - - int length = (mEndPosition - mStartPosition + 1); - if (length <= 0) { - throw new RuntimeException("Invalid start/end positions"); - } - - int binSize = mBinSize; - int binCount = (length + binSize - 1) / binSize; - int[] readStarts = new int[binCount]; - int[] readDepths = new int[binCount]; - List alignmentFiles = getAlignmentFiles(); - for (String path : alignmentFiles) { - processAlignmentFile(path, readStarts, readDepths); - } - printStats(readStarts, readDepths); - } - - private List getAlignmentFiles() - throws IOException { - List fileList = new ArrayList(); - if (mAlignmentListFilePath != null) { - LineNumberReader reader = new LineNumberReader(new FileReader(mAlignmentListFilePath)); - while (true) { - String line = reader.readLine(); - if (line == null) { - reader.close(); - break; - } - String path = line.trim(); - if (path.length() == 0 || path.startsWith("#")) { - continue; - } - fileList.add(path); - } - } else if (mAlignmentFilePath != null) { - fileList.add(mAlignmentFilePath); - } - return fileList; - } - - private void processAlignmentFile(String path, int[] readStarts, int[] readDepths) - throws IOException { - - LookAlignReader reader = null; - if (path == null || path.equals("-")) { - reader = new LookAlignReader(new InputStreamReader(System.in)); - } else { - reader = new LookAlignReader(new File(path)); - } - - while (true) { - Alignment alignment = getNextAlignment(reader); - if (alignment == null) { - reader.close(); - break; - } - processAlignment(alignment, readStarts, readDepths); - } - } - - private void processAlignment(Alignment alignment, - int[] readStarts, - int[] readDepths) { - - if (readStarts != null) { - int baseOffset = alignment.getBStart() - mStartPosition; - int binIndex = baseOffset / mBinSize; - if (binIndex >= 0 && binIndex < readStarts.length) { - readStarts[binIndex]++; - } - } - - if (readDepths != null) { - int baseOffset = alignment.getBStart() - mStartPosition; - int[] alignmentBlocks = alignment.getAlignmentBlocks(); - for (int i = 0; i < alignmentBlocks.length; i += 3) { - int gap = alignmentBlocks[i]; - int duration = alignmentBlocks[i+1]; - if (gap > 0) { - // Gap in B sequence (genome) - // Negative gaps are gaps in A sequence (read) - baseOffset += gap; - } - for (int j = 0; j < duration; j++) { - int binIndex = baseOffset / mBinSize; - if (binIndex >= 0 && binIndex < readDepths.length) { - readDepths[binIndex]++; - } - baseOffset++; - } - } - } - } - - private Alignment getNextAlignment(LookAlignReader reader) - throws IOException { - - if (!mReturnBestHits) { - while (reader.hasNext()) { - Alignment alignment = reader.next(); - if (passesAlignmentFilters(alignment)) { - return alignment; - } - } - return null; - } - - while (true) { - Alignment seed = mPendingAlignment; - mPendingAlignment = null; - if (seed == null && reader.hasNext()) { - seed = reader.next(); - } - if (seed == null) { - return null; - } - List secondaryHits = null; - while (reader.hasNext()) { - Alignment alignment = reader.next(); - if (alignment.getASequenceId() != seed.getASequenceId()) { - if (alignment.getASequenceId() < seed.getASequenceId()) { - throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format()); - } - mPendingAlignment = alignment; - break; - } - if (secondaryHits == null) { - secondaryHits = new ArrayList(); - } - secondaryHits.add(alignment); - } - if (secondaryHits == null) { - if (!passesAlignmentFilters(seed)) { - continue; - } - return seed; - } - secondaryHits.add(seed); - Alignment result = getUniqueBestAlignment(secondaryHits); - if (result != null && passesAlignmentFilters(result)) { - return result; - } - } - } - - private Alignment getUniqueBestAlignment(List alignments) { - int bestMismatches = 0; - List best = new ArrayList(); - for (Alignment a : alignments) { - int mismatches = getAlignmentMismatches(a); - if (best.isEmpty()) { - best.add(a); - bestMismatches = mismatches; - } - if (mismatches == bestMismatches) { - best.add(a); - } else if (mismatches < bestMismatches) { - best.clear(); - best.add(a); - bestMismatches = mismatches; - } - } - if (best.size() != 1) { - return null; - } - return best.get(0); - } - - private boolean passesAlignmentFilters(Alignment alignment) { - - if (mMismatchThreshold != null) { - if (getAlignmentMismatches(alignment) > mMismatchThreshold) { - return false; - } - } - - if (mSequenceId != null) { - if (alignment.getBSequenceId() != mSequenceId) { - return false; - } - } - - if (mStartPosition != null) { - if (alignment.getBEnd() < mStartPosition) { - return false; - } - } - - if (mEndPosition != null) { - if (alignment.getBStart() > mEndPosition) { - return false; - } - } - - return true; - } - - private int getAlignmentMismatches(Alignment alignment) { - int mismatches = 0; - int[] blocks = alignment.getAlignmentBlocks(); - for (int i = 0; i < blocks.length; i += 3) { - int gap = blocks[i]; - int duration = blocks[i+1]; - int mm = blocks[i+2]; - if (mm > duration) { - throw new RuntimeException("Invalid alignment? : " + alignment.format()); - } - mismatches += Math.abs(gap); - mismatches += mm; - } - return mismatches; - } - - private void printStats(int[] readStarts, int[] readDepths) { - if (mOutputColumns != null && mOutputColumns.equals("coverage")) { - // No headers, just coverage - for (int i = 0; i < readDepths.length; i++) { - String line = ""; - if (mBinSize == 1) { - line += readDepths[i]; - } else { - line += (readDepths[i] / (double) mBinSize); - } - System.out.println(line); - } - } else { - System.out.println("Position" + "\t" + "Starts" + "\t" + "Coverage"); - for (int i = 0; i < readDepths.length; i++) { - String line = ""; - int position = mStartPosition + i*mBinSize; - line += position + "\t" + readStarts[i] + "\t"; - if (mBinSize == 1) { - line += readDepths[i]; - } else { - line += (readDepths[i] / (double) mBinSize); - } - System.out.println(line); - } - } - } - - private int chromosomeToSequenceId(String text) { - if (text == null || text.length() == 0) { - return -1; - } - if (text.matches("\\d+")) { - return Integer.parseInt(text); - } - if (text.startsWith("chr") && text.length() > 3) { - text = text.substring(3); - } - if (text.matches("\\d+") && !text.startsWith("0")) { - return Integer.parseInt(text); - } - if (text.equals("M")) { - return 0; - } else if (text.equals("X")) { - return 23; - } else if (text.equals("Y")) { - return 24; - } else { - return -1; - } - } - - private boolean mDebug = false; - private boolean mVerbose = false; - - private String mAction = null; - private String mAlignmentFilePath = null; - private String mAlignmentListFilePath = null; - private String mChromosome = null; - private Integer mStartPosition = null; - private Integer mEndPosition = null; - private Integer mSequenceId = null; - private boolean mReturnBestHits = false; - private Integer mMismatchThreshold = null; - private int mBinSize = 1; - private String mOutputColumns = null; - private Alignment mPendingAlignment = null; -} diff --git a/java/lib/edu/mit/broad/cnv/CountAlignments.java b/java/lib/edu/mit/broad/cnv/CountAlignments.java deleted file mode 100644 index e0d60255d..000000000 --- a/java/lib/edu/mit/broad/cnv/CountAlignments.java +++ /dev/null @@ -1,283 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv; - -import edu.mit.broad.arachne.Alignment; -import edu.mit.broad.arachne.LookAlignReader; - -import java.io.*; -import java.util.*; - -/** - * Utility to count alignments (rather than gathering). - */ -public class CountAlignments { - - public static void main(String[] args) - throws Exception { - new CountAlignments().run(args); - } - - private void usage() { - System.out.println("Usage: CountAlignments ..."); - System.out.println(" -alignments (- for stdin)"); - System.out.println(" -chromosome "); - System.out.println(" -start "); - System.out.println(" -end "); - System.out.println(" -bestAlignments"); - System.out.println(" -mismatchThreshold "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-alignments") && argsleft > 1) { - argpos++; - mAlignmentFilePath = args[argpos++]; - } else if (arg.equals("-mismatchThreshold") && argsleft > 1) { - argpos++; - mMismatchThreshold = new Integer(args[argpos++]); - } else if (arg.equals("-bestAlignments")) { - argpos++; - mReturnBestHits = true; - } else if (arg.equals("-chromosome") && argsleft > 1) { - argpos++; - String chromosome = args[argpos++]; - mSequenceId = chromosomeToSequenceId(chromosome); - if (mSequenceId < 0) { - System.out.println("Invalid chromosome: " + chromosome); - return false; - } - } else if (arg.equals("-start") && argsleft > 1) { - argpos++; - mStartPosition = new Integer(args[argpos++]); - } else if (arg.equals("-end") && argsleft > 1) { - argpos++; - mEndPosition = new Integer(args[argpos++]); - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void run(String[] args) - throws Exception { - - if (!parseArguments(args)) { - System.exit(1); - } - - long[] counts = countAlignments(mAlignmentFilePath); - String line = counts[0] + " " + counts[1]; - if (mAlignmentFilePath != null) { - line = mAlignmentFilePath + " " + line; - } - System.out.println(line); - } - - private long[] countAlignments(String path) - throws IOException { - long alignmentCount = 0; - long baseCount = 0; - LookAlignReader reader = null; - if (path == null || path.equals("-")) { - reader = new LookAlignReader(new InputStreamReader(System.in)); - } else { - reader = new LookAlignReader(new File(path)); - } - while (true) { - Alignment alignment = getNextAlignment(reader); - if (alignment == null) { - reader.close(); - break; - } - if (mMismatchThreshold != null) { - if (getAlignmentMismatches(alignment) > mMismatchThreshold) { - continue; - } - } - if (mSequenceId != null) { - if (alignment.getBSequenceId() != mSequenceId) { - continue; - } - } - if (mStartPosition != null) { - if (alignment.getBEnd() < mStartPosition) { - continue; - } - } - if (mEndPosition != null) { - if (alignment.getBStart() > mEndPosition) { - continue; - } - } - alignmentCount++; - baseCount += getBaseCount(alignment); - } - long[] result = { alignmentCount, baseCount }; - return result; - } - - private Alignment getNextAlignment(LookAlignReader reader) - throws IOException { - if (!mReturnBestHits) { - if (!reader.hasNext()) { - return null; - } - return reader.next(); - } - while (true) { - Alignment seed = mPendingAlignment; - mPendingAlignment = null; - if (seed == null && reader.hasNext()) { - seed = reader.next(); - } - if (seed == null) { - return null; - } - List secondaryHits = null; - while (reader.hasNext()) { - Alignment alignment = reader.next(); - if (alignment.getASequenceId() != seed.getASequenceId()) { - if (alignment.getASequenceId() < seed.getASequenceId()) { - throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format()); - } - mPendingAlignment = alignment; - break; - } - if (secondaryHits == null) { - secondaryHits = new ArrayList(); - } - secondaryHits.add(alignment); - } - if (secondaryHits == null) { - return seed; - } - secondaryHits.add(seed); - Alignment result = getUniqueBestAlignment(secondaryHits); - if (result != null) { - return result; - } - } - } - - private Alignment getUniqueBestAlignment(List alignments) { - int bestMismatches = 0; - List best = new ArrayList(); - for (Alignment a : alignments) { - int mismatches = getAlignmentMismatches(a); - if (best.isEmpty()) { - best.add(a); - bestMismatches = mismatches; - } - if (mismatches == bestMismatches) { - best.add(a); - } else if (mismatches < bestMismatches) { - best.clear(); - best.add(a); - bestMismatches = mismatches; - } - } - if (best.size() != 1) { - return null; - } - return best.get(0); - } - - private int getAlignmentMismatches(Alignment alignment) { - int mismatches = 0; - int[] blocks = alignment.getAlignmentBlocks(); - for (int i = 0; i < blocks.length; i += 3) { - int gap = blocks[i]; - int duration = blocks[i+1]; - int mm = blocks[i+2]; - if (mm > duration) { - throw new RuntimeException("Invalid alignment? : " + alignment.format()); - } - mismatches += Math.abs(gap); - mismatches += mm; - } - return mismatches; - } - - // Return the number of reference bases covered by this alignment. - private int getBaseCount(Alignment alignment) { - int count = 0; - int[] blocks = alignment.getAlignmentBlocks(); - for (int i = 0; i < blocks.length; i += 3) { - // int gap = blocks[i]; - int duration = blocks[i+1]; - // int mm = blocks[i+2]; - count += duration; - } - return count; - } - - private int chromosomeToSequenceId(String text) { - if (text == null || text.length() == 0) { - return -1; - } - if (text.matches("\\d+")) { - return Integer.parseInt(text); - } - if (text.startsWith("chr") && text.length() > 3) { - text = text.substring(3); - } - if (text.matches("\\d+") && !text.startsWith("0")) { - return Integer.parseInt(text); - } - if (text.equals("M")) { - return 0; - } else if (text.equals("X")) { - return 23; - } else if (text.equals("Y")) { - return 24; - } else { - return -1; - } - } - - - private boolean mDebug = false; - private boolean mVerbose = false; - - private String mAlignmentFilePath = null; - private boolean mReturnBestHits = false; - private Integer mMismatchThreshold = null; - private Integer mSequenceId = null; - private Integer mStartPosition = null; - private Integer mEndPosition = null; - private Alignment mPendingAlignment = null; -} diff --git a/java/lib/edu/mit/broad/cnv/CountKMers.java b/java/lib/edu/mit/broad/cnv/CountKMers.java deleted file mode 100644 index 0fa159615..000000000 --- a/java/lib/edu/mit/broad/cnv/CountKMers.java +++ /dev/null @@ -1,1301 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv; - -import java.io.*; -import java.util.*; - - -/** - * Tool for counting unique kmers. - */ -public class CountKMers -{ - private static final int NONUNIQUE_MARKER = -1; - private static boolean mUseOldFormat = false; - - private String mAction = null; - private static int mK = 0; - private int mBatchSize = 0; - private List mInputFiles = null; - private File mInputDirectory = null; - private File mOutputDirectory = null; - private boolean mVerbose = false; - private boolean mDebug = false; - - private List mSequenceList = null; - private List mSequenceOffsetList = null; - private List mSpillFileList = null; - private double mSpillFactor = 0.9; - - private long mKMerCount = 0; - private long mUniquePriorCount = 0; - private long mUniqueNewCount = 0; - private long mPriorMapUniqueCount = 0; - - private InputStream mPriorMapStream = null; - private int mPriorMapPosition = -1; - private int mPriorMapValue = 0; - private int mInputFileIndex = 0; - private LineNumberReader mCurrentReader = null; - private String mNextSequence = null; - private char[] mKMerBuffer = null; - private int mKMerBufferedCount = 0; - private String mLineBuffer = null; - private int mLineBufferIndex = 0; - private int mBaseIndex = -1; - private byte[] mIOBuffer = null; - - /* Design - Inputs: - - One or more fasta files to search (currently one). - - Output directory for the result files. - - Optionally an input k-1-mer file (output from previous pass). - Outputs: - - Unique kmer file: (sorted by kmer) - This is unique globally or unique wrt unique (K-1) mers (i.e. K unique, K-1 not). - - Per chromosome bit map: pos (implicit) new-bit cum-bit - New-bit is 1 if Kmer starting at pos is unique but (K-1)-mer is not. - Cum-bit is 1 if Kmer starting at pos is unique for some L <= K. - - Statistics - Plan: - - Reducing memory footprint is crucial. - - Sequential pass over the input sequences to generate kmers. - - BatchSize kmers are cached in memory, then sorted and uniqified. - - As batch array fills, batches are spilled to disk. - - Batches are reloaded from disk and merged (N-finger algorithm) - - and streamed to a merge file. - - Merge file is read from disk and processed as final results. - */ - - public static void main(String[] args) - throws Exception { - new CountKMers().run(args); - } - - private void usage() { - System.out.println("Usage: CountKMers ..."); - System.out.println(" -action "); - System.out.println(" -genome "); - System.out.println(" -k "); - System.out.println(" -batchSize "); - System.out.println(" -inputDir "); - System.out.println(" -outputDir "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-action") && argsleft > 1) { - argpos++; - mAction = args[argpos++]; - } else if (arg.equals("-genome") && argsleft > 1) { - argpos++; - if (mInputFiles == null) { - mInputFiles = new ArrayList(); - } - mInputFiles.add(new File(args[argpos++])); - } else if (arg.equals("-k") && argsleft > 1) { - argpos++; - mK = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-batchSize") && argsleft > 1) { - argpos++; - mBatchSize = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-inputDir") && argsleft > 1) { - argpos++; - mInputDirectory = new File(args[argpos++]); - } else if (arg.equals("-outputDir") && argsleft > 1) { - argpos++; - mOutputDirectory = new File(args[argpos++]); - } else if (arg.equals("-oldFormat")) { - argpos++; - mUseOldFormat = true; - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void run(String[] args) - throws Exception { - if (!parseArguments(args)) { - System.exit(1); - } - if (mAction == null || mAction.equals("mapKMers")) { - mapKMers(); - } else if (mAction.equals("mapGaps")) { - mapGaps(); - } - } - - // Can be used to scan genome for sequence names/lengths. - private void scanKMers() - throws IOException { - mSequenceList = new ArrayList(); - mSequenceOffsetList = new ArrayList(); - File priorMapFile = - new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); - openPriorMap(priorMapFile); - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - mSequenceList.add(seqName); - mSequenceOffsetList.add(mBaseIndex+1); - log("Scanning " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - mKMerCount++; - if (isUniqueInPriorMap(mBaseIndex)) { - continue; - } - } - } - closePriorMap(); - } - - private void mapGaps() - throws IOException { - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - int pos = 0; - int gapStart = 0; - while (true) { - char base = getNextBase(); - if (base == 0) { - break; - } - pos++; - if (base == 'N') { - if (gapStart == 0) { - gapStart = pos; - } - } else { - if (gapStart > 0) { - System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); - gapStart = 0; - } - } - } - if (gapStart > 0) { - System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); - gapStart = 0; - } - } - } - - private void mapKMers() - throws IOException { - - File textKMerFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.txt"); - File binaryKMerFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.bin"); - File exceptionFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.extra"); - File mapFile = - new File(mOutputDirectory, "unique_" + mK + "_mers_map.bin"); - File priorMapFile = - new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); - File statsFile = - new File(mOutputDirectory, "unique_" + mK + "_mers_stats.txt"); - - if (mBatchSize == 0) { - throw new RuntimeException("Batch size not specified"); - } - - int kmerCount = 0; - int batchSize = mBatchSize; - KMerPosition[] kmerArray = new KMerPosition[batchSize]; - List exceptionList = new ArrayList(); - mSequenceList = new ArrayList(); - mSequenceOffsetList = new ArrayList(); - mIOBuffer = new byte[Math.max(20,4 + 2*((mK + 7)/8))]; - - openPriorMap(priorMapFile); - - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - mSequenceList.add(seqName); - mSequenceOffsetList.add(mBaseIndex+1); - log("Processing " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - mKMerCount++; - int baseIndex = mBaseIndex; - if (isUniqueInPriorMap(baseIndex)) { - mUniquePriorCount++; - continue; - } - KMerPosition kmp = encodeKMer(kmerChars, baseIndex); - if (kmp == null) { - String kmer = new String(kmerChars); - exceptionList.add(new StringKMerPosition(kmer, baseIndex)); - continue; - } - kmerArray[kmerCount++] = kmp; - if (kmerCount == batchSize) { - kmerCount = compactKMers(kmerArray, kmerCount); - if (kmerCount > mSpillFactor * batchSize) { - spillKMers(kmerArray, kmerCount); - kmerCount = 0; - } - } - } - } - if (kmerCount > 0) { - kmerCount = compactKMers(kmerArray, kmerCount); - if (mSpillFileList != null) { - spillKMers(kmerArray, kmerCount); - kmerCount = 0; - } - } - - closePriorMap(); - - // Write out the exception kmers (text file). - compactKMers(exceptionList); - writeExceptionFile(exceptionList, exceptionFile); - - // Write out the binary file of unique encoded kmers. - if (mSpillFileList == null) { - kmerCount = removeNonUnique(kmerArray, kmerCount); - writeKMerBinaryFile(kmerArray, kmerCount, binaryKMerFile); - mUniqueNewCount = kmerCount; - } else { - mUniqueNewCount = mergeSpillFiles(mSpillFileList, binaryKMerFile); - } - mUniqueNewCount += countUniqueKMers(exceptionList); - - // Write out the text file of (all) unique kmers. - writeKMerTextFile(binaryKMerFile, exceptionList, textKMerFile); - - // Create map file from prior map plus the new unique kmers. - int mapSize = ((mBaseIndex >> 2) & 0x3FFFFFFF) + 1; - createMapFile(mapSize, binaryKMerFile, exceptionList, priorMapFile, mapFile); - - // Write summary statistics file. - writeSummaryStatistics(statsFile); - } - - private int compactKMers(KMerPosition[] kmerArray, int kmerCount) { - if (kmerCount == 0) { - return 0; - } - log("Compacting " + kmerCount + " kmers at index " + - Integer.toHexString(mBaseIndex) + " ..."); - Arrays.sort(kmerArray, 0, kmerCount); - int newCount = 1; - KMerPosition current = kmerArray[0]; - for (int i = 1; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - if (current.compareTo(kmp) == 0) { - current.setBaseIndex(NONUNIQUE_MARKER); - } else { - kmerArray[newCount++] = kmp; - current = kmp; - } - } - log("Compaction finished, new count is " + newCount); - return newCount; - } - - private int compactKMers(StringKMerPosition[] kmerArray, int kmerCount) { - if (kmerCount == 0) { - return 0; - } - log("Compacting " + kmerCount + " string kmers ..."); - Arrays.sort(kmerArray, 0, kmerCount); - int newCount = 1; - String kmerString = kmerArray[0].getKMer(); - for (int i = 1; i < kmerCount; i++) { - StringKMerPosition kmp = kmerArray[i]; - String ks = kmp.getKMer(); - if (ks.equals(kmerString)) { - kmerArray[newCount-1].setBaseIndex(NONUNIQUE_MARKER); - } else { - kmerArray[newCount++] = kmp; - kmerString = ks; - } - } - log("Compaction finished, new count is " + newCount); - return newCount; - } - - private void compactKMers(List kmerList) { - int kmerCount = kmerList.size(); - if (kmerCount <= 1) { - return; - } - StringKMerPosition[] kmerArray = - kmerList.toArray(new StringKMerPosition[kmerCount]); - kmerCount = compactKMers(kmerArray, kmerCount); - kmerList.clear(); - for (int i = 0; i < kmerCount; i++) { - kmerList.add(kmerArray[i]); - } - } - - private int removeNonUnique(KMerPosition[] kmerArray, int kmerCount) { - int uniqueCount = 0; - for (int i = 0; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { - kmerArray[uniqueCount++] = kmp; - } - } - return uniqueCount; - } - - private int countUniqueKMers(List kmerList) { - int uniqueCount = 0; - for (StringKMerPosition kmp : kmerList) { - if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { - uniqueCount++; - } - } - return uniqueCount; - } - - private void spillKMers(KMerPosition[] kmerArray, int kmerCount) - throws IOException { - if (mSpillFileList == null) { - mSpillFileList = new ArrayList(); - } - int fileNumber = mSpillFileList.size() + 1; - log("Spilling " + kmerCount + " kmers to file " + fileNumber + " ..."); - File spillFile = new File(mOutputDirectory, - "spill_" + mK + "_" + fileNumber + ".tmp"); - mSpillFileList.add(spillFile); - writeKMerBinaryFile(kmerArray, kmerCount, spillFile); - log("Spill file written"); - } - - private void writeKMerBinaryFile(KMerPosition[] kmerArray, - int kmerCount, - File outputFile) - throws IOException { - OutputStream outputStream = - new BufferedOutputStream(new FileOutputStream(outputFile)); - for (int i = 0; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - writeKMerPosition(outputStream, kmerArray[i]); - } - outputStream.flush(); - outputStream.close(); - } - - private void writeExceptionFile(List kmerList, - File outputFile) - throws IOException { - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - for (StringKMerPosition kmer : kmerList) { - writeUniqueKMer(kmer, writer); - } - writer.flush(); - writer.close(); - } - - private KMerPosition readKMerPosition(InputStream stream) - throws IOException { - if (mUseOldFormat) { - return readKMerPositionOldFormat(stream); - } - byte[] buffer = mIOBuffer; - int encodingLength = (mK + 7)/8; - int fileLength = 4 + 2*encodingLength; - int count = readFully(stream, buffer, 0, fileLength); - if (count <= 0) { - return null; - } else if (count != fileLength) { - throw new RuntimeException("Unexpected end of file"); - } - char[] encoding = new char[encodingLength]; - int baseIndex = ((buffer[0] & 0xFF) | - (buffer[1] & 0xFF) << 8 | - (buffer[2] & 0xFF) << 16 | - (buffer[3] & 0xFF) << 24); - for (int i = 0; i < encodingLength; i++) { - encoding[i] = (char) ((buffer[2*i+4] & 0xFF) | - ((buffer[2*i+5] & 0xFF) << 8)); - } - return new KMerPositionN(encoding, baseIndex); - } - - private KMerPosition readKMerPositionOldFormat(InputStream stream) - throws IOException { - byte[] buffer = mIOBuffer; - int length = (mK >= 32 ? 20 : 12); - int count = readFully(stream, buffer, 0, length); - if (count <= 0) { - return null; - } else if (count != length) { - throw new RuntimeException("Unexpected end of file"); - } - long encoding = (((long)(buffer[0] & 0xFF)) | - ((long)(buffer[1] & 0xFF)) << 8 | - ((long)(buffer[2] & 0xFF)) << 16 | - ((long)(buffer[3] & 0xFF)) << 24 | - ((long)(buffer[4] & 0xFF)) << 32 | - ((long)(buffer[5] & 0xFF)) << 40 | - ((long)(buffer[6] & 0xFF)) << 48 | - ((long)(buffer[7] & 0xFF)) << 56); - int baseIndex = ((buffer[length-4] & 0xFF) | - (buffer[length-3] & 0xFF) << 8 | - (buffer[length-2] & 0xFF) << 16 | - (buffer[length-1] & 0xFF) << 24); - if (length == 12) { - return new KMerPosition1(encoding, baseIndex); - } else { - long encoding2 = (((long)(buffer[8] & 0xFF)) | - ((long)(buffer[9] & 0xFF)) << 8 | - ((long)(buffer[10] & 0xFF)) << 16 | - ((long)(buffer[11] & 0xFF)) << 24 | - ((long)(buffer[12] & 0xFF)) << 32 | - ((long)(buffer[13] & 0xFF)) << 40 | - ((long)(buffer[14] & 0xFF)) << 48 | - ((long)(buffer[15] & 0xFF)) << 56); - return new KMerPosition2(encoding, encoding2, baseIndex); - } - } - - private int readFully(InputStream stream, byte[] buffer, int offset, int count) - throws IOException { - int readCount = 0; - while (readCount < count) { - int read = stream.read(buffer, offset, count-readCount); - if (read <= 0) { - break; - } - offset += read; - readCount += read; - } - return readCount; - } - - private void writeKMerPosition(OutputStream stream, KMerPosition kmer) - throws IOException { - if (mUseOldFormat) { - writeKMerPositionOldFormat(stream, kmer); - return; - } - byte[] buffer = mIOBuffer; - int baseIndex = kmer.getBaseIndex(); - char[] encoding = kmer.getKMerEncoding(); - int offset = 0; - buffer[offset++] = (byte) ((baseIndex) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); - for (int i = 0; i < encoding.length; i++) { - buffer[offset++] = (byte) ((encoding[i]) & 0xFF); - buffer[offset++] = (byte) ((encoding[i] >> 8) & 0xFF); - } - stream.write(buffer, 0, offset); - } - - private void writeKMerPositionOldFormat(OutputStream stream, KMerPosition kmer) - throws IOException { - byte[] buffer = mIOBuffer; - long encoding1 = kmer.getKMerEncoding1(); - long encoding2 = kmer.getKMerEncoding2(); - int baseIndex = kmer.getBaseIndex(); - int offset = 0; - buffer[offset++] = (byte) ((encoding1) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 8) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 16) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 24) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 32) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 40) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 48) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 56) & 0xFF); - if (mK >= 32) { - buffer[offset++] = (byte) ((encoding2) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 8) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 16) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 24) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 32) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 40) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 48) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 56) & 0xFF); - } - buffer[offset++] = (byte) ((baseIndex) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); - stream.write(buffer, 0, offset); - } - - private long mergeSpillFiles(List spillFiles, File outputFile) - throws IOException { - - if (spillFiles == null) { - return 0; - } - - log("Merging spill files ..."); - OutputStream outputStream = - new BufferedOutputStream(new FileOutputStream(outputFile)); - long uniqueCount = 0; - int fileCount = spillFiles.size(); - InputStream[] inputStreams = new InputStream[fileCount]; - KMerPosition[] kmers = new KMerPosition[fileCount]; - for (int i = 0; i < fileCount; i++) { - inputStreams[i] = - new BufferedInputStream(new FileInputStream(spillFiles.get(i))); - } - while (true) { - for (int i = 0; i < fileCount; i++) { - if (kmers[i] == null && inputStreams[i] != null) { - kmers[i] = readKMerPosition(inputStreams[i]); - if (kmers[i] == null) { - inputStreams[i].close(); - inputStreams[i] = null; - } - } - } - int count = 0; - KMerPosition kmer = null; - for (int i = 0; i < fileCount; i++) { - KMerPosition kmp = kmers[i]; - if (kmp == null) { - continue; - } else if (kmer == null) { - kmer = kmp; - count = 1; - } else { - int cmp = kmp.compareTo(kmer); - if (cmp == 0) { - count++; - } else if (cmp < 0) { - kmer = kmp; - count = 1; - } - } - } - if (kmer == null) { - break; - } - for (int i = 0; i < fileCount; i++) { - if (kmers[i] == kmer) { - kmers[i] = null; - } - } - if (count == 1 && kmer.getBaseIndex() != NONUNIQUE_MARKER) { - uniqueCount++; - writeKMerPosition(outputStream, kmer); - } - } - outputStream.flush(); - outputStream.close(); - for (int i = 0; i < fileCount; i++) { - // spillFiles.get(i).delete(); - } - log("Spill files merged, unique count is " + uniqueCount); - return uniqueCount; - } - - private void writeKMerTextFile(File inputFile, - List exceptionList, - File outputFile) - throws IOException { - - log("Writing kmer file " + outputFile + " ..."); - int exceptionIndex = 0; - StringKMerPosition excKMer = null; - Iterator excIter = null; - if (!exceptionList.isEmpty()) { - excIter = exceptionList.iterator(); - excKMer = excIter.next(); - } - - InputStream inputStream = - new BufferedInputStream(new FileInputStream(inputFile)); - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - KMerPosition kmer = readKMerPosition(inputStream); - while (kmer != null || excKMer != null) { - if (excKMer == null) { - writeUniqueKMer(kmer, writer); - kmer = readKMerPosition(inputStream); - } else if (kmer == null) { - writeUniqueKMer(excKMer, writer); - excKMer = excIter.hasNext() ? excIter.next() : null; - } else if (kmer.getKMer().compareTo(excKMer.getKMer()) < 0) { - writeUniqueKMer(kmer, writer); - kmer = readKMerPosition(inputStream); - } else { - writeUniqueKMer(excKMer, writer); - excKMer = excIter.hasNext() ? excIter.next() : null; - } - } - inputStream.close(); - writer.flush(); - writer.close(); - log("Wrote kmer file: " + outputFile); - } - - private void writeUniqueKMer(KMerPosition kmer, PrintWriter writer) { - if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { - writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); - } - } - - private void writeUniqueKMer(StringKMerPosition kmer, PrintWriter writer) { - if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { - writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); - } - } - - private void writeKMer(String kmer, int baseIndex, PrintWriter writer) { - String chr = getBaseIndexSequenceName(baseIndex); - int pos = getBaseIndexCoordinate(baseIndex); - writer.println(kmer + "\t" + chr + "\t" + pos); - } - - private void createMapFile(int mapSize, - File kmerFile, - List exceptionList, - File priorMapFile, - File mapFile) - throws IOException { - byte[] map = null; - long uniquePriorCount = 0; - if (priorMapFile.exists()) { - map = readMapFile(priorMapFile); - if (map.length != mapSize) { - throw new RuntimeException("Prior map is wrong size"); - } - // Clear the new bits from prior map. - // Also count the prior unique positions while we are at it. - // Note that this is a count of positions, not kmers. - for (int i = 0; i < mapSize; i++) { - int cumBits = map[i] & 0x55; - uniquePriorCount += Integer.bitCount(cumBits); - map[i] = (byte) cumBits; - } - } else { - map = new byte[mapSize]; - } - for (StringKMerPosition kmp : exceptionList) { - addToMap(kmp, map); - } - mPriorMapUniqueCount = uniquePriorCount; - - InputStream inputStream = - new BufferedInputStream(new FileInputStream(kmerFile)); - while (true) { - KMerPosition kmp = readKMerPosition(inputStream); - if (kmp == null) { - inputStream.close(); - break; - } - addToMap(kmp, map); - } - - long testCum = 0; - for (int i = 0; i < map.length; i++) { - testCum += Integer.bitCount(map[i] & 0x55); - } - - writeMapFile(map, mapFile); - } - - private void addToMap(KMerPosition kmp, byte[] map) { - int baseIndex = kmp.getBaseIndex(); - if (baseIndex != NONUNIQUE_MARKER) { - addToMap(baseIndex, map); - } - } - - private void addToMap(StringKMerPosition kmp, byte[] map) { - int baseIndex = kmp.getBaseIndex(); - if (baseIndex != NONUNIQUE_MARKER) { - addToMap(baseIndex, map); - } - } - - private void addToMap(int baseIndex, byte[] map) { - int mod = baseIndex & 0x3; - int offset = (baseIndex >> 2) & 0x3FFFFFFF; - if (((map[offset] >> (2*mod)) & 0x3) != 0) { - throw new RuntimeException("Map entry already set: " + baseIndex); - } - map[offset] |= (0x3 << (2*mod)); - } - - private void writeSummaryStatistics(File outputFile) - throws IOException { - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - long baseCount = (mBaseIndex + 1) & 0xFFFFFFFFL; - long uniqueCount = mUniquePriorCount + mUniqueNewCount; - long nonUniqueCount = mKMerCount - uniqueCount; - writer.println("K: " + mK); - writer.println("Sequences: " + mSequenceList.size()); - writer.println("Bases: " + baseCount); - writer.println("KMers: " + mKMerCount); - writer.println("Prior map count: " + mPriorMapUniqueCount); - writer.println("Unique prior: " + mUniquePriorCount + - " (" + formatPercent(mUniquePriorCount, mKMerCount) + ")"); - writer.println("Unique new: " + mUniqueNewCount + - " (" + formatPercent(mUniqueNewCount, mKMerCount) + ")"); - writer.println("Unique cumulative: " + uniqueCount + - " (" + formatPercent(uniqueCount, mKMerCount) + ")"); - writer.println("Nonunique: " + nonUniqueCount + - " (" + formatPercent(nonUniqueCount, mKMerCount) + ")"); - writer.flush(); - writer.close(); - } - - private String formatPercent(long numerator, long denominator) { - double fraction = 0.0; - if (denominator != 0) { - fraction = numerator / (double) denominator; - } - return String.format("%1.1f%%", fraction * 100.0); - } - - private void openPriorMap(File mapFile) - throws IOException { - if (mapFile.exists()) { - mPriorMapStream = new BufferedInputStream(new FileInputStream(mapFile)); - mPriorMapPosition = -1; - mPriorMapValue = 0; - } - } - - private void closePriorMap() - throws IOException { - if (mPriorMapStream != null) { - mPriorMapStream.close(); - } - mPriorMapStream = null; - mPriorMapPosition = -1; - mPriorMapValue = 0; - } - - private byte[] readMapFile(File file) - throws IOException { - long fileLength = file.length(); - if (fileLength > 1000000000) { - throw new RuntimeException("Prior map too large: " + file); - } - int length = (int) fileLength; - byte[] map = new byte[length]; - FileInputStream stream = new FileInputStream(file); - int count = readFully(stream, map, 0, length); - if (count != length) { - throw new RuntimeException("Failed to read map: " + file); - } - stream.close(); - return map; - } - - private void writeMapFile(byte[] map, File file) - throws IOException { - FileOutputStream stream = new FileOutputStream(file); - stream.write(map); - stream.flush(); - stream.close(); - } - - private boolean isUniqueInPriorMap(int baseIndex) - throws IOException { - if (mPriorMapStream == null) { - return false; - } - int byteOffset = (baseIndex >> 2) & 0x3FFFFFFF; - if (byteOffset != mPriorMapPosition) { - int delta = byteOffset - mPriorMapPosition; - if (delta < 0) { - throw new RuntimeException("Attempt to seek backwards in prior map"); - } - if (delta > 1) { - skipFully(mPriorMapStream, delta-1); - } - mPriorMapValue = mPriorMapStream.read(); - if (mPriorMapValue < 0) { - throw new RuntimeException("Unexpected end of file in prior map"); - } - mPriorMapPosition += delta; - } - int mod = baseIndex & 0x3; - return (((mPriorMapValue >> (2*mod)) & 1) != 0); - } - - private void skipFully(InputStream stream, long amount) - throws IOException { - while (amount > 0) { - long skip = stream.skip(amount); - if (skip <= 0 || skip > amount) { - throw new RuntimeException("Skip failed"); - } - amount -= skip; - } - } - - private String getBaseIndexSequenceName(int baseIndex) { - int sequenceCount = mSequenceList.size(); - for (int i = 0; i < sequenceCount-1; i++) { - int nextOffset = mSequenceOffsetList.get(i+1); - if (compareBaseIndex(nextOffset, baseIndex) > 0) { - return mSequenceList.get(i); - } - } - return mSequenceList.get(sequenceCount-1); - } - - private int getBaseIndexCoordinate(int baseIndex) { - Integer sequenceOffset = null; - for (Integer offset : mSequenceOffsetList) { - if (compareBaseIndex(offset, baseIndex) > 0) { - break; - } - sequenceOffset = offset; - } - if (sequenceOffset == null) { - return 0; - } - int coordinate = baseIndex - sequenceOffset + 1; - if (coordinate <= 0) { - dumpSequenceList(); - System.out.println("coordinate: " + coordinate); - System.out.println("sequenceOffset: " + Integer.toHexString(sequenceOffset)); - System.out.println("baseIndex: " + Integer.toHexString(baseIndex)); - throw new RuntimeException("Internal error: illegal coordinate " + - coordinate + " for base index " + baseIndex); - } - return coordinate; - } - - private void dumpSequenceList() { - System.out.println("# Sequences:"); - int count = mSequenceList.size(); - for (int i = 0; i < count; i++) { - String seqName = mSequenceList.get(i); - int offset = mSequenceOffsetList.get(i); - System.out.println("# " + seqName + - "\t" + offset + - "\t" + Integer.toHexString(offset)); - } - } - - private int compareBaseIndex(int baseIndex1, int baseIndex2) { - // Implements unsigned comparison, a la compareTo - if (baseIndex1 < 0 ^ baseIndex2 < 0) { - return ((baseIndex1 < 0) ? 1 : -1); - } else { - return (baseIndex1 - baseIndex2); - } - } - - private String getNextSequence() - throws IOException { - - while (mNextSequence == null) { - if (mCurrentReader == null) { - mCurrentReader = getNextReader(); - if (mCurrentReader == null) { - return null; - } - } - String line = mCurrentReader.readLine(); - if (line == null) { - mCurrentReader.close(); - mCurrentReader = null; - continue; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - } - } - String result = mNextSequence; - mNextSequence = null; - return result; - } - - private LineNumberReader getNextReader() - throws IOException { - if (mInputFileIndex >= mInputFiles.size()) { - return null; - } - File file = mInputFiles.get(mInputFileIndex++); - return new LineNumberReader(new FileReader(file)); - } - - private char[] getNextKMer() - throws IOException { - - if (mKMerBuffer == null) { - mKMerBuffer = new char[mK]; - } - System.arraycopy(mKMerBuffer, 1, mKMerBuffer, 0, mKMerBuffer.length - 1); - if (mKMerBufferedCount > 0) { - mKMerBufferedCount--; - } - - while (mKMerBufferedCount < mK) { - char base = getNextBase(); - if (base == 0) { - incrementBaseIndex(mKMerBufferedCount); - mKMerBufferedCount = 0; - return null; - } else if (base == 'N') { - incrementBaseIndex(mKMerBufferedCount+1); - mKMerBufferedCount = 0; - } else { - mKMerBuffer[mKMerBufferedCount++] = base; - } - } - incrementBaseIndex(1); - return mKMerBuffer; - } - - private char getNextBase() - throws IOException { - - if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) { - if (mCurrentReader == null) { - return 0; - } - String line = mCurrentReader.readLine(); - if (line == null) { - mLineBuffer = null; - mLineBufferIndex = 0; - mCurrentReader.close(); - mCurrentReader = null; - return 0; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - mLineBuffer = null; - mLineBufferIndex = 0; - return 0; - } - mLineBuffer = line.toUpperCase(); - mLineBufferIndex = 0; - } - return mLineBuffer.charAt(mLineBufferIndex++); - } - - private void incrementBaseIndex(int amount) { - if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) { - throw new RuntimeException("Base index: 32-bit overflow"); - } - mBaseIndex += amount; - } - - private void log(String text) { - if (mVerbose) { - System.out.println("# " + new Date() + " " + text); - } - } - - private static KMerPosition encodeKMer(char[] kmerChars, int baseIndex) { - if (mUseOldFormat) { - return encodeKMerOldFormat(kmerChars, baseIndex); - } - if (kmerChars == null) { - return null; - } - int kmerLength = kmerChars.length; - int encodingLength = (kmerLength + 7) / 8; - char[] encoding = new char[encodingLength]; - int offset = kmerLength % 8; - offset = (offset == 0) ? 8 : offset; - int bits = encodeKMerBits(kmerChars, 0, offset); - if (bits < 0) { - return null; - } - encoding[0] = (char) bits; - for (int i = 1; i < encodingLength; i++) { - bits = encodeKMerBits(kmerChars, offset, 8); - if (bits < 0) { - return null; - } - encoding[i] = (char) bits; - offset += 8; - } - return new KMerPositionN(encoding, baseIndex); - } - - private static KMerPosition encodeKMerOldFormat(char[] kmerChars, int baseIndex) { - if (kmerChars == null) { - return null; - } - int length = kmerChars.length; - if (length <= 31) { - long bits = encodeKMerBitsLong(kmerChars, 0, length); - if (bits == -1) { - return null; - } - return new KMerPosition1(bits, baseIndex); - } else if (length <= 62) { - long bits1 = encodeKMerBitsLong(kmerChars, 0, 31); - long bits2 = encodeKMerBitsLong(kmerChars, 31, length - 31); - if (bits1 == -1 || bits2 == -1) { - return null; - } - return new KMerPosition2(bits1, bits2, baseIndex); - } else { - return null; - } - } - - private static int encodeKMerBits(char[] kmerChars, int offset, int length) { - int bits = 0; - for (int i = 0; i < length; i++) { - char base = kmerChars[offset + i]; - int baseBits = "ACGT".indexOf(base); - if (baseBits < 0) { - return -1; - } - bits |= baseBits << (2*(length-i-1)); - } - return bits; - } - - private static long encodeKMerBitsLong(char[] kmerChars, int offset, int length) { - long bits = 0; - for (int i = 0; i < length; i++) { - char base = kmerChars[offset + i]; - int baseBits = "ACGT".indexOf(base); - if (baseBits < 0) { - return -1; - } - bits |= ((long)baseBits) << (2*(length-i-1)); - } - return bits; - } - - private static String decodeKMer1(long bits) { - int length = mK; - char[] buffer = new char[length]; - decodeKMerBits(bits, buffer, 0, length); - return new String(buffer); - } - - private static String decodeKMer2(long bits1, long bits2) { - int length = mK; - char[] buffer = new char[length]; - decodeKMerBits(bits1, buffer, 0, 31); - decodeKMerBits(bits2, buffer, 31, length-31); - return new String(buffer); - } - - private static String decodeKMerN(char[] encoding) { - int length = mK; - char[] buffer = new char[length]; - int offset = length % 8; - offset = (offset == 0) ? 8 : offset; - decodeKMerBits(encoding[0], buffer, 0, offset); - for (int i = 1; i < encoding.length; i++) { - decodeKMerBits(encoding[i], buffer, offset, 8); - offset += 8; - } - return new String(buffer); - } - - private static void decodeKMerBits(char bits, char[] buffer, int offset, int length) { - for (int i = 0; i < length; i++) { - int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); - buffer[offset + i] = "ACGT".charAt(baseBits); - } - } - - private static void decodeKMerBits(long bits, char[] buffer, int offset, int length) { - for (int i = 0; i < length; i++) { - int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); - buffer[offset + i] = "ACGT".charAt(baseBits); - } - } - - static class KMerPosition - implements Comparable { - - private int mBaseIndex; - - KMerPosition(int baseIndex) { - mBaseIndex = baseIndex; - } - - public String getKMer() { - return null; - } - - public long getKMerEncoding1() { - return -1; - } - - public long getKMerEncoding2() { - return -1; - } - - public final int getBaseIndex() { - return mBaseIndex; - } - - public final void setBaseIndex(int baseIndex) { - mBaseIndex = baseIndex; - } - - public char[] getKMerEncoding() { - return null; - } - - public int compareTo(KMerPosition kmp) { - char[] encoding1 = getKMerEncoding(); - char[] encoding2 = kmp.getKMerEncoding(); - int length = Math.max(encoding1.length, encoding2.length); - for (int i = 0; i < length; i++) { - int result = encoding1[i] - encoding2[i]; - if (result != 0) { - return result; - } - } - return 0; - } - } - - static class KMerPosition1 - extends KMerPosition { - - private long mKMerEncoding1; - - KMerPosition1(long kmer, int baseIndex) { - super(baseIndex); - mKMerEncoding1 = kmer; - } - - public String getKMer() { - return decodeKMer1(getKMerEncoding1()); - } - - public final long getKMerEncoding1() { - return mKMerEncoding1; - } - - public int compareTo(KMerPosition kmp) { - int result = Long.signum(getKMerEncoding1() - kmp.getKMerEncoding1()); - if (result == 0) { - result = Long.signum(getKMerEncoding2() - kmp.getKMerEncoding2()); - } - return result; - } - } - - static class KMerPosition2 - extends KMerPosition1 { - - private long mKMerEncoding2; - - KMerPosition2(long encoding1, long encoding2, int baseIndex) { - super(encoding1, baseIndex); - mKMerEncoding2 = encoding2; - } - - public String getKMer() { - return decodeKMer2(getKMerEncoding1(), getKMerEncoding2()); - } - - public final long getKMerEncoding2() { - return mKMerEncoding2; - } - } - - static class KMerPositionN - extends KMerPosition { - - private char[] mKMerEncoding; - - KMerPositionN(char[] encoding, int baseIndex) { - super(baseIndex); - mKMerEncoding = encoding; - } - - public String getKMer() { - return decodeKMerN(mKMerEncoding); - } - - public final char[] getKMerEncoding() { - return mKMerEncoding; - } - } - - static class StringKMerPosition - implements Comparable { - - private String mKMerString = null; - private int mBaseIndex; - - StringKMerPosition(String kmer, int baseIndex) { - mKMerString = kmer; - mBaseIndex = baseIndex; - } - - public final String getKMer() { - return mKMerString; - } - - public final int getBaseIndex() { - return mBaseIndex; - } - - public final void setBaseIndex(int baseIndex) { - mBaseIndex = baseIndex; - } - - public int compareTo(StringKMerPosition kmp) { - return mKMerString.compareTo(kmp.mKMerString); - } - } -} diff --git a/java/lib/edu/mit/broad/cnv/CountKMers3.java b/java/lib/edu/mit/broad/cnv/CountKMers3.java deleted file mode 100644 index 81ddb1745..000000000 --- a/java/lib/edu/mit/broad/cnv/CountKMers3.java +++ /dev/null @@ -1,1426 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv; - -import java.io.*; -import java.util.*; - - -/** - * Tool for counting unique kmers. - */ -public class CountKMers3 -{ - private static final int NONUNIQUE_MARKER = -1; - private static boolean mUseOldFormat = false; - - private String mAction = null; - private static int mK = 0; - private int mBatchSize = 0; - private List mInputFiles = null; - private File mInputDirectory = null; - private File mOutputDirectory = null; - private boolean mVerbose = false; - private boolean mDebug = false; - - private List mSequenceList = null; - private List mSequenceOffsetList = null; - private List mSpillFileList = null; - private double mSpillFactor = 0.9; - - private long mKMerCount = 0; - private long mUniquePriorCount = 0; - private long mUniqueNewCount = 0; - private long mPriorMapUniqueCount = 0; - - private InputStream mPriorMapStream = null; - private int mPriorMapPosition = -1; - private int mPriorMapValue = 0; - private int mInputFileIndex = 0; - private LineNumberReader mCurrentReader = null; - private String mNextSequence = null; - private char[] mKMerBuffer = null; - private int mKMerBufferedCount = 0; - private String mLineBuffer = null; - private int mLineBufferIndex = 0; - private int mBaseIndex = -1; - private byte[] mIOBuffer = null; - - /* Design - Inputs: - - One or more fasta files to search (currently one). - - Output directory for the result files. - - Optionally an input k-1-mer file (output from previous pass). - Outputs: - - Unique kmer file: (sorted by kmer) - This is unique globally or unique wrt unique (K-1) mers (i.e. K unique, K-1 not). - - Per chromosome bit map: pos (implicit) new-bit cum-bit - New-bit is 1 if Kmer starting at pos is unique but (K-1)-mer is not. - Cum-bit is 1 if Kmer starting at pos is unique for some L <= K. - - Statistics - Plan: - - Reducing memory footprint is crucial. - - Sequential pass over the input sequences to generate kmers. - - BatchSize kmers are cached in memory, then sorted and uniqified. - - As batch array fills, batches are spilled to disk. - - Batches are reloaded from disk and merged (N-finger algorithm) - - and streamed to a merge file. - - Merge file is read from disk and processed as final results. - */ - - public static void main(String[] args) - throws Exception { - new CountKMers3().run(args); - } - - private void usage() { - System.out.println("Usage: CountKMers ..."); - System.out.println(" -action "); - System.out.println(" -genome "); - System.out.println(" -k "); - System.out.println(" -batchSize "); - System.out.println(" -inputDir "); - System.out.println(" -outputDir "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-action") && argsleft > 1) { - argpos++; - mAction = args[argpos++]; - } else if (arg.equals("-genome") && argsleft > 1) { - argpos++; - if (mInputFiles == null) { - mInputFiles = new ArrayList(); - } - mInputFiles.add(new File(args[argpos++])); - } else if (arg.equals("-k") && argsleft > 1) { - argpos++; - mK = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-batchSize") && argsleft > 1) { - argpos++; - mBatchSize = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-inputDir") && argsleft > 1) { - argpos++; - mInputDirectory = new File(args[argpos++]); - } else if (arg.equals("-outputDir") && argsleft > 1) { - argpos++; - mOutputDirectory = new File(args[argpos++]); - } else if (arg.equals("-oldFormat")) { - argpos++; - mUseOldFormat = true; - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void run(String[] args) - throws Exception { - if (!parseArguments(args)) { - System.exit(1); - } - if (mAction == null || mAction.equals("mapKMers")) { - mapKMers(); - } else if (mAction.equals("mapGaps")) { - mapGaps(); - } - } - - // Can be used to scan genome for sequence names/lengths. - private void scanKMers() - throws IOException { - mSequenceList = new ArrayList(); - mSequenceOffsetList = new ArrayList(); - File priorMapFile = - new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); - openPriorMap(priorMapFile); - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - mSequenceList.add(seqName); - mSequenceOffsetList.add(mBaseIndex+1); - log("Scanning " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - mKMerCount++; - if (isUniqueInPriorMap(mBaseIndex)) { - continue; - } - } - } - closePriorMap(); - } - - private void mapGaps() - throws IOException { - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - int pos = 0; - int gapStart = 0; - while (true) { - char base = getNextBase(); - if (base == 0) { - break; - } - pos++; - if (base == 'N') { - if (gapStart == 0) { - gapStart = pos; - } - } else { - if (gapStart > 0) { - System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); - gapStart = 0; - } - } - } - if (gapStart > 0) { - System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); - gapStart = 0; - } - } - } - - private void mapKMers() - throws IOException { - - File textKMerFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.txt"); - File binaryKMerFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.bin"); - File exceptionFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.extra"); - File mapFile = - new File(mOutputDirectory, "unique_" + mK + "_mers_map.bin"); - File priorMapFile = - new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); - File statsFile = - new File(mOutputDirectory, "unique_" + mK + "_mers_stats.txt"); - - if (mBatchSize == 0) { - throw new RuntimeException("Batch size not specified"); - } - - int kmerCount = 0; - int batchSize = mBatchSize; - KMerPosition[] kmerArray = new KMerPosition[batchSize]; - List exceptionList = new ArrayList(); - mSequenceList = new ArrayList(); - mSequenceOffsetList = new ArrayList(); - mIOBuffer = new byte[Math.max(20,4 + 2*((mK + 7)/8))]; - - openPriorMap(priorMapFile); - - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - mSequenceList.add(seqName); - mSequenceOffsetList.add(mBaseIndex+1); - log("Processing " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - mKMerCount++; - int baseIndex = mBaseIndex; - if (isUniqueInPriorMap(baseIndex)) { - mUniquePriorCount++; - continue; - } - - KMerPosition kmp = encodeKMer(kmerChars, baseIndex); - if (kmp == null) { - // Note: We currently do not handle the reverse - // complement of exception characters correctly. - // For hg18, however, this doesn't matter as - // none of the kmers containing non-ACGT characters - // are present on the reverse strand. - String kmer = new String(kmerChars); - exceptionList.add(new StringKMerPosition(kmer, baseIndex)); - continue; - } - kmerArray[kmerCount++] = kmp; - if (kmerCount == batchSize) { - kmerCount = compactKMers(kmerArray, kmerCount); - if (kmerCount > mSpillFactor * batchSize) { - spillKMers(kmerArray, kmerCount); - kmerCount = 0; - } - } - } - } - if (kmerCount > 0) { - kmerCount = compactKMers(kmerArray, kmerCount); - if (mSpillFileList != null) { - spillKMers(kmerArray, kmerCount); - kmerCount = 0; - } - } - - closePriorMap(); - - // Write out the exception kmers (text file). - compactKMers(exceptionList); - writeExceptionFile(exceptionList, exceptionFile); - - // Write out the binary file of unique encoded kmers. - if (mSpillFileList == null) { - kmerCount = removeNonUnique(kmerArray, kmerCount); - writeKMerBinaryFile(kmerArray, kmerCount, binaryKMerFile); - mUniqueNewCount = kmerCount; - } else { - mUniqueNewCount = mergeSpillFiles(mSpillFileList, binaryKMerFile); - } - mUniqueNewCount += countUniqueKMers(exceptionList); - - // Write out the text file of (all) unique kmers. - writeKMerTextFile(binaryKMerFile, exceptionList, textKMerFile); - - // Create map file from prior map plus the new unique kmers. - int mapSize = ((mBaseIndex >> 2) & 0x3FFFFFFF) + 1; - createMapFile(mapSize, binaryKMerFile, exceptionList, priorMapFile, mapFile); - - // Write summary statistics file. - writeSummaryStatistics(statsFile); - } - - private int compactKMers(KMerPosition[] kmerArray, int kmerCount) { - if (kmerCount == 0) { - return 0; - } - log("Compacting " + kmerCount + " kmers at index " + - Integer.toHexString(mBaseIndex) + " ..."); - Arrays.sort(kmerArray, 0, kmerCount); - int newCount = 1; - KMerPosition current = kmerArray[0]; - for (int i = 1; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - if (current.compareTo(kmp) == 0) { - current.setBaseIndex(NONUNIQUE_MARKER); - } else { - kmerArray[newCount++] = kmp; - current = kmp; - } - } - log("Compaction finished, new count is " + newCount); - return newCount; - } - - private int compactKMers(StringKMerPosition[] kmerArray, int kmerCount) { - if (kmerCount == 0) { - return 0; - } - log("Compacting " + kmerCount + " string kmers ..."); - Arrays.sort(kmerArray, 0, kmerCount); - int newCount = 1; - String kmerString = kmerArray[0].getKMer(); - for (int i = 1; i < kmerCount; i++) { - StringKMerPosition kmp = kmerArray[i]; - String ks = kmp.getKMer(); - if (ks.equals(kmerString)) { - kmerArray[newCount-1].setBaseIndex(NONUNIQUE_MARKER); - } else { - kmerArray[newCount++] = kmp; - kmerString = ks; - } - } - log("Compaction finished, new count is " + newCount); - return newCount; - } - - private void compactKMers(List kmerList) { - int kmerCount = kmerList.size(); - if (kmerCount <= 1) { - return; - } - StringKMerPosition[] kmerArray = - kmerList.toArray(new StringKMerPosition[kmerCount]); - kmerCount = compactKMers(kmerArray, kmerCount); - kmerList.clear(); - for (int i = 0; i < kmerCount; i++) { - kmerList.add(kmerArray[i]); - } - } - - private int removeNonUnique(KMerPosition[] kmerArray, int kmerCount) { - int uniqueCount = 0; - for (int i = 0; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { - kmerArray[uniqueCount++] = kmp; - } - } - return uniqueCount; - } - - private int countUniqueKMers(List kmerList) { - int uniqueCount = 0; - for (StringKMerPosition kmp : kmerList) { - if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { - uniqueCount++; - } - } - return uniqueCount; - } - - private void spillKMers(KMerPosition[] kmerArray, int kmerCount) - throws IOException { - if (mSpillFileList == null) { - mSpillFileList = new ArrayList(); - } - int fileNumber = mSpillFileList.size() + 1; - log("Spilling " + kmerCount + " kmers to file " + fileNumber + " ..."); - File spillFile = new File(mOutputDirectory, - "spill_" + mK + "_" + fileNumber + ".tmp"); - mSpillFileList.add(spillFile); - writeKMerBinaryFile(kmerArray, kmerCount, spillFile); - log("Spill file written"); - } - - private void writeKMerBinaryFile(KMerPosition[] kmerArray, - int kmerCount, - File outputFile) - throws IOException { - OutputStream outputStream = - new BufferedOutputStream(new FileOutputStream(outputFile)); - for (int i = 0; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - writeKMerPosition(outputStream, kmerArray[i]); - } - outputStream.flush(); - outputStream.close(); - } - - private void writeExceptionFile(List kmerList, - File outputFile) - throws IOException { - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - for (StringKMerPosition kmer : kmerList) { - writeUniqueKMer(kmer, writer); - } - writer.flush(); - writer.close(); - } - - private KMerPosition readKMerPosition(InputStream stream) - throws IOException { - if (mUseOldFormat) { - return readKMerPositionOldFormat(stream); - } - byte[] buffer = mIOBuffer; - int encodingLength = (mK + 7)/8; - int fileLength = 4 + 2*encodingLength; - int count = readFully(stream, buffer, 0, fileLength); - if (count <= 0) { - return null; - } else if (count != fileLength) { - throw new RuntimeException("Unexpected end of file"); - } - char[] encoding = new char[encodingLength]; - int baseIndex = ((buffer[0] & 0xFF) | - (buffer[1] & 0xFF) << 8 | - (buffer[2] & 0xFF) << 16 | - (buffer[3] & 0xFF) << 24); - for (int i = 0; i < encodingLength; i++) { - encoding[i] = (char) ((buffer[2*i+4] & 0xFF) | - ((buffer[2*i+5] & 0xFF) << 8)); - } - return new KMerPositionN(encoding, baseIndex); - } - - private KMerPosition readKMerPositionOldFormat(InputStream stream) - throws IOException { - byte[] buffer = mIOBuffer; - int length = (mK >= 32 ? 20 : 12); - int count = readFully(stream, buffer, 0, length); - if (count <= 0) { - return null; - } else if (count != length) { - throw new RuntimeException("Unexpected end of file"); - } - long encoding = (((long)(buffer[0] & 0xFF)) | - ((long)(buffer[1] & 0xFF)) << 8 | - ((long)(buffer[2] & 0xFF)) << 16 | - ((long)(buffer[3] & 0xFF)) << 24 | - ((long)(buffer[4] & 0xFF)) << 32 | - ((long)(buffer[5] & 0xFF)) << 40 | - ((long)(buffer[6] & 0xFF)) << 48 | - ((long)(buffer[7] & 0xFF)) << 56); - int baseIndex = ((buffer[length-4] & 0xFF) | - (buffer[length-3] & 0xFF) << 8 | - (buffer[length-2] & 0xFF) << 16 | - (buffer[length-1] & 0xFF) << 24); - if (length == 12) { - return new KMerPosition1(encoding, baseIndex); - } else { - long encoding2 = (((long)(buffer[8] & 0xFF)) | - ((long)(buffer[9] & 0xFF)) << 8 | - ((long)(buffer[10] & 0xFF)) << 16 | - ((long)(buffer[11] & 0xFF)) << 24 | - ((long)(buffer[12] & 0xFF)) << 32 | - ((long)(buffer[13] & 0xFF)) << 40 | - ((long)(buffer[14] & 0xFF)) << 48 | - ((long)(buffer[15] & 0xFF)) << 56); - return new KMerPosition2(encoding, encoding2, baseIndex); - } - } - - private int readFully(InputStream stream, byte[] buffer, int offset, int count) - throws IOException { - int readCount = 0; - while (readCount < count) { - int read = stream.read(buffer, offset, count-readCount); - if (read <= 0) { - break; - } - offset += read; - readCount += read; - } - return readCount; - } - - private void writeKMerPosition(OutputStream stream, KMerPosition kmer) - throws IOException { - if (mUseOldFormat) { - writeKMerPositionOldFormat(stream, kmer); - return; - } - byte[] buffer = mIOBuffer; - int baseIndex = kmer.getBaseIndex(); - char[] encoding = kmer.getKMerEncoding(); - int offset = 0; - buffer[offset++] = (byte) ((baseIndex) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); - for (int i = 0; i < encoding.length; i++) { - buffer[offset++] = (byte) ((encoding[i]) & 0xFF); - buffer[offset++] = (byte) ((encoding[i] >> 8) & 0xFF); - } - stream.write(buffer, 0, offset); - } - - private void writeKMerPositionOldFormat(OutputStream stream, KMerPosition kmer) - throws IOException { - byte[] buffer = mIOBuffer; - long encoding1 = kmer.getKMerEncoding1(); - long encoding2 = kmer.getKMerEncoding2(); - int baseIndex = kmer.getBaseIndex(); - int offset = 0; - buffer[offset++] = (byte) ((encoding1) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 8) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 16) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 24) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 32) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 40) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 48) & 0xFF); - buffer[offset++] = (byte) ((encoding1 >> 56) & 0xFF); - if (mK >= 32) { - buffer[offset++] = (byte) ((encoding2) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 8) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 16) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 24) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 32) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 40) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 48) & 0xFF); - buffer[offset++] = (byte) ((encoding2 >> 56) & 0xFF); - } - buffer[offset++] = (byte) ((baseIndex) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); - stream.write(buffer, 0, offset); - } - - private long mergeSpillFiles(List spillFiles, File outputFile) - throws IOException { - - if (spillFiles == null) { - return 0; - } - - log("Merging spill files ..."); - OutputStream outputStream = - new BufferedOutputStream(new FileOutputStream(outputFile)); - long uniqueCount = 0; - int fileCount = spillFiles.size(); - InputStream[] inputStreams = new InputStream[fileCount]; - KMerPosition[] kmers = new KMerPosition[fileCount]; - for (int i = 0; i < fileCount; i++) { - inputStreams[i] = - new BufferedInputStream(new FileInputStream(spillFiles.get(i))); - } - while (true) { - for (int i = 0; i < fileCount; i++) { - if (kmers[i] == null && inputStreams[i] != null) { - kmers[i] = readKMerPosition(inputStreams[i]); - if (kmers[i] == null) { - inputStreams[i].close(); - inputStreams[i] = null; - } - } - } - int count = 0; - KMerPosition kmer = null; - for (int i = 0; i < fileCount; i++) { - KMerPosition kmp = kmers[i]; - if (kmp == null) { - continue; - } else if (kmer == null) { - kmer = kmp; - count = 1; - } else { - int cmp = kmp.compareTo(kmer); - if (cmp == 0) { - count++; - } else if (cmp < 0) { - kmer = kmp; - count = 1; - } - } - } - if (kmer == null) { - break; - } - for (int i = 0; i < fileCount; i++) { - if (kmers[i] != null && kmer.compareTo(kmers[i]) == 0) { - kmers[i] = null; - } - } - if (count == 1 && kmer.getBaseIndex() != NONUNIQUE_MARKER) { - uniqueCount++; - writeKMerPosition(outputStream, kmer); - } - - } - outputStream.flush(); - outputStream.close(); - for (int i = 0; i < fileCount; i++) { - // spillFiles.get(i).delete(); - } - log("Spill files merged, unique count is " + uniqueCount); - return uniqueCount; - } - - private void writeKMerTextFile(File inputFile, - List exceptionList, - File outputFile) - throws IOException { - - log("Writing kmer file " + outputFile + " ..."); - int exceptionIndex = 0; - StringKMerPosition excKMer = null; - Iterator excIter = null; - if (!exceptionList.isEmpty()) { - excIter = exceptionList.iterator(); - excKMer = excIter.next(); - } - - InputStream inputStream = - new BufferedInputStream(new FileInputStream(inputFile)); - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - KMerPosition kmer = readKMerPosition(inputStream); - while (kmer != null || excKMer != null) { - if (excKMer == null) { - writeUniqueKMer(kmer, writer); - kmer = readKMerPosition(inputStream); - } else if (kmer == null) { - writeUniqueKMer(excKMer, writer); - excKMer = excIter.hasNext() ? excIter.next() : null; - } else if (kmer.getKMer().compareTo(excKMer.getKMer()) < 0) { - writeUniqueKMer(kmer, writer); - kmer = readKMerPosition(inputStream); - } else { - writeUniqueKMer(excKMer, writer); - excKMer = excIter.hasNext() ? excIter.next() : null; - } - } - inputStream.close(); - writer.flush(); - writer.close(); - log("Wrote kmer file: " + outputFile); - } - - private void writeUniqueKMer(KMerPosition kmer, PrintWriter writer) { - if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { - writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); - } - } - - private void writeUniqueKMer(StringKMerPosition kmer, PrintWriter writer) { - if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { - writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); - } - } - - private void writeKMer(String kmer, int baseIndex, PrintWriter writer) { - String chr = getBaseIndexSequenceName(baseIndex); - int pos = getBaseIndexCoordinate(baseIndex); - writer.println(kmer + "\t" + chr + "\t" + pos); - } - - private void createMapFile(int mapSize, - File kmerFile, - List exceptionList, - File priorMapFile, - File mapFile) - throws IOException { - byte[] map = null; - long uniquePriorCount = 0; - if (priorMapFile.exists()) { - map = readMapFile(priorMapFile); - if (map.length != mapSize) { - throw new RuntimeException("Prior map is wrong size"); - } - // Clear the new bits from prior map. - // Also count the prior unique positions while we are at it. - // Note that this is a count of positions, not kmers. - for (int i = 0; i < mapSize; i++) { - int cumBits = map[i] & 0x55; - uniquePriorCount += Integer.bitCount(cumBits); - map[i] = (byte) cumBits; - } - } else { - map = new byte[mapSize]; - } - for (StringKMerPosition kmp : exceptionList) { - addToMap(kmp, map); - } - mPriorMapUniqueCount = uniquePriorCount; - - InputStream inputStream = - new BufferedInputStream(new FileInputStream(kmerFile)); - while (true) { - KMerPosition kmp = readKMerPosition(inputStream); - if (kmp == null) { - inputStream.close(); - break; - } - addToMap(kmp, map); - } - - long testCum = 0; - for (int i = 0; i < map.length; i++) { - testCum += Integer.bitCount(map[i] & 0x55); - } - - writeMapFile(map, mapFile); - } - - private void addToMap(KMerPosition kmp, byte[] map) { - int baseIndex = kmp.getBaseIndex(); - if (baseIndex != NONUNIQUE_MARKER) { - addToMap(baseIndex, map); - } - } - - private void addToMap(StringKMerPosition kmp, byte[] map) { - int baseIndex = kmp.getBaseIndex(); - if (baseIndex != NONUNIQUE_MARKER) { - addToMap(baseIndex, map); - } - } - - private void addToMap(int baseIndex, byte[] map) { - int mod = baseIndex & 0x3; - int offset = (baseIndex >> 2) & 0x3FFFFFFF; - if (((map[offset] >> (2*mod)) & 0x3) != 0) { - throw new RuntimeException("Map entry already set: " + baseIndex); - } - map[offset] |= (0x3 << (2*mod)); - } - - private void writeSummaryStatistics(File outputFile) - throws IOException { - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - long baseCount = (mBaseIndex + 1) & 0xFFFFFFFFL; - long uniqueCount = mUniquePriorCount + mUniqueNewCount; - long nonUniqueCount = mKMerCount - uniqueCount; - writer.println("K: " + mK); - writer.println("Sequences: " + mSequenceList.size()); - writer.println("Bases: " + baseCount); - writer.println("KMers: " + mKMerCount); - writer.println("Prior map count: " + mPriorMapUniqueCount); - writer.println("Unique prior: " + mUniquePriorCount + - " (" + formatPercent(mUniquePriorCount, mKMerCount) + ")"); - writer.println("Unique new: " + mUniqueNewCount + - " (" + formatPercent(mUniqueNewCount, mKMerCount) + ")"); - writer.println("Unique cumulative: " + uniqueCount + - " (" + formatPercent(uniqueCount, mKMerCount) + ")"); - writer.println("Nonunique: " + nonUniqueCount + - " (" + formatPercent(nonUniqueCount, mKMerCount) + ")"); - writer.flush(); - writer.close(); - } - - private String formatPercent(long numerator, long denominator) { - double fraction = 0.0; - if (denominator != 0) { - fraction = numerator / (double) denominator; - } - return String.format("%1.1f%%", fraction * 100.0); - } - - private void openPriorMap(File mapFile) - throws IOException { - if (mapFile.exists()) { - mPriorMapStream = new BufferedInputStream(new FileInputStream(mapFile)); - mPriorMapPosition = -1; - mPriorMapValue = 0; - } - } - - private void closePriorMap() - throws IOException { - if (mPriorMapStream != null) { - mPriorMapStream.close(); - } - mPriorMapStream = null; - mPriorMapPosition = -1; - mPriorMapValue = 0; - } - - private byte[] readMapFile(File file) - throws IOException { - long fileLength = file.length(); - if (fileLength > 1000000000) { - throw new RuntimeException("Prior map too large: " + file); - } - int length = (int) fileLength; - byte[] map = new byte[length]; - FileInputStream stream = new FileInputStream(file); - int count = readFully(stream, map, 0, length); - if (count != length) { - throw new RuntimeException("Failed to read map: " + file); - } - stream.close(); - return map; - } - - private void writeMapFile(byte[] map, File file) - throws IOException { - FileOutputStream stream = new FileOutputStream(file); - stream.write(map); - stream.flush(); - stream.close(); - } - - private boolean isUniqueInPriorMap(int baseIndex) - throws IOException { - if (mPriorMapStream == null) { - return false; - } - int byteOffset = (baseIndex >> 2) & 0x3FFFFFFF; - if (byteOffset != mPriorMapPosition) { - int delta = byteOffset - mPriorMapPosition; - if (delta < 0) { - throw new RuntimeException("Attempt to seek backwards in prior map"); - } - if (delta > 1) { - skipFully(mPriorMapStream, delta-1); - } - mPriorMapValue = mPriorMapStream.read(); - if (mPriorMapValue < 0) { - throw new RuntimeException("Unexpected end of file in prior map"); - } - mPriorMapPosition += delta; - } - int mod = baseIndex & 0x3; - return (((mPriorMapValue >> (2*mod)) & 1) != 0); - } - - private void skipFully(InputStream stream, long amount) - throws IOException { - while (amount > 0) { - long skip = stream.skip(amount); - if (skip <= 0 || skip > amount) { - throw new RuntimeException("Skip failed"); - } - amount -= skip; - } - } - - private String getBaseIndexSequenceName(int baseIndex) { - int sequenceCount = mSequenceList.size(); - for (int i = 0; i < sequenceCount-1; i++) { - int nextOffset = mSequenceOffsetList.get(i+1); - if (compareBaseIndex(nextOffset, baseIndex) > 0) { - return mSequenceList.get(i); - } - } - return mSequenceList.get(sequenceCount-1); - } - - private int getBaseIndexCoordinate(int baseIndex) { - Integer sequenceOffset = null; - for (Integer offset : mSequenceOffsetList) { - if (compareBaseIndex(offset, baseIndex) > 0) { - break; - } - sequenceOffset = offset; - } - if (sequenceOffset == null) { - return 0; - } - int coordinate = baseIndex - sequenceOffset + 1; - if (coordinate <= 0) { - dumpSequenceList(); - System.out.println("coordinate: " + coordinate); - System.out.println("sequenceOffset: " + Integer.toHexString(sequenceOffset)); - System.out.println("baseIndex: " + Integer.toHexString(baseIndex)); - throw new RuntimeException("Internal error: illegal coordinate " + - coordinate + " for base index " + baseIndex); - } - return coordinate; - } - - private void dumpSequenceList() { - System.out.println("# Sequences:"); - int count = mSequenceList.size(); - for (int i = 0; i < count; i++) { - String seqName = mSequenceList.get(i); - int offset = mSequenceOffsetList.get(i); - System.out.println("# " + seqName + - "\t" + offset + - "\t" + Integer.toHexString(offset)); - } - } - - private int compareBaseIndex(int baseIndex1, int baseIndex2) { - // Implements unsigned comparison, a la compareTo - if (baseIndex1 < 0 ^ baseIndex2 < 0) { - return ((baseIndex1 < 0) ? 1 : -1); - } else { - return (baseIndex1 - baseIndex2); - } - } - - private String getNextSequence() - throws IOException { - - while (mNextSequence == null) { - if (mCurrentReader == null) { - mCurrentReader = getNextReader(); - if (mCurrentReader == null) { - return null; - } - } - String line = mCurrentReader.readLine(); - if (line == null) { - mCurrentReader.close(); - mCurrentReader = null; - continue; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - } - } - String result = mNextSequence; - mNextSequence = null; - return result; - } - - private LineNumberReader getNextReader() - throws IOException { - if (mInputFileIndex >= mInputFiles.size()) { - return null; - } - File file = mInputFiles.get(mInputFileIndex++); - return new LineNumberReader(new FileReader(file)); - } - - private char[] getNextKMer() - throws IOException { - - if (mKMerBuffer == null) { - mKMerBuffer = new char[mK]; - } - System.arraycopy(mKMerBuffer, 1, mKMerBuffer, 0, mKMerBuffer.length - 1); - if (mKMerBufferedCount > 0) { - mKMerBufferedCount--; - } - - while (mKMerBufferedCount < mK) { - char base = getNextBase(); - if (base == 0) { - incrementBaseIndex(mKMerBufferedCount); - mKMerBufferedCount = 0; - return null; - } else if (base == 'N') { - incrementBaseIndex(mKMerBufferedCount+1); - mKMerBufferedCount = 0; - } else { - mKMerBuffer[mKMerBufferedCount++] = base; - } - } - incrementBaseIndex(1); - return mKMerBuffer; - } - - private char getNextBase() - throws IOException { - - if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) { - if (mCurrentReader == null) { - return 0; - } - String line = mCurrentReader.readLine(); - if (line == null) { - mLineBuffer = null; - mLineBufferIndex = 0; - mCurrentReader.close(); - mCurrentReader = null; - return 0; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - mLineBuffer = null; - mLineBufferIndex = 0; - return 0; - } - mLineBuffer = line.toUpperCase(); - mLineBufferIndex = 0; - } - return mLineBuffer.charAt(mLineBufferIndex++); - } - - private void incrementBaseIndex(int amount) { - if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) { - throw new RuntimeException("Base index: 32-bit overflow"); - } - mBaseIndex += amount; - } - - private void log(String text) { - if (mVerbose) { - System.out.println("# " + new Date() + " " + text); - } - } - - private static void dbg(String text) { - System.out.println("#DBG: " + text); - } - - private static KMerPosition encodeKMer(char[] kmerChars, int baseIndex) { - if (mUseOldFormat) { - return encodeKMerOldFormat(kmerChars, baseIndex); - } - char[] encoding = encodeKMerChars(kmerChars); - if (encoding == null) { - return null; - } - char[] reverseEncoding = encodeKMerChars(reverseComplement(kmerChars)); - if (compareEncodings(encoding, reverseEncoding) <= 0) { - return new KMerPositionN(encoding, baseIndex); - } else { - KMerPositionN kmp = new KMerPositionN(reverseEncoding, baseIndex); - kmp.setIsReversed(true); - return kmp; - } - } - - private static char[] encodeKMerChars(char[] kmerChars) { - if (kmerChars == null) { - return null; - } - - int kmerLength = kmerChars.length; - int encodingLength = (kmerLength + 7) / 8; - char[] encoding = new char[encodingLength]; - int offset = kmerLength % 8; - offset = (offset == 0) ? 8 : offset; - int bits = encodeKMerBits(kmerChars, 0, offset); - if (bits < 0) { - return null; - } - encoding[0] = (char) bits; - for (int i = 1; i < encodingLength; i++) { - bits = encodeKMerBits(kmerChars, offset, 8); - if (bits < 0) { - return null; - } - encoding[i] = (char) bits; - offset += 8; - } - return encoding; - } - - private static int compareEncodings(char[] encoding1, char[] encoding2) { - int length = Math.max(encoding1.length, encoding2.length); - for (int i = 0; i < length; i++) { - int result = encoding1[i] - encoding2[i]; - if (result != 0) { - return result; - } - } - return 0; - } - - private static KMerPosition encodeKMerOldFormat(char[] kmerChars, int baseIndex) { - if (kmerChars == null) { - return null; - } - int length = kmerChars.length; - if (length <= 31) { - long bits = encodeKMerBitsLong(kmerChars, 0, length); - if (bits == -1) { - return null; - } - return new KMerPosition1(bits, baseIndex); - } else if (length <= 62) { - long bits1 = encodeKMerBitsLong(kmerChars, 0, 31); - long bits2 = encodeKMerBitsLong(kmerChars, 31, length - 31); - if (bits1 == -1 || bits2 == -1) { - return null; - } - return new KMerPosition2(bits1, bits2, baseIndex); - } else { - return null; - } - } - - private static int encodeKMerBits(char[] kmerChars, int offset, int length) { - int bits = 0; - for (int i = 0; i < length; i++) { - char base = kmerChars[offset + i]; - int baseBits = "ACGT".indexOf(base); - if (baseBits < 0) { - return -1; - } - bits |= baseBits << (2*(length-i-1)); - } - return bits; - } - - private static long encodeKMerBitsLong(char[] kmerChars, int offset, int length) { - long bits = 0; - for (int i = 0; i < length; i++) { - char base = kmerChars[offset + i]; - int baseBits = "ACGT".indexOf(base); - if (baseBits < 0) { - return -1; - } - bits |= ((long)baseBits) << (2*(length-i-1)); - } - return bits; - } - - private static String decodeKMer1(long bits) { - int length = mK; - char[] buffer = new char[length]; - decodeKMerBits(bits, buffer, 0, length); - return new String(buffer); - } - - private static String decodeKMer2(long bits1, long bits2) { - int length = mK; - char[] buffer = new char[length]; - decodeKMerBits(bits1, buffer, 0, 31); - decodeKMerBits(bits2, buffer, 31, length-31); - return new String(buffer); - } - - private static String decodeKMerN(char[] encoding, boolean reverse) { - int length = mK; - char[] buffer = new char[length]; - int offset = length % 8; - offset = (offset == 0) ? 8 : offset; - decodeKMerBits(encoding[0], buffer, 0, offset); - for (int i = 1; i < encoding.length; i++) { - decodeKMerBits(encoding[i], buffer, offset, 8); - offset += 8; - } - if (reverse) { - reverseComplementInPlace(buffer); - } - return new String(buffer); - } - - private static void decodeKMerBits(char bits, char[] buffer, int offset, int length) { - for (int i = 0; i < length; i++) { - int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); - buffer[offset + i] = "ACGT".charAt(baseBits); - } - } - - private static void decodeKMerBits(long bits, char[] buffer, int offset, int length) { - for (int i = 0; i < length; i++) { - int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); - buffer[offset + i] = "ACGT".charAt(baseBits); - } - } - - private static char[] reverseComplement(char[] buffer) { - int length = buffer.length; - char[] result = new char[length]; - System.arraycopy(buffer, 0, result, 0, length); - reverseComplementInPlace(result); - return result; - } - - private static void reverseComplementInPlace(char[] buffer) { - int length = buffer.length; - int limit = (length + 1)/2; - for (int i = 0; i < limit; i++) { - char ch1 = reverseComplement(buffer[i]); - char ch2 = reverseComplement(buffer[length-i-1]); - buffer[i] = ch2; - buffer[length-i-1] = ch1; - } - } - - private static char reverseComplement(char base) { - switch (base) { - case 'A': - return 'T'; - case 'C': - return 'G'; - case 'G': - return 'C'; - case 'T': - return 'A'; - } - return base; - } - - private static String formatEncoding(char[] encoding) { - if (encoding == null) { - return null; - } - StringBuilder builder = new StringBuilder(); - builder.append('['); - for (int i = 0; i < encoding.length; i++) { - String hex = Integer.toHexString(encoding[i]); - int length = hex.length(); - while (length < 4) { - builder.append('0'); - length++; - } - builder.append(hex); - } - builder.append(']'); - return builder.toString(); - } - - static class KMerPosition - implements Comparable { - - private int mBaseIndex; - - KMerPosition(int baseIndex) { - mBaseIndex = baseIndex; - } - - public String getKMer() { - return null; - } - - public long getKMerEncoding1() { - return -1; - } - - public long getKMerEncoding2() { - return -1; - } - - public final int getBaseIndex() { - return mBaseIndex; - } - - public final void setBaseIndex(int baseIndex) { - mBaseIndex = baseIndex; - } - - public char[] getKMerEncoding() { - return null; - } - - public int compareTo(KMerPosition kmp) { - return compareEncodings(getKMerEncoding(), kmp.getKMerEncoding()); - } - - public boolean equals(Object object) { - if (!(object instanceof KMerPosition)) { - return false; - } - KMerPosition kmp = (KMerPosition) object; - return (getBaseIndex() == kmp.getBaseIndex() && - this.compareTo(kmp) == 0); - } - - public String format() { - return(getKMer() + - " " + formatEncoding(getKMerEncoding()) + - " " + Integer.toHexString(mBaseIndex)); - } - } - - static class KMerPosition1 - extends KMerPosition { - - private long mKMerEncoding1; - - KMerPosition1(long kmer, int baseIndex) { - super(baseIndex); - mKMerEncoding1 = kmer; - } - - public String getKMer() { - return decodeKMer1(getKMerEncoding1()); - } - - public final long getKMerEncoding1() { - return mKMerEncoding1; - } - - public int compareTo(KMerPosition kmp) { - int result = Long.signum(getKMerEncoding1() - kmp.getKMerEncoding1()); - if (result == 0) { - result = Long.signum(getKMerEncoding2() - kmp.getKMerEncoding2()); - } - return result; - } - } - - static class KMerPosition2 - extends KMerPosition1 { - - private long mKMerEncoding2; - - KMerPosition2(long encoding1, long encoding2, int baseIndex) { - super(encoding1, baseIndex); - mKMerEncoding2 = encoding2; - } - - public String getKMer() { - return decodeKMer2(getKMerEncoding1(), getKMerEncoding2()); - } - - public final long getKMerEncoding2() { - return mKMerEncoding2; - } - } - - static class KMerPositionN - extends KMerPosition { - - private boolean mReversed; - private char[] mKMerEncoding; - - KMerPositionN(char[] encoding, int baseIndex) { - super(baseIndex); - mReversed = false; - mKMerEncoding = encoding; - } - - public boolean getIsReversed() { - return mReversed; - } - - public void setIsReversed(boolean value) { - mReversed = value; - } - - public String getKMer() { - return decodeKMerN(mKMerEncoding, mReversed); - } - - public final char[] getKMerEncoding() { - return mKMerEncoding; - } - - public String format() { - return(getKMer() + - " " + formatEncoding(getKMerEncoding()) + - " " + (mReversed ? 'R' : 'F') + - " " + Integer.toHexString(getBaseIndex())); - } - } - - static class StringKMerPosition - implements Comparable { - - private String mKMerString = null; - private int mBaseIndex; - - StringKMerPosition(String kmer, int baseIndex) { - mKMerString = kmer; - mBaseIndex = baseIndex; - } - - public final String getKMer() { - return mKMerString; - } - - public final int getBaseIndex() { - return mBaseIndex; - } - - public final void setBaseIndex(int baseIndex) { - mBaseIndex = baseIndex; - } - - public int compareTo(StringKMerPosition kmp) { - return mKMerString.compareTo(kmp.mKMerString); - } - - public boolean equals(Object object) { - if (!(object instanceof StringKMerPosition)) { - return false; - } - StringKMerPosition kmp = (StringKMerPosition) object; - return (mBaseIndex == kmp.mBaseIndex && - mKMerString.equals(kmp.mKMerString)); - } - } -} diff --git a/java/lib/edu/mit/broad/cnv/GatherAlignments.java b/java/lib/edu/mit/broad/cnv/GatherAlignments.java deleted file mode 100644 index b0dc2d5af..000000000 --- a/java/lib/edu/mit/broad/cnv/GatherAlignments.java +++ /dev/null @@ -1,399 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv; - -import edu.mit.broad.arachne.Alignment; -import edu.mit.broad.arachne.LookAlignReader; - -import java.io.*; -import java.util.*; - -/** - * Utility program to gather CNV alignments from LookAlign files in an I/O efficient manner. - */ -public class GatherAlignments { - - public static void main(String[] args) - throws Exception { - new GatherAlignments().run(args); - } - - private void usage() { - System.out.println("Usage: GatherAlignments ..."); - System.out.println(" -cnpList "); - System.out.println(" -sampleId "); - System.out.println(" -inputFileList "); - System.out.println(" -outputDirectory "); - System.out.println(" -padding "); - System.out.println(" -bestAlignments"); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-cnpList") && argsleft > 1) { - argpos++; - mCnpListPath = args[argpos++]; - } else if (arg.equals("-sampleId") && argsleft > 1) { - argpos++; - mSampleId = args[argpos++]; - } else if (arg.equals("-inputFileList") && argsleft > 1) { - argpos++; - mInputFileListPath = args[argpos++]; - } else if (arg.equals("-outputDirectory") && argsleft > 1) { - argpos++; - mOutputDirectory = args[argpos++]; - } else if (arg.equals("-padding") && argsleft > 1) { - argpos++; - mCnpRegionPadding = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-bestAlignments")) { - argpos++; - mReturnBestHits = true; - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void run(String[] args) - throws Exception { - - if (!parseArguments(args)) { - System.exit(1); - } - - List mInputFileList = parseInputFiles(mInputFileListPath); - Map> mCnpMap = parseCnpFile(mCnpListPath); - for (File inputFile : mInputFileList) { - scanInputFile(inputFile, mCnpMap); - } - } - - private List parseInputFiles(String path) - throws IOException { - List fileList = new ArrayList(); - LineNumberReader reader = new LineNumberReader(new FileReader(path)); - while (true) { - String line = reader.readLine(); - if (line == null) { - reader.close(); - break; - } - line = line.trim(); - if (line.length() == 0 || line.startsWith("#")) { - continue; - } - String[] fields = line.split("\\s+"); - fileList.add(new File(fields[0])); - } - return fileList; - } - - private Map> parseCnpFile(String path) - throws IOException { - Map> cnpMap = new HashMap>(); - LineNumberReader reader = new LineNumberReader(new FileReader(path)); - while (true) { - String line = reader.readLine(); - if (line == null) { - reader.close(); - break; - } - line = line.trim(); - if (line.length() == 0 || line.startsWith("#")) { - continue; - } - String[] fields = line.split("\\s+"); - if (fields.length != 4) { - throw new RuntimeException("Invalid CNP line: " + line); - } - if (fields[0].equalsIgnoreCase("CNPID")) { - continue; - } - String cnpId = fields[0]; - String chromosome = fields[1]; - int start = Integer.parseInt(fields[2].replaceAll(",", "")); - int end = Integer.parseInt(fields[3].replaceAll(",", "")); - int sequenceId = chromosomeToSequenceId(chromosome); - if (sequenceId < 0) { - throw new RuntimeException("Unrecognized chromosome: " + chromosome); - } - if (mCnpRegionPadding > 0) { - start = Math.max(1, start - mCnpRegionPadding); - end = end + mCnpRegionPadding; - } - CnpRegion cnp = new CnpRegion(cnpId, sequenceId, start, end); - List cnpList = cnpMap.get(sequenceId); - if (cnpList == null) { - cnpList = new ArrayList(); - cnpMap.put(sequenceId, cnpList); - } - cnpList.add(cnp); - } - return cnpMap; - } - - private int chromosomeToSequenceId(String text) { - if (text == null || text.length() == 0) { - return -1; - } - if (text.matches("\\d+")) { - return Integer.parseInt(text); - } - if (text.startsWith("chr") && text.length() > 3) { - text = text.substring(3); - } - if (text.matches("\\d+") && !text.startsWith("0")) { - return Integer.parseInt(text); - } - if (text.equals("M")) { - return 0; - } else if (text.equals("X")) { - return 23; - } else if (text.equals("Y")) { - return 24; - } else { - return -1; - } - } - - private void scanInputFile(File inputFile, - Map> cnpMap) - throws IOException { - LookAlignReader reader = new LookAlignReader(inputFile); - while (true) { - Alignment alignment = getNextAlignment(reader); - if (alignment == null) { - reader.close(); - break; - } - List cnpList = cnpMap.get(alignment.getBSequenceId()); - if (cnpList == null) { - continue; - } - for (CnpRegion cnp : cnpList) { - if (overlaps(cnp, alignment)) { - saveCnpAlignment(cnp, alignment, inputFile); - } - } - } - flushCnpAlignments(inputFile); - } - - private Alignment getNextAlignment(LookAlignReader reader) - throws IOException { - if (!mReturnBestHits) { - if (reader.hasNext()) { - return reader.next(); - } else { - return null; - } - } - while (true) { - Alignment seed = mPendingAlignment; - mPendingAlignment = null; - if (seed == null && reader.hasNext()) { - seed = reader.next(); - } - if (seed == null) { - return null; - } - List secondaryHits = null; - while (reader.hasNext()) { - Alignment alignment = reader.next(); - if (alignment.getASequenceId() != seed.getASequenceId()) { - if (alignment.getASequenceId() < seed.getASequenceId()) { - throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format()); - } - mPendingAlignment = alignment; - break; - } - if (secondaryHits == null) { - secondaryHits = new ArrayList(); - } - secondaryHits.add(alignment); - } - if (secondaryHits == null) { - return seed; - } - secondaryHits.add(seed); - Alignment result = getUniqueBestAlignment(secondaryHits); - if (result != null) { - return result; - } - } - } - - private Alignment getUniqueBestAlignment(List alignments) { - int bestMismatches = 0; - List best = new ArrayList(); - for (Alignment a : alignments) { - int mismatches = getAlignmentMismatches(a); - if (best.isEmpty()) { - best.add(a); - bestMismatches = mismatches; - } - if (mismatches == bestMismatches) { - best.add(a); - } else if (mismatches < bestMismatches) { - best.clear(); - best.add(a); - bestMismatches = mismatches; - } - } - if (best.size() != 1) { - return null; - } - return best.get(0); - } - - private int getAlignmentMismatches(Alignment alignment) { - int mismatches = 0; - int[] blocks = alignment.getAlignmentBlocks(); - for (int i = 0; i < blocks.length; i += 3) { - int gap = blocks[i]; - int duration = blocks[i+1]; - int mm = blocks[i+2]; - if (mm > duration) { - throw new RuntimeException("Invalid alignment? : " + alignment.format()); - } - mismatches += Math.abs(gap); - mismatches += mm; - } - return mismatches; - } - - private boolean overlaps(CnpRegion cnp, Alignment alignment) { - return (cnp.getSequenceId() == alignment.getBSequenceId() && - cnp.getStart() <= alignment.getBEnd() && - cnp.getEnd() >= alignment.getBStart()); - } - - private void saveCnpAlignment(CnpRegion cnp, Alignment alignment, File inputFile) - throws IOException { - if (mCnpAlignmentCount > mCnpAlignmentLimit) { - flushCnpAlignments(inputFile); - } - String cnpId = cnp.getCnpId(); - List alignmentList = mCnpAlignmentMap.get(cnpId); - if (alignmentList == null) { - alignmentList = new ArrayList(); - mCnpAlignmentMap.put(cnpId, alignmentList); - } - alignmentList.add(alignment); - mCnpAlignmentCount++; - } - - private void flushCnpAlignments(File inputFile) - throws IOException { - while (!mCnpAlignmentMap.isEmpty()) { - String cnpId = mCnpAlignmentMap.keySet().iterator().next(); - List alignmentList = mCnpAlignmentMap.get(cnpId); - writeAlignments(cnpId, mSampleId, alignmentList, inputFile); - mCnpAlignmentMap.remove(cnpId); - mCnpAlignmentCount -= alignmentList.size(); - } - if (mCnpAlignmentCount != 0) { - throw new RuntimeException("Unsynchronized alignment count"); - } - } - - private void writeAlignments(String cnpId, String sampleId, List alignmentList, File inputFile) - throws IOException { - File outputDir = new File("."); - if (mOutputDirectory != null) { - outputDir = new File(mOutputDirectory); - } - String cnpSample = cnpId; - if (sampleId != null) { - cnpSample = cnpSample + "_" + sampleId; - } - File cnpSampleDir = new File(outputDir, cnpSample); - if (!cnpSampleDir.exists()) { - if (!cnpSampleDir.mkdir()) { - throw new RuntimeException("Failed to create directory " + cnpSampleDir); - } - } - String fileName = inputFile.getName(); - File alignmentFile = new File(cnpSampleDir, fileName); - PrintWriter writer = new PrintWriter(new FileWriter(alignmentFile, true)); - for (Alignment alignment : alignmentList) { - writer.println(alignment.arachneFormat()); - } - writer.flush(); - writer.close(); - } - - private GatherAlignments() { - } - - private static class CnpRegion { - - private CnpRegion(String cnpId, int sequenceId, int start, int end) { - mCnpId = cnpId; - mSequenceId = sequenceId; - mStart = start; - mEnd = end; - } - - public String getCnpId() { return mCnpId; }; - public int getSequenceId() { return mSequenceId; }; - public int getStart() { return mStart; }; - public int getEnd() { return mEnd; }; - - private String mCnpId; - private int mSequenceId; - private int mStart; - private int mEnd; - } - - private boolean mDebug = false; - private boolean mVerbose = false; - - private boolean mReturnBestHits = false; - private String mCnpListPath = null; - private String mSampleId = null; - private String mInputFileListPath = null; - private String mOutputDirectory = null; - private int mCnpRegionPadding = 0; - - private Alignment mPendingAlignment = null; - private int mCnpAlignmentCount = 0; - private int mCnpAlignmentLimit = 1000000; - private Map> mCnpAlignmentMap = new LinkedHashMap>(); -} - - - diff --git a/java/lib/edu/mit/broad/cnv/kmer/CountKMers.java b/java/lib/edu/mit/broad/cnv/kmer/CountKMers.java deleted file mode 100644 index 23b9d6af4..000000000 --- a/java/lib/edu/mit/broad/cnv/kmer/CountKMers.java +++ /dev/null @@ -1,1494 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv.kmer; - - -import edu.mit.broad.cnv.util.SequenceIterator; - -import java.io.*; -import java.util.*; - - -/** - * Tool for counting unique kmers. - */ -public class CountKMers -{ - private static final int NONUNIQUE_MARKER = -1; - - private String mAction = null; - private static int mK = 0; - private int mMinimumK = 0; - private int mMaximumK = 0; - private int mBatchSize = 0; - private List mInputFiles = null; - private File mSearchFile = null; - private String mSequenceName = null; - private File mInputDirectory = null; - private File mOutputDirectory = null; - private boolean mRunDistributed = false; - private int mDistributedWorkerCount = 0; - private boolean mVerbose = false; - private boolean mDebug = false; - - private List mSequenceList = null; - private List mSequenceOffsetList = null; - private List mSpillFileList = null; - private double mSpillFactor = 0.9; - - private long mKMerCount = 0; - private long mUniquePriorCount = 0; - private long mUniqueNewCount = 0; - private long mPriorMapUniqueCount = 0; - - private InputStream mPriorMapStream = null; - private int mPriorMapPosition = -1; - private int mPriorMapValue = 0; - private int mInputFileIndex = 0; - private LineNumberReader mCurrentReader = null; - private String mNextSequence = null; - private char[] mKMerBuffer = null; - private int mKMerBufferedCount = 0; - private String mLineBuffer = null; - private int mLineBufferIndex = 0; - private int mBaseIndex = -1; - private byte[] mIOBuffer = null; - - /* Design - Inputs: - - One or more fasta files to search (currently one). - - Output directory for the result files. - - Optionally an input k-1-mer file (output from previous pass). - Outputs: - - Unique kmer file: (sorted by kmer) - This is unique globally or unique wrt unique (K-1) mers (i.e. K unique, K-1 not). - - Per chromosome bit map: pos (implicit) new-bit cum-bit - New-bit is 1 if Kmer starting at pos is unique but (K-1)-mer is not. - Cum-bit is 1 if Kmer starting at pos is unique for some L <= K. - - Statistics - Plan: - - Reducing memory footprint is crucial. - - Sequential pass over the input sequences to generate kmers. - - BatchSize kmers are cached in memory, then sorted and uniqified. - - As batch array fills, batches are spilled to disk. - - Batches are reloaded from disk and merged (N-finger algorithm) - - and streamed to a merge file. - - Merge file is read from disk and processed as final results. - */ - - public static void main(String[] args) - throws Exception { - new CountKMers().run(args); - } - - private void usage() { - System.out.println("Usage: CountKMers ..."); - System.out.println(" -action "); - System.out.println(" -genome "); - System.out.println(" -chromosome "); - System.out.println(" -k "); - System.out.println(" -minK "); - System.out.println(" -maxK "); - System.out.println(" -batchSize "); - System.out.println(" -inputDir "); - System.out.println(" -outputDir "); - System.out.println(" -distributed"); - System.out.println(" -workers "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-action") && argsleft > 1) { - argpos++; - mAction = args[argpos++]; - } else if (arg.equals("-genome") && argsleft > 1) { - argpos++; - if (mInputFiles == null) { - mInputFiles = new ArrayList(); - } - mInputFiles.add(new File(args[argpos++])); - } else if (arg.equals("-chromosome") && argsleft > 1) { - argpos++; - mSequenceName = args[argpos++]; - } else if (arg.equals("-k") && argsleft > 1) { - argpos++; - mK = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-minK") && argsleft > 1) { - argpos++; - mMinimumK = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-maxK") && argsleft > 1) { - argpos++; - mMaximumK = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-batchSize") && argsleft > 1) { - argpos++; - mBatchSize = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-inputDir") && argsleft > 1) { - argpos++; - mInputDirectory = new File(args[argpos++]); - } else if (arg.equals("-outputDir") && argsleft > 1) { - argpos++; - mOutputDirectory = new File(args[argpos++]); - } else if (arg.equals("-searchFile") && argsleft > 1) { - argpos++; - mSearchFile = new File(args[argpos++]); - } else if (arg.equals("-distributed")) { - argpos++; - mRunDistributed = true; - } else if (arg.equals("-workers") && argsleft > 1) { - argpos++; - mDistributedWorkerCount = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void run(String[] args) - throws Exception { - if (!parseArguments(args)) { - System.exit(1); - } - if (mAction == null || mAction.equals("mapKMers")) { - if (mRunDistributed) { - mapKMersDistributed(); - } else { - mapKMers(); - } - } else if (mAction.equals("mapGaps")) { - mapGaps(); - } else if (mAction.equals("rollUp")) { - rollUp(); - } else if (mAction.equals("search")) { - search(); - } - } - - private void search() - throws IOException { - char[][] searchStrings = loadSearchFile(mSearchFile); - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - int position = 0; - log("Scanning " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - position++; - for (int i = 0; i < searchStrings.length; i++) { - if (Arrays.equals(searchStrings[i], kmerChars)) { - String kmer = new String(searchStrings[i]); - String strand = ((i % 2) == 0) ? "F" : "R"; - System.out.println(kmer + "\t" + seqName + "\t" + position + "\t" + strand); - } - } - } - } - } - - private char[][] loadSearchFile(File file) - throws IOException { - List list = new ArrayList(); - LineNumberReader reader = new LineNumberReader(new FileReader(file)); - while (true) { - String line = reader.readLine(); - if (line == null) { - reader.close(); - break; - } - String text = line.trim(); - if (text.length() == 0 || text.startsWith("#")) { - continue; - } - String[] fields = text.split("\\s+"); - char[] kmer = fields[0].toUpperCase().toCharArray(); - list.add(kmer); - list.add(reverseComplement(kmer)); - } - return list.toArray(new char[0][0]); - } - - // Can be used to scan genome for sequence names/lengths. - private void scanKMers() - throws IOException { - mSequenceList = new ArrayList(); - mSequenceOffsetList = new ArrayList(); - File priorMapFile = - new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); - openPriorMap(priorMapFile); - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - mSequenceList.add(seqName); - mSequenceOffsetList.add(mBaseIndex+1); - log("Scanning " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - mKMerCount++; - if (isUniqueInPriorMap(mBaseIndex)) { - continue; - } - } - } - closePriorMap(); - } - - private void mapGaps() - throws IOException { - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - int pos = 0; - int gapStart = 0; - while (true) { - char base = getNextBase(); - if (base == 0) { - break; - } - pos++; - if (base == 'N') { - if (gapStart == 0) { - gapStart = pos; - } - } else { - if (gapStart > 0) { - System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); - gapStart = 0; - } - } - } - if (gapStart > 0) { - System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1)); - gapStart = 0; - } - } - } - - private void rollUp() - throws IOException { - // Roll up based on the middle of the reads. - File[] mapFiles = getAllMapFiles(); - if (mapFiles.length > 127) { - throw new RuntimeException("K to large for byte sized counts"); - } - SequenceIterator seqIterator = new SequenceIterator(mInputFiles); - while (true) { - String seqName = seqIterator.getNextSequence(); - if (seqName == null) { - break; - } - if (mSequenceName != null && !mSequenceName.equals(seqName)) { - continue; - } - log("Rolling up sequence " + seqName + " ..."); - int seqBaseIndex = seqIterator.getBaseIndex() + 1; - char[] seqChars = loadSequence(seqIterator); - int seqLength = seqChars.length; - int seqMapOffset = (seqBaseIndex >> 3) & 0x1FFFFFFF; - int seqMapModulus = (seqBaseIndex & 0x7); - int seqMapLength = (seqMapModulus + seqLength + 7)/8; - // log(" seqLength = " + seqLength); - // log(" baseIndex = " + Integer.toHexString(seqBaseIndex) - // + " (" + (((long)seqBaseIndex) & 0xFFFFFFFFL) + ")"); - // log(" seqMapOffset = " + seqMapOffset); - // log(" seqMapLength = " + seqMapLength); - byte[] counts = new byte[seqLength]; - for (int pos = 1; pos <= seqLength; pos++) { - if (seqChars[pos-1] == 'N') { - counts[pos-1] = -1; - } - } - for (int k = 1; k <= mapFiles.length; k++) { - if (mapFiles[k-1] == null) { - continue; - } - log("Processing map file " + mapFiles[k-1] + " ..."); - byte[] kmerMap = readMapFileRegion(mapFiles[k-1], seqMapOffset, seqMapLength); - for (int pos = 1; pos <= seqLength; pos++) { - if (counts[pos-1] != 0) { - continue; - } else if (isNearContigBoundary(pos, seqChars, k)) { - counts[pos-1] = -1; - } else { - int baseOffset = pos - (k+1)/2; - int mapIndex = seqMapModulus + baseOffset; - if (isUniqueInMap(kmerMap, mapIndex)) { - counts[pos-1] = (byte) k; - } - } - } - } - File outputFile = - new File(mOutputDirectory, "rollup_" + seqName + ".bin"); - writeRollUpFile(outputFile, counts); - } - } - - private boolean isNearContigBoundary(int pos, char[] seqChars, int k) { - int windowStart = pos - (k-1)/2; - int windowEnd = pos + k/2; - if (windowStart < 1 || windowEnd > seqChars.length) { - return true; - } - for (int i = windowStart-1; i < windowEnd; i++) { - if (seqChars[i] == 'N') { - return true; - } - } - return false; - } - - private void writeRollUpFile(File file, byte[] counts) - throws IOException { - FileOutputStream stream = new FileOutputStream(file); - stream.write(counts); - stream.flush(); - stream.close(); - if (mDebug) { - PrintWriter writer = new PrintWriter(file + ".dbg"); - for (int i = 0; i < counts.length; i++) { - writer.println(counts[i]); - } - writer.flush(); - writer.close(); - } - } - - /** - * Returns an array of files, indexed by K, - * where the array index = K-1 (i.e. K=1 is the first file). - * If there is no file for index K, then the array element is null. - */ - private File[] getAllMapFiles() { - int maxK = mMaximumK; - if (maxK == 0) { - // Safe upper bound - maxK = 1000; - } - List fileList = new ArrayList(); - for (int k = 1; k <= maxK; k++) { - if (mMinimumK > 0 && k < mMinimumK) { - continue; - } - File mapFile = - new File(mInputDirectory, "unique_" + k + "_mers_map.bin"); - if (mapFile.exists()) { - while (fileList.size() < k-1) { - fileList.add(null); - } - fileList.add(mapFile); - } else { - if (mMaximumK == 0 && !fileList.isEmpty()) { - break; - } - } - } - File[] result = new File[fileList.size()]; - result = fileList.toArray(result); - if (mDebug) { - for (int i = 0; i < result.length; i++) { - debug("mapFiles[k=" + (i+1) + "] = " + result[i]); - } - } - return result; - } - - private char[] loadSequence(SequenceIterator seqIterator) - throws IOException { - StringBuilder builder = new StringBuilder(); - while (true) { - char ch = seqIterator.getNextBase(); - if (ch == 0) { - break; - } - builder.append(ch); - } - char[] result = new char[builder.length()]; - builder.getChars(0, builder.length(), result, 0); - return result; - } - - private void mapKMersDistributed() - throws Exception { - DistributedKMerCounter algorithm = new DistributedKMerCounter(); - algorithm.setDebug(mDebug); - algorithm.setVerbose(mVerbose); - algorithm.setInputFiles(mInputFiles); - algorithm.setK(mK); - algorithm.setMaximumWorkerCount(mDistributedWorkerCount); - // algorithm.setLsfQueue(mLsfQueue); - // algorithm.setLsfLogDirectory(mLsfLogDirectory); - // algorithm.setEnableGcLogging(mEnableGcLogging); - algorithm.run(); - } - - private void mapKMers() - throws IOException { - - File textKMerFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.txt"); - File binaryKMerFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.bin"); - File exceptionFile = - new File(mOutputDirectory, "unique_" + mK + "_mers.extra"); - File mapFile = - new File(mOutputDirectory, "unique_" + mK + "_mers_map.bin"); - File priorMapFile = - new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin"); - File statsFile = - new File(mOutputDirectory, "unique_" + mK + "_mers_stats.txt"); - - if (mBatchSize == 0) { - throw new RuntimeException("Batch size not specified"); - } - - int kmerCount = 0; - int batchSize = mBatchSize; - KMerPosition[] kmerArray = new KMerPosition[batchSize]; - List exceptionList = new ArrayList(); - mSequenceList = new ArrayList(); - mSequenceOffsetList = new ArrayList(); - mIOBuffer = new byte[Math.max(20,4 + 2*((mK + 7)/8))]; - - openPriorMap(priorMapFile); - - while (true) { - String seqName = getNextSequence(); - if (seqName == null) { - break; - } - mSequenceList.add(seqName); - mSequenceOffsetList.add(mBaseIndex+1); - log("Processing " + seqName + " ..."); - while (true) { - char[] kmerChars = getNextKMer(); - if (kmerChars == null) { - break; - } - mKMerCount++; - int baseIndex = mBaseIndex; - if (isUniqueInPriorMap(baseIndex)) { - mUniquePriorCount++; - continue; - } - KMerPosition kmp = encodeKMer(kmerChars, baseIndex); - if (kmp == null) { - // Note: We currently do not handle the reverse - // complement of exception characters correctly. - // For hg18, however, this doesn't matter as - // none of the kmers containing non-ACGT characters - // are present on the reverse strand. - String kmer = new String(kmerChars); - exceptionList.add(new StringKMerPosition(kmer, baseIndex)); - continue; - } - kmerArray[kmerCount++] = kmp; - if (kmerCount == batchSize) { - kmerCount = compactKMers(kmerArray, kmerCount); - if (kmerCount > mSpillFactor * batchSize) { - spillKMers(kmerArray, kmerCount); - kmerCount = 0; - } - } - } - } - if (kmerCount > 0) { - kmerCount = compactKMers(kmerArray, kmerCount); - if (mSpillFileList != null) { - spillKMers(kmerArray, kmerCount); - kmerCount = 0; - } - } - - closePriorMap(); - - // Write out the exception kmers (text file). - compactKMers(exceptionList); - writeExceptionFile(exceptionList, exceptionFile); - - // Write out the binary file of unique encoded kmers. - if (mSpillFileList == null) { - kmerCount = removeNonUnique(kmerArray, kmerCount); - writeKMerBinaryFile(kmerArray, kmerCount, binaryKMerFile); - mUniqueNewCount = kmerCount; - } else { - mUniqueNewCount = mergeSpillFiles(mSpillFileList, binaryKMerFile); - } - mUniqueNewCount += countUniqueKMers(exceptionList); - - // Write out the text file of (all) unique kmers. - writeKMerTextFile(binaryKMerFile, exceptionList, textKMerFile); - - // Create map file from prior map plus the new unique kmers. - long mapSize = (mBaseIndex + 1) & 0xFFFFFFFFL; - createMapFile(mapSize, binaryKMerFile, exceptionList, priorMapFile, mapFile); - - // Write summary statistics file. - writeSummaryStatistics(statsFile); - } - - private int compactKMers(KMerPosition[] kmerArray, int kmerCount) { - if (kmerCount == 0) { - return 0; - } - log("Compacting " + kmerCount + " kmers at index " + - Integer.toHexString(mBaseIndex) + " ..."); - Arrays.sort(kmerArray, 0, kmerCount); - int newCount = 1; - KMerPosition current = kmerArray[0]; - for (int i = 1; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - if (current.compareTo(kmp) == 0) { - current.setBaseIndex(NONUNIQUE_MARKER); - } else { - kmerArray[newCount++] = kmp; - current = kmp; - } - } - log("Compaction finished, new count is " + newCount); - return newCount; - } - - private int compactKMers(StringKMerPosition[] kmerArray, int kmerCount) { - if (kmerCount == 0) { - return 0; - } - log("Compacting " + kmerCount + " string kmers ..."); - Arrays.sort(kmerArray, 0, kmerCount); - int newCount = 1; - String kmerString = kmerArray[0].getKMer(); - for (int i = 1; i < kmerCount; i++) { - StringKMerPosition kmp = kmerArray[i]; - String ks = kmp.getKMer(); - if (ks.equals(kmerString)) { - kmerArray[newCount-1].setBaseIndex(NONUNIQUE_MARKER); - } else { - kmerArray[newCount++] = kmp; - kmerString = ks; - } - } - log("Compaction finished, new count is " + newCount); - return newCount; - } - - private void compactKMers(List kmerList) { - int kmerCount = kmerList.size(); - if (kmerCount <= 1) { - return; - } - StringKMerPosition[] kmerArray = - kmerList.toArray(new StringKMerPosition[kmerCount]); - kmerCount = compactKMers(kmerArray, kmerCount); - kmerList.clear(); - for (int i = 0; i < kmerCount; i++) { - kmerList.add(kmerArray[i]); - } - } - - private int removeNonUnique(KMerPosition[] kmerArray, int kmerCount) { - int uniqueCount = 0; - for (int i = 0; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { - kmerArray[uniqueCount++] = kmp; - } - } - return uniqueCount; - } - - private int countUniqueKMers(List kmerList) { - int uniqueCount = 0; - for (StringKMerPosition kmp : kmerList) { - if (kmp.getBaseIndex() != NONUNIQUE_MARKER) { - uniqueCount++; - } - } - return uniqueCount; - } - - private void spillKMers(KMerPosition[] kmerArray, int kmerCount) - throws IOException { - if (mSpillFileList == null) { - mSpillFileList = new ArrayList(); - } - int fileNumber = mSpillFileList.size() + 1; - log("Spilling " + kmerCount + " kmers to file " + fileNumber + " ..."); - File spillFile = new File(mOutputDirectory, - "spill_" + mK + "_" + fileNumber + ".tmp"); - mSpillFileList.add(spillFile); - writeKMerBinaryFile(kmerArray, kmerCount, spillFile); - log("Spill file written"); - } - - private void writeKMerBinaryFile(KMerPosition[] kmerArray, - int kmerCount, - File outputFile) - throws IOException { - OutputStream outputStream = - new BufferedOutputStream(new FileOutputStream(outputFile)); - for (int i = 0; i < kmerCount; i++) { - KMerPosition kmp = kmerArray[i]; - writeKMerPosition(outputStream, kmerArray[i]); - } - outputStream.flush(); - outputStream.close(); - } - - private void writeExceptionFile(List kmerList, - File outputFile) - throws IOException { - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - for (StringKMerPosition kmer : kmerList) { - writeUniqueKMer(kmer, writer); - } - writer.flush(); - writer.close(); - } - - private KMerPosition readKMerPosition(InputStream stream) - throws IOException { - byte[] buffer = mIOBuffer; - int encodingLength = (mK + 7)/8; - int fileLength = 4 + 2*encodingLength; - int count = readFully(stream, buffer, 0, fileLength); - if (count <= 0) { - return null; - } else if (count != fileLength) { - throw new RuntimeException("Unexpected end of file"); - } - char[] encoding = new char[encodingLength]; - int baseIndex = ((buffer[0] & 0xFF) | - (buffer[1] & 0xFF) << 8 | - (buffer[2] & 0xFF) << 16 | - (buffer[3] & 0xFF) << 24); - for (int i = 0; i < encodingLength; i++) { - encoding[i] = (char) ((buffer[2*i+4] & 0xFF) | - ((buffer[2*i+5] & 0xFF) << 8)); - } - return new KMerPosition(encoding, baseIndex); - } - - private int readFully(InputStream stream, byte[] buffer, int offset, int count) - throws IOException { - int readCount = 0; - while (readCount < count) { - int read = stream.read(buffer, offset, count-readCount); - if (read <= 0) { - break; - } - offset += read; - readCount += read; - } - return readCount; - } - - private void skipBytes(InputStream stream, int count) - throws IOException { - - long longCount = count; - long skipCount = 0; - while (skipCount < longCount) { - long skipped = stream.skip(longCount - skipCount); - if (skipped <= 0) { - throw new RuntimeException("Skip failed"); - } - skipCount += skipped; - } - } - - private void writeKMerPosition(OutputStream stream, KMerPosition kmer) - throws IOException { - byte[] buffer = mIOBuffer; - int baseIndex = kmer.getBaseIndex(); - char[] encoding = kmer.getKMerEncoding(); - int offset = 0; - buffer[offset++] = (byte) ((baseIndex) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF); - buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF); - for (int i = 0; i < encoding.length; i++) { - buffer[offset++] = (byte) ((encoding[i]) & 0xFF); - buffer[offset++] = (byte) ((encoding[i] >> 8) & 0xFF); - } - stream.write(buffer, 0, offset); - } - - private long mergeSpillFiles(List spillFiles, File outputFile) - throws IOException { - - if (spillFiles == null) { - return 0; - } - - log("Merging spill files ..."); - OutputStream outputStream = - new BufferedOutputStream(new FileOutputStream(outputFile)); - long uniqueCount = 0; - int fileCount = spillFiles.size(); - InputStream[] inputStreams = new InputStream[fileCount]; - KMerPosition[] kmers = new KMerPosition[fileCount]; - for (int i = 0; i < fileCount; i++) { - inputStreams[i] = - new BufferedInputStream(new FileInputStream(spillFiles.get(i))); - } - while (true) { - for (int i = 0; i < fileCount; i++) { - if (kmers[i] == null && inputStreams[i] != null) { - kmers[i] = readKMerPosition(inputStreams[i]); - if (kmers[i] == null) { - inputStreams[i].close(); - inputStreams[i] = null; - } - } - } - int count = 0; - KMerPosition kmer = null; - for (int i = 0; i < fileCount; i++) { - KMerPosition kmp = kmers[i]; - if (kmp == null) { - continue; - } else if (kmer == null) { - kmer = kmp; - count = 1; - } else { - int cmp = kmp.compareTo(kmer); - if (cmp == 0) { - count++; - } else if (cmp < 0) { - kmer = kmp; - count = 1; - } - } - } - if (kmer == null) { - break; - } - for (int i = 0; i < fileCount; i++) { - if (kmers[i] != null && kmer.compareTo(kmers[i]) == 0) { - kmers[i] = null; - } - } - if (count == 1 && kmer.getBaseIndex() != NONUNIQUE_MARKER) { - uniqueCount++; - writeKMerPosition(outputStream, kmer); - } - } - outputStream.flush(); - outputStream.close(); - for (int i = 0; i < fileCount; i++) { - // spillFiles.get(i).delete(); - } - log("Spill files merged, unique count is " + uniqueCount); - return uniqueCount; - } - - private void writeKMerTextFile(File inputFile, - List exceptionList, - File outputFile) - throws IOException { - - log("Writing kmer file " + outputFile + " ..."); - int exceptionIndex = 0; - StringKMerPosition excKMer = null; - Iterator excIter = null; - if (!exceptionList.isEmpty()) { - excIter = exceptionList.iterator(); - excKMer = excIter.next(); - } - - InputStream inputStream = - new BufferedInputStream(new FileInputStream(inputFile)); - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - KMerPosition kmer = readKMerPosition(inputStream); - while (kmer != null || excKMer != null) { - if (excKMer == null) { - writeUniqueKMer(kmer, writer); - kmer = readKMerPosition(inputStream); - } else if (kmer == null) { - writeUniqueKMer(excKMer, writer); - excKMer = excIter.hasNext() ? excIter.next() : null; - } else if (kmer.getKMer().compareTo(excKMer.getKMer()) < 0) { - writeUniqueKMer(kmer, writer); - kmer = readKMerPosition(inputStream); - } else { - writeUniqueKMer(excKMer, writer); - excKMer = excIter.hasNext() ? excIter.next() : null; - } - } - inputStream.close(); - writer.flush(); - writer.close(); - log("Wrote kmer file: " + outputFile); - } - - private void writeUniqueKMer(KMerPosition kmer, PrintWriter writer) { - if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { - writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); - } - } - - private void writeUniqueKMer(StringKMerPosition kmer, PrintWriter writer) { - if (kmer.getBaseIndex() != NONUNIQUE_MARKER) { - writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer); - } - } - - private void writeKMer(String kmer, int baseIndex, PrintWriter writer) { - String chr = getBaseIndexSequenceName(baseIndex); - int pos = getBaseIndexCoordinate(baseIndex); - writer.println(kmer + "\t" + chr + "\t" + pos); - } - - private void createMapFile(long mapSize, - File kmerFile, - List exceptionList, - File priorMapFile, - File mapFile) - throws IOException { - byte[] map = null; - long uniquePriorCount = 0; - long byteSize = (mapSize + 7)/8; - int mapByteSize = (int) byteSize; - if (mapByteSize != byteSize) { - throw new RuntimeException("Map too large: " + mapSize); - } - if (priorMapFile.exists()) { - map = readMapFile(priorMapFile); - if (map.length != mapByteSize) { - throw new RuntimeException("Prior map is wrong size"); - } - // Count the prior unique positions - for (int i = 0; i < mapByteSize; i++) { - uniquePriorCount += Integer.bitCount(map[i] & 0xFF); - } - } else { - map = new byte[mapByteSize]; - } - for (StringKMerPosition kmp : exceptionList) { - addToMap(kmp, map); - } - mPriorMapUniqueCount = uniquePriorCount; - - InputStream inputStream = - new BufferedInputStream(new FileInputStream(kmerFile)); - while (true) { - KMerPosition kmp = readKMerPosition(inputStream); - if (kmp == null) { - inputStream.close(); - break; - } - addToMap(kmp, map); - } - - writeMapFile(map, mapFile); - } - - private void addToMap(KMerPosition kmp, byte[] map) { - int baseIndex = kmp.getBaseIndex(); - if (baseIndex != NONUNIQUE_MARKER) { - addToMap(baseIndex, map); - } - } - - private void addToMap(StringKMerPosition kmp, byte[] map) { - int baseIndex = kmp.getBaseIndex(); - if (baseIndex != NONUNIQUE_MARKER) { - addToMap(baseIndex, map); - } - } - - private void addToMap(int baseIndex, byte[] map) { - int mod = baseIndex & 0x7; - int offset = (baseIndex >> 3) & 0x1FFFFFFF; - if ((map[offset] & (1 << mod)) != 0) { - throw new RuntimeException("Map entry already set: " + baseIndex); - } - map[offset] |= (1 << mod); - } - - private boolean isUniqueInMap(byte[] map, int baseIndex) { - int mod = baseIndex & 0x7; - int offset = (baseIndex >> 3) & 0x1FFFFFFF; - return ((map[offset] & (1 << mod)) != 0); - } - - private void writeSummaryStatistics(File outputFile) - throws IOException { - PrintWriter writer = - new PrintWriter(new BufferedWriter(new FileWriter(outputFile))); - long baseCount = (mBaseIndex + 1) & 0xFFFFFFFFL; - long uniqueCount = mUniquePriorCount + mUniqueNewCount; - long nonUniqueCount = mKMerCount - uniqueCount; - writer.println("K: " + mK); - writer.println("Sequences: " + mSequenceList.size()); - writer.println("Bases: " + baseCount); - writer.println("KMers: " + mKMerCount); - writer.println("Prior map count: " + mPriorMapUniqueCount); - writer.println("Unique prior: " + mUniquePriorCount + - " (" + formatPercent(mUniquePriorCount, mKMerCount) + ")"); - writer.println("Unique new: " + mUniqueNewCount + - " (" + formatPercent(mUniqueNewCount, mKMerCount) + ")"); - writer.println("Unique cumulative: " + uniqueCount + - " (" + formatPercent(uniqueCount, mKMerCount) + ")"); - writer.println("Nonunique: " + nonUniqueCount + - " (" + formatPercent(nonUniqueCount, mKMerCount) + ")"); - writer.flush(); - writer.close(); - } - - private String formatPercent(long numerator, long denominator) { - double fraction = 0.0; - if (denominator != 0) { - fraction = numerator / (double) denominator; - } - return String.format("%1.1f%%", fraction * 100.0); - } - - private void openPriorMap(File mapFile) - throws IOException { - if (mapFile.exists()) { - mPriorMapStream = new BufferedInputStream(new FileInputStream(mapFile)); - mPriorMapPosition = -1; - mPriorMapValue = 0; - } - } - - private void closePriorMap() - throws IOException { - if (mPriorMapStream != null) { - mPriorMapStream.close(); - } - mPriorMapStream = null; - mPriorMapPosition = -1; - mPriorMapValue = 0; - } - - private byte[] readMapFile(File file) - throws IOException { - long fileLength = file.length(); - if (fileLength > 1000000000) { - throw new RuntimeException("Prior map too large: " + file); - } - int length = (int) fileLength; - byte[] map = new byte[length]; - FileInputStream stream = new FileInputStream(file); - int count = readFully(stream, map, 0, length); - if (count != length) { - throw new RuntimeException("Failed to read map: " + file); - } - stream.close(); - return map; - } - - /** - * Read just a subset of a map file. - */ - private byte[] readMapFileRegion(File file, int offset, int length) - throws IOException { - byte[] map = new byte[length]; - FileInputStream stream = new FileInputStream(file); - skipBytes(stream, offset); - int count = readFully(stream, map, 0, length); - if (count != length) { - throw new RuntimeException("Failed to read map: " + file); - } - stream.close(); - return map; - } - - private void writeMapFile(byte[] map, File file) - throws IOException { - FileOutputStream stream = new FileOutputStream(file); - stream.write(map); - stream.flush(); - stream.close(); - } - - private boolean isUniqueInPriorMap(int baseIndex) - throws IOException { - if (mPriorMapStream == null) { - return false; - } - int byteOffset = (baseIndex >> 3) & 0x1FFFFFFF; - if (byteOffset != mPriorMapPosition) { - int delta = byteOffset - mPriorMapPosition; - if (delta < 0) { - throw new RuntimeException("Attempt to seek backwards in prior map"); - } - if (delta > 1) { - skipFully(mPriorMapStream, delta-1); - } - mPriorMapValue = mPriorMapStream.read(); - if (mPriorMapValue < 0) { - throw new RuntimeException("Unexpected end of file in prior map"); - } - mPriorMapPosition += delta; - } - int mod = baseIndex & 0x7; - return (((1 << mod) & mPriorMapValue) != 0); - } - - private void skipFully(InputStream stream, long amount) - throws IOException { - while (amount > 0) { - long skip = stream.skip(amount); - if (skip <= 0 || skip > amount) { - throw new RuntimeException("Skip failed"); - } - amount -= skip; - } - } - - private String getBaseIndexSequenceName(int baseIndex) { - int sequenceCount = mSequenceList.size(); - for (int i = 0; i < sequenceCount-1; i++) { - int nextOffset = mSequenceOffsetList.get(i+1); - if (compareBaseIndex(nextOffset, baseIndex) > 0) { - return mSequenceList.get(i); - } - } - return mSequenceList.get(sequenceCount-1); - } - - private int getBaseIndexCoordinate(int baseIndex) { - Integer sequenceOffset = null; - for (Integer offset : mSequenceOffsetList) { - if (compareBaseIndex(offset, baseIndex) > 0) { - break; - } - sequenceOffset = offset; - } - if (sequenceOffset == null) { - return 0; - } - int coordinate = baseIndex - sequenceOffset + 1; - if (coordinate <= 0) { - dumpSequenceList(); - System.out.println("coordinate: " + coordinate); - System.out.println("sequenceOffset: " + Integer.toHexString(sequenceOffset)); - System.out.println("baseIndex: " + Integer.toHexString(baseIndex)); - throw new RuntimeException("Internal error: illegal coordinate " + - coordinate + " for base index " + baseIndex); - } - return coordinate; - } - - private void dumpSequenceList() { - System.out.println("# Sequences:"); - int count = mSequenceList.size(); - for (int i = 0; i < count; i++) { - String seqName = mSequenceList.get(i); - int offset = mSequenceOffsetList.get(i); - System.out.println("# " + seqName + - "\t" + offset + - "\t" + Integer.toHexString(offset)); - } - } - - private int compareBaseIndex(int baseIndex1, int baseIndex2) { - // Implements unsigned comparison, a la compareTo - if (baseIndex1 < 0 ^ baseIndex2 < 0) { - return ((baseIndex1 < 0) ? 1 : -1); - } else { - return (baseIndex1 - baseIndex2); - } - } - - private String getNextSequence() - throws IOException { - - while (mNextSequence == null) { - if (mCurrentReader == null) { - mCurrentReader = getNextReader(); - if (mCurrentReader == null) { - return null; - } - } - String line = mCurrentReader.readLine(); - if (line == null) { - mCurrentReader.close(); - mCurrentReader = null; - continue; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - } - } - String result = mNextSequence; - mNextSequence = null; - return result; - } - - private LineNumberReader getNextReader() - throws IOException { - if (mInputFileIndex >= mInputFiles.size()) { - return null; - } - File file = mInputFiles.get(mInputFileIndex++); - return new LineNumberReader(new FileReader(file)); - } - - private char[] getNextKMer() - throws IOException { - - if (mKMerBuffer == null) { - mKMerBuffer = new char[mK]; - } - System.arraycopy(mKMerBuffer, 1, mKMerBuffer, 0, mKMerBuffer.length - 1); - if (mKMerBufferedCount > 0) { - mKMerBufferedCount--; - } - - while (mKMerBufferedCount < mK) { - char base = getNextBase(); - if (base == 0) { - incrementBaseIndex(mKMerBufferedCount); - mKMerBufferedCount = 0; - return null; - } else if (base == 'N') { - incrementBaseIndex(mKMerBufferedCount+1); - mKMerBufferedCount = 0; - } else { - mKMerBuffer[mKMerBufferedCount++] = base; - } - } - incrementBaseIndex(1); - return mKMerBuffer; - } - - private char getNextBase() - throws IOException { - - if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) { - if (mCurrentReader == null) { - return 0; - } - String line = mCurrentReader.readLine(); - if (line == null) { - mLineBuffer = null; - mLineBufferIndex = 0; - mCurrentReader.close(); - mCurrentReader = null; - return 0; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - mLineBuffer = null; - mLineBufferIndex = 0; - return 0; - } - mLineBuffer = line.toUpperCase(); - mLineBufferIndex = 0; - } - return mLineBuffer.charAt(mLineBufferIndex++); - } - - private void incrementBaseIndex(int amount) { - if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) { - throw new RuntimeException("Base index: 32-bit overflow"); - } - mBaseIndex += amount; - } - - private void log(String text) { - if (mVerbose) { - System.out.println("# " + new Date() + " " + text); - } - } - - private void debug(String text) { - if (mDebug) { - System.out.println("# " + new Date() + " " + text); - } - } - - private static KMerPosition encodeKMer(char[] kmerChars, int baseIndex) { - char[] encoding = encodeKMerChars(kmerChars); - if (encoding == null) { - return null; - } - char[] reverseEncoding = encodeKMerChars(reverseComplement(kmerChars)); - if (compareEncodings(encoding, reverseEncoding) <= 0) { - return new KMerPosition(encoding, baseIndex); - } else { - KMerPosition kmp = new KMerPosition(reverseEncoding, baseIndex); - kmp.setIsReversed(true); - return kmp; - } - } - - private static char[] encodeKMerChars(char[] kmerChars) { - if (kmerChars == null) { - return null; - } - - int kmerLength = kmerChars.length; - int encodingLength = (kmerLength + 7) / 8; - char[] encoding = new char[encodingLength]; - int offset = kmerLength % 8; - offset = (offset == 0) ? 8 : offset; - int bits = encodeKMerBits(kmerChars, 0, offset); - if (bits < 0) { - return null; - } - encoding[0] = (char) bits; - for (int i = 1; i < encodingLength; i++) { - bits = encodeKMerBits(kmerChars, offset, 8); - if (bits < 0) { - return null; - } - encoding[i] = (char) bits; - offset += 8; - } - return encoding; - } - - private static int compareEncodings(char[] encoding1, char[] encoding2) { - int length = Math.max(encoding1.length, encoding2.length); - for (int i = 0; i < length; i++) { - int result = encoding1[i] - encoding2[i]; - if (result != 0) { - return result; - } - } - return 0; - } - - private static int encodeKMerBits(char[] kmerChars, int offset, int length) { - int bits = 0; - for (int i = 0; i < length; i++) { - char base = kmerChars[offset + i]; - int baseBits = "ACGT".indexOf(base); - if (baseBits < 0) { - return -1; - } - bits |= baseBits << (2*(length-i-1)); - } - return bits; - } - - private static String decodeKMer(char[] encoding, boolean reverse) { - int length = mK; - char[] buffer = new char[length]; - int offset = length % 8; - offset = (offset == 0) ? 8 : offset; - decodeKMerBits(encoding[0], buffer, 0, offset); - for (int i = 1; i < encoding.length; i++) { - decodeKMerBits(encoding[i], buffer, offset, 8); - offset += 8; - } - if (reverse) { - reverseComplementInPlace(buffer); - } - return new String(buffer); - } - - private static void decodeKMerBits(char bits, char[] buffer, int offset, int length) { - for (int i = 0; i < length; i++) { - int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); - buffer[offset + i] = "ACGT".charAt(baseBits); - } - } - - private static void decodeKMerBits(long bits, char[] buffer, int offset, int length) { - for (int i = 0; i < length; i++) { - int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3); - buffer[offset + i] = "ACGT".charAt(baseBits); - } - } - - private static char[] reverseComplement(char[] buffer) { - int length = buffer.length; - char[] result = new char[length]; - System.arraycopy(buffer, 0, result, 0, length); - reverseComplementInPlace(result); - return result; - } - - private static void reverseComplementInPlace(char[] buffer) { - int length = buffer.length; - int limit = (length + 1)/2; - for (int i = 0; i < limit; i++) { - char ch1 = reverseComplement(buffer[i]); - char ch2 = reverseComplement(buffer[length-i-1]); - buffer[i] = ch2; - buffer[length-i-1] = ch1; - } - } - - private static char reverseComplement(char base) { - switch (base) { - case 'A': - return 'T'; - case 'C': - return 'G'; - case 'G': - return 'C'; - case 'T': - return 'A'; - } - return base; - } - - private static String formatEncoding(char[] encoding) { - if (encoding == null) { - return null; - } - StringBuilder builder = new StringBuilder(); - builder.append('['); - for (int i = 0; i < encoding.length; i++) { - String hex = Integer.toHexString(encoding[i]); - int length = hex.length(); - while (length < 4) { - builder.append('0'); - length++; - } - builder.append(hex); - } - builder.append(']'); - return builder.toString(); - } - - static class KMerPosition - implements Comparable { - - private int mBaseIndex; - private boolean mReversed; - private char[] mKMerEncoding; - - KMerPosition(char[] encoding, int baseIndex) { - mBaseIndex = baseIndex; - mReversed = false; - mKMerEncoding = encoding; - } - - public final String getKMer() { - return decodeKMer(mKMerEncoding, mReversed); - } - - public final boolean getIsReversed() { - return mReversed; - } - - public final void setIsReversed(boolean value) { - mReversed = value; - } - - public final int getBaseIndex() { - return mBaseIndex; - } - - public final void setBaseIndex(int baseIndex) { - mBaseIndex = baseIndex; - } - - public final char[] getKMerEncoding() { - return mKMerEncoding; - } - - public int compareTo(KMerPosition kmp) { - return compareEncodings(getKMerEncoding(), kmp.getKMerEncoding()); - } - - public boolean equals(Object object) { - if (!(object instanceof KMerPosition)) { - return false; - } - KMerPosition kmp = (KMerPosition) object; - return (getBaseIndex() == kmp.getBaseIndex() && - this.compareTo(kmp) == 0); - } - - public String format() { - return(getKMer() + - " " + formatEncoding(getKMerEncoding()) + - " " + (mReversed ? 'R' : 'F') + - " " + Integer.toHexString(mBaseIndex)); - } - } - - static class StringKMerPosition - implements Comparable { - - private String mKMerString = null; - private int mBaseIndex; - - StringKMerPosition(String kmer, int baseIndex) { - mKMerString = kmer; - mBaseIndex = baseIndex; - } - - public final String getKMer() { - return mKMerString; - } - - public final int getBaseIndex() { - return mBaseIndex; - } - - public final void setBaseIndex(int baseIndex) { - mBaseIndex = baseIndex; - } - - public int compareTo(StringKMerPosition kmp) { - return mKMerString.compareTo(kmp.mKMerString); - } - - public boolean equals(Object object) { - if (!(object instanceof StringKMerPosition)) { - return false; - } - StringKMerPosition kmp = (StringKMerPosition) object; - return (mBaseIndex == kmp.mBaseIndex && - mKMerString.equals(kmp.mKMerString)); - } - } -} diff --git a/java/lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java b/java/lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java deleted file mode 100644 index 90b26d0b1..000000000 --- a/java/lib/edu/mit/broad/cnv/kmer/DistributedKMerCounter.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv.kmer; - - -import edu.mit.broad.dcp.DistributedAlgorithm; -import edu.mit.broad.cnv.util.SequenceIterator; - -import java.io.*; -import java.util.*; - - -/** - * Distributed algorithm for counting unique kmers. - */ -public class DistributedKMerCounter - extends DistributedAlgorithm -{ - private boolean mDebug = false; - private boolean mVerbose = false; - private int mK = 0; - private List mInputFiles = null; - private List mSequenceList = null; - private List mSequenceOffsetList = null; - - - public DistributedKMerCounter() { - } - - public boolean getDebug() { - return mDebug; - } - - public void setDebug(boolean value) { - mDebug = value; - } - - public boolean getVerbose() { - return mVerbose; - } - - public void setVerbose(boolean value) { - mVerbose = value; - } - - public int getK() { - return mK; - } - - public void setK(int value) { - mK = value; - } - - public List getInputFiles() { - return mInputFiles; - } - - public void setInputFiles(List value) { - mInputFiles = value; - } - - public void run() - throws Exception { - super.run(); - finish(); - } - - protected void init() - throws Exception { - if (getWorkerId() == MASTER) { - initMaster(); - } else { - initWorker(); - } - } - - private void initMaster() - throws IOException { - // Tasks to be amortized - report("Scanning sequences ..."); - scanSequences(); - report("Scan complete."); - } - - private void initWorker() { - // Tasks to be amortized - } - - protected void start() { - // scan genome, divide into chromosomes and optionally segments, distribute calls - } - - private void finish() { - // merge individual files, write out final results - } - - private void scanSequences() - throws IOException { - List sequenceList = new ArrayList(); - List sequenceOffsetList = new ArrayList(); - SequenceIterator seqIterator = new SequenceIterator(getInputFiles()); - while (true) { - String seqName = seqIterator.getNextSequence(); - if (seqName == null) { - break; - } - int baseIndex = seqIterator.getBaseIndex() + 1; - sequenceList.add(seqName); - sequenceOffsetList.add(baseIndex); - } - mSequenceList = sequenceList; - mSequenceOffsetList = sequenceOffsetList; - } - - // Currently not used - private void loadGenomeOffsets(File file) - throws IOException { - List sequenceList = new ArrayList(); - List sequenceOffsetList = new ArrayList(); - int baseIndex = 0; - LineNumberReader reader = new LineNumberReader(new FileReader(file)); - while (true) { - String line = reader.readLine(); - if (line == null) { - break; - } - String text = line.trim(); - if (text.length() == 0 || text.startsWith("#")) { - continue; - } - String[] fields = text.split("\\s+"); - if (fields.length != 2) { - throw new RuntimeException("Invalid input line: " + line); - } - int length = Integer.parseInt(fields[1]); - sequenceList.add(fields[0]); - sequenceOffsetList.add(baseIndex); - baseIndex += length; - } - mSequenceList = sequenceList; - mSequenceOffsetList = sequenceOffsetList; - } -} diff --git a/java/lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java b/java/lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java deleted file mode 100644 index 7ed22faf3..000000000 --- a/java/lib/edu/mit/broad/cnv/util/GenomeBaseIndex.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv.util; - - -import java.io.*; -import java.util.*; - - -/** - * Utility class for transforming between a linear base index - * and a chromsome + position coordinate system. - */ -public class GenomeBaseIndex { - - private List mSequenceNames = null; - private int[] mLengths = null; - private long[] mOffsets = null; - - private GenomeBaseIndex() { - } - - public static GenomeBaseIndex read(File file) - throws IOException { - Reader reader = new BufferedReader(new FileReader(file)); - try { - return read(reader); - } finally { - reader.close(); - } - } - - // The input is just a list of space-delimited sequence name and length. - public static GenomeBaseIndex read(Reader reader) - throws IOException { - List sequenceNames = new ArrayList(); - List sequenceLengths = new ArrayList(); - BufferedReader bufferedReader = new BufferedReader(reader); - while (true) { - String line = bufferedReader.readLine(); - if (line == null) { - break; - } - String text = line.trim(); - if (text.length() == 0 || text.startsWith("#")) { - continue; - } - String[] fields = text.split("\\s+"); - if (fields.length < 2) { - throw new RuntimeException("Invalid input line: " + line); - } - int length = Integer.parseInt(fields[1]); - if (length <= 0) { - throw new RuntimeException("Invalid sequence length: " + length); - } - sequenceNames.add(fields[0]); - sequenceLengths.add(length); - } - int count = sequenceLengths.size(); - int[] lengths = new int[count]; - long[] offsets = new long[count]; - long offset = 0; - for (int i = 0; i < count; i++) { - lengths[i] = sequenceLengths.get(i); - offsets[i] = offset; - offset += lengths[i]; - } - GenomeBaseIndex result = new GenomeBaseIndex(); - result.mSequenceNames = sequenceNames; - result.mLengths = lengths; - result.mOffsets = offsets; - return result; - } - - public List getSequenceNames() { - return mSequenceNames; - } - - public boolean contains(String seqName) { - return (getSequenceIndex(seqName) >= 0); - } - - public long getFirstIndex(String seqName) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return -1; - } - return mOffsets[index]; - } - - public long getLastIndex(String seqName) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return -1; - } - return (mOffsets[index] + mLengths[index] - 1); - } - - public int getSequenceLength(String seqName) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return 0; - } - return mLengths[index]; - } - - public long getBaseIndex(String seqName, int position) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return -1; - } - if (position > mLengths[index]) { - return -1; - } - if (position < 1) { - // Zero or negative position means last base index - position = mLengths[index]; - } - return (mOffsets[index] + position - 1); - } - - public String getSequenceName(long baseIndex) { - int index = getSequenceIndex(baseIndex); - if (index < 0) { - return null; - } - return mSequenceNames.get(index); - } - - public int getPosition(long baseIndex) { - if (baseIndex < 0) { - // Catch common sign-extension error when packing indexes as ints. - throw new IllegalArgumentException("Invalid base index: " + baseIndex); - } - int index = getSequenceIndex(baseIndex); - if (index < 0) { - return 0; - } - long offset = mOffsets[index]; - long result = baseIndex - offset + 1; - return (int) result; - } - - // Same as getSequenceName, but treat the argument as an unsigned int. - // This is useful for manipulating/storing indexes for the human - // genome as 4-byte unsigned ints. - public String getSequenceNameUnsigned(int baseIndex) { - return getSequenceName(baseIndex & 0xFFFFFFFFL); - } - - // Same as getPosition, but treat the argument as an unsigned int. - // This is useful for manipulating/storing indexes for the human - // genome as 4-byte unsigned ints. - public int getPositionUnsigned(int baseIndex) { - return getPosition(baseIndex & 0xFFFFFFFFL); - } - - private int getSequenceIndex(String seqName) { - return mSequenceNames.indexOf(seqName); - } - - private int getSequenceIndex(long baseIndex) { - long offset = 0; - if (baseIndex < 0) { - return -1; - } - for (int i = 0; i < mLengths.length; i++) { - int length = mLengths[i]; - if (offset + length > baseIndex) { - return i; - } - offset += length; - } - return -1; - } -} diff --git a/java/lib/edu/mit/broad/cnv/util/GenomeBinIndex.java b/java/lib/edu/mit/broad/cnv/util/GenomeBinIndex.java deleted file mode 100644 index 2d1a96f61..000000000 --- a/java/lib/edu/mit/broad/cnv/util/GenomeBinIndex.java +++ /dev/null @@ -1,167 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv.util; - - -import java.io.*; -import java.util.*; - - -/** - * Utility class for transforming between a chromsome + position - * coordinate system and a binned coordinate system where each - * chromosome (separately) is divided into fixed sized bins, - * ragged on the right/upper end. - */ -public class GenomeBinIndex { - - private int mBinSize; - private List mSequenceNames; - private int[] mSequenceLengths; - private int[] mBinOffsets; - - public GenomeBinIndex(GenomeBaseIndex gbi, int binSize) { - if (binSize <= 0) { - throw new IllegalArgumentException("Illegal bin size: " + binSize); - } - mBinSize = binSize; - mSequenceNames = new ArrayList(gbi.getSequenceNames()); - int count = mSequenceNames.size(); - mSequenceLengths = new int[count]; - mBinOffsets = new int[count]; - long binOffset = 0; // long to detect overflow - for (int i = 0; i < count; i++) { - int length = gbi.getSequenceLength(mSequenceNames.get(i)); - int binCount = (length + binSize - 1) / binSize; - mSequenceLengths[i] = length; - mBinOffsets[i] = (int) binOffset; - binOffset += binCount; - } - if (binOffset > Integer.MAX_VALUE) { - // Check for integer overflow. - // This will happen, e.g., with the human genome and a bin size of 1. - throw new RuntimeException("Binsize too small: " + binSize); - } - } - - public int getBinSize() { - return mBinSize; - } - - public int getBinIndex(String seqName, int position) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return -1; - } - if (position > mSequenceLengths[index]) { - return -1; - } - if (position < 1) { - position = mSequenceLengths[index]; - } - int bin = (position - 1) / mBinSize; - return (mBinOffsets[index] + bin); - } - - public String getSequenceName(int binIndex) { - int index = getSequenceIndex(binIndex); - if (index < 0) { - return null; - } - return mSequenceNames.get(index); - } - - public int getStartPosition(int binIndex) { - int index = getSequenceIndex(binIndex); - if (index < 0) { - return -1; - } - int bin = binIndex - mBinOffsets[index]; - return (bin * mBinSize + 1); - } - - public int getEndPosition(int binIndex) { - int index = getSequenceIndex(binIndex); - if (index < 0) { - return -1; - } - int bin = binIndex - mBinOffsets[index]; - int position = (bin+1) * mBinSize; - position = Math.min(position, mSequenceLengths[index]); - return position; - } - - public List getSequenceNames() { - return mSequenceNames; - } - - public int getFirstBin(String seqName) { - return getBinIndex(seqName, 1); - } - - public int getLastBin(String seqName) { - return getBinIndex(seqName, 0); - } - - public int getBinCount() { - if (mBinOffsets.length == 0) { - return 0; - } - int lastIndex = mBinOffsets.length - 1; - int count = mBinOffsets[lastIndex]; - count += (mSequenceLengths[lastIndex] + mBinSize - 1) / mBinSize; - return count; - } - - public int getBinCount(String seqName) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return -1; - } - return ((mSequenceLengths[index] + mBinSize - 1) / mBinSize); - } - - public int getSequenceLength(String seqName) { - int index = getSequenceIndex(seqName); - if (index < 0) { - return 0; - } - return mSequenceLengths[index]; - } - - private int getSequenceIndex(String seqName) { - for (int i = 0; i < mSequenceNames.size(); i++) { - if (mSequenceNames.get(i).equals(seqName)) { - return i; - } - } - return -1; - } - - private int getSequenceIndex(int binIndex) { - if (binIndex < 0) { - return -1; - } - for (int i = 1; i < mBinOffsets.length; i++) { - if (mBinOffsets[i] > binIndex) { - return i-1; - } - } - int lastIndex = mBinOffsets.length-1; - int lastBinIndex = mBinOffsets[lastIndex]; - lastBinIndex += (mSequenceLengths[lastIndex] + mBinSize - 1) / mBinSize; - if (binIndex <= lastBinIndex) { - return lastIndex; - } - return -1; - } -} - diff --git a/java/lib/edu/mit/broad/cnv/util/SequenceIterator.java b/java/lib/edu/mit/broad/cnv/util/SequenceIterator.java deleted file mode 100644 index 57bbae7a5..000000000 --- a/java/lib/edu/mit/broad/cnv/util/SequenceIterator.java +++ /dev/null @@ -1,145 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.cnv.util; - - -import java.io.*; -import java.util.*; - - -/** - * Utility class for iterating over fasta files. - * Also maintains an unsigned base index over the file set. - */ -public class SequenceIterator -{ - private List mInputFiles = null; - private int mInputFileIndex = 0; - private int mBaseIndex = -1; - private LineNumberReader mCurrentReader = null; - private String mNextSequence = null; - private String mLineBuffer = null; - private int mLineBufferIndex = 0; - - public SequenceIterator(File inputFile) { - mInputFiles = new ArrayList(); - mInputFiles.add(inputFile); - } - - public SequenceIterator(List inputFiles) { - mInputFiles = inputFiles; - } - - public void close() { - if (mCurrentReader != null) { - try { - mCurrentReader.close(); - } catch (IOException exc) { - throw new RuntimeException("Error closing reader: " + exc.getMessage(), - exc); - } - } - mCurrentReader = null; - mInputFiles = null; - mInputFileIndex = 0; - mBaseIndex = -1; - mNextSequence = null; - mLineBuffer = null; - mLineBufferIndex = 0; - } - - public String getNextSequence() - throws IOException { - - while (mNextSequence == null) { - if (mLineBuffer != null) { - incrementBaseIndex(mLineBuffer.length() - mLineBufferIndex); - mLineBuffer = null; - mLineBufferIndex = 0; - } - if (mCurrentReader == null) { - mCurrentReader = getNextReader(); - if (mCurrentReader == null) { - return null; - } - } - String line = mCurrentReader.readLine(); - if (line == null) { - mCurrentReader.close(); - mCurrentReader = null; - continue; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - } else { - incrementBaseIndex(line.length()); - } - } - String result = mNextSequence; - mNextSequence = null; - return result; - } - - public char getNextBase() - throws IOException { - - if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) { - if (mCurrentReader == null) { - return 0; - } - if (mNextSequence != null) { - return 0; - } - String line = mCurrentReader.readLine(); - if (line == null) { - mLineBuffer = null; - mLineBufferIndex = 0; - mCurrentReader.close(); - mCurrentReader = null; - return 0; - } - if (line.startsWith(">")) { - String[] tokens = line.substring(1).trim().split("\\s+"); - mNextSequence = tokens[0]; - mLineBuffer = null; - mLineBufferIndex = 0; - return 0; - } - mLineBuffer = line.toUpperCase(); - mLineBufferIndex = 0; - } - char result = mLineBuffer.charAt(mLineBufferIndex++); - incrementBaseIndex(1); - return result; - } - - public int getBaseIndex() { - return mBaseIndex; - } - - private LineNumberReader getNextReader() - throws IOException { - if (mInputFileIndex >= mInputFiles.size()) { - return null; - } - File file = mInputFiles.get(mInputFileIndex++); - return new LineNumberReader(new FileReader(file)); - } - - private void incrementBaseIndex(int amount) { - if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) { - throw new RuntimeException("Base index: 32-bit overflow"); - } - mBaseIndex += amount; - } -} - diff --git a/java/lib/edu/mit/broad/dcp/CallStatus.java b/java/lib/edu/mit/broad/dcp/CallStatus.java deleted file mode 100644 index e431b27df..000000000 --- a/java/lib/edu/mit/broad/dcp/CallStatus.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2007 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp; - -public enum CallStatus -{ - PENDING, - PROCESSING -} - - diff --git a/java/lib/edu/mit/broad/dcp/CommandRunner.java b/java/lib/edu/mit/broad/dcp/CommandRunner.java deleted file mode 100644 index b93b310dd..000000000 --- a/java/lib/edu/mit/broad/dcp/CommandRunner.java +++ /dev/null @@ -1,309 +0,0 @@ -/** - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2006 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp; - -import java.io.*; - - -/** - * Utility class to run system commands synchronously and return the output. - * - * The interface supports the typical case where you want to return a modest - * amount of information from the command's standard output or standard error - * as a string. The caller can override this behavior, however, and provide - * alternative output destinations if necessary. - * - * If setMergeOutput() is true, then this class will attempt to interleave - * the standard output and standard error streams of the command into one - * stream (standard output). This may not produce exactly the same results - * as having the operating system interleave the output, but works well for - * simple executables that do not heavily intermix stdout and stderr. - * - * A typical invocation is: - *
- *  CommandRunner runner = new CommandRunner();
- *  int status = runner.runCommand("ls");
- *  if (status == 0) {
- *      System.out.print(runner.getStandardOutput());
- *  }
- * 
- * - * @author Bob Handsaker - */ -public class CommandRunner { - - private boolean mMergeOutput = false; - private Writer mStandardOutputDestination = null; - private Writer mStandardErrorDestination = null; - private String mStandardOutputString = null; - private String mStandardErrorString = null; - - - /** - * Default constructor. - */ - public CommandRunner() { - } - - /** - * Get the standard output from the last command as a string. - * - * If no command has been run or an explicit output destination - * was set, then this method returns null. - */ - public String getStandardOutputString() { - return mStandardOutputString; - } - - /** - * Get the standard error from the last command as a string. - * - * If no command has been run or an explicit output destination - * was set, then this method returns null. - */ - public String getStandardErrorString() { - return mStandardErrorString; - } - - /** - * If true, the command's standard error stream will be interleaved - * with the command's standard output stream. The standard error - * stream destination will not be used. - */ - public boolean getMergeOutput() { - return mMergeOutput; - } - - /** - * If true, the command's standard error stream will be interleaved - * with the command's standard output stream. - */ - public void setMergeOutput(boolean value) { - mMergeOutput = value; - } - - /** - * The destination for the command's standard output stream. - * If null, the standard output will be captured in a string. - */ - public Writer getStandardOutputDestination() { - return mStandardOutputDestination; - } - - /** - * The destination for the command's standard output stream. - * If set to null, the standard output will be captured in a string. - */ - public void setStandardOutputDestination(Writer writer) { - mStandardOutputDestination = writer; - } - - /** - * The destination for the command's standard error stream. - * If null, the standard error will be captured in a string. - */ - public Writer getStandardErrorDestination() { - return mStandardErrorDestination; - } - - /** - * The destination for the command's standard error stream. - * If set to null, the standard error will be captured in a string. - */ - public void setStandardErrorDestination(Writer writer) { - mStandardErrorDestination = writer; - } - - /** - * Run a command string as a system command. - * - * Returns the exit status of the command. - * - * When this method is called, the standard output string - * and standard error string are updated if no alternative output - * destinations have been set. - * - * This method throws a RuntimeException if running the command fails - * (for example, if there are not enough system resources to spawn - * the process). - * - * @param commmand The command string to run. - * @return Command exit status. - * @throws RuntimeException If command execution fails. - */ - public int runCommand(String command) - throws RuntimeException { - return runCommand(command.split(" "), null, null); - } - - /** - * Run a command string as a system command. - * - * Returns the exit status of the command. - * - * When this method is called, the standard output string - * and standard error string are updated if no alternative output - * destinations have been set. - * - * This method throws a RuntimeException if running the command fails - * (for example, if there are not enough system resources to spawn - * the process). - * - * @param commmand The command string to run. - * @param environment The command environment (or null to inherit). - * @param workingDirectory The working directory (or null to inherit). - * @return Command exit status. - * @throws RuntimeException If command execution fails. - */ - public int runCommand(String command, String[] environment, File workingDirectory) - throws RuntimeException { - return runCommand(command.split(" "), environment, workingDirectory); - } - - /** - * Run a command string as a system command. - * - * Returns the exit status of the command. - * - * When this method is called, the standard output string - * and standard error string are updated if no alternative output - * destinations have been set. - * - * This method throws a RuntimeException if running the command fails - * (for example, if there are not enough system resources to spawn - * the process). - * - * @param commmand The command to run (as a array of arguments). - * @param environment The command environment (or null to inherit). - * @param workingDirectory The working directory (or null to inherit). - * @return Command exit status. - * @throws RuntimeException If command execution fails. - */ - public int runCommand(String[] command, String[] environment, File workingDirectory) - throws RuntimeException { - - Writer stdout = mStandardOutputDestination; - Writer stderr = mStandardErrorDestination; - if (stdout == null) { - stdout = new StringWriter(); - } - if (mMergeOutput) { - stderr = stdout; - } else if (stderr == null) { - stderr = new StringWriter(); - } - - mStandardOutputString = null; - mStandardErrorString = null; - - int commandStatus = 0; - try { - Process process = - Runtime.getRuntime().exec(command, environment, workingDirectory); - StreamHandler stdoutHandler = - new StreamHandler(process.getInputStream(), stdout); - StreamHandler stderrHandler = - new StreamHandler(process.getErrorStream(), stderr); - - commandStatus = process.waitFor(); - - // Wait for the streams to drain. - stdoutHandler.join(); - stderrHandler.join(); - } catch (Exception exc) { - throw new RuntimeException("Command execution failed: " + - exc.getMessage(), - exc); - } - - if (mStandardOutputDestination == null) { - mStandardOutputString = stdout.toString(); - } - if (mStandardErrorDestination == null && !mMergeOutput) { - mStandardErrorString = stderr.toString(); - } - - return commandStatus; - } - - - /** - * Internal class to asynchronously read from the standard output - * and standard error streams of the command being executed. - * - * If you do not handle command output asynchronously, then execution - * of a command may block in some environments if the program produces - * too much output. In this case, the call to run the process will - * never complete. - */ - private static class StreamHandler extends Thread { - - /** - * Constructor. - * Create an instance of this class, which is an asynchronous - * thread that will consume input from the given input stream - * and send the output to the given output destination. - * - * @param input The input stream to read. - * @param output The output destination. - */ - StreamHandler(InputStream input, Writer output) { - m_input = input; - m_output = output; - start(); - } - - - /** - * Standard thread run method. - * Pipe input from the input source to the output destination - * until there is no more input left. - * - * If an IOException occurs, the thread will make sure all - * available output has been flushed to the destination and - * then terminate. The IOException is not propagated. - */ - public void run() { - - char[] buffer = new char[4096]; - Reader reader = - new InputStreamReader(new BufferedInputStream(m_input)); - Writer writer = m_output; - - try { - while (true) { - int count = reader.read(buffer); - if (count <= 0) { - break; - } - if (writer != null) { - synchronized (writer) { - writer.write(buffer, 0, count); - } - } - } - } catch (IOException ignore) { - // Ignore IO exceptions - } finally { - try { - reader.close(); - } catch (Exception ignore) { - } - try { - m_output.flush(); - } catch (Exception ignore) { - } - } - } - - private InputStream m_input; - private Writer m_output; - } -} diff --git a/java/lib/edu/mit/broad/dcp/DistributedAlgorithm.java b/java/lib/edu/mit/broad/dcp/DistributedAlgorithm.java deleted file mode 100644 index a223c0326..000000000 --- a/java/lib/edu/mit/broad/dcp/DistributedAlgorithm.java +++ /dev/null @@ -1,618 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2007 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp; - -import edu.mit.broad.dcp.message.*; - -import java.io.*; -import java.util.*; -import java.lang.reflect.Method; -import java.net.InetAddress; -import java.net.ServerSocket; -import java.rmi.registry.*; - -/** - * Experimental. - */ -public abstract class DistributedAlgorithm - implements Serializable -{ - public static final Integer ANY = 0; - public static final Integer MASTER = 1; - - public DistributedAlgorithm() { - } - - public String getServerHost() { - return mServerHost; - } - - public void setServerHost(String value) { - mServerHost = value; - } - - public int getServerPort() { - return mServerPort; - } - - public void setServerPort(int value) { - mServerPort = value; - } - - public String getAlgorithmName() { - if (mAlgorithmName != null) { - return mAlgorithmName; - } else { - return getClassName(); - } - } - - public void setAlgorithmName(String value) { - mAlgorithmName = value; - } - - public int getMaximumWorkerCount() { - return mMaximumWorkerCount; - } - - public void setMaximumWorkerCount(int value) { - mMaximumWorkerCount = value; - } - - /** - * Name of LSF queue to use for workers. - */ - public String getLsfQueue() { - return mLsfQueue; - } - - public void setLsfQueue(String value) { - mLsfQueue = value; - } - - /** - * Directory to hold lsf log files. - */ - public String getLsfLogDirectory() { - return mLsfLogDirectory; - } - - public void setLsfLogDirectory(String value) { - mLsfLogDirectory = value; - } - - public boolean getEnableGcLogging() { - return mEnableGcLogging; - } - - public void setEnableGcLogging(boolean value) { - mEnableGcLogging = value; - } - - public Integer getWorkerId() { - return mWorkerId; - } - - public Integer getProcessId() { - return mProcessId; - } - - protected void init() - throws Exception { - } - - protected abstract void start() - throws Exception; - - public void run() - throws Exception { - - if (mIsRunning) { - throw new IllegalStateException("Algorithm is already running"); - } - - mIsRunning = true; - mWorkerId = MASTER; - mProcessId = MASTER; - - try { - startDistributedServer(); - init(); - startWorkerThread(); - startWorkers(); - start(); - waitForCompletion(); - } finally { - // TBD: More cleanup (shutdown threads, etc.) - stopDistributedServer(); - mIsRunning = false; - } - } - - void runWorker(int workerId, int processId) - throws Exception { - - if (mIsRunning) { - throw new IllegalStateException("Algorithm is already running"); - } - - mIsRunning = true; - mWorkerId = workerId; - mProcessId = processId; - - try { - if (openDistributedServer() == null) { - report("Server " + mServerHost + ":" + mServerPort + " not responding"); - return; - } - init(); - startWorkerThread(); - mWorkerThread.join(); - } finally { - closeDistributedServer(); - mIsRunning = false; - } - } - - private void startWorkers() { - int workerCount = getMaximumWorkerCount(); - if (workerCount <= 0) { - // Use single process execution for testing/debugging. - new InProcessWorker().start(); - return; - } - if (workerCount > 1000) { - throw new RuntimeException("Excessive worker count: " + workerCount); - } - for (int i = 0; i < workerCount; i++) { - Integer workerId = (MASTER + i + 1); - Integer processId = workerId; // for now - startWorker(workerId, processId); - } - } - - private void startDistributedServer() { - try { - // Create a server socket to allocate a unique port. - // There is a window of vulnerability where the port - // can get reused, but in practice this works ok. - String serverHost = getCurrentHost(); - ServerSocket socket = new ServerSocket(0); - int serverPort = socket.getLocalPort(); - socket.close(); - Registry registry = LocateRegistry.createRegistry(serverPort); - DistributedCallServer server = new DistributedCallServer(); - server.setAlgorithm(this); - registry.bind("DistributedCallService", server); - mServerHost = serverHost; - mServerPort = serverPort; - mDistributedCallServer = server; - mDistributedCallService = server; - } catch (Exception exc) { - throw wrapException(exc); - } - } - - private void stopDistributedServer() { - if (mDistributedCallServer != null) { - try { - Registry registry = LocateRegistry.getRegistry(mServerPort); - registry.unbind("DistributedCallService"); - mDistributedCallServer.stop(); - } catch (Exception exc) { - throw wrapException(exc); - } - } - mDistributedCallService = null; - mDistributedCallServer = null; - } - - private DistributedCallService openDistributedServer() { - mDistributedCallService = null; - try { - String url = "rmi://" + getServerHost() + ":" + getServerPort() + "/DistributedCallService"; - DistributedCallService server = - (DistributedCallService) java.rmi.Naming.lookup(url); - mDistributedCallService = server; - } catch (java.rmi.NotBoundException exc) { - // Server has exited - } catch (Exception exc) { - throw wrapException(exc); - } - return mDistributedCallService; - } - - private void closeDistributedServer() { - mDistributedCallService = null; - } - - private void startWorker(Integer workerId, Integer processId) { - - String logFile = "worker_" + processId + "_%J.bsub"; - if (mLsfLogDirectory != null) { - logFile = mLsfLogDirectory + "/" + logFile; - } - - List command = new ArrayList(); - command.add("bsub"); - command.add("-o"); - command.add(logFile); - if (mLsfQueue != null) { - command.add("-q"); - command.add(mLsfQueue); - } - command.add("runDistributedWorker"); - command.add("-serverHost"); - command.add(getServerHost()); - command.add("-serverPort"); - command.add(Integer.toString(getServerPort())); - command.add("-workerId"); - command.add(Integer.toString(workerId)); - command.add("-processId"); - command.add(Integer.toString(processId)); - - // Pass our -Xmx setting along to all workers. - Map environment = - new LinkedHashMap(System.getenv()); - long maxMemory = Runtime.getRuntime().maxMemory(); - long maxKbytes = maxMemory / 1024; - String memJavaOpt = "-Xmx" + maxKbytes + "K"; - - // Enable GC logging if requested - String gcJavaOpt = null; - if (mEnableGcLogging) { - String gcLogFile = "worker_" + processId + ".gc.log"; - if (mLsfLogDirectory != null) { - gcLogFile = mLsfLogDirectory + "/" + gcLogFile; - } - gcJavaOpt = "-Xloggc:" + gcLogFile; - } - - String javaOpts = environment.get("JAVAOPTS"); - if (javaOpts == null) { - javaOpts = memJavaOpt; - if (gcJavaOpt != null) { - javaOpts = javaOpts + " " + gcJavaOpt; - } - environment.put("JAVAOPTS", javaOpts); - } - - // Log output ourselves (rather than waiting for bsub). - String workerLogFile = "worker_" + processId + ".log"; - if (mLsfLogDirectory != null) { - workerLogFile = mLsfLogDirectory + "/" + workerLogFile; - } - environment.put("DA_LOG_FILE", workerLogFile); - - CommandRunner runner = new CommandRunner(); - Writer output = new LsfOutputFilter(); - runner.setStandardOutputDestination(output); - runner.setStandardErrorDestination(output); - String[] commandArray = command.toArray(new String[command.size()]); - String[] environmentArray = createEnvironmentArray(environment); - int status = runner.runCommand(commandArray, environmentArray, null); - if (status != 0) { - throw new RuntimeException("Error starting worker: " + status); - } - } - - private String[] createEnvironmentArray(Map map) { - if (map == null) { - return null; - } - int index = 0; - String[] array = new String[map.size()]; - for (Map.Entry entry : map.entrySet()) { - array[index++] = entry.getKey() + "=" + entry.getValue(); - } - return array; - } - - private String getCurrentHost() { - try { - return InetAddress.getLocalHost().getCanonicalHostName(); - } catch (Exception exc) { - throw wrapException(exc); - } - } - - private void waitForCompletion() { - DistributedCallServer server = mDistributedCallServer; - while (true) { - if (server.isQueueEmpty()) { - break; - } - try { - Thread.sleep(1000); - } catch (InterruptedException exc) { - // ignore - } - } - } - - protected void callDistributed(String methodName, Object... methodArgs) { - callDistributed(null, methodName, methodArgs); - } - - protected void callDistributed(Integer workerId, String methodName, Object... methodArgs) { - if (workerId == null) { - workerId = ANY; - } - try { - DistributedCallMessage message = new DistributedCallMessage(); - message.setSenderWorkerId(getWorkerId()); - message.setSenderProcessId(getProcessId()); - message.setReceiverWorkerId(workerId); - message.setMethodName(methodName); - message.setMethodArgs(methodArgs); - mDistributedCallService.writeMessage(message); - } catch (Throwable exc) { - throw wrapException(exc); - } - } - - private void callMethod(String methodName, Object[] methodArgs) { - try { - Object target = this; - Class targetClass = target.getClass(); - Method targetMethod = findMethod(targetClass, methodName); - if (targetMethod == null) { - throw new RuntimeException("Cannot find target method: " + methodName); - } - targetMethod.invoke(target, methodArgs); - } catch (Throwable exc) { - throw wrapException(exc); - } - } - - private Method findMethod(Class clazz, String methodName) throws Exception { - Method result = null; - Method[] methods = clazz.getDeclaredMethods(); - for (int i = 0; i < methods.length; i++) { - if (methods[i].getName().equals(methodName)) { - if (result != null) { - throw new RuntimeException("Duplicate method name: " + methodName); - } - result = methods[i]; - } - } - return result; - } - - private RuntimeException wrapException(Throwable exception) { - if (exception instanceof RuntimeException) { - return (RuntimeException) exception; - } else { - return new RuntimeException(exception.getMessage(), exception); - } - } - - private void startWorkerThread() { - if (mWorkerThread != null) { - throw new IllegalStateException("WorkerThread is running"); - } - mWorkerThread = new WorkerThread(); - mWorkerThread.start(); - } - - private void stopWorkerThread() { - if (mWorkerThread == null) { - throw new IllegalStateException("WorkerThread is running"); - } - mWorkerThread.stopThread(); - } - - private class WorkerThread extends Thread { - - WorkerThread() { - setDaemon(true); - } - - public void run() { - try { - DistributedCallService service = mDistributedCallService; - while (true) { - if (isInterrupted()) { - System.out.println("#DBG: Worker isInterrupted"); - throw new InterruptedException(); - } - DistributedCallMessage message = - service.acceptMessage(getWorkerId(), getProcessId()); - if (message == null) { - Thread.sleep(1000); - } else { - processMessage(message); - } - } - } catch (InterruptedException exc) { - // Interruption terminates this thread. - // System.out.println("#DBG: Worker caught InterruptedException"); - } catch (Throwable exc) { - if (isDisconnectException(exc)) { - report("Server disconnected"); - } else { - reportError("Exception in WorkerThread: " + exc.getMessage(), exc); - System.exit(1); - } - } - report("WorkerThread terminated"); - } - - void stopThread() { - // System.out.println("#DBG: About to interrupt worker..."); - interrupt(); - // System.out.println("#DBG: Joining worker..."); - try { - join(); - } catch (InterruptedException exc) { - // ignore - } - } - - private boolean isDisconnectException(Throwable exc) { - if (exc instanceof java.rmi.ConnectException) { - return true; - } else if (exc instanceof java.rmi.NoSuchObjectException) { - return true; - } else if (exc instanceof java.rmi.UnmarshalException && - exc.getCause() != null && - exc.getCause() instanceof EOFException) { - return true; - } else { - return false; - } - } - } - - private void processMessage(DistributedCallMessage message) { - try { - Integer workerId = message.getReceiverWorkerId(); - if (workerId == null || !workerId.equals(getWorkerId())) { - reportError("Invalid worker ID in message: " + message); - return; - } - callMethod(message.getMethodName(), message.getMethodArgs()); - } catch (Throwable exc) { - reportError("Exception running message: " + message, exc); - } finally { - completeMessage(message); - } - } - - private void completeMessage(DistributedCallMessage message) { - try { - DistributedCallService service = mDistributedCallService; - service.completeMessage(getWorkerId(), getProcessId(), message.getCallId()); - } catch (Throwable exc) { - reportError("Exception completing message: " + message, exc); - } - } - - protected void report(String message) { - String identity = - getAlgorithmName() + " " + - getWorkerId() + "/" + getProcessId(); - System.out.println("# " + identity + " : " + message); - } - - protected void reportError(String message) { - reportError(message, null); - } - - protected void reportError(String message, Throwable exception) { - String identity = - getAlgorithmName() + " " + - getWorkerId() + "/" + getProcessId(); - System.out.println("Error" + - " [" + identity + "]" + - ": " + message); - if (exception != null) { - System.out.println(" with exception: " + exception.getMessage()); - exception.printStackTrace(System.out); - } - } - - private String getClassName() { - String name = getClass().getName(); - return name.substring(name.lastIndexOf('.')+1); - } - - public String toString() { - StringBuilder builder = new StringBuilder(); - builder.append("DistributedAlgorithm"); - builder.append("("); - builder.append("" + getAlgorithmName()); - builder.append(","); - builder.append("" + getWorkerId()); - builder.append(","); - builder.append("" + getProcessId()); - builder.append(","); - builder.append("" + getMaximumWorkerCount()); - builder.append(","); - builder.append("" + getLsfQueue()); - builder.append(","); - builder.append("" + mIsRunning); - builder.append(")"); - return builder.toString(); - } - - // This class is used only during in-process execution/testing/debugging. - private class InProcessWorker extends Thread { - - InProcessWorker() { - setDaemon(true); - } - - public void run() { - report("InProcessWorker starting"); - try { - String serverAddress = getServerHost() + ":" + getServerPort(); - String url = "rmi://" + serverAddress + "/DistributedCallService"; - DistributedCallService server = - (DistributedCallService) java.rmi.Naming.lookup(url); - DistributedAlgorithm algorithm = server.getAlgorithm(); - algorithm.setServerHost(getServerHost()); - algorithm.setServerPort(getServerPort()); - algorithm.runWorker(2, 1); - } catch (Throwable exc) { - reportError("Exception in InProcessWorker: " + exc.getMessage(), exc); - System.exit(1); - } - report("InProcessWorker terminated"); - } - } - - private static class LsfOutputFilter - extends FilterWriter { - - LsfOutputFilter() { - super(new PrintWriter(System.out, true)); - } - - public void write(int ch) - throws IOException { - if (mAtLineStart) { - out.write("# "); - mAtLineStart = false; - } - out.write(ch); - mAtLineStart = (ch == '\n'); - } - - public void write(String s, int off, int len) - throws IOException { - write(s.toCharArray(), off, len); - } - - public void write(char[] a, int off, int len) - throws IOException { - for (int i = 0; i < len; i++) { - write(a[off+i]); - } - } - - private boolean mAtLineStart = true; - } - - - private transient int mMaximumWorkerCount = 0; - private transient String mLsfQueue = null; - private transient String mLsfLogDirectory = null; - private transient boolean mEnableGcLogging = false; - private transient boolean mIsRunning = false; - private transient int mWorkerId = 0; - private transient int mProcessId = 0; - private transient WorkerThread mWorkerThread = null; - private transient String mAlgorithmName = null; - private transient String mServerHost = null; - private transient int mServerPort = 0; - private transient DistributedCallService mDistributedCallService = null; - private transient DistributedCallServer mDistributedCallServer = null; -} diff --git a/java/lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java b/java/lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java deleted file mode 100644 index dcee13eb8..000000000 --- a/java/lib/edu/mit/broad/dcp/DistributedAlgorithmWorker.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2007 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp; - -import java.util.*; - -/** - * Command line driver for distributed worker invocation. - */ -public class DistributedAlgorithmWorker -{ - public static void main(String[] args) - throws Exception { - new DistributedAlgorithmWorker().run(args); - } - - private void run(String[] args) - throws Exception { - - if (!parseArguments(args)) { - System.exit(1); - } - System.out.println("# DistributedAlgorithmWorker"); - System.out.println("# Started at " + new Date()); - runDistributedWorker(); - System.out.println("# Ended at " + new Date()); - } - - private boolean parseArguments(String[] args) { - - int argpos = 0; - int argsleft = 0; - - while (argpos < args.length) { - argsleft = args.length - argpos; - String arg = args[argpos]; - if (arg.equals("-serverHost") && argsleft > 1) { - argpos++; - mServerHost = args[argpos++]; - } else if (arg.equals("-serverPort") && argsleft > 1) { - argpos++; - mServerPort = Integer.parseInt(args[argpos++]); - } else if (arg.equals("-workerId") && argsleft > 1) { - argpos++; - mWorkerId = new Integer(args[argpos++]); - } else if (arg.equals("-processId") && argsleft > 1) { - argpos++; - mProcessId = new Integer(args[argpos++]); - } else if (arg.equals("-debug")) { - argpos++; - mDebug = true; - continue; - } else if (arg.equals("-verbose")) { - argpos++; - mVerbose = true; - continue; - } else if (arg.startsWith("-")) { - usage(); - return false; - } else { - break; - } - } - - argsleft = args.length - argpos; - if (argsleft != 0) { - usage(); - return false; - } - - return true; - } - - private void usage() { - System.out.println("Usage: DistributedWorkerMain ..."); - System.out.println(" -serverHost "); - System.out.println(" -serverPort "); - System.out.println(" -workerId "); - System.out.println(" -processId "); - System.out.println(" -verbose"); - System.out.println(" -debug"); - } - - private void runDistributedWorker() - throws Exception { - - DistributedAlgorithm algorithm = null; - String serverAddress = getServerHost() + ":" + getServerPort(); - try { - String url = "rmi://" + serverAddress + "/DistributedCallService"; - DistributedCallService server = - (DistributedCallService) java.rmi.Naming.lookup(url); - algorithm = server.getAlgorithm(); - } catch (java.rmi.ConnectException exc) { - System.out.println("# Server " + serverAddress + " not responding."); - return; - } - - algorithm.setServerHost(getServerHost()); - algorithm.setServerPort(getServerPort()); - algorithm.runWorker(getWorkerId(), getProcessId()); - } - - private Integer getWorkerId() { - return mWorkerId; - } - - private Integer getProcessId() { - return mProcessId; - } - - private String getServerHost() { - return mServerHost; - } - - private int getServerPort() { - return mServerPort; - } - - - private boolean mDebug = false; - private boolean mVerbose = false; - private String mServerHost = null; - private int mServerPort = 0; - private Integer mWorkerId = null; - private Integer mProcessId = null; -} diff --git a/java/lib/edu/mit/broad/dcp/DistributedCallServer.java b/java/lib/edu/mit/broad/dcp/DistributedCallServer.java deleted file mode 100644 index 995eff571..000000000 --- a/java/lib/edu/mit/broad/dcp/DistributedCallServer.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp; - - -import edu.mit.broad.dcp.message.*; - -import java.rmi.server.UnicastRemoteObject; -import java.util.*; - -public class DistributedCallServer - extends UnicastRemoteObject - implements DistributedCallService -{ - public DistributedCallServer() - throws java.rmi.RemoteException { - } - - public void setAlgorithm(DistributedAlgorithm algorithm) { - mAlgorithm = algorithm; - } - - public DistributedAlgorithm getAlgorithm() { - return mAlgorithm; - } - - public long writeMessage(DistributedCallMessage message) { - message.setCallStatus(CallStatus.PENDING); - message.setCallId(generateCallId()); - if (message.getReceiverWorkerId().equals(0)) { - synchronized (mMessageQueue) { - mMessageQueue.addLast(message); - } - } else { - synchronized (mMessageQueue) { - mMessageQueue.addFirst(message); - } - } - return message.getCallId(); - } - - public DistributedCallMessage acceptMessage(int workerId, int processId) { - if (workerId <= 0) { - throw new IllegalArgumentException("Invalid worker ID: " + workerId); - } - if (processId <= 0) { - throw new IllegalArgumentException("Invalid process ID: " + processId); - } - synchronized (mMessageQueue) { - Iterator iterator = mMessageQueue.iterator(); - while (iterator.hasNext()) { - DistributedCallMessage message = iterator.next(); - if (message.getCallStatus() != CallStatus.PENDING) { - continue; - } - int receiverId = message.getReceiverWorkerId(); - if (receiverId == workerId || - (receiverId == 0 && workerId > 1)) { - message.setCallStatus(CallStatus.PROCESSING); - message.setReceiverWorkerId(workerId); - message.setReceiverProcessId(processId); - return message; - } - } - } - - return null; - } - - public void completeMessage(int workerId, int processId, long callId) { - if (workerId <= 0) { - throw new IllegalArgumentException("Invalid worker ID: " + workerId); - } - if (processId <= 0) { - throw new IllegalArgumentException("Invalid process ID: " + processId); - } - if (callId <= 0) { - throw new IllegalArgumentException("Invalid call ID: " + callId); - } - synchronized (mMessageQueue) { - Iterator iterator = mMessageQueue.iterator(); - while (iterator.hasNext()) { - DistributedCallMessage message = iterator.next(); - if (message.getCallId().longValue() == callId) { - if (message.getCallStatus() != CallStatus.PROCESSING) { - throw new IllegalStateException("Call #" + callId + " not in state PROCESSING"); - } - if (!message.getReceiverWorkerId().equals(workerId)) { - throw new IllegalStateException("Call #" + callId + " assigned to worker " + message.getReceiverWorkerId() + " not worker " + workerId); - } - if (!message.getReceiverProcessId().equals(processId)) { - throw new IllegalStateException("Call #" + callId + " assigned to process " + message.getReceiverProcessId() + " not process " + processId); - } - iterator.remove(); - return; - } - } - } - - throw new IllegalArgumentException("Unrecognized call ID " + callId); - } - - public boolean isQueueEmpty() { - synchronized (mMessageQueue) { - return mMessageQueue.isEmpty(); - } - } - - public void stop() { - try { - UnicastRemoteObject.unexportObject(this, false); - } catch (java.rmi.NoSuchObjectException exc) { - throw new RuntimeException("Exception unexporting object: " + exc.getMessage(), - exc); - } - } - - private synchronized long generateCallId() { - return ++mCallIdGenerator; - } - - private long mCallIdGenerator = 0; - private DistributedAlgorithm mAlgorithm = null; - private LinkedList mMessageQueue = - new LinkedList(); -} diff --git a/java/lib/edu/mit/broad/dcp/DistributedCallService.java b/java/lib/edu/mit/broad/dcp/DistributedCallService.java deleted file mode 100644 index 202b25f42..000000000 --- a/java/lib/edu/mit/broad/dcp/DistributedCallService.java +++ /dev/null @@ -1,25 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp; - -import edu.mit.broad.dcp.message.*; - -public interface DistributedCallService - extends java.rmi.Remote -{ - public DistributedAlgorithm getAlgorithm() - throws java.rmi.RemoteException; - public long writeMessage(DistributedCallMessage message) - throws java.rmi.RemoteException; - public DistributedCallMessage acceptMessage(int workerId, int processId) - throws java.rmi.RemoteException; - public void completeMessage(int workerId, int processId, long callId) - throws java.rmi.RemoteException; -} diff --git a/java/lib/edu/mit/broad/dcp/message/DistributedCallMessage.java b/java/lib/edu/mit/broad/dcp/message/DistributedCallMessage.java deleted file mode 100644 index 1b0fa0a4d..000000000 --- a/java/lib/edu/mit/broad/dcp/message/DistributedCallMessage.java +++ /dev/null @@ -1,90 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2007 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp.message; - -import edu.mit.broad.dcp.CallStatus; - -public class DistributedCallMessage - extends DistributedMessage -{ - public DistributedCallMessage() { - } - - public Long getCallId() { - return mCallId; - } - - public void setCallId(Long value) { - mCallId = value; - } - - public CallStatus getCallStatus() { - return mCallStatus; - } - - public void setCallStatus(CallStatus value) { - mCallStatus = value; - } - - public String getMethodName() { - return mMethodName; - } - - public void setMethodName(String value) { - mMethodName = value; - } - - public Object[] getMethodArgs() { - return mMethodArgs; - } - - public void setMethodArgs(Object[] value) { - mMethodArgs = value; - } - - public String toString() { - StringBuilder builder = new StringBuilder(); - builder.append("DistributedCallMessage"); - builder.append("("); - builder.append("" + getSenderWorkerId()); - builder.append(","); - builder.append("" + getSenderProcessId()); - builder.append(","); - builder.append("" + getReceiverWorkerId()); - builder.append(","); - builder.append("" + getReceiverProcessId()); - builder.append(","); - builder.append("" + mCallId); - builder.append(","); - builder.append("" + mCallStatus); - builder.append(","); - builder.append("" + mMethodName); - builder.append(","); - if (mMethodArgs == null) { - builder.append("" + mMethodArgs); - } else { - builder.append("["); - for (int i = 0; i < mMethodArgs.length; i++) { - if (i > 0) { - builder.append(","); - } - builder.append("" + mMethodArgs[i]); - } - builder.append("]"); - } - builder.append(")"); - return builder.toString(); - } - - public Long mCallId; - public CallStatus mCallStatus; - public String mMethodName; - public Object[] mMethodArgs; -} diff --git a/java/lib/edu/mit/broad/dcp/message/DistributedMessage.java b/java/lib/edu/mit/broad/dcp/message/DistributedMessage.java deleted file mode 100644 index a5e837a69..000000000 --- a/java/lib/edu/mit/broad/dcp/message/DistributedMessage.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2007 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.dcp.message; - - -public class DistributedMessage -{ - public DistributedMessage() { - } - - public Integer getSenderWorkerId() { - return mSenderWorkerId; - } - - public void setSenderWorkerId(Integer value) { - mSenderWorkerId = value; - } - - public Integer getSenderProcessId() { - return mSenderProcessId; - } - - public void setSenderProcessId(Integer value) { - mSenderProcessId = value; - } - - public Integer getReceiverWorkerId() { - return mReceiverWorkerId; - } - - public void setReceiverWorkerId(Integer value) { - mReceiverWorkerId = value; - } - - public Integer getReceiverProcessId() { - return mReceiverProcessId; - } - - public void setReceiverProcessId(Integer value) { - mReceiverProcessId = value; - } - - public Integer mSenderWorkerId; - public Integer mSenderProcessId; - public Integer mReceiverWorkerId; - public Integer mReceiverProcessId; -} diff --git a/java/lib/edu/mit/broad/picard/PicardException.java b/java/lib/edu/mit/broad/picard/PicardException.java deleted file mode 100644 index 4e36ba648..000000000 --- a/java/lib/edu/mit/broad/picard/PicardException.java +++ /dev/null @@ -1,27 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard; - -/** - * Basic Picard runtime exception that, for now, does nothing much - * - * @author Kathleen Tibbetts - */ -public class PicardException extends RuntimeException -{ - public PicardException(String message) { - super(message); - } - - public PicardException(String message, Throwable throwable) { - super(message, throwable); - } - -} diff --git a/java/lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java b/java/lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java deleted file mode 100644 index 54f0ab9aa..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/AbstractBaseAligner.java +++ /dev/null @@ -1,97 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner; - -import edu.mit.broad.picard.io.IoUtil; - -import java.io.File; -import java.io.IOException; -import java.util.Map; -import java.util.List; - -/** - * Abstract base class for use by Aligner implementations. Provides a constructor and - * accessors for common inputs and outputs. - * - * @author Kathleen Tibbetts - */ -public abstract class AbstractBaseAligner implements Aligner { - - private final Stringency stringency; // The stringency of the alignment - private final File readsBamFile; // The BAM file containing the read data - private final String outputPrefix; // The directory and file name prefix for outputs - private final String referenceFileDir; // The directory where the reference file can be found - private final int clipPoints[]; // The clip points to use - private final Integer expectedInsertSize; // Expected insert size; null for non-paired-end lanes - private final Integer readsToAlign; // The number of reads to align (all if null) - private final boolean pairedReads; // Whether this is a paired-end run - private final int readLength; - // Parameters specific to the Aligner implementation being used - private final Map customParametersMap; - - /** - * Constructor that sets every parameter. - * - * @param stringency the stringency of the alignment - * @param readsBamFile the BAM file containing the reads - * @param outputPrefix the directory and filename prefix for output - * @param referenceFileDir the directory where the reference file is located - * @param clipPoints the clip points - * @param expectedInsertSize the expected insert size (null for non-PE lanes) - * @param readsToAlign the number of reads to align - * @param customParametersMap parameters specific to the Aligner implementation - */ - public AbstractBaseAligner(Stringency stringency, File readsBamFile, String outputPrefix, - String referenceFileDir, int clipPoints[], Integer expectedInsertSize, - Integer readsToAlign, Map customParametersMap, - boolean pairedReads, int readLength) { - - // First, a little validation - if (clipPoints != null && clipPoints.length != 4) { - throw new IllegalArgumentException("Length of clipPoints array argument must be 4."); - } - IoUtil.assertFileIsReadable(readsBamFile); - - this.stringency = stringency; - this.readsBamFile = readsBamFile; - this.outputPrefix = outputPrefix; - this.referenceFileDir = referenceFileDir; - this.clipPoints = clipPoints != null ? clipPoints : new int[4]; - this.expectedInsertSize = expectedInsertSize; - this.readsToAlign = readsToAlign; - this.customParametersMap = customParametersMap; - this.pairedReads = pairedReads; - this.readLength = readLength; - } - - /** - * Utility method for deleting a list of files, to be used by the - * cleanup method of sub-classes - * - * @param files the list of files to delete - */ - protected final void deleteFiles(List files) { - for (File f : files) { - f.delete(); - } - } - - // Accessors - protected final Stringency getStringency() { return stringency; } - protected final File getReadsBamFile() { return readsBamFile; } - protected final String getOutputPrefix() { return outputPrefix; } - protected final String getReferenceFileDir() { return referenceFileDir; } - protected final int[] getClipPoints() { return clipPoints; } - protected final Integer getExpectedInsertSize() { return expectedInsertSize; } - protected final Integer getReadsToAlign() { return readsToAlign; } - protected final Map getCustomParametersMap() { return customParametersMap; } - protected final boolean isPairedReads() { return pairedReads; } - protected final int getReadLength() { return readLength; } -} diff --git a/java/lib/edu/mit/broad/picard/aligner/Aligner.java b/java/lib/edu/mit/broad/picard/aligner/Aligner.java deleted file mode 100644 index d0fdf47de..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/Aligner.java +++ /dev/null @@ -1,45 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner; - -/** - * API for aligners. Clients must call these methods in order, as each depends on - * the previous one, but they may call them multiple times and need not call them all. - * This allows steps to be rerun and also lets the caller review intermediate files - * when troubleshooting. - * - * @author Kathleen Tibbetts - */ -public interface Aligner { - - public static enum Stringency{ low, high }; - - /** - * Prepares all the necessary inputs for the alignment process from a BAM file of read data. - */ - public void prepareInputs(); - - /** - * Does the alignment and produces output in the underlying form of the aligner. - */ - public void align(); - - /** - * Converts the output of the aligner to BAM format - */ - public void prepareOutput(); - - /** - * Cleans up intermediate files (the files created in by and for the underlying aligner by the - * prepareInputs() and align() methods. Does not clean up the original source files or the final BAM file. - */ - public void cleanup(); - -} diff --git a/java/lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java b/java/lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java deleted file mode 100644 index 1f3cd55ac..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/maq/BamToBfqWriter.java +++ /dev/null @@ -1,319 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner.maq; - -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.sam.SAMRecord; -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.filter.*; -import edu.mit.broad.picard.util.PeekableIterator; -import edu.mit.broad.picard.util.Log; -import edu.mit.broad.picard.sam.ReservedTagConstants; - -import java.io.File; -import java.util.List; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.Arrays; - -/** - * Class to take unmapped reads in BAM file format and create Maq binary fastq format file(s) -- - * one or two of them, depending on whether it's a paired-end read. This relies on the unmapped - * BAM file having all paired reads together in order. - */ -public class BamToBfqWriter { - - private final File bamFile; - private final String outputPrefix; - private boolean pairedReads = false; - private int wrote = 0; - private int increment = 1; - private int chunk = 0; - private BinaryCodec codec1; - private BinaryCodec codec2; - private final Log log = Log.getInstance(BamToBfqWriter.class); - - /** - * Constructor - * - * @param bamFile the BAM file to read from - * @param outputPrefix the directory and file prefix for the binary fastq files - * @param total the total number of records that should be written, drawn evenly - * from throughout the file (null for all). - * @param chunk the maximum number of records taht should be written to any one file - * @param pairedReads whether these reads are from a paired-end run - */ - public BamToBfqWriter(File bamFile, String outputPrefix, Integer total, Integer chunk, boolean pairedReads) { - this.bamFile = bamFile; - this.outputPrefix = outputPrefix; - this.pairedReads = pairedReads; - if (total != null) { - double writeable = (double)countWritableRecords(); - this.increment = (int)Math.floor(writeable/total.doubleValue()); - } - if (chunk != null) { - this.chunk = chunk; - } - } - - /** - * Constructor - * - * @param bamFile the BAM file to read from - * @param outputPrefix the directory and file prefix for the binary fastq files - * @param pairedReads whether these reads are from a paired-end run - */ - public BamToBfqWriter(File bamFile, String outputPrefix, boolean pairedReads) { - this(bamFile, outputPrefix, null, null, pairedReads); - } - - /** - * Writes the binary fastq file(s) to the output directory - */ - public void writeBfqFiles() { - - Iterator iterator = (new SAMFileReader(IoUtil.openFileForReading(this.bamFile))).iterator(); - - // Filter out noise reads and reads that fail the quality filter - TagFilter tagFilter = new TagFilter(ReservedTagConstants.XN, 1); - FailsVendorReadQualityFilter qualityFilter = new FailsVendorReadQualityFilter(); - - if (!pairedReads) { - writeSingleEndBfqs(iterator, Arrays.asList(tagFilter, qualityFilter)); - codec1.close(); - } - else { - writePairedEndBfqs(iterator, tagFilter, qualityFilter); - codec1.close(); - codec2.close(); - } - log.info("Wrote " + wrote + " bfq records."); - - } - - /** - * Path for writing bfqs for paired-end reads - * - * @param iterator the iterator witht he SAM Records to write - * @param tagFilter the filter for noise reads - * @param qualityFilter the filter for PF reads - */ - private void writePairedEndBfqs(Iterator iterator, TagFilter tagFilter, - FailsVendorReadQualityFilter qualityFilter) { - // Open the codecs for writing - int fileIndex = 0; - initializeNextBfqFiles(fileIndex++); - - int records = 0; - - while (iterator.hasNext()) { - SAMRecord first = iterator.next(); - if (!iterator.hasNext()) { - throw new PicardException("Mismatched number of records in " + this.bamFile.getAbsolutePath()); - } - SAMRecord second = iterator.next(); - if (!second.getReadName().equals(first.getReadName()) || - first.getFirstOfPairFlag() == second.getFirstOfPairFlag()) { - throw new PicardException("Unmatched read pairs in " + this.bamFile.getAbsolutePath() + - ": " + first.getReadName() + ", " + second.getReadName() + "."); - } - - // If both are noise reads, filter them out - if (tagFilter.filterOut(first) && tagFilter.filterOut(second)) { - // skip it - } - // If either fails to pass filter, then exclude them as well - else if (qualityFilter.filterOut(first) || qualityFilter.filterOut(second)) { - // skip it - } - // Otherwise, write them out - else { - records++; - if (records % increment == 0) { - first.setReadName(first.getReadName() + "#0/1"); - writeFastqRecord(first.getFirstOfPairFlag() ? codec1 : codec2, first); - second.setReadName(second.getReadName() + "#0/2"); - writeFastqRecord(second.getFirstOfPairFlag() ? codec1 : codec2, second); - wrote++; - if (wrote % 1000000 == 0) { - log.info(wrote + " records written."); - } - if (chunk > 0 && wrote % chunk == 0) { - initializeNextBfqFiles(fileIndex++); - } - } - } - } - } - - /** - * Path for writing bfqs for single-end reads - * - * @param iterator the iterator witht he SAM Records to write - * @param filters the list of filters to be applied - */ - private void writeSingleEndBfqs(Iterator iterator, List filters) { - - // Open the codecs for writing - int fileIndex = 0; - initializeNextBfqFiles(fileIndex++); - - int records = 0; - - FilteringIterator it = new FilteringIterator(iterator, new AggregateFilter(filters)); - while (it.hasNext()) { - SAMRecord record = it.next(); - records++; - if (records % increment == 0) { - - writeFastqRecord(codec1, record); - wrote++; - if (wrote % 1000000 == 0) { - log.info(wrote + " records processed."); - } - if (chunk > 0 && wrote % chunk == 0) { - initializeNextBfqFiles(fileIndex++); - } - } - } - } - - /** - * Closes any the open bfq file(s), if any, and opens the new one(s) - * - * @param fileIndex the index (counter) of the files to write - */ - private void initializeNextBfqFiles(int fileIndex) { - // Close the codecs if they were writing before - if (codec1 != null) { - codec1.close(); - if (pairedReads) { - codec2.close(); - } - } - - // Open new file, using the fileIndex. - File bfq1 = getOutputFile(this.outputPrefix , 1, fileIndex); - codec1 = new BinaryCodec(IoUtil.openFileForWriting(bfq1)); - log.info("Now writing to file " + bfq1.getAbsolutePath()); - if (pairedReads) { - File bfq2 = getOutputFile(this.outputPrefix , 2, fileIndex); - codec2 = new BinaryCodec(IoUtil.openFileForWriting(bfq2)); - log.info("Now writing to file " + bfq2.getAbsolutePath()); - } - } - - /** - * Writes out a SAMRecord in Maq fastq format - * - * @param codec the code to write to - * @param rec the SAMRecord to write - */ - private void writeFastqRecord(BinaryCodec codec, SAMRecord rec) { - - // Writes the length of the read name and then the name (null-terminated) - codec.writeString(rec.getReadName(), true, true); - - char seqs[] = rec.getReadString().toCharArray(); - char quals[] = rec.getBaseQualityString().toCharArray(); - - // Write the length of the sequence - codec.writeInt(seqs.length); - - // Calculate and write the sequence and qualities - byte seqsAndQuals[] = new byte[seqs.length]; - - for (int i = 0; i < seqs.length; i++) { - int quality = Math.min(quals[i]-33, 63); - int base; - switch(seqs[i]) { - case 'A': - case 'a': - base = 0; - break; - case 'C': - case 'c': - base = 1; - break; - case 'G': - case 'g': - base = 2; - break; - case 'T': - case 't': - base = 3; - break; - case 'N': - case 'n': - case '.': - base = 0; - quality = 0; - break; - default: - throw new PicardException("Unknown base when writing bfq file: " + seqs[i]); - } - seqsAndQuals[i] = (byte) (base << 6 | quality); - } - codec.writeBytes(seqsAndQuals); - } - - private int countWritableRecords() { - int count = 0; - PeekableIterator it = new PeekableIterator((new SAMFileReader(IoUtil.openFileForReading(this.bamFile))).iterator()); - if (!this.pairedReads) { - // Filter out noise reads and reads that fail the quality filter - List filters = new ArrayList(); - filters.add(new TagFilter(ReservedTagConstants.XN, 1)); - filters.add(new FailsVendorReadQualityFilter()); - FilteringIterator itr = new FilteringIterator(it, new AggregateFilter(filters)); - while (itr.hasNext()) { - itr.next(); - count++; - } - } - else { - while (it.hasNext()) { - SAMRecord first = it.next(); - SAMRecord second = it.next(); - // If both are noise reads, filter them out - if (first.getAttribute(ReservedTagConstants.XN) != null && - second.getAttribute(ReservedTagConstants.XN) != null) { - // skip it - } - // If either fails to pass filter, then exclude them as well - else if (first.getReadFailsVendorQualityCheckFlag() || second.getReadFailsVendorQualityCheckFlag() ) { - // skip it - } - // Otherwise, write them out - else { - count++; - } - } - } - it.close(); - return count; - } - - /** - * Constructs the name for the output file and returns the file - * - * @param outputPrefix the directory and file prefix for the output bfq file - * @param read whether this is the file for the first or second read - * @return a new File object for the bfq file. - */ - private File getOutputFile(String outputPrefix, int read, int index) { - File result = new File(outputPrefix + "." + index + "." + read + ".bfq"); - IoUtil.assertFileIsWritable(result); - return result; - } - -} diff --git a/java/lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java b/java/lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java deleted file mode 100644 index af5574185..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/maq/MapFileIterator.java +++ /dev/null @@ -1,357 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner.maq; - -import edu.mit.broad.sam.*; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.StringUtil; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.util.SamPairUtil; - -import java.io.File; -import java.io.BufferedInputStream; -import java.util.*; - -/** - * Reads a Maq map file and returns an an iterator of SAMRecords and a populated header - * - * IMPORTANT! Even though the reads in the map file are in coordinate order, this iterator - * will not necessarily return them in that order. For paired reads, both will be - * returned only after *both* records have been seen. - * - * @author Kathleen Tibbetts - */ -public class MapFileIterator implements CloseableIterator { - - public static final int MATE_UNMAPPED_FLAG = 64; - public static final int READ_UNMAPPED_FLAG = 192; - - private static final int READ_NAME_LENGTH = 36; - private static final int MAP_FORMAT = -1; - private static final int MAX_READ_LENGTH = 128; - - private static final byte ACGT[] = {'A', 'C', 'G', 'T'}; - - public static final String PROGRAM_RECORD = "0"; - - private long recordCount = 0L; - private int recordsRead = 0; - private BinaryCodec mapCodec; - private final SAMFileHeader header; - private final boolean pairedReads; - private final boolean jumpingLibrary; - private final List next = new ArrayList(); - private final Map pending = new HashMap(); - private final List mapFiles = new LinkedList(); - - /** - * Constructor. Opens the map file, reads the record count and header from it, - * creates the SAMFileHeader, and queues up the first read - * - * @param mapFile The Maq map file to read - * @param commandLine The command line used to invoke Maq (for the header) - * @param pairedReads Whether this is a paired-end run - */ - public MapFileIterator(String commandLine, boolean pairedReads, boolean jumpingLibrary, File... mapFile) { - if (mapFile.length == 0) { - throw new IllegalArgumentException("At least one map file must be provided."); - } - mapFiles.addAll(Arrays.asList(mapFile)); - - this.pairedReads = pairedReads; - this.jumpingLibrary = jumpingLibrary; - - header = new SAMFileHeader(); - header.setSortOrder(SAMFileHeader.SortOrder.coordinate); - SAMProgramRecord program = new SAMProgramRecord(PROGRAM_RECORD); - program.setProgramVersion(MaqConstants.getProgramVersion()); - program.setCommandLine(commandLine); - header.addProgramRecord(program); - - queueNextMapFile(); - } - - /** - * Queues up the next map file - * - * @return true if there's another map file to iterate over - */ - private boolean queueNextMapFile() { - - // Close the old file - if (mapCodec != null) { - mapCodec.close(); - } - - // If there are no more map files, return fales - if (mapFiles.size() == 0) { - return false; - } - - // Otherwise, open the next file and reset the recordsRead count - mapCodec = new BinaryCodec(new BufferedInputStream(IoUtil.openFileForReading(mapFiles.remove(0)))); - int format = mapCodec.readInt(); - if (format != MAP_FORMAT) { - mapCodec.close(); - throw new PicardException("Unrecognized Maq map file format: " + format); - } - recordsRead = 0; - - - // Read the sequences out of the map file and set them on the header - int sequenceCount = mapCodec.readInt(); - List sequences = new ArrayList(); - for (int i = 0; i < sequenceCount; i++) { - int length = mapCodec.readInt(); - // Write the sequence name, trimming off the null terminator - sequences.add(new SAMSequenceRecord(mapCodec.readString(length).substring(0, length-1))); - } - if (header.getSequences() == null || header.getSequences().size() == 0) { - header.setSequences(sequences); - } - else { - // TODO: Check that the sequences match and throw and exception if they don't - } - recordCount = mapCodec.readLong(); - - readNext(); - return true; - } - - /** - * Closes the BinaryCodec reading the map file - */ - public void close() { - mapCodec.close(); - } - - /** - * @return true if the iteration has more elements - */ - public boolean hasNext() { - return next.size() > 0; - } - - /** - * @return the next SAMRecord in the iteration - * @throws NoSuchElementException if this is called when hasNext() returns false - */ - public SAMRecord next() { - if (!hasNext()) { - throw new NoSuchElementException("No more elements in this iteration"); - } - SAMRecord result = next.remove(0); - readNext(); - return result; - } - - /** - * Reads the next element from the map file. If we are done with it, we put it in the next - * list; if we are waiting to see its mate, we put it in the pending map. Calls itself - * repeatedly until there is at least one element in next. - */ - private void readNext() { - - // If there's already a record queued up, just return - if (next.size() > 0) { - return; - } - - // If we've read all there is, then any remaining records in the pending map should be returned. - // If this is not a PE run, then the pending map will be empty and we're done. - if (recordsRead == recordCount) { - if (pending.size() > 0) { - StringBuffer sb = new StringBuffer(); - for (String item : pending.keySet()) { - sb.append(item).append("\n"); - } - throw new PicardException("MapFileIterator pending map should have been empty but contained " + - "the following records: " + sb.toString()); - } - queueNextMapFile(); - return; - } - - // Otherwise, we read until there is at least one record in the next list - readMapRecord(); - if (next.size() == 0) { - readNext(); - } - } - - /** - * Reads one record from the map file and throws it onto the pending map or the next list, - * depending on whether we have already seen its mate - */ - private void readMapRecord() { - - // Now that we've got all the data from the binary file, write a SAMRecord and add it to - // the new BAM file - SAMRecord record = new SAMRecord(); - record.setAttribute(SAMTag.PG.toString(), PROGRAM_RECORD); - record.setReadPairedFlag(this.pairedReads); - - // the last base is the single-end mapping quality. - byte seqsAndQuals[] = new byte[MAX_READ_LENGTH-1]; - mapCodec.readBytes(seqsAndQuals); - - byte singleEndMappingQualityOrIndelLength = mapCodec.readByte(); - - // the length of the read - int readLength = mapCodec.readUByte(); - setSeqsAndQuals(seqsAndQuals, readLength, record); - - // the final mapping quality (unless flag below is 130, then it is the - // position of the indel (or 0 if no indel) - int mappingQuality = mapCodec.readUByte(); - - // mismatches in the 28bp (higher 4 bits) and mismatches (lower 4 bits) - mapCodec.readUByte(); - // sum of errors of the best hit - mapCodec.readUByte(); - // counts of all 0- and 1-mismatch hits on the reference - mapCodec.readUByte(); - mapCodec.readUByte(); - - // A bitwise flag. See the Maq docs for its full meaning - int flag = mapCodec.readUByte(); - - // the lower mapQ of the two ends (equals map_qual if unpaired); if flag is 130: mapQ of its mate - int altQual = mapCodec.readUByte(); - - // Index of the sequence for this read - record.setReferenceIndex((int)mapCodec.readUInt(), getHeader()); - - // Start position and strand - long pos = mapCodec.readUInt(); - int startPos = ((int)((pos>>1)& 0x7FFFFFFF)) + 1; - record.setAlignmentStart(startPos); - record.setReadNegativeStrandFlag((pos&1) == 1); - - // offset of the mate (zero if unpaired, or two ends mapped to different chr) - mapCodec.readInt(); - - // The read name - byte nameBytes[] = new byte[READ_NAME_LENGTH]; - mapCodec.readBytes(nameBytes); - String name = StringUtil.bytesToString(nameBytes).trim(); - if (this.pairedReads) { - if (name.endsWith("/1")) { - record.setFirstOfPairFlag(true); - record.setSecondOfPairFlag(false); - } - else if (name.endsWith("/2")) { - record.setFirstOfPairFlag(false); - record.setSecondOfPairFlag(true); - } - else { - throw new PicardException("Unrecognized ending for paired read name: " + name); - } - name = name.substring(0, name.length()-2); - } - record.setReadName(name); - - - if (flag != 130 || singleEndMappingQualityOrIndelLength == 0) { // No indel - record.setCigarString(readLength + "M"); - record.setMappingQuality(mappingQuality); - } - else { // Indel - int indelPos = mappingQuality; - String cigar = indelPos + "M" + Math.abs(singleEndMappingQualityOrIndelLength); - int remaining = readLength - indelPos; - if (singleEndMappingQualityOrIndelLength > 0) { - cigar += "I" + (remaining - singleEndMappingQualityOrIndelLength) + "M"; - } - else { - cigar += "D" + remaining + "M"; - } - record.setCigarString(cigar); - // In the docs, it look like there is a mapping quality for the mate, do we use that? - record.setMappingQuality(altQual); - } - - if (!pairedReads) { - record.setProperPairFlag(false); - next.add(record); - } - else { - record.setMateUnmappedFlag(flag == MATE_UNMAPPED_FLAG); - SAMRecord mate = pending.remove(record.getReadName()); - - if (mate != null) { - boolean proper = SamPairUtil.isProperPair(record, mate, jumpingLibrary); - record.setProperPairFlag(proper); - mate.setProperPairFlag(proper); - - SamPairUtil.setMateInfo(record, mate); - - int insertSize = SamPairUtil.computeInsertSize(record, mate); - record.setInferredInsertSize(insertSize); - mate.setInferredInsertSize(insertSize); - - if (!mate.getMateUnmappedFlag()) { - next.add(record); - } - if (!record.getMateUnmappedFlag()) { - next.add(mate); - } - } - else { - pending.put(record.getReadName(), record); - } - } - - // TODO: Figure out what do do about noise reads long-term - // Note that it is possible that we have lost a "Noise read" annotation at this point. Since - // we try to map a pair if only one of the reads is classified as "noise", then for any paired - // reads where one was a noise read and one was not, we will lose the noise annotation on the - // one noisy read. We have discussed either re-doing the noise evaluation here, modifying the - // read name to carry the noise flag through Maq, or changing what reads we give to Maq. - - recordsRead++; - - } - - /** - * Decodes the sequence and the qualities and sets them on the SAMrecords - * - * @param seqsAndQuals the list of seqs and quals - * @param readLength the length of the read - * @param sam the SAMRecord to populate - */ - private void setSeqsAndQuals(byte seqsAndQuals[], int readLength, SAMRecord sam) { - byte sequence[] = new byte[readLength]; - byte qualities[] = new byte[readLength]; - for (int i = 0; i < readLength; i++) { - byte b = seqsAndQuals[i]; - qualities[i] = (byte)(b & 0x3F); - if (b == 0) { - sequence[i] = 'N'; - } - else { - sequence[i] = ACGT[(seqsAndQuals[i] >> 6) & 3]; - } - } - sam.setReadBases(sequence); - sam.setBaseQualities(qualities); - } - - /** - * @throws UnsupportedOperationException -- not implemented - */ - public void remove() { - throw new UnsupportedOperationException("remove() not supported in MapFileIterator"); - } - - public SAMFileHeader getHeader() { return header; } -} diff --git a/java/lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java b/java/lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java deleted file mode 100644 index 6c1890818..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/maq/MaqAligner.java +++ /dev/null @@ -1,211 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner.maq; - -import edu.mit.broad.picard.aligner.Aligner; -import edu.mit.broad.picard.aligner.AbstractBaseAligner; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.util.Log; - -import java.io.File; -import java.io.FilenameFilter; -import java.util.*; - -/** - * Maq implementation of the Aligner interface - */ -public class MaqAligner extends AbstractBaseAligner implements Aligner { - - // Constants related to Maq output files - public static final String MAQ_MAP_SUFFIX = ".out.aln.map"; - public static final String MAQ_LOG_SUFFIX = ".out.map.log"; - - // Internal constant for multi-plexing lane data - private static final int READ_CHUNK_SIZE = 2000000; - - public static final String REFERENCE_FILE_SUFFIX = ".bfa"; - - private final Log log = Log.getInstance(MaqAligner.class); - - private String commandLine = null; - - - /** - * Constructor that sets every parameter. All other constructors delegate to this one. - * - * @param stringency the stringency of the alignment - * @param readsBamFile the BAM file containing the reads - * @param outputPrefix the directory and filename prefix for output - * @param referenceFileDir the directory where the reference file is located - * @param clipPoints the clip points - * @param expectedInsertSize the expected insert size (null for non-PE lanes) - * @param readsToAlign the number of reads to align - * @param customParametersMap parameters specific to the Aligner implementation - */ - public MaqAligner(Stringency stringency, File readsBamFile, String outputPrefix, - String referenceFileDir, int clipPoints[], Integer expectedInsertSize, - Integer readsToAlign, Map customParametersMap, - boolean pairedReads, int readLength) { - - super(stringency, readsBamFile, outputPrefix, referenceFileDir, clipPoints, - expectedInsertSize, readsToAlign, customParametersMap, pairedReads, readLength); - } - - /** - * Prepares all the necessary inputs for the alignment process from a BAM file of read data. - */ - public void prepareInputs() { - log.info("Preparing Maq inputs."); - BamToBfqWriter writer = new BamToBfqWriter(this.getReadsBamFile(), this.getOutputPrefix(), - this.getReadsToAlign(), READ_CHUNK_SIZE, isPairedReads()); - writer.writeBfqFiles(); - } - - /** - * Does the alignment and produces output in the underlying form of the aligner. - */ - public void align() { - log.info("Running Maq alignment."); - - // Temporary hack until we get the multi-tasking code from Seva - List mapFileNames = new ArrayList(); // All map files that we will merge together at the end - - String maqParams = MaqConstants.SWITCH_RANDOM_SEED + " " + MaqConstants.DEFAULT_RANDOM_SEED; - - if (this.getStringency() == Stringency.high) { - maqParams += " " + MaqConstants.SWITCH_MAX_OUTER_DISTANCE + " " + Math.round( - this.getExpectedInsertSize() * MaqConstants.HIGH_STRINGENCY_MAX_OUTER_DISTANCE_MULTIPLIER); - maqParams += " " + MaqConstants.SWITCH_SUM_MISMATCHES + " " + - MaqConstants.HIGH_STRINGENCY_SUM_MISMATCHES; - } - else { - maqParams += " " + MaqConstants.SWITCH_MAX_OUTER_DISTANCE + " " + - MaqConstants.LOW_STRINGENCY_MAX_OUTER_DISTANCE; - // For low stringency, get at least 30 bases and then let half of what's remaining mismatch - int maxMisMatches = (this.getReadLength() - 30)/2; - maqParams += " " + MaqConstants.SWITCH_SUM_MISMATCHES + " " + - (maxMisMatches * MaqConstants.LOW_STRINGENCY_QUALITY_FOR_MISMATCHES); - } - - String referenceFile = new File(this.getReferenceFileDir()).listFiles(new FilenameFilter() { - public boolean accept(File dir, String name) { - return name.endsWith(REFERENCE_FILE_SUFFIX); - } - })[0].getAbsolutePath(); - - ProcessBuilder builder; - - // Map the bfq files, individually or in pairs - SortedSet bfqs = new TreeSet(this.getBfqFiles()); - for (Iterator it = bfqs.iterator(); it.hasNext();) { - - String read1bfq = it.next().getAbsolutePath(); - String read2bfq = (this.isPairedReads()) ? it.next().getAbsolutePath() : ""; - - String outputFileBase = read1bfq.substring(0, read1bfq.lastIndexOf('.')-2); - String mapFile = outputFileBase + MAQ_MAP_SUFFIX; - String logFile = outputFileBase + MAQ_LOG_SUFFIX; - - String command = MaqConstants.MAQ_HOME + MaqConstants.MAQ_COMMAND + " " + MaqConstants.MAP_COMMAND + - " " + maqParams + " " + mapFile + " " + referenceFile + " " + read1bfq + " " + read2bfq + - " 2> " + logFile; - setCommandLine(getCommandLine() == null ? command : getCommandLine() + ";" + command); - log.info("Executing command: " + command); - try { - builder = new ProcessBuilder(command.split(" ")); - Process p = builder.start(); - p.waitFor(); - } - catch (Exception e) { - throw new PicardException("Error starting Maq process", e); - } - - mapFileNames.add(mapFile); - } - - // If there's more than one map file, then merge them. - String finalFileName = this.getOutputPrefix() + "." + this.getStringency() + MAQ_MAP_SUFFIX; - if (mapFileNames.size() > 1) { - String command = MaqConstants.MAQ_HOME + MaqConstants.MAQ_COMMAND + " " + - MaqConstants.MERGE_COMMAND + " " + finalFileName; - for (String name : mapFileNames) { - command += " " + name; - } - setCommandLine(getCommandLine() == null ? command : getCommandLine() + ";" + command); - log.info("Executing command: " + command); - - try { - builder = new ProcessBuilder(command.split(" ")); - Process p = builder.start(); - p.waitFor(); - } - catch (Exception e) { - throw new PicardException("Error starting Maq process", e); - } - } - else { // Otherwise rename the single map file so we can find it later - File f = new File(mapFileNames.get(0)); - if (!f.renameTo(new File(finalFileName))) { - throw new PicardException("Error renaming " + f.getAbsolutePath() + " to " + finalFileName); - } - } - } - - /** - * Converts the output of the aligner to BAM format - */ - public void prepareOutput() { - log.info("Preparing output from Maq alignment."); - // TODO: MaqToBam - } - - /** - * Cleans up intermediate files (the files created in by and for the underlying aligner by the - * prepareInputs() and align() methods. Does not clean up the original source files or the final BAM file. - */ - public void cleanup() { - log.info("Cleaning up Maq intermediate files."); - this.deleteFiles(getBfqFiles()); -// this.deleteFiles(getMaqAlignmentFiles()); - } - - /** - * Returns a list of zero to two BFQ files, depending on whether they are there - * and whether it was a paired-end run or not - * - * @return a list of BFQ files - */ - private List getBfqFiles() { - File dir = new File(this.getOutputPrefix().substring(0, this.getOutputPrefix().lastIndexOf("/"))); - return Arrays.asList(dir.listFiles(new FilenameFilter() { - public boolean accept(File dir, String name) { - return name.endsWith(".bfq"); - } - })); - } - - /** - * Returns the Maq map files - * - * @return a list of Maq .map files - */ - private List getMaqAlignmentFiles() { - File dir = new File(this.getOutputPrefix().substring(0, this.getOutputPrefix().lastIndexOf("/"))); - return Arrays.asList(dir.listFiles(new FilenameFilter() { - public boolean accept(File dir, String name) { - // TODO: Add the text files if we do not read the binary map files - return name.endsWith(MAQ_MAP_SUFFIX) || name.endsWith(MAQ_LOG_SUFFIX); - } - })); - } - - public String getCommandLine() { return commandLine; } - public void setCommandLine(String commandLine) { this.commandLine = commandLine; } -} diff --git a/java/lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java b/java/lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java deleted file mode 100644 index b5e4b9b59..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/maq/MaqConstants.java +++ /dev/null @@ -1,39 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner.maq; - -/** - * Utility class to hold Maq-related constants (program name, location, switches, etc) - */ -public class MaqConstants { - // General Maq constants - public static final String PROGRAM_NAME = "Maq"; - public static final String PROGRAM_VERSION = "0.7.1"; - public static final String MAQ_HOME = "/seq/dirseq/maq-0.7.1/"; - - // Command-related constants - public static final String MAQ_COMMAND = "maq"; - public static final String MAP_COMMAND = "map"; - public static final String MERGE_COMMAND = "mapmerge"; - - // Constants related to Maq map switches - public static final String SWITCH_SUM_MISMATCHES = "-e"; - public static final int HIGH_STRINGENCY_SUM_MISMATCHES = 100; - public static final int LOW_STRINGENCY_QUALITY_FOR_MISMATCHES = 30; - - public static final String SWITCH_MAX_OUTER_DISTANCE = "-a"; - public static final int LOW_STRINGENCY_MAX_OUTER_DISTANCE = 1500; - public static final double HIGH_STRINGENCY_MAX_OUTER_DISTANCE_MULTIPLIER = 1.5d; - - public static final String SWITCH_RANDOM_SEED = "-s"; - public static final int DEFAULT_RANDOM_SEED = 0; - - public static String getProgramVersion() { return PROGRAM_VERSION; } -} diff --git a/java/lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java b/java/lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java deleted file mode 100644 index 3b82cc106..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/maq/MaqMapMerger.java +++ /dev/null @@ -1,125 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner.maq; - -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.util.StringSortingCollectionFactory; -import edu.mit.broad.picard.util.Log; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.sam.util.SortingCollection; -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.*; - -import java.io.File; -import java.io.BufferedInputStream; -import java.util.List; -import java.util.ArrayList; -import java.util.Iterator; -import java.nio.ByteBuffer; - -/** - * Class to write a BAM file that includes the results from a Maq .map file along with the unaligned - * reads from the original BAM file. - * - * Information on the meaning of the elements of the map file is drawn from the Maq documentation - * on this page: http://maq.sourceforge.net/maqmap_format.shtml - */ -public class MaqMapMerger { - - private final File mapFile; - private final File sourceBamFile; - private final File targetBamFile; - private final boolean pairedReads; - private final Log log = Log.getInstance(MaqMapMerger.class); - private String commandLine = null; - private List sequences = new ArrayList(); - - - /** - * Constructor - * - * @param mapFile The Maq map file to parse - * @param sourceBamFile The BAM file that was used as the input to the Maq aligner, which will - * include info on all the reads that did not map - * @param targetBamFile The file to which to write the merged - */ - public MaqMapMerger(File mapFile, File sourceBamFile, File targetBamFile, boolean pairedReads) { - IoUtil.assertFileIsReadable(mapFile); - IoUtil.assertFileIsReadable(sourceBamFile); - IoUtil.assertFileIsWritable(targetBamFile); - this.mapFile = mapFile; - this.sourceBamFile = sourceBamFile; - this.targetBamFile = targetBamFile; - this.pairedReads = pairedReads; - } - - /** - * Merges the alignment from the map file with the remaining records from the source BAM file. - */ - public void mergeAlignment() { - log.info("Processing map file: " + mapFile.getAbsolutePath()); - // Write the header - MapFileIterator it = new MapFileIterator(getCommandLine(), this.pairedReads, false, this.mapFile); - SAMFileHeader header = it.getHeader(); - SAMFileWriter writer = new SAMFileWriterFactory().makeBAMWriter(header, false, targetBamFile); - - // Write the alignments - SortingCollection readNames = writeAlignments(it, writer); - - // We're done with the map file, so close it - it.close(); - writeUnalignedReads(writer, readNames.iterator()); - - // Now close the writer - writer.close(); - } - - - private void writeUnalignedReads(SAMFileWriter writer, CloseableIterator nameIterator) { - - int skipCount = 0; - SAMFileReader reader = new SAMFileReader(IoUtil.openFileForReading(this.sourceBamFile)); - CloseableIterator bamRecords = reader.iterator(); - - String readName = nameIterator.hasNext() ? nameIterator.next() : null; - while(bamRecords.hasNext()) { - SAMRecord rec = bamRecords.next(); - if (rec.getReadName().equals(readName)) { - // skip it and pull the next name off the name iterator - readName = nameIterator.hasNext() ? nameIterator.next() : null; - skipCount++; - } - else { - writer.addAlignment(rec); - } - } -System.out.println("Skipped " + skipCount + " already-aligned records."); - bamRecords.close(); - nameIterator.close(); - } - - private SortingCollection writeAlignments(MapFileIterator iterator, SAMFileWriter writer) { - -int wrote = 0; - SortingCollection readNames = StringSortingCollectionFactory.newCollection(); - while (iterator.hasNext()) { - SAMRecord record = iterator.next(); - readNames.add(record.getReadName()); - writer.addAlignment(record); -wrote++; - } -System.out.println("Wrote " + wrote + " alignment records."); - return readNames; - } - - public void setCommandLine(String commandLine) { this.commandLine = commandLine; } - public String getCommandLine() { return this.commandLine; } -} diff --git a/java/lib/edu/mit/broad/picard/aligner/maq/RunMaq.java b/java/lib/edu/mit/broad/picard/aligner/maq/RunMaq.java deleted file mode 100644 index bc3741b02..000000000 --- a/java/lib/edu/mit/broad/picard/aligner/maq/RunMaq.java +++ /dev/null @@ -1,133 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.aligner.maq; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.aligner.Aligner; - -import java.io.File; -import java.util.Map; -import java.util.List; -import java.util.HashMap; -import java.util.ArrayList; - -/** - * CommandLineProgram to generate to invoke BustardToBamWriter - * - * @author Kathleen Tibbetts - */ -public class RunMaq extends CommandLineProgram { - private static final String PROGRAM_VERSION = "1.0"; - - // The following attributes define the command-line arguments - @Usage - public String USAGE = - "Usage: " + getClass().getName() + " [options]\n\n" + - "Invoke the Maq aligner.\n" + - "Version: " + PROGRAM_VERSION +"\n"; - - @Option(shortName="I", doc="The BAM file to parse.", optional=true) - public File INPUT; - @Option(shortName="O", doc="The directory and file prefix for all output.", optional=false) - public String OUTPUT; - @Option(shortName="L", doc="The read length.", optional=false) - public Integer READ_LENGTH; - @Option(shortName="S", doc="Stringency of the alignment.", optional=true) - public Aligner.Stringency STRINGENCY; - @Option(shortName="R", doc="Directory where the reference file is located.", optional=true) - public String REFERENCE; - @Option(shortName="C", doc="Clip points for the alignment.", optional=true, minElements=0, maxElements=4) - public List CLIP_POINT = new ArrayList(); - @Option(shortName="E", doc="Expected insert size.", optional=true) - public Integer EXPECTED_INSERT_SIZE; - @Option(doc="Whether this is a paired-end run.", optional=false) - public Boolean PE; - @Option(shortName="NUM", doc="Number of reads to align (null = all).", optional=true) - public Integer READS_TO_ALIGN; - @Option(shortName="CUSTOM", doc="Custom parameter in the form name=value.", optional=true) - public List CUSTOM_PARAMETER = new ArrayList(); - @Option(shortName="PREP", doc="Whether to prepare inputs for the alignement.", optional=true) - public Boolean PREPARE = true; - @Option(doc="Whether to do the alignement.", optional=true) - public Boolean ALIGN = true; - @Option(shortName="BAM", doc="Whether to generate a BAM file from the alignment output.", optional=true) - public Boolean BAM_OUTPUT = true; - @Option(doc="Whether to clean up intermediate input and output.", optional=true) - public Boolean CLEANUP = true; - - protected int doWork() { - int clipPoints[] = null; - if (CLIP_POINT != null) { - clipPoints = new int[4]; - int index=0; - for (Integer i : CLIP_POINT) { - clipPoints[index++] = i; - } - } - Map params = null; - if (CUSTOM_PARAMETER != null) { - params = new HashMap(); - for (String param : CUSTOM_PARAMETER) { - String nameAndVal[] = param.split("="); - params.put(nameAndVal[0], nameAndVal[1]); - } - } - Aligner aligner = new MaqAligner(STRINGENCY, INPUT, OUTPUT, REFERENCE, clipPoints, - EXPECTED_INSERT_SIZE, READS_TO_ALIGN, params, PE, READ_LENGTH); - if (PREPARE) { - aligner.prepareInputs(); - } - if (ALIGN) { - aligner.align(); - } - if (BAM_OUTPUT) { - aligner.prepareOutput(); - } - if (CLEANUP) { - aligner.cleanup(); - } - return 0; - } - - /** - * This is kind of a mess. Almost everything is optional, since you don't have to do all of the steps in the - * alignement. - * @return - */ - protected boolean customCommandLineValidation() { - if (PREPARE) { - if( INPUT == null) { - System.err.println("ERROR: INPUT must be specified when preparing inputs for the alignment."); - return false; - } - if (CLIP_POINT.size() != 0 && CLIP_POINT.size() != 4) { - System.err.println("ERROR: You must supply either 0 or 4 values for CLIP_POINT: " + CLIP_POINT.size()); - return false; - } - } - if (ALIGN) { - if (STRINGENCY == null) { - System.err.println("ERROR: STRINGENCY must be specified when doing an alignment."); - return false; - } - if (REFERENCE == null) { - System.err.println("ERROR: REFERENCE must be specified when doing an alignment."); - return false; - } - } - return true; - } - - public static void main(String[] argv) { - System.exit(new RunMaq().instanceMain(argv)); - } -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java deleted file mode 100644 index cfe74bbcc..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/CommandLineParseException.java +++ /dev/null @@ -1,27 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -public class CommandLineParseException extends RuntimeException{ - public CommandLineParseException() { - } - - public CommandLineParseException(String s) { - super(s); - } - - public CommandLineParseException(String s, Throwable throwable) { - super(s, throwable); - } - - public CommandLineParseException(Throwable throwable) { - super(throwable); - } -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/CommandLineParser.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineParser.java deleted file mode 100644 index 69b681abb..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/CommandLineParser.java +++ /dev/null @@ -1,638 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -import java.io.*; -import java.lang.reflect.Constructor; -import java.lang.reflect.Field; -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.ParameterizedType; -import java.lang.reflect.Type; -import java.util.*; - -import edu.mit.broad.picard.util.StringUtil; -import edu.mit.broad.picard.PicardException; - -/** - * Annotation-driven utility for parsing command-line arguments, checking for errors, and producing usage message. - * - * This class supports options of the form KEY=VALUE, plus positional arguments. Positional arguments must not contain - * an equal sign lest they be mistaken for a KEY=VALUE pair. - * - * The caller must supply an object that both defines the command line and has the parsed options set into it. - * For each possible KEY=VALUE option, there must be a public data member annotated with @Option. The KEY name is - * the name of the data member. An abbreviated name may also be specified with the shortName attribute of @Option. - * If the data member is a List, then the option may be specified multiple times. The type of the data member, - * or the type of the List element must either have a ctor T(String), or must be an Enum. List options must - * be initialized by the caller with some kind of list. Any other option that is non-null is assumed to have the given - * value as a default. If an option has no default value, and does not have the optional attribute of @Option set, - * is required. For List options, minimum and maximum number of elements may be specified in the @Option annotation. - * - * A single List data member may be annotated with the @PositionalArguments. This behaves similarly to a Option - * with List data member: the caller must initialize the data member, the type must be constructable from String, and - * min and max number of elements may be specified. If no @PositionalArguments annotation appears in the object, - * then it is an error for the command line to contain positional arguments. - * - * A single String public data member may be annotated with @Usage. This string, if present, is used to - * construct the usage message. Details about the possible options are automatically appended to this string. - * If @Usage does not appear, a boilerplate usage message is used. - */ -public class CommandLineParser { - // For formatting option section of usage message. - private static final int OPTION_COLUMN_WIDTH = 30; - private static final int DESCRIPTION_COLUMN_WIDTH = 50; - - private static final Boolean[] TRUE_FALSE_VALUES = {Boolean.TRUE, Boolean.FALSE}; - - // Use these if no @Usage annotation - private static final String defaultUsagePreamble = "Usage: program [options...]\n"; - private static final String defaultUsagePreambleWithPositionalArguments = - "Usage: program [options...] [positional-arguments...]\n"; - private static final String OPTIONS_FILE = "OPTIONS_FILE"; - - /** - * A typical command line program will call this to get the beginning of the usage message, - * and then append a description of the program, like this: - * - * \@Usage(programVersion=PROGRAM_VERSION) - * public String USAGE = CommandLineParser.getStandardUsagePreamble(getClass()) + "Frobnicates the freebozzle." - */ - public static String getStandardUsagePreamble(Class mainClass) { - return "USAGE: " + mainClass.getName() + " [options]\n\n"; - } - - // This is the object that the caller has provided that contains annotations, - // and into which the values will be assigned. - private final Object callerOptions; - - private String usagePreamble; - // null if no @PositionalArguments annotation - private Field positionalArguments; - private int minPositionalArguments; - private int maxPositionalArguments; - - // List of all the data members with @Option annotation - private final List optionDefinitions = new ArrayList(); - - // Maps long name, and short name, if present, to an option definition that is - // also in the optionDefinitions list. - private final Map optionMap = new HashMap(); - - // For printing error messages when parsing command line. - private PrintStream messageStream; - - // In case implementation wants to get at arg for some reason. - private String[] argv; - - - /** - * This attribute is here just to facilitate printing usage for OPTIONS_FILE - */ - public File IGNORE_THIS_PROPERTY; - - /** - * Prepare for parsing command line arguments, by validating annotations. - * @param callerOptions This object contains annotations that define the acceptable command-line options, - * and ultimately will receive the settings when a command line is parsed. - */ - public CommandLineParser(final Object callerOptions) { - this.callerOptions = callerOptions; - - for (final Field field : this.callerOptions.getClass().getFields()) { - if (field.getAnnotation(PositionalArguments.class) != null) { - handlePositionalArgumentAnnotation(field); - } - if (field.getAnnotation(Usage.class) != null) { - handleUsageAnnotation(field); - } - if (field.getAnnotation(Option.class) != null) { - handleOptionAnnotation(field); - } - } - - if (usagePreamble == null) { - if (positionalArguments == null) { - usagePreamble = defaultUsagePreamble; - } else { - usagePreamble = defaultUsagePreambleWithPositionalArguments; - } - } - } - - /** - * Print a usage message based on the options object passed to the ctor. - * @param stream Where to write the usage message. - */ - public void usage(final PrintStream stream) { - stream.print(usagePreamble); - if (!optionDefinitions.isEmpty()) { - stream.println("\nOptions:\n"); - for (final OptionDefinition optionDefinition : optionDefinitions) { - printOptionUsage(stream, optionDefinition); - } - } - final Field fileField; - try { - fileField = getClass().getField("IGNORE_THIS_PROPERTY"); - } catch (NoSuchFieldException e) { - throw new PicardException("Should never happen", e); - } - final OptionDefinition optionsFileOptionDefinition = - new OptionDefinition(fileField, OPTIONS_FILE, "", - "File of OPTION_NAME=value pairs. No positional parameters allowed. Unlike command-line options, " + - "unrecognized options are ignored. " + "A single-valued option set in an options file may be overridden " + - "by a subsequent command-line option. " + - "A line starting with '#' is considered a comment.", false, true, 0, Integer.MAX_VALUE, null, new String[0]); - printOptionUsage(stream, optionsFileOptionDefinition); - } - - /** - * Parse command-line options, and store values in callerOptions object passed to ctor. - * @param messageStream Where to write error messages. - * @param args Command line tokens. - * @return true if command line is valid. - */ - public boolean parseOptions(final PrintStream messageStream, final String[] args) { - this.argv = args; - this.messageStream = messageStream; - for (final String arg: args) { - if (arg.equals("-h") || arg.equals("--help")) { - usage(messageStream); - return false; - } - final String[] pair = arg.split("=", 2); - if (pair.length == 2) { - if (pair[0].equals(OPTIONS_FILE)) { - if (!parseOptionsFile(pair[1])) { - messageStream.println(); - usage(messageStream); - return false; - } - } else { - if (!parseOption(pair[0], pair[1], false)) { - messageStream.println(); - usage(messageStream); - return false; - } - } - } else if (!parsePositionalArgument(arg)) { - messageStream.println(); - usage(messageStream); - return false; - } - } - if (!checkNumArguments()) { - messageStream.println(); - usage(messageStream); - return false; - } - return true; - } - - /** - * After command line has been parsed, make sure that all required options have values, and that - * lists with minimum # of elements have sufficient. - * @return true if valid - */ - private boolean checkNumArguments() { - try { - for (final OptionDefinition optionDefinition : optionDefinitions) { - StringBuilder mutextOptionNames = new StringBuilder(); - for (String mutexOption : optionDefinition.mutuallyExclusive) { - OptionDefinition mutextOptionDef = optionMap.get(mutexOption); - if (mutextOptionDef != null && mutextOptionDef.hasBeenSet) { - mutextOptionNames.append(" ").append(mutextOptionDef.name); - } - } - if (optionDefinition.hasBeenSet && mutextOptionNames.length() > 0) { - messageStream.println("ERROR: Option '" + optionDefinition.name + - "' cannot be used in conjunction with option(s)" + - mutextOptionNames.toString()); - return false; - } - if (optionDefinition.isCollection) { - final Collection c = (Collection)optionDefinition.field.get(callerOptions); - if (c.size() < optionDefinition.minElements) { - messageStream.println("ERROR: Option '" + optionDefinition.name + "' must be specified at least " + - optionDefinition.minElements + " times."); - return false; - } - } else if (!optionDefinition.optional && !optionDefinition.hasBeenSet && mutextOptionNames.length() == 0) { - messageStream.print("ERROR: Option '" + optionDefinition.name + "' is required"); - if (optionDefinition.mutuallyExclusive.isEmpty()) { - messageStream.println("."); - } else { - messageStream.println(" unless any of " + optionDefinition.mutuallyExclusive + " are specified."); - } - return false; - } - } - if (positionalArguments != null) { - final Collection c = (Collection)positionalArguments.get(callerOptions); - if (c.size() < minPositionalArguments) { - messageStream.println("ERROR: At least " + minPositionalArguments + - " positional arguments must be specified."); - return false; - } - } - return true; - } catch (IllegalAccessException e) { - // Should never happen because lack of publicness has already been checked. - throw new RuntimeException(e); - } - } - - private boolean parsePositionalArgument(final String stringValue) { - if (positionalArguments == null) { - messageStream.println("ERROR: Invalid argument '" + stringValue + "'."); - return false; - } - final Object value; - try { - value = constructFromString(getUnderlyingType(positionalArguments), stringValue); - } catch (CommandLineParseException e) { - messageStream.println("ERROR: " + e.getMessage()); - return false; - } - final Collection c; - try { - c = (Collection)positionalArguments.get(callerOptions); - } catch (IllegalAccessException e) { - throw new RuntimeException(e); - } - if (c.size() >= maxPositionalArguments) { - messageStream.println("ERROR: No more than " + maxPositionalArguments + - " positional arguments may be specified on the command line."); - return false; - } - c.add(value); - return true; - } - - private boolean parseOption(String key, final String stringValue, final boolean optionsFile) { - key = key.toUpperCase(); - final OptionDefinition optionDefinition = optionMap.get(key); - if (optionDefinition == null) { - if (optionsFile) { - // Silently ignore unrecognized option from options file - return true; - } - messageStream.println("ERROR: Unrecognized option: " + key); - return false; - } - if (!optionDefinition.isCollection) { - if (optionDefinition.hasBeenSet && !optionDefinition.hasBeenSetFromOptionsFile) { - messageStream.println("ERROR: Option '" + key + "' cannot be specified more than once."); - return false; - } - } - final Object value; - try { - value = constructFromString(getUnderlyingType(optionDefinition.field), stringValue); - } catch (CommandLineParseException e) { - messageStream.println("ERROR: " + e.getMessage()); - return false; - } - try { - if (optionDefinition.isCollection) { - final Collection c = (Collection)optionDefinition.field.get(callerOptions); - if (c.size() >= optionDefinition.maxElements) { - messageStream.println("ERROR: Option '" + key + "' cannot be used more than " + - optionDefinition.maxElements + " times."); - return false; - } - c.add(value); - } else { - optionDefinition.field.set(callerOptions, value); - optionDefinition.hasBeenSet = true; - optionDefinition.hasBeenSetFromOptionsFile = optionsFile; - } - } catch (IllegalAccessException e) { - // Should never happen because we only iterate through public fields. - throw new RuntimeException(e); - } - return true; - } - - /** - * Parsing of options from file is looser than normal. Any unrecognized options are - * ignored, and a single-valued option that is set in a file may be overridden by a - * subsequent appearance of that option. - * A line that starts with '#' is ignored. - * @param optionsFile - * @return false if a fatal error occurred - */ - private boolean parseOptionsFile(final String optionsFile) { - try { - final BufferedReader reader = new BufferedReader(new FileReader(optionsFile)); - String line; - while ((line = reader.readLine()) != null) { - if (line.startsWith("#")) { - continue; - } - final String[] pair = line.split("=", 2); - if (pair.length == 2) { - if (!parseOption(pair[0], pair[1], true)) { - messageStream.println(); - usage(messageStream); - return false; - } - } else { - messageStream.println("Strange line in OPTIONS_FILE " + optionsFile + ": " + line); - usage(messageStream); - return false; - } - } - reader.close(); - return true; - - } catch (IOException e) { - throw new PicardException("I/O error loading OPTIONS_FILE=" + optionsFile, e); - } - } - - private void printOptionUsage(final PrintStream stream, final OptionDefinition optionDefinition) { - final String type = getUnderlyingType(optionDefinition.field).getSimpleName(); - String optionLabel = optionDefinition.name + "=" + type; - stream.print(optionLabel); - if (optionDefinition.shortName.length() > 0) { - stream.println(); - } - if (optionDefinition.shortName.length() > 0) { - optionLabel = optionDefinition.shortName + "=" + type; - stream.print(optionLabel); - } - int numSpaces = OPTION_COLUMN_WIDTH - optionLabel.length(); - if (optionLabel.length() > OPTION_COLUMN_WIDTH) { - stream.println(); - numSpaces = OPTION_COLUMN_WIDTH; - } - printSpaces(stream, numSpaces); - final StringBuilder sb = new StringBuilder(); - if (optionDefinition.doc.length() > 0) { - sb.append(optionDefinition.doc); - sb.append(" "); - } - if (optionDefinition.optional && !optionDefinition.isCollection) { - sb.append("Default value: "); - sb.append(optionDefinition.defaultValue); - sb.append(". "); - } else if (!optionDefinition.isCollection){ - sb.append("Required. "); - } - Object[] enumConstants = getUnderlyingType(optionDefinition.field).getEnumConstants(); - if (enumConstants == null && getUnderlyingType(optionDefinition.field) == Boolean.class) { - enumConstants = TRUE_FALSE_VALUES; - } - if (enumConstants != null) { - sb.append("Possible values: {"); - for (int i = 0; i < enumConstants.length; ++i) { - if (i > 0) { - sb.append(", "); - } - sb.append(enumConstants[i].toString()); - } - sb.append("} "); - } - if (optionDefinition.isCollection) { - if (optionDefinition.minElements == 0) { - if (optionDefinition.maxElements == Integer.MAX_VALUE) { - sb.append("This option may be specified 0 or more times."); - } else { - sb.append("This option must be specified no more than " + optionDefinition.maxElements + "times."); - } - } else if (optionDefinition.maxElements == Integer.MAX_VALUE) { - sb.append("This option must be specified at least " + optionDefinition.minElements + " times."); - } else { - sb.append("This option may be specified between " + optionDefinition.minElements + - " and " + optionDefinition.maxElements + " times."); - } - } - if (!optionDefinition.mutuallyExclusive.isEmpty()) { - sb.append(" Cannot be used in conjuction with option(s)"); - for (String option : optionDefinition.mutuallyExclusive) { - OptionDefinition mutextOptionDefinition = optionMap.get(option); - sb.append(" ").append(mutextOptionDefinition.name); - if (mutextOptionDefinition.shortName.length() > 0) { - sb.append(" (").append(mutextOptionDefinition.shortName).append(")"); - } - } - } - final String wrappedDescription = StringUtil.wordWrap(sb.toString(), DESCRIPTION_COLUMN_WIDTH); - final String[] descriptionLines = wrappedDescription.split("\n"); - for (int i = 0; i < descriptionLines.length; ++i) { - if (i > 0) { - printSpaces(stream, OPTION_COLUMN_WIDTH); - } - stream.println(descriptionLines[i]); - } - stream.println(); - } - - private void printSpaces(final PrintStream stream, final int numSpaces) { - final StringBuilder sb = new StringBuilder(); - for (int i = 0; i < numSpaces; ++i) { - sb.append(" "); - } - stream.print(sb); - } - - private void handleOptionAnnotation(final Field field) { - try { - final Option optionAnnotation = field.getAnnotation(Option.class); - final boolean isCollection = isCollectionField(field); - if (isCollection) { - if (optionAnnotation.maxElements() == 0) { - throw new CommandLineParserDefinitionException("@Option member " + field.getName() + - "has maxElements = 0"); - } - if (optionAnnotation.minElements() > optionAnnotation.maxElements()) { - throw new CommandLineParserDefinitionException("In @Option member " + field.getName() + - ", minElements cannot be > maxElements"); - } - } - if (!canBeMadeFromString(getUnderlyingType(field))) { - throw new CommandLineParserDefinitionException("@Option member " + field.getName() + - " must have a String ctor or be an enum"); - } - - final OptionDefinition optionDefinition = new OptionDefinition(field, - field.getName(), - optionAnnotation.shortName(), - optionAnnotation.doc(), optionAnnotation.optional() || (field.get(callerOptions) != null), - isCollection, optionAnnotation.minElements(), - optionAnnotation.maxElements(), field.get(callerOptions), - optionAnnotation.mutex()); - - for (String option : optionAnnotation.mutex()) { - OptionDefinition mutextOptionDef = optionMap.get(option); - if (mutextOptionDef != null) { - mutextOptionDef.mutuallyExclusive.add(field.getName()); - } - } - if (optionMap.containsKey(optionDefinition.name)) { - throw new CommandLineParserDefinitionException(optionDefinition.name + " has already been used"); - } - optionMap.put(optionDefinition.name, optionDefinition); - if (optionDefinition.shortName.length() > 0) { - if (optionMap.containsKey(optionDefinition.shortName)) { - throw new CommandLineParserDefinitionException(optionDefinition.shortName + " has already been used"); - } - optionMap.put(optionDefinition.shortName, optionDefinition); - } - optionDefinitions.add(optionDefinition); - } catch (IllegalAccessException e) { - throw new CommandLineParserDefinitionException(field.getName() + - " must have public visibility to have @Option annotation"); - } - } - - private void handleUsageAnnotation(final Field field) { - if (usagePreamble != null) { - throw new CommandLineParserDefinitionException - ("@Usage cannot be used more than once in an option class."); - } - try { - usagePreamble = (String)field.get(callerOptions); - final Usage usageAnnotation = field.getAnnotation(Usage.class); - if (usageAnnotation.programVersion().length() > 0) { - usagePreamble += "Version: " + usageAnnotation.programVersion() + "\n"; - } - } catch (IllegalAccessException e) { - throw new CommandLineParserDefinitionException("@Usage data member must be public"); - } catch (ClassCastException e) { - throw new CommandLineParserDefinitionException - ("@Usage can only be applied to a String data member."); - } - } - - private void handlePositionalArgumentAnnotation(final Field field) { - if (positionalArguments != null) { - throw new CommandLineParserDefinitionException - ("@PositionalArguments cannot be used more than once in an option class."); - } - positionalArguments = field; - if (!isCollectionField(field)) { - throw new CommandLineParserDefinitionException("@PositionalArguments must be applied to a Collection"); - } - - if (!canBeMadeFromString(getUnderlyingType(field))) { - throw new CommandLineParserDefinitionException("@PositionalParameters member " + field.getName() + - "does not have a String ctor"); - } - - final PositionalArguments positionalArgumentsAnnotation = field.getAnnotation(PositionalArguments.class); - minPositionalArguments = positionalArgumentsAnnotation.minElements(); - maxPositionalArguments = positionalArgumentsAnnotation.maxElements(); - if (minPositionalArguments > maxPositionalArguments) { - throw new CommandLineParserDefinitionException("In @PositionalArguments, minElements cannot be > maxElements"); - } - } - - private boolean isCollectionField(final Field field) { - try { - field.getType().asSubclass(Collection.class); - return true; - } catch (ClassCastException e) { - return false; - } - } - - private Class getUnderlyingType(final Field field) { - if (isCollectionField(field)) { - final ParameterizedType clazz = (ParameterizedType)(field.getGenericType()); - final Type[] genericTypes = clazz.getActualTypeArguments(); - if (genericTypes.length != 1) { - throw new CommandLineParserDefinitionException("Strange collection type for field " + field.getName()); - } - return (Class)genericTypes[0]; - - } else { - return field.getType(); - } - } - - // True if clazz is an enum, or if it has a ctor that takes a single String argument. - private boolean canBeMadeFromString(final Class clazz) { - if (clazz.isEnum()) { - return true; - } - try { - clazz.getConstructor(String.class); - return true; - } catch (NoSuchMethodException e) { - return false; - } - } - - private Object constructFromString(final Class clazz, final String s) { - try { - if (clazz.isEnum()) { - try { - return Enum.valueOf(clazz, s); - } catch (IllegalArgumentException e) { - throw new CommandLineParseException("'" + s + "' is not a valid value for " + - clazz.getSimpleName() + ".", e); - } - } - final Constructor ctor = clazz.getConstructor(String.class); - return ctor.newInstance(s); - } catch (NoSuchMethodException e) { - // Shouldn't happen because we've checked for presence of ctor - throw new CommandLineParseException(e); - } catch (InstantiationException e) { - throw new CommandLineParseException("Abstract class '" + clazz.getSimpleName() + - "'cannot be used for an option value type.", e); - } catch (IllegalAccessException e) { - throw new CommandLineParseException("String constructor for option value type '" + clazz.getSimpleName() + - "' must be public.", e); - } catch (InvocationTargetException e) { - throw new CommandLineParseException("Problem constructing " + clazz.getSimpleName() + " from the string '" + s + "'.", - e.getCause()); - } - } - - public String[] getArgv() { - return argv; - } - - private class OptionDefinition { - final Field field; - final String name; - final String shortName; - final String doc; - final boolean optional; - final boolean isCollection; - final int minElements; - final int maxElements; - final String defaultValue; - boolean hasBeenSet = false; - boolean hasBeenSetFromOptionsFile = false; - Set mutuallyExclusive; - - private OptionDefinition(final Field field, final String name, final String shortName, final String doc, final boolean optional, final boolean collection, - final int minElements, final int maxElements, final Object defaultValue, String[] mutuallyExclusive) { - this.field = field; - this.name = name.toUpperCase(); - this.shortName = shortName.toUpperCase(); - this.doc = doc; - this.optional = optional; - isCollection = collection; - this.minElements = minElements; - this.maxElements = maxElements; - if (defaultValue != null) { - this.defaultValue = defaultValue.toString(); - } else { - this.defaultValue = "null"; - } - this.mutuallyExclusive = new HashSet(Arrays.asList(mutuallyExclusive)); - } - } -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java deleted file mode 100644 index 088755e2a..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/CommandLineParserDefinitionException.java +++ /dev/null @@ -1,27 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -public class CommandLineParserDefinitionException extends RuntimeException { - public CommandLineParserDefinitionException() { - } - - public CommandLineParserDefinitionException(String s) { - super(s); - } - - public CommandLineParserDefinitionException(String s, Throwable throwable) { - super(s, throwable); - } - - public CommandLineParserDefinitionException(Throwable throwable) { - super(throwable); - } -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java deleted file mode 100644 index 10ee7635f..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/CommandLineProgram.java +++ /dev/null @@ -1,141 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -import edu.mit.broad.picard.util.Log; -import edu.mit.broad.picard.util.StringUtil; -import edu.mit.broad.picard.metrics.Header; -import edu.mit.broad.picard.metrics.StringHeader; -import edu.mit.broad.picard.metrics.MetricsFile; -import edu.mit.broad.picard.metrics.MetricBase; - -import java.io.File; -import java.util.Date; -import java.util.List; -import java.util.ArrayList; - -/** - * Abstract class to facilitate writing command-line programs. - * - * To use: - * - * 1. Extend this class with a concrete class that has data members annotated with @Option, @PositionalArguments - * and/or @Usage annotations. - * - * 2. If there is any custom command-line validation, override customCommandLineValidation(). When this method is - * called, the command line has been parsed and set into the data members of the concrete class. - * - * 3. Implement a method doWork(). This is called after successful comand-line processing. The value it returns is - * the exit status of the program. It is assumed that the concrete class emits any appropriate error message before - * returning non-zero. doWork() may throw unchecked exceptions, which are caught and reported appropriately. - * - * 4. Implement the following static method in the concrete class: - * - * public static void main(String[] argv) { - System.exit(new MyConcreteClass().instanceMain(argv)); - } - - - */ -public abstract class CommandLineProgram { - - @Option - public File TMP_DIR = new File(System.getProperty("java.io.tmpdir"), System.getProperty("user.name")); - - @Option(doc = "Control verbosity of logging") - public Log.LogLevel VERBOSITY = Log.LogLevel.INFO; - - @Option(doc = "Whether to suppress job-summary info on System.out") - public Boolean QUIET = false; - - private final String standardUsagePreamble = CommandLineParser.getStandardUsagePreamble(getClass()); - - /** - * Initialized in parseArgs. Subclasses may want to access this to do - * their own validation, and then print usage using clp. - */ - protected CommandLineParser clp; - - private final List
defaultHeaders = new ArrayList
(); - - /** - * Do the work after command line has been parsed. - * RuntimeException may be thrown by this method, and are reported appropriately. - * @return program exit status. - */ - protected abstract int doWork(); - - public int instanceMain(final String[] argv) { - // Build the default headers - final Date startDate = new Date(); - final String cmdline = getClass().getName() + " " + StringUtil.join(" ", argv); - this.defaultHeaders.add(new StringHeader(cmdline)); - this.defaultHeaders.add(new StringHeader("Started on: " + startDate)); - - if (!parseArgs(argv)) { - return 1; - } - - Log.setGlobalLogLevel(VERBOSITY); - - if (!TMP_DIR.exists()) { - // Intentially not checking the return value, because it may be that the program does not - // need a tmp_dir. If this fails, the problem will be discovered downstream. - TMP_DIR.mkdir(); - } - System.setProperty("java.io.tmpdir", TMP_DIR.getAbsolutePath()); - if (!QUIET) { - System.out.println("[" + new Date() + "] " + cmdline); - } - final int ret = doWork(); - if (!QUIET) { - System.out.println("[" + new Date() + "] " + getClass().getName() + " done."); - System.out.println("Runtime.totalMemory()=" + Runtime.getRuntime().totalMemory()); - } - return ret; - } - - /** - * Put any custom command-line validation in an override of this method. - * clp is initialized at this point and can be used to print usage and access argv. - * Any options set by command-line parser can be validated. - * @return true if command line is valid. - */ - protected boolean customCommandLineValidation() { - return true; - } - - /** - * - * @return true if command line is valid - */ - protected boolean parseArgs(final String[] argv) { - clp = new CommandLineParser(this); - final boolean ret = clp.parseOptions(System.err, argv); - if (!ret) { - return false; - } - return customCommandLineValidation(); - } - - /** Gets a MetricsFile with default headers already written into it. */ - protected MetricsFile getMetricsFile() { - final MetricsFile file = new MetricsFile(); - for (final Header h : this.defaultHeaders) { - file.addHeader(h); - } - - return file; - } - - public String getStandardUsagePreamble() { - return standardUsagePreamble; - } -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java b/java/lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java deleted file mode 100644 index 0702f3bc7..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/CommandLineUtils.java +++ /dev/null @@ -1,39 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -import java.io.*; -import java.util.regex.Pattern; - -public class CommandLineUtils { - /** Regex for splitting on spaces. */ - public static final Pattern SPACE_SPLITTER = Pattern.compile(" "); - - // Regexes to split things apart on white space - public static final Pattern TAB_SPLITTER = Pattern.compile("\\t"); - - /** Checks that a file exists and is readable, and then returns a buffered reader for it. */ - public static BufferedReader getReader(File file) throws IOException { - return new BufferedReader(new InputStreamReader(getInputStream(file))); - } - - /** Checks that a file exists and is readable, and then returns a input stream for it. */ - public static InputStream getInputStream(File file) throws IOException { - if (!file.exists()) { - throw new RuntimeException("Specified file does not exist: " + file); - } - - if (!file.canRead()) { - throw new RuntimeException("Specified file is not readable: " + file); - } - - return new FileInputStream(file); - } -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/Option.java b/java/lib/edu/mit/broad/picard/cmdline/Option.java deleted file mode 100644 index b7ffebdd9..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/Option.java +++ /dev/null @@ -1,60 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -import java.lang.annotation.Documented; -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; - -/** - * Used to annotate which fields of a CommandLineProgram are options given at the command line. - * If a command line call looks like "cmd option=foo x=y bar baz" the CommandLineProgram - * would have annotations on fields to handle the values of option and x. All options - * must be in the form name=value on the command line. The java type of the option - * will be inferred from the type of the field or from the generic type of the collection - * if this option is allowed more than once. The type must be an enum or - * have a constructor with a single String parameter. - * - * @author Alec Wysoker - */ -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.FIELD) -@Documented -public @interface Option { - /** The name of the option as it would appear on the command line. */ - String shortName() default ""; - - /** Text that appears for this option in text describing usage of the command line program. */ - String doc() default ""; - - /** - * If set to false, an exception will be thrown if the option is not specified. - * If 2 options are mutually exclusive and both have optional=false it will be - * interpreted as one or the other is required and an exception will only be thrown if - * neither are specified. - */ - boolean optional() default false; - - /** - * Array of option names that cannot be used in conjunction with this one. - * If 2 options are mutually exclusive and both have optional=false it will be - * interpreted as one OR the other is required and an exception will only be thrown if - * neither are specified. - */ - String[] mutex() default {}; - - /** The minimum number of times that this option is required. */ - int minElements() default 0; - - /** The maximum number of times this option is allowed. */ - int maxElements() default Integer.MAX_VALUE; -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/PositionalArguments.java b/java/lib/edu/mit/broad/picard/cmdline/PositionalArguments.java deleted file mode 100644 index f45301439..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/PositionalArguments.java +++ /dev/null @@ -1,38 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -import java.lang.annotation.Documented; -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; - -/** - * Used to annotate which field of a CommandLineProgram should store parameters given at the - * command line which are not options. Fields with this annotation must be a Collection - * (and probably should be a List if order is important). - * If a command line call looks like "cmd option=foo x=y bar baz" the values "bar" and "baz" - * would be added to the collection with this annotation. The java type of the arguments - * will be inferred from the generic type of the collection. The type must be an enum or - * have a constructor with a single String parameter. - * - * @author Alec Wysoker - */ -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.FIELD) -@Documented -public @interface PositionalArguments { - /** The minimum number of arguments required. */ - int minElements() default 0; - - /** The maximum number of arguments allowed. */ - int maxElements() default Integer.MAX_VALUE; -} diff --git a/java/lib/edu/mit/broad/picard/cmdline/Usage.java b/java/lib/edu/mit/broad/picard/cmdline/Usage.java deleted file mode 100644 index 13aef9467..000000000 --- a/java/lib/edu/mit/broad/picard/cmdline/Usage.java +++ /dev/null @@ -1,26 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.cmdline; - -import java.lang.annotation.Documented; -import java.lang.annotation.ElementType; -import java.lang.annotation.Retention; -import java.lang.annotation.RetentionPolicy; -import java.lang.annotation.Target; - -/** - * Annotates the field that contains text to be displayed in a usage message. - */ -@Retention(RetentionPolicy.RUNTIME) -@Target(ElementType.FIELD) -@Documented -public @interface Usage { - String programVersion() default ""; -} diff --git a/java/lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java b/java/lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java deleted file mode 100644 index 75fb98b16..000000000 --- a/java/lib/edu/mit/broad/picard/directed/ArachneMapToIntervalList.java +++ /dev/null @@ -1,62 +0,0 @@ -package edu.mit.broad.picard.directed; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.util.BasicTextFileParser; -import edu.mit.broad.picard.util.Interval; -import edu.mit.broad.picard.util.FormatUtil; -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.sam.SAMSequenceRecord; - -import java.io.File; -import java.util.List; - -/** - * Converts an arachne style map file to the new interval list format. - * - * @author Tim Fennell - */ -public class ArachneMapToIntervalList extends CommandLineProgram { - @Option(shortName="M", doc="The path to an archne style map file") public File MAP; - @Option(shortName="SD", doc="A sequence dictionary in SAM or BAM format") public File SEQUENCE_DICTIONARY; - @Option(shortName="O", doc="The output file to write the interval list to") public File OUTPUT; - @Option(shortName="P", doc="Prefix to use when generating names") public String PREFIX; - - /** Stock main method. */ - public static void main(String[] argv) { - System.exit(new ArachneMapToIntervalList().instanceMain(argv)); - } - - protected int doWork() { - IoUtil.assertFileIsReadable(MAP); - IoUtil.assertFileIsReadable(SEQUENCE_DICTIONARY); - IoUtil.assertFileIsWritable(OUTPUT); - - SAMFileReader sam = new SAMFileReader(SEQUENCE_DICTIONARY); - SAMFileHeader header = sam.getFileHeader(); - List seqs = header.getSequences(); - IntervalList list = new IntervalList(header); - - BasicTextFileParser parser = new BasicTextFileParser(true, 3, MAP); - FormatUtil format = new FormatUtil(); - int i=1; - - while (parser.hasNext()) { - String[] fields = parser.next(); - int seqIndex = format.parseInt(fields[0]); - int start = format.parseInt(fields[1]) + 1; - int end = format.parseInt(fields[2]) + 1; - String seq = seqs.get(seqIndex).getSequenceName(); - - Interval interval = new Interval(seq, start, end, false, PREFIX + "_" + i++); - list.add(interval); - } - - list.sort(); - list.write(OUTPUT); - - return 0; - } -} diff --git a/java/lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java b/java/lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java deleted file mode 100644 index d3be86825..000000000 --- a/java/lib/edu/mit/broad/picard/directed/CalculateHsMetrics.java +++ /dev/null @@ -1,51 +0,0 @@ -package edu.mit.broad.picard.directed; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.metrics.MetricsFile; -import edu.mit.broad.sam.SAMFileReader; - -import java.io.File; - -/** - * Calculates a set of HS metrics from a sam or bam file. - * - * @author Tim Fennell - */ -public class CalculateHsMetrics extends CommandLineProgram { - @Usage public final String USAGE = - "Calculates a set of Hybrid Selection specific metrics from an aligned SAM" + - "or BAM file."; - @Option(shortName="BI") public File BAIT_INTERVALS; - @Option(shortName="TI") public File TARGET_INTERVALS; - @Option(shortName="I") public File INPUT; - @Option(shortName="M") public File METRICS_FILE; - - /** Stock main method. */ - public static void main(String[] argv) { - System.exit(new CalculateHsMetrics().instanceMain(argv)); - } - - /** - * Asserts that files are readable and writable and then fires off an - * HsMetricsCalculator instance to do the real work. - */ - protected int doWork() { - IoUtil.assertFileIsReadable(BAIT_INTERVALS); - IoUtil.assertFileIsReadable(TARGET_INTERVALS); - IoUtil.assertFileIsReadable(INPUT); - IoUtil.assertFileIsWritable(METRICS_FILE); - - HsMetricsCalculator calculator = new HsMetricsCalculator(BAIT_INTERVALS, TARGET_INTERVALS); - SAMFileReader sam = new SAMFileReader(INPUT); - calculator.analyze(sam.iterator()); - - MetricsFile metrics = getMetricsFile(); - metrics.addMetric(calculator.getMetrics()); - - metrics.write(METRICS_FILE); - return 0; - } -} diff --git a/java/lib/edu/mit/broad/picard/directed/GenomeMask.java b/java/lib/edu/mit/broad/picard/directed/GenomeMask.java deleted file mode 100644 index 27be5df71..000000000 --- a/java/lib/edu/mit/broad/picard/directed/GenomeMask.java +++ /dev/null @@ -1,52 +0,0 @@ -package edu.mit.broad.picard.directed; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.BitSet; -import java.util.SortedMap; -import java.util.TreeMap; - -/** - * Utility class to store coordinates of interest in per-sequence bitmasks. - */ -public class GenomeMask { - - // if memory usage becomes a problem... this could be changed to a SparseBitSet - // http://java.sun.com/developer/onlineTraining/collections/magercises/BitSet/index.html - private SortedMap data = new TreeMap(); - - - public GenomeMask() { - } - - public boolean get(int contig, int position) { - BitSet bits = data.get(contig); - return (bits != null) && bits.get(position); - } - - public BitSet get(int contig) { - return data.get(contig); - } - - /** - * Get an existing BitSet for the given contig, or create one if not already present. This is - * useful when initializing a GenomeMask from an external source. - * @param contig which BitSet - * @param numBits if there was not already a BitSet for this contig, one is created and initialized to this size. - * @return the BitSet for the given contig, creating one if necessary - */ - public BitSet getOrCreate(int contig, int numBits) { - BitSet ret = data.get(contig); - if (ret == null) { - ret = new BitSet(numBits); - data.put(contig, ret); - } - return ret; - } - - public int getMaxContig() { - return data.lastKey(); - } -} diff --git a/java/lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java b/java/lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java deleted file mode 100644 index ba81a7eb6..000000000 --- a/java/lib/edu/mit/broad/picard/directed/GenomeMaskFactory.java +++ /dev/null @@ -1,47 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.directed; - -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.picard.util.Interval; -import edu.mit.broad.picard.io.IoUtil; - -import java.util.List; -import java.util.BitSet; -import java.io.File; - -/** - * Create a GenomeMask from an IntervalList or a file containing an IntervalList - */ -public class GenomeMaskFactory { - - public GenomeMask makeGenomeMaskFromIntervalList(IntervalList intervalList) { - if (intervalList.getHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { - intervalList.sort(); - } - List uniqueIntervals = intervalList.getUniqueIntervals(); - GenomeMask ret = new GenomeMask(); - - SAMFileHeader samHeader = intervalList.getHeader(); - - for (Interval interval : uniqueIntervals) { - // TODO: Maybe figure out more intelligently how big the bitset might be? - BitSet bitSet = ret.getOrCreate(samHeader.getSequenceIndex(interval.getSequence()), interval.getEnd() + 1); - bitSet.set(interval.getStart(), interval.getEnd() + 1); - } - return ret; - } - - public GenomeMask makeGenomeMaskFromIntervalList(File intervalListFile) { - IoUtil.assertFileIsReadable(intervalListFile); - IntervalList intervalList = IntervalList.fromFile(intervalListFile); - return makeGenomeMaskFromIntervalList(intervalList); - } -} diff --git a/java/lib/edu/mit/broad/picard/directed/HsMetrics.java b/java/lib/edu/mit/broad/picard/directed/HsMetrics.java deleted file mode 100644 index 74817f919..000000000 --- a/java/lib/edu/mit/broad/picard/directed/HsMetrics.java +++ /dev/null @@ -1,108 +0,0 @@ -package edu.mit.broad.picard.directed; - -import edu.mit.broad.picard.metrics.MetricBase; - -/** - * The set of metrics captured that are specific to a hybrid selection analysis. - * - * @author Tim Fennell - */ -public class HsMetrics extends MetricBase { - /** The name of the bait set used in the hybrid selection. */ - public String BAIT_SET; - - /** The number of bases in the reference genome used for alignment. */ - public long GENOME_SIZE; - - /** The number of bases which have one or more baits on top of them. */ - public long BAIT_TERRITORY; - - /** The unique number of target bases in the experiment where target is usually exons etc. */ - public long TARGET_TERRITORY; - - /** Target terrirtoy / bait territory. 1 == perfectly efficient, 0.5 = half of baited bases are not target. */ - public double BAIT_DESIGN_EFFICIENCY; - - /** The total number of reads in the SAM or BAM file examine. */ - public int TOTAL_READS; - - /** The number of reads that pass the vendor's filter. */ - public int PF_READS; - - /** The number of PF reads that are not marked as duplicates. */ - public int PF_UNIQUE_READS; - - /** PF reads / total reads. The percent of reads passing filter. */ - public double PCT_PF_READS; - - /** PF Unique Reads / Total Reads. */ - public double PCT_PF_UQ_READS; - - /** The number of PF reads that are aligned with mapping score > 0 to the reference genome. */ - public int PF_READS_ALIGNED; - - /** PF Reads Aligned / PF Reads. */ - public double PCT_PF_READS_ALIGNED; - - /** The number of bases in the PF aligned reads that are mapped to a reference base. Accounts for clipping and gaps. */ - public int PF_BASES_ALIGNED; - - /** The number of PF aligned bases that mapped to a baited region of the genome. */ - public long ON_BAIT_BASES; - - /** The number of PF aligned bases that mapped to within a fixed interval of a baited region, but not on a baited region. */ - public long NEAR_BAIT_BASES; - - /** The number of PF aligned bases that mapped to neither on or near a bait. */ - public long OFF_BAIT_BASES; - - /** The number of PF aligned bases that mapped to a targetted region of the genome. */ - public long ON_TARGET_BASES; - - /** On+Near Bait Bases / PF Bases Aligned. */ - public double PCT_SELECTED_BASES; - - /** The percentage of aligned PF bases that mapped neither on or near a bait. */ - public double PCT_OFF_BAIT; - - /** The percentage of on+near bait bases that are on as opposed to near. */ - public double ON_BAIT_VS_SELECTED; - - /** The mean coverage of all baits in the experiment. */ - public double MEAN_BAIT_COVERAGE; - - /** The mean coverage of targets that recieved at least coverage depth = 2 at one base. */ - public double MEAN_TARGET_COVERAGE; - - /** The fold by which the baited region has been amplified above genomic background. */ - public double FOLD_ENRICHMENT; - - /** The number of targets that did not reach coverage=2 over any base. */ - public double ZERO_CVG_TARGETS_PCT; - - /** - * The fold over-coverage necessary to raise 80% of bases in "non-zero-cvg" targets to - * the mean coverage level in those targets. - */ - public double FOLD_80_BASE_PENALTY; - - - /** - * Calculates the metrics in this class that can be derived from other metrics in the class. - */ - public void calculateDerivedMetrics() { - BAIT_DESIGN_EFFICIENCY = (double) TARGET_TERRITORY / (double) BAIT_TERRITORY; - - PCT_PF_READS = PF_READS / (double) TOTAL_READS; - PCT_PF_UQ_READS = PF_UNIQUE_READS / (double) TOTAL_READS; - PCT_PF_READS_ALIGNED = PF_READS_ALIGNED / (double) PF_UNIQUE_READS; - - double denominator = (ON_BAIT_BASES + NEAR_BAIT_BASES + OFF_BAIT_BASES); - - PCT_SELECTED_BASES = (ON_BAIT_BASES + NEAR_BAIT_BASES) / denominator; - PCT_OFF_BAIT = OFF_BAIT_BASES / denominator; - ON_BAIT_VS_SELECTED = ON_BAIT_BASES / (double) (ON_BAIT_BASES + NEAR_BAIT_BASES); - MEAN_BAIT_COVERAGE = ON_BAIT_BASES / (double) BAIT_TERRITORY; - FOLD_ENRICHMENT = (ON_BAIT_BASES/ denominator) / ((double) BAIT_TERRITORY / GENOME_SIZE); - } -} diff --git a/java/lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java b/java/lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java deleted file mode 100644 index a454642a7..000000000 --- a/java/lib/edu/mit/broad/picard/directed/HsMetricsCalculator.java +++ /dev/null @@ -1,207 +0,0 @@ -package edu.mit.broad.picard.directed; - -import edu.mit.broad.picard.util.*; -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.sam.SAMRecord; -import edu.mit.broad.sam.AlignmentBlock; -import edu.mit.broad.sam.SAMSequenceRecord; - -import java.util.*; -import java.io.*; - -/** - * Calculates HS metrics for a given SAM or BAM file. Requires the input of a list of - * target intervals and a list of bait intervals. Can be invoked either on an entire - * iterator of SAMRecords or be passed SAMRecords one at a time. - * - * @author Tim Fennell - */ -public class HsMetricsCalculator { - // What is considered "near" to the bait - private static final int NEAR_BAIT_DISTANCE = 250; - private static final Log log = Log.getInstance(HsMetricsCalculator.class); - - // Holds file names and other parameter related junk - private SAMFileReader sam; - private File baitFile; - private File targetFile; - private IntervalList baits; - private IntervalList targets; - - // Overlap detector for finding overlaps between reads and the experimental targets - private OverlapDetector targetDetector = new OverlapDetector(0,0); - - // Overlap detector for finding overlaps between the reads and the baits (and the near bait space) - private OverlapDetector baitDetector = new OverlapDetector(-NEAR_BAIT_DISTANCE,0); - - // A Map to accumulate per-bait-region (i.e. merge of overlapping baits) coverage. */ - private Map coverageByTarget = new HashMap(); - - private HsMetrics metrics = new HsMetrics(); - - /** - * Constructor that parses the squashed reference to genome reference file and stores the - * information in a map for later use. - */ - public HsMetricsCalculator(File baits, File targets) { - this.baitFile = baits; - this.targetFile = targets; - this.baits = IntervalList.fromFile(baits); - this.targets = IntervalList.fromFile(targets); - - this.metrics.BAIT_SET = baits.getName(); - int tmp = this.metrics.BAIT_SET.indexOf("."); - if (tmp > 0) { - this.metrics.BAIT_SET = this.metrics.BAIT_SET.substring(0, tmp); - } - - List uniqueBaits = this.baits.getUniqueIntervals(); - this.baitDetector.addAll(uniqueBaits, uniqueBaits); - this.metrics.BAIT_TERRITORY = Interval.countBases(uniqueBaits); - - List uniqueTargets = this.targets.getUniqueIntervals(); - this.targetDetector.addAll(uniqueTargets, uniqueTargets); - this.metrics.TARGET_TERRITORY = Interval.countBases(uniqueTargets); - - for (SAMSequenceRecord seq : this.baits.getHeader().getSequences()) { - this.metrics.GENOME_SIZE += seq.getSequenceLength(); - } - - // Populate the coverage by target map - for (Interval target : this.targets.getIntervals()) { - this.coverageByTarget.put(target, new Coverage(target, 0)); - } - } - - /** Iterates over all records in the file and collects metrics. */ - public void analyze(Iterator records) { - int i = 0; - while (records.hasNext()) { - analyze(records.next()); - - if (++i % 1000000 == 0) { - log.info("Processed " + i + " records so far."); - } - } - } - - /** Adds information about an individual SAMRecord to the statistics. */ - public void analyze(SAMRecord rec) { - // Just plain avoid records that are marked as not-primary - if (rec.getNotPrimaryAlignmentFlag()) return; - - this.metrics.TOTAL_READS += 1; - - // Check for PF reads - if (rec.getReadFailsVendorQualityCheckFlag()) { - return; - } - else { - ++this.metrics.PF_READS; - } - - // Check for reads that are marked as duplicates - if (rec.getDuplicateReadFlag()) { - return; - } - else { - ++this.metrics.PF_UNIQUE_READS; - } - - // Don't bother with reads that didn't align uniquely - if (rec.getReadUnmappedFlag() || rec.getMappingQuality() == 0) { - return; - } - - this.metrics.PF_READS_ALIGNED += 1; - for (AlignmentBlock block : rec.getAlignmentBlocks()) { - this.metrics.PF_BASES_ALIGNED += block.getLength(); - } - - Interval read = new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd()); - - // Find the target overlaps - Collection targets = this.targetDetector.getOverlaps(read); - if (targets != null && !targets.isEmpty()) { - for (Interval target : targets) { - Coverage coverage = this.coverageByTarget.get(target); - - for (AlignmentBlock block : rec.getAlignmentBlocks()) { - int end = CoordMath.getEnd(block.getReferenceStart(), block.getLength()); - for (int pos=block.getReferenceStart(); pos<=end; ++ pos) { - if (pos >= target.getStart() && pos <= target.getEnd()) { - ++this.metrics.ON_TARGET_BASES; - coverage.addBase(pos - target.getStart()); - } - } - } - } - } - - // Now do the bait overlaps - int mappedBases = 0; - for (AlignmentBlock block : rec.getAlignmentBlocks()) mappedBases += block.getLength(); - Collection baits = this.baitDetector.getOverlaps(read); - int onBaitBases = 0; - - if (baits != null && !baits.isEmpty()) { - for (Interval bait : baits) { - for (AlignmentBlock block : rec.getAlignmentBlocks()) { - int end = CoordMath.getEnd(block.getReferenceStart(), block.getLength()); - - for (int pos=block.getReferenceStart(); pos<=end; ++pos) { - if (pos >= bait.getStart() && pos <= bait.getEnd()) ++onBaitBases; - } - } - } - - this.metrics.ON_BAIT_BASES += onBaitBases; - this.metrics.NEAR_BAIT_BASES += (mappedBases - onBaitBases); - } - else { - this.metrics.OFF_BAIT_BASES += mappedBases; - } - - } - - /** Calculates a few last summary metrics and then returns the metrics calculated. */ - public HsMetrics getMetrics() { - this.metrics.calculateDerivedMetrics(); - calculateTargetCoverageMetrics(); - return this.metrics; - } - - /** Calculates how much additional sequencing is needed to raise 80% of bases to the mean for the lane. */ - private void calculateTargetCoverageMetrics() { - short[] depths = new short[(int) this.metrics.TARGET_TERRITORY]; // may not use entire array - int zeroCoverageTargets = 0; - int depthIndex = 0; - double totalCoverage = 0; - int basesConsidered = 0; - - for (Coverage c : this.coverageByTarget.values()) { - if (!c.hasCoverage()) { - ++zeroCoverageTargets; - continue; - } - - final short[] targetDepths = c.getDepths(); - basesConsidered += targetDepths.length; - - for (short depth : targetDepths) { - depths[depthIndex++] = depth; - totalCoverage += depth; - } - } - - this.metrics.MEAN_TARGET_COVERAGE = totalCoverage / basesConsidered; - - // Sort the array (ASCENDING) and then find the base the coverage value that lies at the 80% - // line, which is actually at 20% into the array now - Arrays.sort(depths); - int indexOf80thPercentile = (depths.length - basesConsidered) + (int) (basesConsidered * 0.2); - int coverageAt80thPercentile = depths[indexOf80thPercentile]; - this.metrics.FOLD_80_BASE_PENALTY = this.metrics.MEAN_TARGET_COVERAGE / coverageAt80thPercentile; - this.metrics.ZERO_CVG_TARGETS_PCT = zeroCoverageTargets / (double) this.targets.getIntervals().size(); - } -} diff --git a/java/lib/edu/mit/broad/picard/directed/IntervalList.java b/java/lib/edu/mit/broad/picard/directed/IntervalList.java deleted file mode 100644 index 087537c0a..000000000 --- a/java/lib/edu/mit/broad/picard/directed/IntervalList.java +++ /dev/null @@ -1,240 +0,0 @@ -package edu.mit.broad.picard.directed; - -import edu.mit.broad.picard.util.Interval; -import edu.mit.broad.picard.util.FormatUtil; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.sam.SAMTextHeaderCodec; -import edu.mit.broad.sam.util.StringLineReader; - -import java.util.*; -import java.io.*; - -/** - * Represents a list of intervals against a reference sequence that can be written to - * and read from a file. The file format is relatively simple and reflects the SAM - * alignment format to a degree. - * - * A SAM style header must be present in the file which lists the sequence records - * against which the intervals are described. After the header the file then contains - * records one per line in text format with the following values tab-separated: - * - Sequence name - * - Start position (1-based) - * - End position (1-based, end inclusive) - * - Strand (either + or -) - * - Interval name (an, ideally unique, name for the interval) - * - * @author Tim Fennell - */ -public class IntervalList implements Iterable { - private SAMFileHeader header; - private List intervals = new ArrayList(); - - /** Constructs a new interval list using the supplied header information. */ - public IntervalList(SAMFileHeader header) { - if (header == null) { - throw new IllegalArgumentException("SAMFileHeader must be supplied."); - } - this.header = header; - } - - /** Gets the header (if there is one) for the interval list. */ - public SAMFileHeader getHeader() { return header; } - - /** Returns an iterator over the intervals. */ - public Iterator iterator() { return this.intervals.iterator(); } - - /** Adds an interval to the list of intervals. */ - public void add(Interval interval) { this.intervals.add(interval); } - - /** Sorts the internal collection of intervals by coordinate. */ - public void sort() { - Collections.sort(this.intervals, new IntervalCoordinateComparator(this.header)); - this.header.setSortOrder(SAMFileHeader.SortOrder.coordinate); - } - - /** Gets the set of intervals as held internally. */ - public List getIntervals() { - return Collections.unmodifiableList(this.intervals); - } - - /** - * Merges the list of intervals and then reduces them down where regions overlap - * or are directly adjacent to one another. During this process the "merged" interval - * will retain the strand and name of the 5' most interval merged. - * - * @return the set of unique intervals condensed from the contained intervals - */ - public List getUniqueIntervals() { - List unique = new ArrayList(); - ListIterator iterator = this.intervals.listIterator(); - Interval previous = iterator.next(); - - while (iterator.hasNext()) { - Interval next = iterator.next(); - if (previous.intersects(next) || previous.abuts(next)) { - previous = new Interval(previous.getSequence(), - previous.getStart(), - Math.max(previous.getEnd(), next.getEnd()), - previous.isNegativeStrand(), - previous.getName()); - } - else { - unique.add(previous); - previous = next; - } - } - - if (previous != null) unique.add(previous); - - return unique; - } - - /** Gets the (potentially redundant) sum of the length of the intervals in the list. */ - public long getBaseCount() { - return Interval.countBases(this.intervals); - } - - /** Gets the count of unique bases represented by the intervals in the list. */ - public long getUniqueBaseCount() { - return Interval.countBases(getUniqueIntervals()); - } - - /** - * Parses an interval list from a file. - * @param file the file containing the intervals - * @return an IntervalList object that contains the headers and intervals from the file - */ - public static IntervalList fromFile(File file) { - BufferedReader in = new BufferedReader(new InputStreamReader(IoUtil.openFileForReading(file))); - - try { - // Setup a reader and parse the header - StringBuilder builder = new StringBuilder(4096); - String line = null; - - while ((line = in.readLine()) != null) { - if (line.startsWith("@")) { - builder.append(line).append('\n'); - } - else { - break; - } - } - - if (builder.length() == 0) { - throw new IllegalStateException("Interval list file must contain header: " + file.getAbsolutePath()); - } - - StringLineReader headerReader = new StringLineReader(builder.toString()); - SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); - IntervalList list = new IntervalList(codec.decode(headerReader, file)); - - // Then read in the intervals - FormatUtil format = new FormatUtil(); - do { - if (line.trim().length() == 0) continue; // skip over blank lines - - // Make sure we have the right number of fields - String fields[] = line.split("\t"); - if (fields.length != 5) { - throw new PicardException("Invalid interval record contains " + - fields.length + " fields: " + line); - } - - // Then parse them out - String seq = fields[0]; - int start = format.parseInt(fields[1]); - int end = format.parseInt(fields[2]); - - boolean negative; - if (fields[3].equals("-")) negative = true; - else if (fields[3].equals("+")) negative = false; - else throw new IllegalArgumentException("Invalid strand field: " + fields[3]); - - String name = fields[4]; - - Interval interval = new Interval(seq, start, end, negative, name); - list.intervals.add(interval); - } - while ((line = in.readLine()) != null); - - return list; - } - catch (IOException ioe) { - throw new PicardException("Error parsing interval list file: " + file.getAbsolutePath(), ioe); - } - finally { - try { in.close(); } catch (Exception e) { /* do nothing */ } - } - } - - /** - * Writes out the list of intervals to the supplied file. - * @param file a file to write to. If exists it will be overwritten. - */ - public void write(File file) { - try { - BufferedWriter out = new BufferedWriter(new OutputStreamWriter(IoUtil.openFileForWriting(file))); - FormatUtil format = new FormatUtil(); - - // Write out the header - if (this.header != null) { - SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); - codec.encode(out, this.header); - } - - // Write out the intervals - for (Interval interval : this) { - out.write(interval.getSequence()); - out.write('\t'); - out.write(format.format(interval.getStart())); - out.write('\t'); - out.write(format.format(interval.getEnd())); - out.write('\t'); - out.write(interval.isPositiveStrand() ? '+' : '-'); - out.write('\t'); - out.write(interval.getName()); - out.newLine(); - } - - out.flush(); - out.close(); - } - catch (IOException ioe) { - throw new PicardException("Error writing out interval list to file: " + file.getAbsolutePath(), ioe); - } - } -} - -/** - * Comparator that orders intervals based on their sequence index, by coordinate - * then by strand and finally by name. - */ -class IntervalCoordinateComparator implements Comparator { - private SAMFileHeader header; - - /** Constructs a comparator using the supplied sequence header. */ - IntervalCoordinateComparator(SAMFileHeader header) { - this.header = header; - } - - public int compare(Interval lhs, Interval rhs) { - int lhsIndex = this.header.getSequenceIndex(lhs.getSequence()); - int rhsIndex = this.header.getSequenceIndex(rhs.getSequence()); - int retval = lhsIndex - rhsIndex; - - if (retval == 0) retval = lhs.getStart() - rhs.getStart(); - if (retval == 0) retval = lhs.getEnd() - rhs.getEnd(); - if (retval == 0) { - if (lhs.isPositiveStrand() && rhs.isNegativeStrand()) retval = -1; - else if (lhs.isNegativeStrand() && rhs.isPositiveStrand()) retval = 1; - } - if (retval == 0) { - retval = lhs.getName().compareTo(rhs.getName()); - } - - return retval; - } -} \ No newline at end of file diff --git a/java/lib/edu/mit/broad/picard/filter/AggregateFilter.java b/java/lib/edu/mit/broad/picard/filter/AggregateFilter.java deleted file mode 100644 index 3ee558c99..000000000 --- a/java/lib/edu/mit/broad/picard/filter/AggregateFilter.java +++ /dev/null @@ -1,46 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.filter; - -import edu.mit.broad.sam.SAMRecord; - -import java.util.List; - -/** - * Aggregates multiple filters and provides a method for applying them all to a given record with - * one method call. - */ -public class AggregateFilter implements SamRecordFilter { - - private final List filters; - - /** - * Constructor - * @param filters the list of filters that this Aggregator applies - */ - public AggregateFilter(List filters) { - this.filters = filters; - } - - /** - * Determines whether a SAMRecord matches this filter - * - * @param record the SAMRecord to evaluate - * @return true if the SAMRecord matches at least one filter, otherwise false - */ - public boolean filterOut(SAMRecord record) { - for (SamRecordFilter filter : filters) { - if (filter.filterOut(record)) { - return true; - } - } - return false; - } -} diff --git a/java/lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java b/java/lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java deleted file mode 100644 index 3e0c9bb3f..000000000 --- a/java/lib/edu/mit/broad/picard/filter/FailsVendorReadQualityFilter.java +++ /dev/null @@ -1,28 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.filter; - -import edu.mit.broad.sam.SAMRecord; - -/** - * Filter for filtering out reads that do not pass the quality filter - */ -public class FailsVendorReadQualityFilter implements SamRecordFilter { - - /** - * Determines whether a SAMRecord matches this filter - * - * @param record the SAMRecord to evaluate - * @return true if the SAMRecord matches the filter, otherwise false - */ - public boolean filterOut(SAMRecord record) { - return record.getReadFailsVendorQualityCheckFlag(); - } -} diff --git a/java/lib/edu/mit/broad/picard/filter/FilteringIterator.java b/java/lib/edu/mit/broad/picard/filter/FilteringIterator.java deleted file mode 100644 index ddb85b9d7..000000000 --- a/java/lib/edu/mit/broad/picard/filter/FilteringIterator.java +++ /dev/null @@ -1,94 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.filter; - -import edu.mit.broad.sam.SAMRecord; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.picard.util.CloserUtil; - -import java.util.Iterator; -import java.util.NoSuchElementException; - -/** - * Filtering Iterator which takes a filter and an iterator and iterates - * through only those records which are not rejected by the filter. - * - * @author Kathleen Tibbetts - */ -public class FilteringIterator implements CloseableIterator { - - private final Iterator iterator; - private final SamRecordFilter filter; - private SAMRecord next = null; - - /** - * Constructor - * - * @param iterator the backing iterator - * @param filter the filter (which may be a FilterAggregator) - */ - public FilteringIterator(Iterator iterator, SamRecordFilter filter) { - this.iterator = iterator; - this.filter = filter; - next = getNextRecord(); - } - - /** - * Returns true if the iteration has more elements. - * - * @return true if the iteration has more elements. Otherwise returns false. - */ - public boolean hasNext() { - return next != null; - } - - /** - * Returns the next element in the iteration. - * - * @return the next element in the iteration - * @throws java.util.NoSuchElementException - */ - public SAMRecord next() { - if (next == null) { - throw new NoSuchElementException("Iterator has no more elements."); - } - SAMRecord result = next; - next = getNextRecord(); - return result; - } - - /** - * Required method for Iterator API. - * - * @throws UnsupportedOperationException - */ - public void remove() { - throw new UnsupportedOperationException("Remove() not supported by FilteringIterator"); - } - - public void close() { - CloserUtil.close(iterator); - } - - /** - * Gets the next record from the underlying iterator that passes the filter - * - * @return SAMRecord the next filter-passing record - */ - private SAMRecord getNextRecord() { - while (iterator.hasNext()) { - SAMRecord record = iterator.next(); - if (!filter.filterOut(record)) { - return record; - } - } - return null; - } -} diff --git a/java/lib/edu/mit/broad/picard/filter/SamRecordFilter.java b/java/lib/edu/mit/broad/picard/filter/SamRecordFilter.java deleted file mode 100644 index d8936ca8a..000000000 --- a/java/lib/edu/mit/broad/picard/filter/SamRecordFilter.java +++ /dev/null @@ -1,26 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.filter; - -import edu.mit.broad.sam.SAMRecord; - -/** - * API for filtering SAMRecords - */ -public interface SamRecordFilter { - - /** - * Determines whether a SAMRecord matches this filter - * - * @param record the SAMRecord to evaluate - * @return true if the SAMRecord matches the filter, otherwise false - */ - public boolean filterOut(SAMRecord record); -} diff --git a/java/lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java b/java/lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java deleted file mode 100644 index 9969ae2e3..000000000 --- a/java/lib/edu/mit/broad/picard/filter/SolexaNoiseFilter.java +++ /dev/null @@ -1,37 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.filter; - -import edu.mit.broad.picard.util.SequenceUtil; -import edu.mit.broad.sam.SAMRecord; - -/** - * Filter to determine whether a read is "noisy" due to a poly-A run that is a sequencing artifact. - * Currently we filter out only reads that are composed entirely of As. - */ -public class SolexaNoiseFilter implements SamRecordFilter { - - /** - * Determines whether a SAMRecord matches this filter - * - * @param record the SAMRecord to evaluate - * @return true if the SAMRecord matches the filter, otherwise false - */ - public boolean filterOut(SAMRecord record) { - byte sequence[] = record.getReadBases(); - for (byte base : sequence) { - if (base != 'A' && base != 'a' && - !SequenceUtil.isNoCall(base)) { - return false; - } - } - return true; - } -} diff --git a/java/lib/edu/mit/broad/picard/filter/TagFilter.java b/java/lib/edu/mit/broad/picard/filter/TagFilter.java deleted file mode 100644 index f35957ba0..000000000 --- a/java/lib/edu/mit/broad/picard/filter/TagFilter.java +++ /dev/null @@ -1,56 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.filter; - -import edu.mit.broad.sam.SAMRecord; - -import java.util.List; -import java.util.Arrays; - -/** - * Filter class for matching tag attributes in SAMRecords - */ -public class TagFilter implements SamRecordFilter { - - private final String tag; // The key of the tag to match - private final List values; // The list of matching values - - /** - * Constructor for a single value - * - * @param tag the key of the tag to match - * @param value the value to match - */ - public TagFilter(String tag, Object value) { - this.tag = tag; - this.values = Arrays.asList(value); - } - - /** - * Constructor for multiple values - * - * @param tag the key of the tag to match - * @param values the matching values - */ - public TagFilter(String tag, List values) { - this.tag = tag; - this.values = values; - } - - /** - * Determines whether a SAMRecord matches this filter - * - * @param record the SAMRecord to evaluate - * @return true if the SAMRecord matches the filter, otherwise false - */ - public boolean filterOut(SAMRecord record) { - return values.contains(record.getAttribute(tag)); - } - } diff --git a/java/lib/edu/mit/broad/picard/genotype/GeliException.java b/java/lib/edu/mit/broad/picard/genotype/GeliException.java deleted file mode 100644 index 5d6fed76c..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GeliException.java +++ /dev/null @@ -1,30 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.genotype; - -import edu.mit.broad.picard.PicardException; - -/** - * Generic exception thrown by GELI format machinery. - * - * @author Doug Voet - */ -public class GeliException extends PicardException { - - public GeliException(String message, Throwable throwable) { - super(message, throwable); - } - - public GeliException(String message) { - super(message); - } - -} diff --git a/java/lib/edu/mit/broad/picard/genotype/GeliFileConstants.java b/java/lib/edu/mit/broad/picard/genotype/GeliFileConstants.java deleted file mode 100644 index 6f1496251..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GeliFileConstants.java +++ /dev/null @@ -1,20 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.genotype; - -/** - * Misc constants for GELI format - * - * @author Doug Voet - */ -public interface GeliFileConstants { - public static final byte[] GELI_MAGIC = "GELI".getBytes(); -} diff --git a/java/lib/edu/mit/broad/picard/genotype/GeliFileReader.java b/java/lib/edu/mit/broad/picard/genotype/GeliFileReader.java deleted file mode 100644 index de72b1639..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GeliFileReader.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.picard.genotype; - - -import java.io.BufferedInputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; - -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.sam.util.BlockCompressedInputStream; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.util.RuntimeIOException; - - -/** - * Class for reading GELI (GEnotype LIkelihood) files. - * - * @author Doug Voet - */ -public class GeliFileReader implements Iterable -{ - private ReaderImplementation mReader = null; - - /** - * Internal interface for SAM/BAM file reader implementations. - * Implemented as an abstract class to enforce better access control. - */ - static abstract class ReaderImplementation { - abstract SAMFileHeader getFileHeader(); - abstract CloseableIterator getIterator(); - abstract void close(); - } - - - public GeliFileReader(final InputStream stream) { - try { - final BufferedInputStream bufferedStream = toBufferedStream(stream); - if (isValidGELIFile(bufferedStream)) { - mReader = new GeliFileReaderImplementation(bufferedStream); - } else { - throw new GeliException("Unrecognized file format"); - } - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - public GeliFileReader(final File file) { - try { - final BufferedInputStream bufferedStream = - new BufferedInputStream(new FileInputStream(file)); - if (isValidGELIFile(bufferedStream)) { - bufferedStream.close(); - final GeliFileReaderImplementation reader = new GeliFileReaderImplementation(file); - mReader = reader; - } else { - bufferedStream.close(); - throw new GeliException("Unrecognized file format"); - } - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - public void close() { - if (mReader != null) { - mReader.close(); - } - mReader = null; - } - - public SAMFileHeader getFileHeader() { - return mReader.getFileHeader(); - } - - public CloseableIterator iterator() { - return mReader.getIterator(); - } - - private boolean isValidGELIFile(final InputStream stream) - throws IOException { - return BlockCompressedInputStream.isValidFile(stream); - } - - private BufferedInputStream toBufferedStream(final InputStream stream) { - if (stream instanceof BufferedInputStream) { - return (BufferedInputStream) stream; - } else { - return new BufferedInputStream(stream); - } - } -} diff --git a/java/lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java b/java/lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java deleted file mode 100644 index 7f544532e..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GeliFileReaderImplementation.java +++ /dev/null @@ -1,189 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.picard.genotype; - - -import java.io.DataInputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.LineNumberReader; -import java.io.StringReader; -import java.util.Arrays; - -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.sam.SAMSequenceRecord; -import edu.mit.broad.sam.SAMTextHeaderCodec; -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.BlockCompressedInputStream; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.util.StringLineReader; - -/** - * Internal class for reading GELI files. - */ -class GeliFileReaderImplementation extends GeliFileReader.ReaderImplementation { - - private boolean mIsSeekable = false; - private BinaryCodec mStream = null; - private final BlockCompressedInputStream mCompressedInputStream; - private SAMFileHeader mFileHeader = null; - private long mFirstRecordPointer = 0; - private CloseableIterator mCurrentIterator = null; - - - GeliFileReaderImplementation(final InputStream stream) - throws IOException { - mIsSeekable = false; - mCompressedInputStream = new BlockCompressedInputStream(stream); - mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); - readHeader(null); - } - - GeliFileReaderImplementation(final File file) - throws IOException { - mIsSeekable = true; - mCompressedInputStream = new BlockCompressedInputStream(file); - mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); - readHeader(file); - mFirstRecordPointer = mCompressedInputStream.getFilePointer(); - } - - void close() { - if (mStream != null) { - mStream.close(); - } - mStream = null; - mFileHeader = null; - } - - SAMFileHeader getFileHeader() { - return mFileHeader; - } - - CloseableIterator getIterator() { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (mIsSeekable) { - try { - mCompressedInputStream.seek(mFirstRecordPointer); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - mCurrentIterator = new GELIFileIterator(); - return mCurrentIterator; - } - - private void readHeader(final File file) - throws IOException { - - final byte[] buffer = new byte[4]; - mStream.readBytes(buffer); - if (!Arrays.equals(buffer, GeliFileConstants.GELI_MAGIC)) { - throw new IOException("Invalid GELI file header"); - } - - final int headerTextLength = mStream.readInt(); - final String textHeader = mStream.readString(headerTextLength); - mFileHeader = new SAMTextHeaderCodec().decode(new StringLineReader(textHeader), - file); - - final int sequenceCount = mStream.readInt(); - if (sequenceCount != mFileHeader.getSequences().size()) { - throw new GeliException("Number of sequences in text header (" + mFileHeader.getSequences().size() + - ") != number of sequences in binary header (" + sequenceCount + ") for file " + file); - } - for (int i = 0; i < sequenceCount; i++) { - readSequenceRecord(file); -// final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i); -// if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { -// throw new GELIException("For sequence " + i + ", text and binary have different names in file " + -// file); -// } -// if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { -// throw new GELIException("For sequence " + i + ", text and binary have different lengths in file " + -// file); -// } - } - } - - private SAMSequenceRecord readSequenceRecord(final File file) { - final int nameLength = mStream.readInt(); - if (nameLength <= 1) { - throw new GeliException("Invalid BAM file header: missing sequence name in file " + file); - } - final String sequenceName = mStream.readString(nameLength - 1); - // Skip the null terminator - mStream.readByte(); - final int sequenceLength = mStream.readInt(); - final SAMSequenceRecord record = new SAMSequenceRecord(sequenceName); - record.setSequenceLength(sequenceLength); - return record; - } - - private class GELIFileIterator - implements CloseableIterator { - - private GenotypeLikelihoods mNextRecord = null; - private final GenotypeLikelihoodsCodec likelihoodsCodec = new GenotypeLikelihoodsCodec(); - - - GELIFileIterator() { - this(true); - } - - GELIFileIterator(final boolean advance) { - likelihoodsCodec.setInputStream(mStream.getInputStream()); - if (advance) { - advance(); - } - } - - public void close() { - if (this != mCurrentIterator) { - throw new IllegalStateException("Attempt to close non-current iterator"); - } - mCurrentIterator = null; - } - - public boolean hasNext() { - return (mNextRecord != null); - } - - public GenotypeLikelihoods next() { - final GenotypeLikelihoods result = mNextRecord; - advance(); - return result; - } - - public void remove() { - throw new UnsupportedOperationException("Not supported: remove"); - } - - void advance() { - try { - mNextRecord = getNextRecord(); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - - GenotypeLikelihoods getNextRecord() - throws IOException { - return likelihoodsCodec.decode(); - } - } -} diff --git a/java/lib/edu/mit/broad/picard/genotype/GeliFileWriter.java b/java/lib/edu/mit/broad/picard/genotype/GeliFileWriter.java deleted file mode 100644 index 84196b239..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GeliFileWriter.java +++ /dev/null @@ -1,168 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.genotype; - -import java.io.DataOutputStream; -import java.io.File; -import java.io.StringWriter; - -import edu.mit.broad.picard.genotype.GenotypeLikelihoods.GenotypeLikelihoodsComparator; -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.sam.SAMSequenceRecord; -import edu.mit.broad.sam.SAMTextHeaderCodec; -import edu.mit.broad.sam.SAMFileHeader.SortOrder; -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.BlockCompressedOutputStream; -import edu.mit.broad.sam.util.SortingCollection; - -/** - * Class for writing GELI (GEnotype LIkelihood) files. - */ -public class GeliFileWriter { - private static final int MAX_RECORDS_IN_RAM = 1000000; - private SAMFileHeader.SortOrder sortOrder = SortOrder.coordinate; - private SAMFileHeader header; - private SortingCollection likelihoodsSorter; - - // These two fields are for validating presorted records. - private GenotypeLikelihoods prevLikelihoods; - private GenotypeLikelihoodsComparator presortedComparator; - - // If true, records passed to addAlignment are already in the order specified by sortOrder - private boolean presorted; - protected final BinaryCodec outputBinaryCodec; - private GenotypeLikelihoodsCodec genotypeLikelihoodsCodec = null; - - public GeliFileWriter(final File path) { - this(path, false); - } - - public GeliFileWriter(final File path, boolean presorted) { - outputBinaryCodec = new BinaryCodec(new DataOutputStream(new BlockCompressedOutputStream(path))); - outputBinaryCodec.setOutputFileName(path.toString()); - this.presorted = presorted; - } - - /** - * Must be called before addAlignment. - * @param header - */ - public void setHeader(final SAMFileHeader header) - { - this.header = header; - header.setSortOrder(sortOrder); - final StringWriter headerTextBuffer = new StringWriter(); - new SAMTextHeaderCodec().encode(headerTextBuffer, header); - final String headerText = headerTextBuffer.toString(); - - writeHeader(headerText); - - if (presorted) { - presortedComparator = makeComparator(); - } else if (!sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) { - likelihoodsSorter = SortingCollection.newInstance(GenotypeLikelihoods.class, - new GenotypeLikelihoodsCodec(), makeComparator(), MAX_RECORDS_IN_RAM); - } - } - - protected SAMFileHeader getHeader() { - return header; - } - - private GenotypeLikelihoodsComparator makeComparator() { - return new GenotypeLikelihoodsComparator(); - } - - public void addGenotypeLikelihoods(GenotypeLikelihoods genotypeLikelihoods) - { - if (presorted) { - assertPresorted(genotypeLikelihoods); - writeGenotypeLikelihoods(genotypeLikelihoods); - } else { - likelihoodsSorter.add(genotypeLikelihoods); - } - } - - private void assertPresorted(final GenotypeLikelihoods genotypeLikelihoods) { - if (prevLikelihoods != null) { - if (presortedComparator.compare(prevLikelihoods, genotypeLikelihoods) > 0) { - throw new IllegalArgumentException("GenotypeLikelihoods added out of order in GELIFileWriterImpl.addGenotypeLikelihoods for " + - getFilename() + ". Sort order is " + this.sortOrder + ". Offending records are at [" - + prevLikelihoods.getReferenceIndex() + ":" + prevLikelihoods.getPosition() + "] and [" - + genotypeLikelihoods.getReferenceIndex() + ":" + genotypeLikelihoods.getPosition() + "]"); - } - } - prevLikelihoods = genotypeLikelihoods; - } - - public final void close() - { - if (likelihoodsSorter != null) { - for (final GenotypeLikelihoods genotypeLikelihoods : likelihoodsSorter) { - writeGenotypeLikelihoods(genotypeLikelihoods); - } - likelihoodsSorter.cleanup(); - } - finish(); - } - - private void prepareToWriteAlignments() { - if (genotypeLikelihoodsCodec == null) { - genotypeLikelihoodsCodec = new GenotypeLikelihoodsCodec(); - genotypeLikelihoodsCodec.setOutputStream(outputBinaryCodec.getOutputStream()); - } - } - - /** - * Writes the record to disk. Sort order has been taken care of by the time - * this method is called. - * @param alignment - */ - protected void writeGenotypeLikelihoods(GenotypeLikelihoods genotypeLikelihoods) { - prepareToWriteAlignments(); - genotypeLikelihoodsCodec.encode(genotypeLikelihoods); - } - - /** - * Write the header to disk. Header object is available via getHeader(). - * @param textHeader for convenience if the implementation needs it. - */ - protected void writeHeader(final String textHeader) { - outputBinaryCodec.writeBytes(GeliFileConstants.GELI_MAGIC); - - // calculate and write the length of the SAM file header text and the header text - outputBinaryCodec.writeInt(textHeader.length()); - outputBinaryCodec.writeBytes(textHeader.getBytes()); - - // write the sequences binarily. This is redundant with the text header - outputBinaryCodec.writeInt(getHeader().getSequences().size()); - for (final SAMSequenceRecord sequenceRecord: getHeader().getSequences()) { - outputBinaryCodec.writeInt(sequenceRecord.getSequenceName().length() + 1); - outputBinaryCodec.writeBytes(sequenceRecord.getSequenceName().getBytes()); - outputBinaryCodec.writeByte(0); - outputBinaryCodec.writeInt(sequenceRecord.getSequenceLength()); - } - } - - /** - * Do any required flushing here. - */ - protected void finish() { - outputBinaryCodec.close(); - } - - /** - * For producing error messages. - * @return Output filename, or null if there isn't one. - */ - protected String getFilename() { - return outputBinaryCodec.getOutputFileName(); - } -} diff --git a/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java b/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java deleted file mode 100644 index d19a637c4..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoods.java +++ /dev/null @@ -1,164 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.genotype; - -import java.util.Arrays; -import java.util.Comparator; - -/** - * Data object for Genotype Likelihoods. One object represents one row in a GELI file. - * - * @author Doug Voet - */ -public class GenotypeLikelihoods { - /** this is a guess at how much memory an instance of this object occupies */ - public static final int OBJECT_SIZE_BYTES = 150; - - public static final int AA_GENOTYPE = 0; - public static final int AC_GENOTYPE = 1; - public static final int AG_GENOTYPE = 2; - public static final int AT_GENOTYPE = 3; - public static final int CC_GENOTYPE = 4; - public static final int CG_GENOTYPE = 5; - public static final int CT_GENOTYPE = 6; - public static final int GG_GENOTYPE = 7; - public static final int GT_GENOTYPE = 8; - public static final int TT_GENOTYPE = 9; - - private static final char[][] GENOTYPES = { - "AA".toCharArray(), - "AC".toCharArray(), - "AG".toCharArray(), - "AT".toCharArray(), - "CC".toCharArray(), - "CG".toCharArray(), - "CT".toCharArray(), - "GG".toCharArray(), - "GT".toCharArray(), - "TT".toCharArray() - }; - - /** compares first by reference index then by position */ - public static class GenotypeLikelihoodsComparator implements Comparator { - @Override - public int compare(GenotypeLikelihoods thing1, GenotypeLikelihoods thing2) { - long refCompare = thing1.referenceIndex - thing2.referenceIndex; - if (refCompare == 0) { - long posCompare = thing1.position - thing2.position; - return (int) posCompare; - } else { - return (int) refCompare; - } - } - } - - - private long referenceIndex; - private long position; - private byte referenceBase; - private int numReads; - private short maxMappingQuality; - private float[] likelihoods = new float[10]; - private byte bestLikelihoodIndex = -1; // stored as byte to reduce memory footprint - private byte secondBestLikelihoodIndex = -1; // stored as byte to reduce memory footprint - - public static int getLikelihoodIndex(char[] genotype) { - char first = Character.isLowerCase(genotype[0]) ? Character.toUpperCase(genotype[0]) : genotype[0]; - char second = Character.isLowerCase(genotype[1]) ? Character.toUpperCase(genotype[1]) : genotype[1]; - if (first > second) { - char temp = first; - first = second; - second = temp; - } - for (int i=0; i>> 32)); - result = prime * result + referenceBase; - result = prime * result + (int) (referenceIndex ^ (referenceIndex >>> 32)); - return result; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - GenotypeLikelihoods other = (GenotypeLikelihoods) obj; - if (!Arrays.equals(likelihoods, other.likelihoods)) - return false; - if (maxMappingQuality != other.maxMappingQuality) - return false; - if (numReads != other.numReads) - return false; - if (position != other.position) - return false; - if (referenceBase != other.referenceBase) - return false; - if (referenceIndex != other.referenceIndex) - return false; - return true; - } - - public long getReferenceIndex() { return referenceIndex; } - public void setReferenceIndex(long sequenceIndex) { this.referenceIndex = sequenceIndex; } - public long getPosition() { return position; } - public void setPosition(long position) { this.position = position; } - public byte getReferenceBase() { return referenceBase; } - public void setReferenceBase(byte referenceBase) { this.referenceBase = referenceBase; } - public int getNumReads() { return numReads; } - public void setNumReads(int numReads) { this.numReads = numReads; } - public short getMaxMappingQuality() { return maxMappingQuality; } - public void setMaxMappingQuality(short maxMappingQuality) { this.maxMappingQuality = maxMappingQuality; } - float[] getLikelihoods() { return likelihoods; } - public int getBestLikelihoodIndex() { return bestLikelihoodIndex; } - public void setBestLikelihoodIndex(int bestLikelihoodIndex) { this.bestLikelihoodIndex = (byte) bestLikelihoodIndex; } - public int getSecondBestLikelihoodIndex() { return secondBestLikelihoodIndex; } - public void setSecondBestLikelihoodIndex(int secondBestLikelihoodIndex) { this.secondBestLikelihoodIndex = (byte) secondBestLikelihoodIndex; } -} diff --git a/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java b/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java deleted file mode 100644 index aa0679941..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/GenotypeLikelihoodsCodec.java +++ /dev/null @@ -1,126 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.genotype; - -import java.io.InputStream; -import java.io.OutputStream; - -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.RuntimeEOFException; -import edu.mit.broad.sam.util.SortingCollection; - -public class GenotypeLikelihoodsCodec implements SortingCollection.Codec { - private static final int SIG_FIG_MULTIPLIER = 100; - private static final short BLOCK_SIZE = 12 + 10 * 4; - - private OutputStream os; - private InputStream is; - private BinaryCodec binaryCodec; - - /** Returns a new genotype likelihood codec. */ - public SortingCollection.Codec clone() { - return new GenotypeLikelihoodsCodec(); - } - - /** - * Write object to OutputStream. - * - * @param genotypeLikelihoods what to write - */ - public void encode(final GenotypeLikelihoods genotypeLikelihoods) { - this.binaryCodec.writeShort(BLOCK_SIZE); - this.binaryCodec.writeUInt(genotypeLikelihoods.getReferenceIndex()); - this.binaryCodec.writeUInt(genotypeLikelihoods.getPosition()); - this.binaryCodec.writeByte(genotypeLikelihoods.getReferenceBase()); - this.binaryCodec.writeUShort(genotypeLikelihoods.getNumReads()); - this.binaryCodec.writeByte(genotypeLikelihoods.getMaxMappingQuality()); - - for (int i = 0; i < genotypeLikelihoods.getLikelihoods().length; i++) { - writeLikelihood(genotypeLikelihoods.getLikelihoods()[i]); - } - } - - /** - * Read the next record from the input stream and convert into a java object. - * - * @return null if no more records. Should throw exception if EOF is encountered in the middle of - * a record. - */ - public GenotypeLikelihoods decode() { - int recordLength = 0; - try { - recordLength = this.binaryCodec.readShort(); - } catch (RuntimeEOFException e) { - return null; - } - if (recordLength != BLOCK_SIZE) { - throw new GeliException("Invalid record length: " + recordLength); - } - - final GenotypeLikelihoods genotypeLikelihoods = new GenotypeLikelihoods(); - genotypeLikelihoods.setReferenceIndex(this.binaryCodec.readUInt()); - genotypeLikelihoods.setPosition(this.binaryCodec.readUInt()); - genotypeLikelihoods.setReferenceBase(this.binaryCodec.readByte()); - genotypeLikelihoods.setNumReads(this.binaryCodec.readUShort()); - genotypeLikelihoods.setMaxMappingQuality(this.binaryCodec.readByte()); - - int bestIndex = -1; - int secondBestIndex = -1; - for (int i = 0; i < genotypeLikelihoods.getLikelihoods().length; i++) { - float likelihood = readLikelihood(); - genotypeLikelihoods.getLikelihoods()[i] = likelihood; - - if (bestIndex == -1 || genotypeLikelihoods.getLikelihood(bestIndex) < likelihood) { - secondBestIndex = bestIndex; - bestIndex = i; - } else if (secondBestIndex == -1 || genotypeLikelihoods.getLikelihood(secondBestIndex) < likelihood) { - secondBestIndex = i; - } - } - genotypeLikelihoods.setBestLikelihoodIndex(bestIndex); - genotypeLikelihoods.setSecondBestLikelihoodIndex(secondBestIndex); - - return genotypeLikelihoods; - } - - /** - * Where to write encoded output - * - * @param os - */ - public void setOutputStream(final OutputStream os) { - this.os = os; - this.binaryCodec = new BinaryCodec(os); - } - - /** - * Where to read encoded input from - * - * @param is - */ - public void setInputStream(final InputStream is) { - this.is = is; - this.binaryCodec = new BinaryCodec(is); - } - - private void writeLikelihood(float likelihood) { - float shiftedLikelihood = likelihood * SIG_FIG_MULTIPLIER; - this.binaryCodec.writeInt((int) Math.round(shiftedLikelihood)); - } - - /** - * @return - */ - private float readLikelihood() { - float likelihood = (float) this.binaryCodec.readInt() / SIG_FIG_MULTIPLIER; - return likelihood; - } - -} diff --git a/java/lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java b/java/lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java deleted file mode 100644 index 3893e7bd1..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/caller/AbstractAlleleCaller.java +++ /dev/null @@ -1,192 +0,0 @@ -package edu.mit.broad.picard.genotype.caller; - -import edu.mit.broad.picard.sam.SamLocusIterator; -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.picard.reference.ReferenceSequenceFile; -import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; -import edu.mit.broad.picard.reference.ReferenceSequence; -import edu.mit.broad.picard.PicardException; - -import java.io.IOException; -import java.io.BufferedWriter; -import java.io.File; -import java.util.SortedSet; -import java.util.List; - -/** - * Base class for AlleleCallers. Handles efficient access to the reference, output of data to a - * standard file format, and application of priors - */ -public abstract class AbstractAlleleCaller { - // writer for output - private final BufferedWriter writer; - - // for providing access to reference data - private final ReferenceSequenceFile referenceSequenceFile; - private final SAMFileHeader samHeader; - private ReferenceSequence referenceSequence; - - public AbstractAlleleCaller(final File reference, final SAMFileHeader samHeader, final BufferedWriter writer) { - this.writer = writer; - this.referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(reference); - this.samHeader = samHeader; - } - - - /** - * emit allele calls to the writer specified in the constructor - * - * @param li Locus to call - */ - public void callAlleles(final SamLocusIterator.LocusInfo li) throws IOException { - - - cacheReferenceSequence(li.getSequenceIndex()); - - final char ref = Character.toUpperCase((char)(referenceSequence.getBases()[li.getPosition() - 1] & 0xff)); - - - // delegate to the specific implementation - final SortedSet likelihoods = call(ref, li.getBasesAsString(), li.getQualities()); - - - final GenotypeTheory bestTheory = likelihoods.first(); - GenotypeTheory nextBestTheory = null; - GenotypeTheory refTheory = null; - final String refString = new String(new char[]{ref,ref}); - final DiploidGenotype refGenotype = DiploidGenotype.valueOf(refString); - - - final StringBuilder theoryString = new StringBuilder(); - int k=0; - for(final GenotypeTheory t : likelihoods) { - if (k == 1) { nextBestTheory = t; } - if (t.getGenotype() == refGenotype) { refTheory = t; } - - theoryString.append(t.getGenotype()) - .append(":") - .append(String.format("%.2f",t.getLikelihood())) - .append(" "); - k++; - } - - final double btnb = bestTheory.getLikelihood() - nextBestTheory.getLikelihood(); - final double btr = bestTheory.getLikelihood() - refTheory.getLikelihood(); - - final DiploidGenotype gt = likelihoods.first().getGenotype(); - - final String type; - if (!gt.isHet() && gt.getAllele1() == ref) { - type = "homozygous"; - } else if (!gt.isHet() && gt.getAllele1() != ref) { - type = "homozygous-SNP"; - } else { - type = "heterozygous-SNP"; - } - - final String bases = li.getBasesAsString(); - int a = 0,c = 0,g = 0,t = 0; - for(int i=0; i= the arg in the previous - * call to this method. - */ - private void cacheReferenceSequence(int sequenceIndex) { - if (referenceSequence != null && referenceSequence.getContigIndex() == sequenceIndex) { - return; - } - referenceSequence = null; - for(referenceSequence = referenceSequenceFile.nextSequence(); - referenceSequence != null; - referenceSequence = referenceSequenceFile.nextSequence()) { - // Sanity check the sequence names against the sequence dictionary while scanning through. - if (!referenceSequence.getName().equals(samHeader.getSequence(referenceSequence.getContigIndex()).getSequenceName())) { - throw new PicardException("Sequence name mismatch at sequence index " + referenceSequence.getContigIndex() + - ": " + referenceSequence.getName() + " != " + - samHeader.getSequence(referenceSequence.getContigIndex()).getSequenceName()); - } - if (referenceSequence.getContigIndex() == sequenceIndex) { - break; - } - if (referenceSequence.getContigIndex() > sequenceIndex) { - throw new PicardException("Never found reference sequence with index " + sequenceIndex); - } - } - if (referenceSequence == null) { - throw new PicardException("Reference sequence with index " + sequenceIndex + " was not found"); - } - } - - /** - * Override this to implement a concrete genotype caller - * @param ref the reference base - * @param bases each element in the String is the base at current locus for a given read - * @param quals same length as bases. the ith element corresponds to the ith element of bases. - * @return - */ - abstract protected SortedSet call(char ref, String bases, List quals); - - - /** - * Apply a general population-based prior to the likelihood: - *
    - *
  • ref is .999
  • - *
  • het is 10^-3
  • - *
  • homozygous, non-reference is 10^-5
  • - * - * @param ref reference allele - * @return prior, given the reference and genotype alleles - */ - protected double getPrior(final char ref, final DiploidGenotype gt) { - final double prior; - if (gt.isHom() && gt.getAllele1() == ref) { - prior = 0.999; // reference - } else { - if (gt.getAllele1() != ref && gt.getAllele2() != ref) { - prior = 0.00001; // neither base is reference - } else { - prior = 0.001; // het, one base is reference - } - } - return prior; - } - - // -------------------------------------------------------------------------------------------- - // Helper methods below this point... - // -------------------------------------------------------------------------------------------- - - - public boolean isHet(final String alleles) { - return (alleles.charAt(0) != (alleles.charAt(1))); - } - - -} diff --git a/java/lib/edu/mit/broad/picard/genotype/caller/CallGenotypes.java b/java/lib/edu/mit/broad/picard/genotype/caller/CallGenotypes.java deleted file mode 100644 index 06b5a4200..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/caller/CallGenotypes.java +++ /dev/null @@ -1,93 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.genotype.caller; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.directed.GenomeMaskFactory; -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.picard.sam.SamLocusIterator; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; - -/** - * Call genotypes given a SAM file of aligned reads, reference sequences, and optionally a target map. - */ -public class CallGenotypes extends CommandLineProgram { - // Usage and parameters - @Usage(programVersion="1.0") public String USAGE = "Basic Allele Caller\n"; - @Option(shortName="I", doc="SAM or BAM file for calling") public File INPUT_FILE; - @Option(shortName="O", doc="Allele Call output GELI file") public File OUTPUT_FILE; - @Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE; - @Option(shortName="T", doc="IntervalList-format target map file", optional = true) public File TARGET_FILE; - @Option(shortName="Q", doc="Minimum quality score threshold to use in allele calling", optional = true) public Integer QUAL_SCORE_THRESHOLD; - - - /** Required main method implementation. */ - public static void main(final String[] argv) { - System.exit(new CallGenotypes().instanceMain(argv)); - } - - - protected int doWork() { - try { - final BufferedWriter writer = new BufferedWriter(new FileWriter(OUTPUT_FILE)); - - final SAMFileReader samReader = getSamReader(INPUT_FILE); - - // TODO -- parameterize, or create separate executables... - // AbstractAlleleCaller caller = new FlatQualityAlleleCaller(reference, writer); - final AbstractAlleleCaller caller = new QualityScoreAlleleCaller(REF_FILE, samReader.getFileHeader(), writer); - final long startTime = System.currentTimeMillis(); - - final SamLocusIterator sli = new SamLocusIterator(samReader.iterator()); - - if (TARGET_FILE != null) { - sli.setGenomeMask(new GenomeMaskFactory().makeGenomeMaskFromIntervalList(TARGET_FILE)); - } - - if (QUAL_SCORE_THRESHOLD != null) { - System.out.println("Masking out bases with < Q"+QUAL_SCORE_THRESHOLD); - sli.setQualityScoreCutoff(QUAL_SCORE_THRESHOLD); - } - - for (final SamLocusIterator.LocusInfo li : sli) { - if (li != null) caller.callAlleles(li); - } - - final long elapsed = System.currentTimeMillis() - startTime; - System.out.println("Completed in " + elapsed + "ms"); - - writer.flush(); - writer.close(); - } catch (IOException ioe) { - throw new RuntimeException(ioe); - } - return 0; - } - - private SAMFileReader getSamReader(final File samFile) { - final SAMFileReader samReader = new SAMFileReader(samFile); - - // ensure the file is sorted - if (samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { - System.out.println("SAM Files must be coordinate-sorted, this is " + samReader.getFileHeader().getSortOrder()); - System.exit(1); - } - - return samReader; - } - -} \ No newline at end of file diff --git a/java/lib/edu/mit/broad/picard/genotype/caller/DiploidGenotype.java b/java/lib/edu/mit/broad/picard/genotype/caller/DiploidGenotype.java deleted file mode 100644 index 1d9e01f58..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/caller/DiploidGenotype.java +++ /dev/null @@ -1,27 +0,0 @@ -package edu.mit.broad.picard.genotype.caller; - -public enum DiploidGenotype { - AA('A','A'), - AC('A','C'), - AG('A','G'), - AT('A','T'), - CC('C','C'), - CG('C','G'), - CT('C','T'), - GG('G','G'), - GT('G','T'), - TT('T','T'); - - private final char allele1; - private final char allele2; - - private DiploidGenotype(final char allele1, final char allele2) { - this.allele1 = allele1; - this.allele2 = allele2; - } - - public char getAllele1() { return allele1; } - public char getAllele2() { return allele2; } - public boolean isHet() { return this.allele1 != this.allele2; } - public boolean isHom() { return this.allele1 == this.allele2; } -} diff --git a/java/lib/edu/mit/broad/picard/genotype/caller/FlatQualityAlleleCaller.java b/java/lib/edu/mit/broad/picard/genotype/caller/FlatQualityAlleleCaller.java deleted file mode 100644 index c437a911e..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/caller/FlatQualityAlleleCaller.java +++ /dev/null @@ -1,76 +0,0 @@ -package edu.mit.broad.picard.genotype.caller; - -import edu.mit.broad.sam.SAMFileHeader; - -import java.io.IOException; -import java.io.BufferedWriter; -import java.io.File; -import java.util.*; -import static java.lang.Math.*; - - -/** - * Bayesian-based allele caller using flat qualities and a 1e-3 error rate, based on CRD algorithm - */ -public class FlatQualityAlleleCaller extends AbstractAlleleCaller { - - public FlatQualityAlleleCaller(final File fastbReference, SAMFileHeader samHeader, final BufferedWriter writer) { - super(fastbReference, samHeader, writer); - } - - - protected SortedSet call(final char ref, final String bases, final List quals) { - final float eps = 1e-3f; - - // count up the base by nucleotide and put them into a map - final int depth = bases.length(); - int a = 0,c = 0,g = 0,t = 0; - for(int i=0; i< bases.length(); i++) { - if (bases.charAt(i) == 'A') { a++; } - else if (bases.charAt(i) == 'C') { c++; } - else if (bases.charAt(i) == 'G') { g++; } - else if (bases.charAt(i) == 'T') { t++; } - else { throw new RuntimeException("Unknown Base " + bases.charAt(i)); } - } - - final Map counts = new HashMap(); - counts.put('A', a); - counts.put('C', c); - counts.put('G', g); - counts.put('T', t); - - - // for each of the 10 theories, calculate the likelihood - final SortedSet results = new TreeSet(); - for(final DiploidGenotype theory : DiploidGenotype.values()) { - final double likelihood; - final char allele1 = theory.getAllele1(); - final char allele2 = theory.getAllele2(); - - if (!theory.isHet()) { - likelihood = log10(1-eps)*counts.get(allele1) + log10(eps)*(depth - counts.get(allele1)); - } else { - final int major_allele_counts; - final int minor_allele_counts; - if (counts.get(allele1) > counts.get(allele2)) { - major_allele_counts = counts.get(allele1); - minor_allele_counts = counts.get(allele2); - } else { - major_allele_counts = counts.get(allele2); - minor_allele_counts = counts.get(allele1); - } - - likelihood = log10(0.5 - (eps/2.0) )*major_allele_counts + - log10(0.5 - (eps/2.0) )*minor_allele_counts + - log10(eps)*(depth - major_allele_counts - minor_allele_counts); - } - - final double prior = getPrior(ref, theory); - results.add(new GenotypeTheory(theory, likelihood + log10(prior))); - } - - - return results; - - } -} diff --git a/java/lib/edu/mit/broad/picard/genotype/caller/GenotypeTheory.java b/java/lib/edu/mit/broad/picard/genotype/caller/GenotypeTheory.java deleted file mode 100644 index a97e83a97..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/caller/GenotypeTheory.java +++ /dev/null @@ -1,46 +0,0 @@ -package edu.mit.broad.picard.genotype.caller; - -/** - * Datastructure to hold a single genotype along with a likelihood. - */ -public class GenotypeTheory implements Comparable { - private DiploidGenotype genotype; - private double likelihood; - - public GenotypeTheory(final DiploidGenotype genotype, final double likelihood) { - this.genotype = genotype; - this.likelihood = likelihood; - } - - public DiploidGenotype getGenotype() { - return genotype; - } - - public void setGenotype(final DiploidGenotype genotype) { - this.genotype = genotype; - } - - public double getLikelihood() { - return likelihood; - } - - public void setLikelihood(final double likelihood) { - this.likelihood = likelihood; - } - - /** - * Genotype Theories are sorted first by descending likelihood (ie - * the GenotypeTheory with biggest likelihood comes first). Ties are - * broken by lexical sorting of the genotypes themselves - * - */ - public int compareTo(final GenotypeTheory other) { - if (this.getLikelihood() == other.getLikelihood()) { - return this.getGenotype().compareTo(other.getGenotype()); - } else if (this.getLikelihood() > other.getLikelihood()) { - return -1; - } else { - return 1; - } - } -} diff --git a/java/lib/edu/mit/broad/picard/genotype/caller/QualityScoreAlleleCaller.java b/java/lib/edu/mit/broad/picard/genotype/caller/QualityScoreAlleleCaller.java deleted file mode 100644 index f9863546d..000000000 --- a/java/lib/edu/mit/broad/picard/genotype/caller/QualityScoreAlleleCaller.java +++ /dev/null @@ -1,82 +0,0 @@ -package edu.mit.broad.picard.genotype.caller; - -import edu.mit.broad.sam.SAMFileHeader; - -import java.util.*; -import static java.lang.Math.log10; -import static java.lang.Math.pow; -import java.io.BufferedWriter; -import java.io.IOException; -import java.io.File; - -/** - * Bayesian-based allele caller using quality scores, based on CRD algorithm - */ -public class QualityScoreAlleleCaller extends AbstractAlleleCaller { - - public QualityScoreAlleleCaller(final File fastbReference, SAMFileHeader samHeader, final BufferedWriter writer) { - super(fastbReference, samHeader, writer); - } - - protected SortedSet call(final char ref, final String bases, final List quals) { - - // for each of the 10 theories, calculate the likelihood using quality scores - final SortedSet results = new TreeSet(); - for(final DiploidGenotype theory : DiploidGenotype.values()) { - double likelihood = 0; - - for(int i=0; i, Iterable, Closeable { - - private final File bustardDirectory; - private final int lane; - private final boolean pairedEnd; - private PasteParser parser; - private BustardReadData next = null; - private final FormatUtil formatter = new FormatUtil(); - private boolean iterating = false; - - /** - * Constructor - * - * @param bustardDirectory directory where the Bustard files can be located - * @param lane the lane to parse - * @param pairedEnd whether this is a paired-end run - */ - public BustardFileParser(File bustardDirectory, int lane, boolean pairedEnd) { - this.bustardDirectory = bustardDirectory; - this.lane = lane; - this.pairedEnd = pairedEnd; - initialize(); - } - - /** - * Finds the relevant files in the bustardDirectory, sorts them, and puts them into the - * sortedFiles iterator. Does some basic sanity checking to ensure that some files - * are found and that they are the expected multiple for paired-end or not. - * - */ - private void initialize() - { - final String qseq1Regex = "s_" + lane + "_1_\\d{4}_qseq.txt(.gz)?"; - final String qseq2Regex = "s_" + lane + "_2_\\d{4}_qseq.txt(.gz)?"; - final String intensityRegex = "s_" + lane + "_\\d{4}_sig2.txt(.gz)?"; - - File read1files[] = bustardDirectory.listFiles( new FilenameFilter() { - public boolean accept(File dir, String name) { - return name.matches(qseq1Regex); - } - }); - - File read2files[] = bustardDirectory.listFiles( new FilenameFilter() { - public boolean accept(File dir, String name) { - return name.matches(qseq2Regex); - } - }); - - File intensityFiles[] = bustardDirectory.listFiles( new FilenameFilter() { - public boolean accept(File dir, String name) { - return name.matches(intensityRegex); - } - }); - - // Some basic sanity checking on file counts - if (read1files.length == 0 && read2files.length == 0 && intensityFiles.length == 0) { - throw new PicardException("No Bustard files found in " + - bustardDirectory.getAbsolutePath() + " for lane " + lane); - } - if (pairedEnd) { - if (read1files.length != read2files.length || read2files.length != intensityFiles.length) { - throw new PicardException("Incorrect number of Bustard files found in " + - bustardDirectory.getAbsolutePath() + " for lane " + lane + ". Found " + - read1files.length + " read 1 qseq files, " + read2files.length + " read 2 " + - "qseq files, and " + intensityFiles.length + " sig2 files. There should be " + - "the same number of each type of file"); - } - } - else { - if (read1files.length != intensityFiles.length) { - throw new PicardException("Incorrect number of Bustard files found in " + - bustardDirectory.getAbsolutePath() + " for lane " + lane + ". Found " + - read1files.length + " qseq files and " + intensityFiles.length + " sig2 files, " + - "which should be equal."); - } - if (read2files.length > 0) { - throw new PicardException("Read 2 Bustard files found in " + - bustardDirectory.getAbsolutePath() + " for lane " + lane + ". Lane " + - " was specified as a non-PE run, and so should not have any read 2 data."); - } - } - - // Sort each set of reads and create a text parser for it - SortedSet sortedRead1 = new TreeSet(new BustardFilenameComparator()); - sortedRead1.addAll(Arrays.asList(read1files)); - read1files = sortedRead1.toArray(read1files); - BasicTextFileParser read1Parser = new BasicTextFileParser(true, read1files); - - SortedSet sortedIntensity = new TreeSet(new BustardFilenameComparator()); - sortedIntensity.addAll(Arrays.asList(intensityFiles)); - intensityFiles = sortedIntensity.toArray(intensityFiles); - BasicTextFileParser intensityParser = new BasicTextFileParser(true, intensityFiles); - - // And create a paste parser for all of them - if (pairedEnd) { - SortedSet sortedRead2 = new TreeSet(new BustardFilenameComparator()); - sortedRead2.addAll(Arrays.asList(read2files)); - read2files = sortedRead2.toArray(read2files); - BasicTextFileParser read2Parser = new BasicTextFileParser(true, read2files); - - parser = new PasteParser(read1Parser, read2Parser, intensityParser); - } - else { - parser = new PasteParser(read1Parser, intensityParser); - } - } - - /** - * Parses the next line from the parser and constructs a BustardReadData object from it - * The first 11 fields are the read1 data, the second 11 are the read2 data, and the remaining - * values are the intensities data. Note that the first four values in the intensity file - * are not intensities but rather lane, tiles, x, and y for the given cluster. - * - * @param validate whether to check that the expected number of intensity values are returned - * @return a fully populated BustardReadData object - */ - private BustardReadData readNext(boolean validate) { - if (!parser.hasNext()) { - return null; - } - String data[][] = parser.next(); - String machine = data[0][0]; - int run = formatter.parseInt(data[0][1]); - int lane = formatter.parseInt(data[0][2]); - int tile = formatter.parseInt(data[0][3]); - int x = formatter.parseInt(data[0][4]); - int y = formatter.parseInt(data[0][5]); - String firstSeq = data[0][8]; - String firstQual = data[0][9]; - boolean pf = formatter.parseInt(data[0][10]) == 1; - String secondSeq = null; - String secondQual = null; - - int intensityIndex = 1; - if (pairedEnd) { - secondSeq = data[1][8]; - secondQual = data[1][9]; - intensityIndex = 2; - } - - int numIntensities = firstSeq.length() * (pairedEnd ? 2 : 1); - - // Sanity check since some of those files look a little weird - if (validate) { - int remaining = data[intensityIndex].length - 4; - if ((remaining % 4 != 0) || (remaining/4) != numIntensities) { - throw new PicardException("Unexpected number of intensity fields for " + machine + "/" + run + - "/" + lane + "/" + tile + ": " + remaining); - } - } - - double intensities[][] = new double[numIntensities][4]; - int intensityArrayIndex = 4; - for (int i = 0; i < numIntensities; i++) { - for (int j = 0; j < 4; j++) { - intensities[i][j] = formatter.parseDouble(data[intensityIndex][intensityArrayIndex++]); - } - } - - return new BustardReadData( - machine, run, lane, tile, firstSeq, firstQual, secondSeq, secondQual, pf, intensities, x, y); - - } - - /** - * Returns an iterator over a set of elements of type BustardReadData. - * - * @return an iterator over a set of elements of type BustardReadData - */ - public Iterator iterator() { - if (iterating) { - throw new IllegalStateException("iterator() method can only be called once, before the" + - "first call to hasNext()"); - } - next = readNext(true); - iterating = true; - return this; - } - - /** - * Returns true if the iteration has more elements. - * - * @return true if the iteration has more elements. Otherwise returns false. - */ - public boolean hasNext() { - if (!iterating) { - next = readNext(true); - iterating = true; - } - return next != null; - } - - /** - * Returns the next element in the iteration. - * - * @return the next element in the iteration - * @throws java.util.NoSuchElementException - */ - public BustardReadData next() { - - if (!hasNext()) { - throw new NoSuchElementException("Iteration has no more elements."); - } - - BustardReadData result = next; - next = readNext(false); - return result; - } - - /** - * Required method for Iterator API. - * - * @throws UnsupportedOperationException - */ - public void remove() { - throw new UnsupportedOperationException("Remove() not supported."); - } - - /** - * Closes the underlying PasteParser - */ - public void close() { - if (parser != null) { - parser.close(); - } - } - - public int getLane() { return this.lane; } - public boolean isPairedEnd() { return this.pairedEnd; } -} diff --git a/java/lib/edu/mit/broad/picard/illumina/BustardFilenameComparator.java b/java/lib/edu/mit/broad/picard/illumina/BustardFilenameComparator.java deleted file mode 100644 index ad92377f1..000000000 --- a/java/lib/edu/mit/broad/picard/illumina/BustardFilenameComparator.java +++ /dev/null @@ -1,78 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.illumina; - -import java.io.File; -import java.util.Comparator; - -/** - * Comparator for getting Bustard files in "sorted" order for use by the BustardFileParser. Expected order is - * by lane in ascending order, then by tile in ascending order, then: - * the read 1 qseq file - * the read 2 qseq file - * the sig2 file - * - * IMPORTANT: Currently this class expects to receive ONLY qseq and sig2 files. - * - * @author Kathleen Tibbetts - */ -public class BustardFilenameComparator implements Comparator { - - /** - * Compares its two arguments for order. Returns a negative integer, zero, or a positive integer as - * the first argument is less than, equal to, or greater than the second. - * - * @param file1 - * @param file2 - * @return a negative integer, zero, or a positive integer as - * the first argument is less than, equal to, or greater than the second. - */ - public int compare(File file1, File file2) - { - Integer parts1[] = parseFileNameParts(file1.getName()); - Integer parts2[] = parseFileNameParts(file2.getName()); - - for (int i = 1; i < parts1.length; i++) - { - if (!parts1[i].equals(parts2[i])) { - return parts1[i].compareTo(parts2[i]); - } - } - return 0; - } - - /** - * Utility method that returns an array of integers that represent, in order, - * the lane, tile, type (0 for qseq files, 1 for sig2 files), and read (if any) - * represented by the given file name - * - * @param name - * @return an array of integers that represent, in order, - * the lane, tile, type (0 for qseq files, 1 for sig2 files), and read (if any) - * represented by the given file name - */ - private Integer[] parseFileNameParts(String name) - { - Integer parts[] = new Integer[4]; // Lane, tile, type, read - String src[] = name.split("_"); - parts[0] = new Integer(src[1]); // Lane is always the second part - if (src[2].length() == 4) { // Tile is 3rd or fourth - parts[1] = new Integer(src[2]); - } - else { - parts[1] = new Integer(src[3]); - } - parts[2] = (src[src.length-1].equals("qseq.txt")) ? 0 : 1; // qseq tests are lower - if (src[2].length() == 1) { // read is last - parts[3] = new Integer(src[2]); - } - return parts; - } -} diff --git a/java/lib/edu/mit/broad/picard/illumina/BustardReadData.java b/java/lib/edu/mit/broad/picard/illumina/BustardReadData.java deleted file mode 100644 index 6076f36e0..000000000 --- a/java/lib/edu/mit/broad/picard/illumina/BustardReadData.java +++ /dev/null @@ -1,128 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.illumina; - -/** - * Holds all the Bustard-level data we need (so far) about an individual read. - * - * @author Kathleen Tibbetts - */ -public class BustardReadData { - - private static final String PADDING ="00000"; - - final private String machineName; - final private int runNumber; - final private int laneNumber; - final private int tileNumber; - final private String firstReadSequence; - final private String firstReadQualities; - final private String secondReadSequence; - final private String secondReadQualities; - final private boolean pf; - final private double intensities[][]; - final private int xCoordinate; - final private int yCoordinate; - private final SolexaQualityConverter converter = new SolexaQualityConverter(); - - - /** - * Constructor that takes everything to populate this object - * - * @param machineName - * @param runNumber - * @param laneNumber - * @param tileNumber - * @param firstReadSequence - * @param firstReadQualities - * @param secondReadSequence - * @param secondReadQualities - * @param pf - * @param intensities - * @param xCoordinate - * @param yCoordinate - */ - public BustardReadData(String machineName, int runNumber, int laneNumber, int tileNumber, - String firstReadSequence, String firstReadQualities, - String secondReadSequence, String secondReadQualities, - boolean pf, double[][] intensities, int xCoordinate, int yCoordinate ) { - - this.machineName = machineName; - this.runNumber = runNumber; - this.laneNumber = laneNumber; - this.tileNumber = tileNumber; - this.firstReadSequence = firstReadSequence; - this.firstReadQualities = firstReadQualities; - this.secondReadSequence = secondReadSequence; - this.secondReadQualities = secondReadQualities; - this.pf = pf; - this.intensities = intensities; - this.xCoordinate = xCoordinate; - this.yCoordinate = yCoordinate; - } - - // TODO: Finalize read name -- ask Tim - /** - * Composes a name for this read from its values - * - * @return the read name - */ - public String getReadName() { - return this.machineName + ":" + this.laneNumber + ":" + this.tileNumber + - ":" + this.xCoordinate + ":" + this.yCoordinate; - } - - /** - * Gets Phred-style qualitites for the first read - * - * @return the String of qualities - */ - public String getFirstReadPhredQualities() { - return decodeSolexaQualitiesToPhred(getFirstReadQualities()); - } - - /** - * Gets Phred-style qualitites for the second read - * - * @return the String of qualities - */ - public String getSecondReadPhredQualities() { - return decodeSolexaQualitiesToPhred(getSecondReadQualities()); - } - - /** - * Converts a string of Solexa qualities to a Phred-style quality String - * - * @param qualities the Solexa qualities to decode - * @return the String of Phred qualities - */ - private String decodeSolexaQualitiesToPhred(String qualities) { - StringBuilder sb = new StringBuilder(); - for (char c : qualities.toCharArray()) { - // Quality char is phred score + 33 - sb.append((char)(converter.solexaToPhred((byte)c)+33)); - } - return sb.toString(); - } - - public String getMachineName() { return machineName; } - public int getRunNumber() { return runNumber; } - public int getLaneNumber() { return laneNumber; } - public int getTileNumber() { return tileNumber; } - public String getFirstReadSequence() { return firstReadSequence; } - public String getFirstReadQualities() { return firstReadQualities; } - public String getSecondReadSequence() { return secondReadSequence; } - public String getSecondReadQualities() { return secondReadQualities; } - public double[][] getIntensities() { return intensities; } - public boolean isPf() { return pf; } - public int getXCoordinate() { return xCoordinate; } - public int getYCoordinate() { return yCoordinate; } - -} diff --git a/java/lib/edu/mit/broad/picard/illumina/BustardToSam.java b/java/lib/edu/mit/broad/picard/illumina/BustardToSam.java deleted file mode 100644 index eb88e3465..000000000 --- a/java/lib/edu/mit/broad/picard/illumina/BustardToSam.java +++ /dev/null @@ -1,58 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.illumina; - -import java.io.File; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.cmdline.Usage; - -/** - * CommandLineProgram to generate to invoke BustardToBamWriter - * - * @author Kathleen Tibbetts - */ -public class BustardToSam extends CommandLineProgram { - // The following attributes define the command-line arguments - @Usage(programVersion="1.0") - public String USAGE = - "Usage: " + getClass().getName() + " [options]\n\n" + - "Generate a BAM binary file from data in an illumina Bustard directory.\n"; - - @Option(shortName = "B", doc = "Bustard directory to parse. ") - public File BUSTARD_DIRECTORY; - - @Option(shortName = "F", doc = "The flowcell. ") - public String FLOWCELL; - - @Option(shortName = "L", doc = "The lane for which to parse data. ") - public Integer LANE; - - @Option(shortName = "P", doc = "Whether the lane was a paired-end run. ") - public Boolean PE; - - @Option(shortName = "O", doc = "The directory for the binary output file. ") - public File OUTPUT; - - @Override - protected int doWork() { - BustardToSamWriter writer = new BustardToSamWriter( - new BustardFileParser(BUSTARD_DIRECTORY, LANE, PE), OUTPUT, FLOWCELL); - writer.writeBamFile(); - return 0; - } - - public static void main(String[] argv) { - System.exit(new BustardToSam().instanceMain(argv)); - } - - -} diff --git a/java/lib/edu/mit/broad/picard/illumina/BustardToSamWriter.java b/java/lib/edu/mit/broad/picard/illumina/BustardToSamWriter.java deleted file mode 100644 index 05c92c246..000000000 --- a/java/lib/edu/mit/broad/picard/illumina/BustardToSamWriter.java +++ /dev/null @@ -1,138 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.illumina; - -import edu.mit.broad.sam.*; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.util.Log; -import edu.mit.broad.picard.filter.AggregateFilter; -import edu.mit.broad.picard.filter.SamRecordFilter; -import edu.mit.broad.picard.filter.SolexaNoiseFilter; -import edu.mit.broad.picard.sam.ReservedTagConstants; - -import java.io.File; -import java.util.*; - -/** - * Writes the data from a BustardFileParser to a BAM file - */ -public class BustardToSamWriter { - - private final BustardFileParser parser; - private SAMFileWriter writer; - private final File outputFile; - private AggregateFilter filters; - private int recordsWritten = 0; - private Log log = Log.getInstance(BustardToSamWriter.class); - - /** - * Constructor - * - * @param parser The parser for the Bustard data - * @param outputDirectory The directory in which to write the BAM file - * @param flowcell The flowcell from which the data is drawn - */ - public BustardToSamWriter(BustardFileParser parser, File outputDirectory, String flowcell) { - this.parser = parser; - this.outputFile = getOutputFile(outputDirectory, flowcell); - initializeFilters(); - } - - /** - * Alternate constructor for testing - * - * @param parser The parser for the Bustard data - * @param outputFile The directory in which to write the BAM file - */ - BustardToSamWriter(BustardFileParser parser, File outputFile) { - this.parser = parser; - this.outputFile = outputFile; - initializeFilters(); - } - - private void initializeFilters() { - filters = new AggregateFilter(Arrays.asList( - (SamRecordFilter)new SolexaNoiseFilter() - )); - } - - - /** - * Writes all data from the BustardFileParser to a BAM file - */ - public void writeBamFile() { - SAMFileHeader header = new SAMFileHeader(); - header.setSortOrder(SAMFileHeader.SortOrder.unsorted); - writer = new SAMFileWriterFactory().makeBAMWriter(header, false, outputFile); - - while (parser.hasNext()) { - BustardReadData brd = parser.next(); - - SAMRecord sam = createSamRecord(brd, true); - writer.addAlignment(sam); - this.recordsWritten++; - - if (parser.isPairedEnd()) { - SAMRecord sam2 = createSamRecord(brd, false); - writer.addAlignment(sam2); - this.recordsWritten++; - } - - } - writer.close(); - - log.info("Wrote " + this.recordsWritten + " read records to BAM file " + - this.outputFile.getAbsolutePath()); - } - - /** - * Creates a SAMRecord from Bustard data - * - * @param brd The BustardReadData to use in populating the SAMRecord - * @param isFirstRead whether this is the first read of a pair - * @return SAMRecord fully populated SAMRecord - */ - private SAMRecord createSamRecord(BustardReadData brd, boolean isFirstRead) { - SAMRecord sam = new SAMRecord(); - sam.setReadName(brd.getReadName()); - sam.setReadString(isFirstRead ? brd.getFirstReadSequence() : brd.getSecondReadSequence()); - sam.setBaseQualityString(isFirstRead ? brd.getFirstReadPhredQualities() : brd.getSecondReadPhredQualities()); - - // Flag values - sam.setReadPairedFlag(parser.isPairedEnd()); - sam.setReadUmappedFlag(true); - sam.setReadFailsVendorQualityCheckFlag(!brd.isPf()); - sam.setMateUnmappedFlag(true); - if (parser.isPairedEnd()) { - sam.setFirstOfPairFlag(isFirstRead); - sam.setSecondOfPairFlag(!isFirstRead); - } - - if (filters.filterOut(sam)) { - sam.setAttribute(ReservedTagConstants.XN, 1); - } - return sam; - } - - /** - * Constructs the name for the output file, determines whether it is writeable, - * and returns the file - * - * @param outputDirectory the directory in which to write the BAM file - * @param flowcell the flowcell from which the data is drawn - * @return a new File object for the BAM file. - */ - private File getOutputFile(File outputDirectory, String flowcell) { - File result = new File(outputDirectory.getAbsolutePath() + "/" + - flowcell + "." + parser.getLane() + ".unmapped.bam"); - IoUtil.assertFileIsWritable(result); - return result; - } -} diff --git a/java/lib/edu/mit/broad/picard/illumina/GeraldParser.java b/java/lib/edu/mit/broad/picard/illumina/GeraldParser.java deleted file mode 100644 index a72f90dba..000000000 --- a/java/lib/edu/mit/broad/picard/illumina/GeraldParser.java +++ /dev/null @@ -1,235 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.illumina; - -import edu.mit.broad.picard.util.PasteParser; -import edu.mit.broad.picard.util.TabbedTextFileParser; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.sam.util.CloseableIterator; - -import java.io.File; -import java.util.Iterator; -import java.util.Arrays; -import java.util.regex.Pattern; -import java.text.ParsePosition; -import java.text.NumberFormat; - -/** - * Parse the pair of files (eland_extended.txt and export.txt) that correspond to an end of a Gerald run for a lane. - */ -public class GeraldParser implements Iterable, CloseableIterator { - private static final int EXPECTED_ELAND_FIELDS = 4; - // Regex used to split apart multiple alignments in the eland output - private static final Pattern ALIGN_SPLITTER = Pattern.compile("\\,+"); - - // export.txt constants - private static final int PASSING_FILTER_COLUMN = 21; - private static final int QUALITIES_COLUMN = 9; - private static final int REQUIRED_EXPORT_COLUMNS = PASSING_FILTER_COLUMN + 1; - - private final NumberFormat integerFormat = NumberFormat.getIntegerInstance(); - - private final SquashedCoordinateMap geraldToArachne; - private final PasteParser pasteParser; - private final File elandExtended; - private final File export; - private boolean iteratorCalled = false; - private final byte[] solexaToPhredQualityConverter = new SolexaQualityConverter().getSolexaToPhredConversionTable(); - - /** - * @param geraldToArachne for converting btw Gerald coordinate and genomic coordinate - */ - public GeraldParser(final SquashedCoordinateMap geraldToArachne, final File elandExtended, final File export) { - this.geraldToArachne = geraldToArachne; - this.elandExtended = elandExtended; - this.export = export; - final TabbedTextFileParser[] parsers = { - new TabbedTextFileParser(false, elandExtended), - new TabbedTextFileParser(false, export) - }; - pasteParser = new PasteParser(parsers); - } - - public Iterator iterator() { - if (iteratorCalled) { - throw new IllegalStateException("iterator() cannot be called more than once on a GeraldParser instance."); - } - iteratorCalled = true; - return this; - } - - public void close() { - pasteParser.close(); - } - - public boolean hasNext() { - return pasteParser.hasNext(); - } - - public GeraldAlignment next() { - final GeraldAlignment ret = new GeraldAlignment(); - final String[][] fields = pasteParser.next(); - - // Parse eland_extended.txt fields - final String[] elandExtendedFields = fields[0]; - if (elandExtendedFields.length < EXPECTED_ELAND_FIELDS) { - throw new PicardException("Not enough fields in file: " + elandExtended); - } - - ret.readName = elandExtendedFields[0].substring(1); - ret.readBases = elandExtendedFields[1]; - ret.readLength = ret.readBases.length(); - final String[] alignCounts = elandExtendedFields[2].split(":"); - if (alignCounts.length == 3) { - ret.zeroMismatchPlacements = Short.parseShort(alignCounts[0]); - ret.oneMismatchPlacements = Short.parseShort(alignCounts[1]); - ret.twoMismatchPlacements = Short.parseShort(alignCounts[2]); - } - - final String[] alignments = ALIGN_SPLITTER.split(elandExtendedFields[3]); - if (alignments.length == 1 && !"-".equals(alignments[0])) { - final int lastDot = alignments[0].lastIndexOf("."); - final int colon = alignments[0].indexOf(':'); - - final String tmp = alignments[0].substring(colon + 1); - final ParsePosition pos = new ParsePosition(0); - final long start = integerFormat.parse(tmp, pos).longValue(); - if (pos.getIndex() == 0) { - throw new RuntimeException("Problem parsing eland extended alignment record: " + Arrays.toString(elandExtendedFields)); - } - - final SimpleMapping m = new SimpleMapping(alignments[0].substring(lastDot+1, colon).trim(), - start, start + ret.readLength - 1, null); - geraldToArachne.convertToArachneCoords(m); - ret.primaryChrom = m.getSequenceName(); - ret.primaryStart = m.getStartPos(); - ret.primaryStop = m.getEndPos(); - ret.orientation = tmp.substring(pos.getIndex(), pos.getIndex() + 1); - ret.mismatchString = tmp.substring(pos.getIndex() + 1); - - // Count the mismatches in the alignment - for (int i=pos.getIndex(); i readGroups = new ArrayList(); - readGroups.add(readGroup); - readGroup.setSample(SAMPLE); - if (LIBRARY != null) { - readGroup.setLibrary(LIBRARY); - } - setRGAttributeIfNotNull(readGroup, DESCRIPTION, "DS"); - setRGAttributeIfNotNull(readGroup, RUN, "PU"); - setRGAttributeIfNotNull(readGroup, PI, SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG); - setRGAttributeIfNotNull(readGroup, CN, "CN"); - setRGAttributeIfNotNull(readGroup, RUN_DATE, SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG); - setRGAttributeIfNotNull(readGroup, PL, "PL"); - header.setReadGroups(readGroups); - } - } - - private void setRGAttributeIfNotNull(final SAMReadGroupRecord readGroup, final Object value, final String key) { - if (value == null) { - return; - } - readGroup.setAttribute(key, value); - } - - /** - * Iterate through the Gerald output and write alignments. eland_extended.txt and export.txt are - * iterated together using PasteParser. If paired end lane, then two PasteParsers are iterated in tandem, - * so that mate info is available when a SAMRecord is created. - */ - private void writeAlignments() { - final GeraldParserFactory geraldParserFactory = new GeraldParserFactory(GERALD_DIR, LANE, SQUASHED_MAP); - paired = geraldParserFactory.isPairedRun(); - final GeraldParser firstEndIterator = geraldParserFactory.makeParser(paired ? 1: null); - GeraldParser secondEndIterator = null; - if (paired) { - secondEndIterator = geraldParserFactory.makeParser(2); - } - int numAlignmentsOrPairsWritten = 0; - while (firstEndIterator.hasNext()) { - final GeraldParser.GeraldAlignment firstEnd = firstEndIterator.next(); - GeraldParser.GeraldAlignment secondEnd = null; - if (paired) { - hasNextAssert(secondEndIterator); - secondEnd = secondEndIterator.next(); - } - final SAMRecord firstEndAlignment = createSAMRecordFromGerald(firstEnd); - SAMRecord secondEndAlignment = null; - if (paired) { - secondEndAlignment = createSAMRecordFromGerald(secondEnd); - setMateInfo(secondEndAlignment, firstEnd); - setMateInfo(firstEndAlignment, secondEnd); - secondEndAlignment.setSecondOfPairFlag(true); - firstEndAlignment.setFirstOfPairFlag(true); - final boolean properPair = SamPairUtil.isProperPair(firstEndAlignment, secondEndAlignment, JUMPING_LIBRARY); - firstEndAlignment.setProperPairFlag(properPair); - secondEndAlignment.setProperPairFlag(properPair); - int insertSize = SamPairUtil.computeInsertSize(firstEndAlignment, secondEndAlignment); - firstEndAlignment.setInferredInsertSize(insertSize); - secondEndAlignment.setInferredInsertSize(-insertSize); - } - - writer.addAlignment(firstEndAlignment); - if (secondEndAlignment != null) { - writer.addAlignment(secondEndAlignment); - } - ++numAlignmentsOrPairsWritten; - if (MAX_ALIGNMENTS != null && numAlignmentsOrPairsWritten >= MAX_ALIGNMENTS) { - break; - } - if (numAlignmentsOrPairsWritten % 500000 == 0) { - log.info("Loaded " + numAlignmentsOrPairsWritten + " reads"); - } - } - if (MAX_ALIGNMENTS == null) { - noMoreAssert(firstEndIterator); - if (paired) { - noMoreAssert(secondEndIterator); - } - } - log.info("Done loading " + numAlignmentsOrPairsWritten + " reads"); - } - - /** - * Write into the samRecord the mate info from the mate gerald alignment - */ - private void setMateInfo(final SAMRecord samRecord, final GeraldParser.GeraldAlignment mateGeraldAlignment) { - final boolean isMapped = mateGeraldAlignment.getPrimaryChrom() != null; - if (isMapped) { - samRecord.setMateReferenceName(mateGeraldAlignment.getPrimaryChrom()); - samRecord.setMateAlignmentStart((int)mateGeraldAlignment.getPrimaryStart()); - samRecord.setMateNegativeStrandFlag(isNegativeStrand(mateGeraldAlignment)); - } else { - samRecord.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); - samRecord.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); - samRecord.setMateUnmappedFlag(true); - } - } - - private boolean isNegativeStrand(final GeraldParser.GeraldAlignment alignment) { - final String orientation = alignment.getOrientation(); - if (orientation.equals("F")) { - return false; - } else if (orientation.equals("R")) { - return true; - } else { - throw new RuntimeException("Strange orientation in eland_extended file"); - } - } - - private SAMRecord createSAMRecordFromGerald(final GeraldParser.GeraldAlignment alignment) { - final SAMRecord samRecord = new SAMRecord(); - // Consider an alignment with a negative start (i.e. that hangs off the beginning of the contig) - // to be unmapped. - final boolean isMapped = alignment.getPrimaryChrom() != null && alignment.getPrimaryStart() >= 0; - - String readName = alignment.getReadName(); - if (readName.endsWith("/1") || readName.endsWith("/2")) { - readName = readName.substring(0, readName.length() - 2); - } - samRecord.setReadName(readName); - - // Set all the flags - samRecord.setReadPairedFlag(paired); - samRecord.setReadUmappedFlag(!isMapped); - if (isMapped) { - samRecord.setReadNegativeStrandFlag(isNegativeStrand(alignment)); - } - // For now we are only taking the primary alignment - samRecord.setNotPrimaryAlignmentFlag(false); - String readBases = alignment.getReadBases(); - if (samRecord.getReadNegativeStrandFlag()) { - readBases = SequenceUtil.reverseComplement(readBases); - } - samRecord.setReadString(readBases); - final byte[] phredQualities = alignment.getPhredQualities(); - if (isMapped && samRecord.getReadNegativeStrandFlag()) { - ArrayUtil.reverseArray(phredQualities); - } - samRecord.setBaseQualities(phredQualities); - if (isMapped) { - /* - if ("23".equals(geraldReferenceName)) { - geraldReferenceName = "X"; - } else if ("24".equals(geraldReferenceName)) { - geraldReferenceName = "Y"; - } - return REFERENCE_PREFIX + geraldReferenceName; - */ - samRecord.setReferenceName(alignment.getPrimaryChrom()); - samRecord.setAlignmentStart((int)alignment.getPrimaryStart()); - samRecord.setMappingQuality(SAMRecord.UNKNOWN_MAPPING_QUALITY); - // CIGAR is trivial because there are no indels or clipping in Gerald - final String cigar = Integer.toString(alignment.getReadLength()) + "M"; - samRecord.setCigarString(cigar); - // We've decided not to bother with this, and just load the reference - // if we want to determine mismatches. - // samRecord.setAttribute("MD", alignment.getMismatchString()); - } else { - samRecord.setReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); - samRecord.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START); - samRecord.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY); - samRecord.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR); - } - - if (SAMPLE != null) { - // There is a read group (id = READ_GROUP_ID) - samRecord.setAttribute("RG", READ_GROUP_ID); - } - - samRecord.setAttribute("PG", PROGRAM_RECORD_ID); - return samRecord; - } - - private void hasNextAssert(final Iterator iterator) { - if (!iterator.hasNext()) { - throw new RuntimeException("gerald output file ends unexpectedly."); - - } - } - - private void noMoreAssert(final Iterator iterator) { - if (iterator.hasNext()) { - throw new RuntimeException("gerald output file has more lines than expected."); - } - } - -} diff --git a/java/lib/edu/mit/broad/picard/illumina/SimpleMapping.java b/java/lib/edu/mit/broad/picard/illumina/SimpleMapping.java deleted file mode 100644 index a1797d58d..000000000 --- a/java/lib/edu/mit/broad/picard/illumina/SimpleMapping.java +++ /dev/null @@ -1,117 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.illumina; - -import edu.mit.broad.sam.util.CoordMath; - -class SimpleMapping implements Comparable { - String arachneIndex; - long startPos; - long endPos; - String sequenceName; - - public SimpleMapping(final String arachneIndex, final long startPos, final long endPos, final String sequenceName) { - this.arachneIndex = arachneIndex; - this.startPos = startPos; - this.endPos = endPos; - this.sequenceName = sequenceName; - - if (this.endPos < this.startPos) throw new IllegalArgumentException("startPos must be less than endPos!"); - } - - public String getArachneIndex() { - return arachneIndex; - } - - public void setArachneIndex(final String arachneIndex) { - this.arachneIndex = arachneIndex; - } - - public long getStartPos() { - return startPos; - } - - public void setStartPos(final long startPos) { - this.startPos = startPos; - } - - public long getEndPos() { - return endPos; - } - - public void setEndPos(final long endPos) { - this.endPos = endPos; - } - - public String getSequenceName() { - return sequenceName; - } - - public void setSequenceName(final String sequenceName) { - this.sequenceName = sequenceName; - } - - public SimpleMapping intersection(final SimpleMapping other) { - if (this.intersects(other)) { - return new SimpleMapping(this.getArachneIndex(), - (this.getStartPos() >= other.getStartPos())?this.getStartPos():other.getStartPos(), - (this.getEndPos() <= other.getEndPos())?this.getEndPos():other.getEndPos(), this.getSequenceName()); - } - - return null; - } - - public boolean intersects(final SimpleMapping other) { - return (this.getArachneIndex().equals(other.getArachneIndex()) && - CoordMath.overlaps(this.getStartPos(), this.getEndPos(), other.getStartPos(), other.getEndPos())); - } - - public long length() { - return CoordMath.getLength(startPos, endPos); - } - - /** - * Sort based on sequence.compareTo, then start pos, then end pos - * with null objects coming lexically last - */ - public int compareTo(final SimpleMapping that) { - if (that == null) return -1; // nulls last - - int result = this.getArachneIndex().compareTo(that.getArachneIndex()); - if (result == 0) { - if (this.getStartPos() == that.getStartPos()) { - result = ((int) (this.getEndPos() - that.getEndPos())); - } else { - result = ((int) (this.getStartPos() - that.getStartPos())); - } - } - - // normalize to -1, 0, 1 - if (result > 1) result = 1; - else if (result < -1) result = -1; - return result; - } - - public boolean equals(final SimpleMapping that) { - return (this.compareTo(that) == 0); - } - - public int hashCode() { - int result; - result = arachneIndex.hashCode(); - result = 31 * result + (int) (startPos ^ (startPos >>> 32)); - result = 31 * result + (int) (endPos ^ (endPos >>> 32)); - return result; - } - - public String toString() { - return getArachneIndex() + ":" + getStartPos() + "-" + getEndPos(); - } -} diff --git a/java/lib/edu/mit/broad/picard/illumina/SolexaQualityConverter.java b/java/lib/edu/mit/broad/picard/illumina/SolexaQualityConverter.java deleted file mode 100644 index 80633fb72..000000000 --- a/java/lib/edu/mit/broad/picard/illumina/SolexaQualityConverter.java +++ /dev/null @@ -1,58 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.illumina; - -/** - * Optimized method for converting Solexa ASCII qualities into Phred scores. - * Pre-computes all values in order to eliminate repeated computation. - */ -public class SolexaQualityConverter { - - /** - * This value is added to a Solexa quality score to make it printable ASCII - */ - private static int SOLEXA_ADDEND = 64; - - /** - * Mapping from ASCII value in Gerald export file to phred score - */ - private final byte[] phredScore = new byte[256]; - - public SolexaQualityConverter() { - for (int i = 0; i < SOLEXA_ADDEND; ++i) { - phredScore[i] = 0; - } - for (int i = SOLEXA_ADDEND; i < phredScore.length; ++i) { - phredScore[i] = decodeSolexaQualityToPhred(i); - } - } - - - /** Converts a solexa character quality into a phred numeric quality. */ - private byte decodeSolexaQualityToPhred(final int solexaQuality) { - return (byte) Math.round(10d * Math.log10(1d+Math.pow(10d, (solexaQuality - SOLEXA_ADDEND)/10d))); - } - - /** - * Convert a solexa quality ASCII character into a phred score. - */ - public byte solexaToPhred(final byte solexaQuality) { - return phredScore[solexaQuality]; - } - - /** - * @return a byte array that can be indexed by Solexa ASCII quality, with value - * of corresponding Phred score. Elements 0-63 are invalid because Solexa qualities - * should all be >= 64. Do not modify this array! - */ - public byte[] getSolexaToPhredConversionTable() { - return phredScore; - } -} diff --git a/java/lib/edu/mit/broad/picard/illumina/SquashedCoordinateMap.java b/java/lib/edu/mit/broad/picard/illumina/SquashedCoordinateMap.java deleted file mode 100644 index 92011add2..000000000 --- a/java/lib/edu/mit/broad/picard/illumina/SquashedCoordinateMap.java +++ /dev/null @@ -1,75 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.illumina; - -import edu.mit.broad.sam.util.CoordMath; -import edu.mit.broad.picard.cmdline.CommandLineUtils; - -import java.util.Map; -import java.util.HashMap; -import java.io.File; -import java.io.BufferedReader; -import java.io.IOException; - -public class SquashedCoordinateMap { - private final Map geraldToArachne = new HashMap(); - private long genomeSize; - - public SquashedCoordinateMap(final File squashedMapFile) { - try { - final BufferedReader in = CommandLineUtils.getReader(squashedMapFile); - String line; - genomeSize = 0; - - while ((line = in.readLine()) != null) { - final String[] fields = CommandLineUtils.SPACE_SPLITTER.split(line); - final String arachneIndex = fields[0].trim().intern(); - final String squashedRefIndex = fields[1].trim().intern(); - final long squashedStart = Long.parseLong(fields[2]); - final long length = Long.parseLong(fields[3]); - final String sequenceName = fields[4]; - - final SimpleMapping mapping = new SimpleMapping(squashedRefIndex, squashedStart, - CoordMath.getEnd(squashedStart, length), sequenceName); - geraldToArachne.put(mapping, arachneIndex); - - genomeSize += length; - } - - in.close(); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - /* Converts a read's mapping from Gerald's vretarded space to arachne index + coords. */ - public void convertToArachneCoords(final SimpleMapping read) { - if (this.geraldToArachne == null || this.geraldToArachne.isEmpty()) { - throw new IllegalStateException("Cannot invoke convertToArachneCoords before parseSquashedMapFile"); - } - - for (final Map.Entry entry : this.geraldToArachne.entrySet()) { - final SimpleMapping chunk = entry.getKey(); - if (chunk.intersects(read)) { - read.setArachneIndex(entry.getValue()); - read.setStartPos( read.getStartPos() - chunk.getStartPos() ); - read.setEndPos( read.getEndPos() - chunk.getStartPos() ); - read.setSequenceName(chunk.getSequenceName()); - return; - } - } - - throw new RuntimeException("Could not convert read: " + read); - } - - long getGenomeSize() { - return genomeSize; - } -} diff --git a/java/lib/edu/mit/broad/picard/importer/genotype/BedFileReader.java b/java/lib/edu/mit/broad/picard/importer/genotype/BedFileReader.java deleted file mode 100644 index 8bd01c755..000000000 --- a/java/lib/edu/mit/broad/picard/importer/genotype/BedFileReader.java +++ /dev/null @@ -1,82 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.importer.genotype; - -import java.io.Closeable; -import java.io.File; - -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.sam.util.BinaryCodec; - -/** - * - * - * @author Doug Voet - */ -public class BedFileReader implements Closeable { - private static final int LOWEST_2_BIT_MASK = 3; // binary 11 - private static final short BED_MAGIC_NUMBER = 7020; -// private static final short BED_MAGIC_NUMBER = Short.parseShort("0110110000011011", 2); - - public static final byte MODE_INDIVIDUAL_MAJOR = 0; - public static final byte MODE_SNP_MAJOR = 1; - - public static final byte GENOTYPE_AA = 0; // binary 00 - public static final byte GENOTYPE_NO_CALL = 1; // binary 01 - public static final byte GENOTYPE_AB = 2; // binary 10 - public static final byte GENOTYPE_BB = 3; // binary 11 - - private final byte mode; - private final BinaryCodec codec; - private byte currentBlock; - private int genotypeCount = 0; - - public BedFileReader(File bedFile) { - this.codec = new BinaryCodec(bedFile, false); - short fileMagicNumber = this.codec.readShort(); - if (fileMagicNumber != BED_MAGIC_NUMBER) { - this.codec.close(); - throw new PicardException("Given file [" + bedFile.getAbsolutePath() + - "] is not in bed file format... magic number does not match"); - } - this.mode = codec.readByte(); - } - - public byte getMode() { - return mode; - } - - @Override - public void close() { - this.codec.close(); - } - - public byte nextGenotype() { - // there are 4 genotypes per byte so get a new byte every 4 genotypes read - if (this.genotypeCount++ % 4 == 0) { - this.currentBlock = this.codec.readByte(); - } - - // the 2 lowest order bits of currentBlock are the next genotype, pop them off - byte genotype = (byte) (LOWEST_2_BIT_MASK & this.currentBlock); - this.currentBlock >>>= 2; - - return genotype; - } - - /** - * Call this method when moving on to the next individual (in indiv-major mode) or next - * snp (in snp-major mode). - */ - public void dropRemainingBlock() { - this.genotypeCount = 0; - } -} diff --git a/java/lib/edu/mit/broad/picard/importer/genotype/BedToGeli.java b/java/lib/edu/mit/broad/picard/importer/genotype/BedToGeli.java deleted file mode 100644 index 8a735207e..000000000 --- a/java/lib/edu/mit/broad/picard/importer/genotype/BedToGeli.java +++ /dev/null @@ -1,371 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.importer.genotype; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; - -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.genotype.GeliFileWriter; -import edu.mit.broad.picard.genotype.GenotypeLikelihoods; -import edu.mit.broad.picard.genotype.GenotypeLikelihoodsCodec; -import edu.mit.broad.picard.genotype.GenotypeLikelihoods.GenotypeLikelihoodsComparator; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.util.BasicTextFileParser; -import edu.mit.broad.picard.util.Log; -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.sam.SAMSequenceRecord; -import edu.mit.broad.sam.SAMTextHeaderCodec; -import edu.mit.broad.sam.util.AsciiLineReader; -import edu.mit.broad.sam.util.SortingCollection; - -/** - * Converts a BED/BIM/FAM file trio to a number of GELI files (1 per individual). - * BED files come in 2 formats, individual-major and snp-major. The former lists all SNPs for the - * first individual then all SNPs for the second individual, etc. The latter list all individuals - * for first SNP then all individuals for second SNP, etc. The order for snps is dictated by - * the bim file and the order for individuals is dictated by the fam file. - *

    - * See this page for details - * of the format. - * - * @author Doug Voet - */ -public class BedToGeli extends CommandLineProgram { - static final float LIKELIHOOD = 500; - private static final Log log = Log.getInstance(BedToGeli.class); - - @Usage(programVersion="1.0") - public final String USAGE = ""; - - @Option(doc="The bed file name.", mutex="BFILE") - public File BED; - - @Option(doc="The bim file name.", mutex="BFILE") - public File BIM; - - @Option(doc="The fam file name.", mutex="BFILE") - public File FAM; - - @Option(doc="The root file name of the bed, bim & fam files.", mutex={"BED", "BIM", "FAM"}) - public String BFILE; - - @Option(doc="The directory to write the output GELI files", shortName="D") - public File OUTPUT_DIR; - - @Option(doc="Set to 'true' if the family name should be included in the output file names, default false", - shortName="F", - optional=true) - public Boolean USE_FAMILY = Boolean.FALSE; - - @Option(doc="Name of file containing sequence dictionary to embed in new GELI files", - shortName="DICT") - public File SEQUENCE_DICTIONARY; - - private List snpCache; - private List geliFileNames; - private List sequenceDictionary; - private Map referenceIndexes; - - public static void main(String[] argv) { - System.exit(new BedToGeli().instanceMain(argv)); - } - - @Override - protected int doWork() { - populateFileNames(); - IoUtil.assertFileIsReadable(this.BED); - IoUtil.assertFileIsReadable(this.BIM); - IoUtil.assertFileIsReadable(this.FAM); - IoUtil.assertFileIsReadable(this.SEQUENCE_DICTIONARY); - IoUtil.assertDirectoryIsWritable(this.OUTPUT_DIR); - - populateSequenceDictionary(); - - BedFileReader bedReader = new BedFileReader(this.BED); - if (bedReader.getMode() == BedFileReader.MODE_INDIVIDUAL_MAJOR) { - log.debug("Detected BED file in individual-major mode"); - parseIndividualMajor(bedReader); - } else { - log.debug("Detected BED file in snp-major mode"); - parseSnpMajor(bedReader); - } - - return 0; - } - - /** - * loads the SEQUENCE_DICTIONARY file - */ - private void populateSequenceDictionary() { - try { - final SAMFileHeader header = new SAMTextHeaderCodec().decode(new AsciiLineReader(new FileInputStream(this.SEQUENCE_DICTIONARY)), null); - this.sequenceDictionary = header.getSequences(); - - this.referenceIndexes = new HashMap(); - for (byte i = 0; i < sequenceDictionary.size(); i++) { - this.referenceIndexes.put(sequenceDictionary.get(i).getSequenceName().intern(), i); - } - } catch (FileNotFoundException e) { - throw new PicardException("Unexpected exception", e); - } - } - - private void parseIndividualMajor(BedFileReader bedReader) { - cacheSnps(); - BasicTextFileParser famReader = new BasicTextFileParser(true, this.FAM); - for (String[] famFields : famReader) { - GeliFileWriter geliWriter = getGeliFileWriter(getGeliFileName(famFields[0], famFields[1]), false); - for (SNP snp : this.snpCache) { - GenotypeLikelihoods genotypeLikelihoods = constructGenotypeLikelihoods( - bedReader, snp); - if (genotypeLikelihoods != null) { - geliWriter.addGenotypeLikelihoods(genotypeLikelihoods); - } - } - bedReader.dropRemainingBlock(); - geliWriter.close(); - } - famReader.close(); - } - - /** - * @return null if for a no-call or the snp has no position on the genome - */ - private char[] getNextGenotype(BedFileReader bedReader, SNP snp) { - char[] genotype = new char[2]; - byte genotypeCode = bedReader.nextGenotype(); - if (snp == null) { - // unplaced marker... we need to read the genotype off the reader so we don't lose - // our place, but we cannot put the marker in the geli file. - return null; - } - switch (genotypeCode) { - case BedFileReader.GENOTYPE_AA: - genotype[0] = (char) snp.getAllele1(); - genotype[1] = (char) snp.getAllele1(); - break; - case BedFileReader.GENOTYPE_AB: - genotype[0] = (char) snp.getAllele1(); - genotype[1] = (char) snp.getAllele2(); - break; - case BedFileReader.GENOTYPE_BB: - genotype[0] = (char) snp.getAllele2(); - genotype[1] = (char) snp.getAllele2(); - break; - case BedFileReader.GENOTYPE_NO_CALL: - // don't record a genotype likelihood for a no call - return null; - default: - throw new PicardException("Unknown genotype code: " + Integer.toBinaryString(genotypeCode)); - } - return genotype; - } - - private void cacheSnps() { - BasicTextFileParser bimReader = null; - try { - bimReader = new BasicTextFileParser(true, this.BIM); - this.snpCache = new LinkedList(); - for (String[] bimFields : bimReader) { - SNP snp = constructSnp(bimFields); - snpCache.add(snp); - } - } finally { - try { - bimReader.close(); - } catch (Exception e) { - } - } - } - - private SNP constructSnp(String[] bimFields) { - byte referenceIndex = getReferenceIndex(bimFields[0]); - if (referenceIndex == -1) { - return null; - } - SNP snp = new SNP( - referenceIndex, - Integer.parseInt(bimFields[3]), - bimFields[4].toUpperCase().getBytes()[0], - bimFields[5].toUpperCase().getBytes()[0]); - return snp; - } - - /** - * determines the index in the sequence dictionary for the given chromosome - */ - private byte getReferenceIndex(String chromosome) { - final String referenceName; - int chromosomeNumber; - try { - chromosomeNumber = Integer.parseInt(chromosome); - } catch (NumberFormatException e) { - chromosomeNumber = -1; - } - - if (chromosomeNumber >= 1 && chromosomeNumber <= 22) { - referenceName = ("chr" + chromosome).intern(); - } else if (chromosomeNumber == 26 || chromosome.equalsIgnoreCase("MT")) { - referenceName = "chrM"; - } else if (chromosomeNumber == 23 || chromosomeNumber == 25 || - chromosome.equalsIgnoreCase("XY") || chromosome.equalsIgnoreCase("X")) { - referenceName = "chrX"; - } else if (chromosomeNumber == 24 || chromosome.equalsIgnoreCase("Y")) { - referenceName = "chrY"; - } else { - // unplaced marker - return -1; - } - - Byte referenceIndex = this.referenceIndexes.get(referenceName); - if (referenceIndex == null) { - throw new PicardException("Reference sequence [" + referenceName + "] not found in sequence dictionary"); - } - return referenceIndex; - } - - private void cacheGELIFileNames() { - BasicTextFileParser famReader = null; - try { - famReader = new BasicTextFileParser(true, this.FAM); - this.geliFileNames = new LinkedList(); - for (String[] fields : famReader) { - this.geliFileNames.add(getGeliFileName(fields[0], fields[1])); - } - } finally { - try { - famReader.close(); - } catch (Exception e) { - } - } - } - - private void parseSnpMajor(BedFileReader bedReader) { - cacheGELIFileNames(); - BasicTextFileParser bimReader = new BasicTextFileParser(true, this.BIM); - Map> likelihoodsByFile = - new HashMap>( - (int) Math.ceil(this.geliFileNames.size() * 1.34)); - - int maxRecordsInRam = calculateMaxRecordsInRam(); - for (String geliFileName : this.geliFileNames) { - likelihoodsByFile.put(geliFileName, SortingCollection.newInstance( - GenotypeLikelihoods.class, - new GenotypeLikelihoodsCodec(), - new GenotypeLikelihoodsComparator(), - maxRecordsInRam)); - } - - for (String[] bimFields : bimReader) { - for (String fileName : this.geliFileNames) { - SNP snp = constructSnp(bimFields); - GenotypeLikelihoods genotypeLikelihoods = constructGenotypeLikelihoods( - bedReader, snp); - if (genotypeLikelihoods != null) { - likelihoodsByFile.get(fileName).add(genotypeLikelihoods); - } - } - bedReader.dropRemainingBlock(); - } - bimReader.close(); - - writeGeliFiles(likelihoodsByFile); - } - - /** - * @return - */ - private int calculateMaxRecordsInRam() { - Runtime.getRuntime().gc(); - double memoryToUse = Runtime.getRuntime().maxMemory() * .8; // use up to 80% - int objectCountLimit = (int) (memoryToUse / GenotypeLikelihoods.OBJECT_SIZE_BYTES); - return objectCountLimit / this.geliFileNames.size(); - } - - /** - * @param likelihoodsByFile - */ - private void writeGeliFiles( - Map> likelihoodsByFile) { - - for (Map.Entry> entry : likelihoodsByFile.entrySet()) { - GeliFileWriter fileWriter = getGeliFileWriter(entry.getKey(), true); - for (GenotypeLikelihoods likelihoods : entry.getValue()) { - fileWriter.addGenotypeLikelihoods(likelihoods); - } - fileWriter.close(); - } - } - - private GeliFileWriter getGeliFileWriter( - String fileName, boolean presorted) { - File geliFile = new File(this.OUTPUT_DIR, fileName); - GeliFileWriter fileWriter = new GeliFileWriter(geliFile, presorted); - SAMFileHeader header = new SAMFileHeader(); - header.setAttribute(SAMFileHeader.VERSION_TAG, "1.0"); - header.setSequences(this.sequenceDictionary); - fileWriter.setHeader(header); - return fileWriter; - } - - /** - * @param bedReader - * @param snp - * @return - */ - private GenotypeLikelihoods constructGenotypeLikelihoods( - BedFileReader bedReader, SNP snp) { - char[] genotype = getNextGenotype(bedReader, snp); - if (genotype == null) { - // no call or unplaced marker - return null; - } - - GenotypeLikelihoods genotypeLikelihoods = new GenotypeLikelihoods(); - genotypeLikelihoods.setLikelihood( - GenotypeLikelihoods.getLikelihoodIndex(genotype), - LIKELIHOOD); - genotypeLikelihoods.setReferenceIndex(snp.getReferenceIndex()); - genotypeLikelihoods.setPosition(snp.getPosition()); - return genotypeLikelihoods; - } - - /** - * populates bed/bim/fam if bfile option is used - */ - private void populateFileNames() { - if (this.BFILE != null) { - this.BED = new File(this.BFILE + ".bed"); - this.BIM = new File(this.BFILE + ".bim"); - this.FAM = new File(this.BFILE + ".fam"); - } - } - - /** - * @return the appropriate name taking into account this.USE_FAMILY - */ - private String getGeliFileName(String family, String individual) { - StringBuilder fileName = new StringBuilder(individual).append(".geli"); - if (this.USE_FAMILY) { - fileName.insert(0, "_").insert(0, family); - } - return fileName.toString(); - } -} diff --git a/java/lib/edu/mit/broad/picard/importer/genotype/SNP.java b/java/lib/edu/mit/broad/picard/importer/genotype/SNP.java deleted file mode 100644 index d9fce76cf..000000000 --- a/java/lib/edu/mit/broad/picard/importer/genotype/SNP.java +++ /dev/null @@ -1,35 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.importer.genotype; - -/** - * data class for storing snp info - * - * @author Doug Voet - */ -public class SNP { - private final byte referenceIndex; - private final int position; - private final byte allele1; - private final byte allele2; - - public SNP(byte chromosome, int position, byte allele1, byte allele2) { - this.referenceIndex = chromosome; - this.position = position; - this.allele1 = allele1; - this.allele2 = allele2; - } - - public byte getReferenceIndex() { return referenceIndex; } - public int getPosition() { return position; } - public byte getAllele1() { return allele1; } - public byte getAllele2() { return allele2; } -} diff --git a/java/lib/edu/mit/broad/picard/io/IoUtil.java b/java/lib/edu/mit/broad/picard/io/IoUtil.java deleted file mode 100644 index 14688a7c1..000000000 --- a/java/lib/edu/mit/broad/picard/io/IoUtil.java +++ /dev/null @@ -1,183 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.io; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; - -import edu.mit.broad.picard.PicardException; - -/** - * A class for utility methods that wrap or aggregate functionality in Java IO. - * - * @author Tim Fennell - */ -public class IoUtil { - /** - * Checks that a file is non-null, exists, is not a directory and is readable. If any - * condition is false then a runtime exception is thrown. - * - * @param file the file to check for readability - */ - public static void assertFileIsReadable(File file) { - if (file == null) { - throw new IllegalArgumentException("Cannot check readability of null file."); - } else if (!file.exists()) { - throw new PicardException("Cannot read non-existent file: " + file.getAbsolutePath()); - } - else if (file.isDirectory()) { - throw new PicardException("Cannot read file because it is a directory: " + file.getAbsolutePath()); - } - else if (!file.canRead()) { - throw new PicardException("File exists but is not readable: " + file.getAbsolutePath()); - } - } - - /** - * Checks that a file is non-null, and is either extent and writable, or non-existent but - * that the parent directory exists and is writable. If any - * condition is false then a runtime exception is thrown. - * - * @param file the file to check for writability - */ - public static void assertFileIsWritable(File file) { - if (file == null) { - throw new IllegalArgumentException("Cannot check readability of null file."); - } else if (!file.exists()) { - // If the file doesn't exist, check that it's parent directory does and is writable - File parent = file.getAbsoluteFile().getParentFile(); - if (!parent.exists()) { - throw new PicardException("Cannot write file: " + file.getAbsolutePath() + ". " + - "Neither file nor parent directory exist."); - } - else if (!parent.isDirectory()) { - throw new PicardException("Cannot write file: " + file.getAbsolutePath() + ". " + - "File does not exist and parent is not a directory."); - } - else if (!parent.canWrite()) { - throw new PicardException("Cannot write file: " + file.getAbsolutePath() + ". " + - "File does not exist and parent directory is not writable.."); - } - } - else if (file.isDirectory()) { - throw new PicardException("Cannot write file because it is a directory: " + file.getAbsolutePath()); - } - else if (!file.canWrite()) { - throw new PicardException("File exists but is not writable: " + file.getAbsolutePath()); - } - } - - /** - * Checks that a directory is non-null, extent, writable and a directory - * otherwise a runtime exception is thrown. - * - * @param dir the dir to check for writability - */ - public static void assertDirectoryIsWritable(File dir) { - if (dir == null) { - throw new IllegalArgumentException("Cannot check readability of null file."); - } - else if (!dir.exists()) { - throw new PicardException("Directory does not exist: " + dir.getAbsolutePath()); - } - else if (!dir.isDirectory()) { - throw new PicardException("Cannot write to directory because it is not a directory: " + dir.getAbsolutePath()); - } - else if (!dir.canWrite()) { - throw new PicardException("Directory exists but is not writable: " + dir.getAbsolutePath()); - } - } - - /** - * Opens a file for reading, decompressing it if necessary - * - * @param file The file to open - * @return the input stream to read from - */ - public static InputStream openFileForReading(File file) { - - try { - if (file.getName().endsWith(".gz") || - file.getName().endsWith(".bfq") || - file.getName().endsWith(".map")) { - return new GZIPInputStream(new FileInputStream(file)); - } - //TODO: Other compression formats - else { - return new FileInputStream(file); - } - } - catch (IOException ioe) { - throw new PicardException("File not found: " + file.getName(), ioe); - } - - } - - /** - * Opens a file for writing, overwriting the file if it already exists - * - * @param file the file to write to - * @return the output stream to write to - */ - public static OutputStream openFileForWriting(File file) { - return openFileForWriting(file, false); - } - - /** - * Opens a file for writing - * - * @param file the file to write to - * @param append whether to append to the file if it already exists (we overwrite it if false) - * @return the output stream to write to - */ - public static OutputStream openFileForWriting(File file, boolean append) { - - try { - if (file.getName().endsWith(".gz") || - file.getName().endsWith(".bfq") || - file.getName().endsWith(".map")) { - return new GZIPOutputStream(new FileOutputStream(file, append)); - } - //TODO: Other compression formats - else { - return new FileOutputStream(file, append); - } - } - catch (IOException ioe) { - throw new PicardException("Error opening file for writing: " + file.getName(), ioe); - } - } - - /** - * Utility method to copy the contents of input to output. The caller is responsible for - * opening and closing both streams. - * - * @param input contents to be copied - * @param output destination - */ - public static void copyStream(InputStream input, OutputStream output) { - try { - byte[] buffer = new byte[1024]; - int bytesRead = 0; - while((bytesRead = input.read(buffer)) > 0) { - output.write(buffer, 0, bytesRead); - } - } catch (IOException e) { - throw new PicardException("Exception copying stream", e); - } - } - -} diff --git a/java/lib/edu/mit/broad/picard/metrics/AggregateMetricCollector.java b/java/lib/edu/mit/broad/picard/metrics/AggregateMetricCollector.java deleted file mode 100644 index fa611ff09..000000000 --- a/java/lib/edu/mit/broad/picard/metrics/AggregateMetricCollector.java +++ /dev/null @@ -1,50 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.metrics; - -import edu.mit.broad.sam.SAMRecord; - -public class AggregateMetricCollector implements MetricCollector { - private final MetricCollector[] collectors; - - public AggregateMetricCollector(MetricCollector... collectors) { - if (collectors.length == 0) { - throw new IllegalArgumentException("Must supply at least one collector."); - } - this.collectors = collectors; - } - - @Override - public void addRecord(SAMRecord record) { - for (MetricCollector collector : this.collectors) { - collector.addRecord(record); - } - } - - @Override - public void onComplete() { - for (MetricCollector collector : this.collectors) { - collector.onComplete(); - } - } - - @Override - public void setMetrics(T metrics) { - for (MetricCollector collector : this.collectors) { - collector.setMetrics(metrics); - } - } - - @Override - public T getMetrics() { - return this.collectors[0].getMetrics(); - } -} \ No newline at end of file diff --git a/java/lib/edu/mit/broad/picard/metrics/Header.java b/java/lib/edu/mit/broad/picard/metrics/Header.java deleted file mode 100644 index 3ae8f2179..000000000 --- a/java/lib/edu/mit/broad/picard/metrics/Header.java +++ /dev/null @@ -1,17 +0,0 @@ -package edu.mit.broad.picard.metrics; - -/** - * A header for a metrics file. A header simply consists of a type and some arbitrary - * data, but must be able to turn itself into a String and parse it's data back out - * of that String at a later date. - * - * @author Tim Fennell - */ -public interface Header { - /** Converts the header to a String for persisting to a file. */ - public String toString(); - - /** Parses the data contained in the String version of the header. */ - public void parse(String in); - -} diff --git a/java/lib/edu/mit/broad/picard/metrics/MetricBase.java b/java/lib/edu/mit/broad/picard/metrics/MetricBase.java deleted file mode 100644 index 21c1226cd..000000000 --- a/java/lib/edu/mit/broad/picard/metrics/MetricBase.java +++ /dev/null @@ -1,77 +0,0 @@ -package edu.mit.broad.picard.metrics; - -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.util.FormatUtil; - -import java.lang.reflect.Field; - -/** - * A base class from which all Metric classes should inherit. - * - * @author Tim Fennell - */ -public class MetricBase { - /** - * An equals method that checks equality by asserting that the classes are of the exact - * same type and that all public fields are equal. - * - * @param o an instance to compare to - * @return true if they are equal, false otherwise - */ - public boolean equals(Object o) { - if (o == null) return false; - if (o.getClass() != getClass()) return false; - - // Loop through all the fields and check that they are either - // null in both objects or equal in both objects - for (Field f : getClass().getFields()) { - try { - Object lhs = f.get(this); - Object rhs = f.get(o); - - if (lhs == null) { - if (rhs == null) { - // keep going - } - else if (rhs != null) { - return false; - } - } - else { - if (lhs.equals(rhs)) { - // keep going - } - else { - return false; - } - } - } - catch (IllegalAccessException iae) { - throw new PicardException("Could not read field " + f.getName() + " from a " + getClass().getSimpleName()); - } - } - - // If we got this far all the fields are equal - return true; - } - - /** Converts the metric class to a human readable string. */ - public String toString() { - StringBuilder buffer = new StringBuilder(); - FormatUtil formatter = new FormatUtil(); - - for (Field f : getClass().getFields()) { - try { - buffer.append(f.getName()); - buffer.append("\t"); - buffer.append(formatter.format(f.get(this))); - buffer.append("\n"); - } - catch (IllegalAccessException iae) { - throw new PicardException("Could not read field " + f.getName() + " from a " + getClass().getSimpleName()); - } - } - - return buffer.toString(); - } -} diff --git a/java/lib/edu/mit/broad/picard/metrics/MetricCollector.java b/java/lib/edu/mit/broad/picard/metrics/MetricCollector.java deleted file mode 100644 index e84fed450..000000000 --- a/java/lib/edu/mit/broad/picard/metrics/MetricCollector.java +++ /dev/null @@ -1,24 +0,0 @@ -package edu.mit.broad.picard.metrics; - -import edu.mit.broad.sam.SAMRecord; - -/** - * Interface for objects that collect metrics about SAMRecords. - */ -public interface MetricCollector { - T getMetrics(); - - /** Called after collector is constructed to populate the metrics object. */ - void setMetrics(T metrics); - - /** - * Called when collection is complete. Implementations can do any calculations - * that must wait until all records are visited at this time. - */ - void onComplete(); - - /** - * Visitor method called to have the record considered by the collector. - */ - void addRecord(SAMRecord record); -} \ No newline at end of file diff --git a/java/lib/edu/mit/broad/picard/metrics/MetricsFile.java b/java/lib/edu/mit/broad/picard/metrics/MetricsFile.java deleted file mode 100644 index 72c6da842..000000000 --- a/java/lib/edu/mit/broad/picard/metrics/MetricsFile.java +++ /dev/null @@ -1,370 +0,0 @@ -package edu.mit.broad.picard.metrics; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.io.Reader; -import java.io.Writer; -import java.lang.reflect.Field; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.util.FormatUtil; -import edu.mit.broad.picard.util.Histogram; -import edu.mit.broad.picard.util.StringUtil; - -/** - * Contains a set of metrics that can be written to a file and parsed back - * again. The set of metrics is composed of zero or more instances of a class, - * BEAN, that extends {@link MetricBase} (all instances must be of the same type) - * and may optionally include a histogram of data. - * - * @author Tim Fennell - */ -public class MetricsFile { - public static final String MAJOR_HEADER_PREFIX = "## "; - public static final String MINOR_HEADER_PREFIX = "# "; - public static final String SEPARATOR = "\t"; - public static final String HISTO_HEADER = "## HISTOGRAM\t"; - public static final String METRIC_HEADER = "## METRICS CLASS\t"; - - private List

    headers = new ArrayList
    (); - private List metrics = new ArrayList(); - private Histogram histogram; - - /** Adds a header to the collection of metrics. */ - public void addHeader(Header h) { this.headers.add(h); } - - /** Returns the list of headers. */ - public List
    getHeaders() { return Collections.unmodifiableList(this.headers); } - - /** Adds a bean to the collection of metrics. */ - public void addMetric(BEAN bean) { this.metrics.add(bean); } - - /** Returns the list of headers. */ - public List getMetrics() { return Collections.unmodifiableList(this.metrics); } - - /** Returns the histogram contained in the metrics file if any. */ - public Histogram getHistogram() { return histogram; } - - /** Sets the histogram contained in the metrics file. */ - public void setHistogram(Histogram histogram) { this.histogram = histogram; } - - /** Returns the list of headers with the specified type. */ - public List
    getHeaders(Class type) { - List
    tmp = new ArrayList
    (); - for (Header h : this.headers) { - if (h.getClass().equals(type)) { - tmp.add(h); - } - } - - return tmp; - } - - /** - * Writes out the metrics file to the supplied file. The file is written out - * headers first, metrics second and histogram third. - * - * @param f a File into which to write the metrics - */ - public void write(File f) { - FileWriter w = null; - try { - w = new FileWriter(f); - write(w); - } - catch (IOException ioe) { - throw new PicardException("Could not write metrics to file: " + f.getAbsolutePath(), ioe); - } - finally { - if (w != null) { - try { - w.close(); - } catch (IOException e) { - } - } - } - } - - /** - * Writes out the metrics file to the supplied writer. The file is written out - * headers first, metrics second and histogram third. - * - * @param w a Writer into which to write the metrics - */ - public void write(Writer w) { - try { - FormatUtil formatter = new FormatUtil(); - BufferedWriter out = new BufferedWriter(w); - printHeaders(out); - out.newLine(); - - printBeanMetrics(out, formatter); - out.newLine(); - - printHistogram(out, formatter); - out.newLine(); - out.flush(); - } - catch (IOException ioe) { - throw new PicardException("Could not write metrics file.", ioe); - } - } - - /** Prints the headers into the provided PrintWriter. */ - private void printHeaders(BufferedWriter out) throws IOException { - for (Header h : this.headers) { - out.append(MAJOR_HEADER_PREFIX); - out.append(h.getClass().getName()); - out.newLine(); - out.append(MINOR_HEADER_PREFIX); - out.append(h.toString()); - out.newLine(); - } - } - - /** Prints each of the metrics entries into the provided PrintWriter. */ - private void printBeanMetrics(BufferedWriter out, FormatUtil formatter) throws IOException { - if (this.metrics.isEmpty()) { - return; - } - - // Write out a header row with the type of the metric class - out.append(METRIC_HEADER + getBeanType().getName()); - out.newLine(); - - // Write out the column headers - Field[] fields = getBeanType().getFields(); - final int fieldCount = fields.length; - - for (int i=0; i.Bin bin : this.histogram.values()) { - out.append(StringUtil.assertCharactersNotInString(formatter.format(bin.getId()), '\t', '\n')); - out.append(MetricsFile.SEPARATOR); - out.append(formatter.format(bin.getValue())); - out.newLine(); - } - } - } - - /** Gets the type of the metrics bean being used. */ - private Class getBeanType() { - if (this.metrics == null || this.metrics.isEmpty()) { - return null; - } else { - return this.metrics.get(0).getClass(); - } - } - - /** Reads the Metrics in from the given reader. */ - public void read(Reader r) { - BufferedReader in = new BufferedReader(r); - FormatUtil formatter = new FormatUtil(); - String line = null; - - try { - // First read the headers - Header header = null; - boolean inHeader = true; - while ((line = in.readLine()) != null && inHeader) { - line = line.trim(); - // A blank line signals the end of the headers, otherwise parse out - // the header types and values and build the headers. - if ("".equals(line)) { - inHeader = false; - } - else if (line.startsWith(MAJOR_HEADER_PREFIX)) { - if (header != null) { - throw new IllegalStateException("Consecutive header class lines encountered."); - } - - String className = line.substring(MAJOR_HEADER_PREFIX.length()).trim(); - try { - header = (Header) Class.forName(className).newInstance(); - } - catch (Exception e) { - throw new PicardException("Error load and/or instantiating an instance of " + className, e); - } - } - else if (line.startsWith(MINOR_HEADER_PREFIX)) { - if (header == null) { - throw new IllegalStateException("Header class must precede header value:" + line); - } - header.parse(line.substring(MINOR_HEADER_PREFIX.length())); - this.headers.add(header); - header = null; - } - else { - throw new PicardException("Illegal state. Found following string in metrics file header: " + line); - } - } - - // Then read the metrics if there are any - while (!line.startsWith(MAJOR_HEADER_PREFIX)) { - line = in.readLine().trim(); - } - if (line.startsWith(METRIC_HEADER)) { - // Get the metric class from the header - String className = line.split(SEPARATOR)[1]; - Class type = null; - try { - type = Class.forName(className); - } - catch (ClassNotFoundException cnfe) { - throw new PicardException("Could not locate class with name " + className, cnfe); - } - - // Read the next line with the column headers - String[] fieldNames = in.readLine().split(SEPARATOR); - Field[] fields = new Field[fieldNames.length]; - for (int i=0; i 0) { - value = formatter.parseObject(values[i], fields[i].getType()); - } - - try { fields[i].set(bean, value); } - catch (Exception e) { - throw new PicardException("Error setting field " + fields[i].getName() + - " on class of type " + type.getName(), e); - } - } - - this.metrics.add(bean); - } - } - } - - // Then read the histogram if it is present - while (line != null && !line.startsWith(MAJOR_HEADER_PREFIX)) { - line = in.readLine(); - } - if (line != null && line.startsWith(HISTO_HEADER)) { - // Get the key type of the histogram - String keyClassName = line.split(SEPARATOR)[1].trim(); - Class keyClass = null; - - try { keyClass = Class.forName(keyClassName); } - catch (ClassNotFoundException cnfe) { throw new PicardException("Could not load class with name " + keyClassName); } - - // Read the next line with the bin and value labels - String[] labels = in.readLine().split(SEPARATOR); - this.histogram = new Histogram(labels[0], labels[1]); - - // Read the entries in the histogram - while ((line = in.readLine()) != null && !"".equals(line)) { - String[] fields = line.trim().split(SEPARATOR); - HKEY key = (HKEY) formatter.parseObject(fields[0], keyClass); - double value = formatter.parseDouble(fields[1]); - this.histogram.increment(key, value); - } - } - } - catch (IOException ioe) { - throw new PicardException("Could not read metrics from reader.", ioe); - } - } - - /** Checks that the headers, metrics and histogram are all equal. */ - @Override - public boolean equals(Object o) { - if (getClass() != o.getClass()) { - return false; - } - MetricsFile that = (MetricsFile) o; - - if (!this.headers.equals(that.headers)) { - return false; - } - if (!this.metrics.equals(that.metrics)) { - return false; - } - if (this.histogram == null && that.histogram == null) { - return true; - } else if (this.histogram != null) { - return this.histogram.equals(that.histogram); - } else if (that.histogram != null) { - return that.histogram.equals(this.histogram); - } - - return true; - } -} diff --git a/java/lib/edu/mit/broad/picard/metrics/StringHeader.java b/java/lib/edu/mit/broad/picard/metrics/StringHeader.java deleted file mode 100644 index 6798def88..000000000 --- a/java/lib/edu/mit/broad/picard/metrics/StringHeader.java +++ /dev/null @@ -1,43 +0,0 @@ -package edu.mit.broad.picard.metrics; - -import edu.mit.broad.picard.util.StringUtil; - -/** - * A simple header who's data type is a single String. Should not be used for anything other - * than comments or descriptive text. - * - * @author Tim Fennell - */ -public class StringHeader implements Header { - private String value; - - /** Default constructor. */ - public StringHeader() {} - - /** Constructor that uses the supplied value as the value of the header. */ - public StringHeader(String value) { - setValue(value); - } - - public void parse(String in) { value = in.trim(); } - public String toString() { return value; } - - public String getValue() { return value; } - public void setValue(String value) { this.value = StringUtil.assertCharactersNotInString(value, '\n'); } - - /** Checks equality on the value of the header. */ - public boolean equals(Object o) { - if (o != null && o instanceof StringHeader) { - StringHeader that = (StringHeader) o; - if (this.value == null) { - return that.value == null; - } - else { - return this.value.equals(that.value); - } - } - else { - return false; - } - } -} diff --git a/java/lib/edu/mit/broad/picard/metrics/VersionHeader.java b/java/lib/edu/mit/broad/picard/metrics/VersionHeader.java deleted file mode 100644 index 665f39ecf..000000000 --- a/java/lib/edu/mit/broad/picard/metrics/VersionHeader.java +++ /dev/null @@ -1,50 +0,0 @@ -package edu.mit.broad.picard.metrics; - -import edu.mit.broad.picard.util.StringUtil; - -/** - * Header that stores information about the version of some piece of software or - * data used to create the metrics file. Payload consists of a name or description - * of the versioned item and a version string. - * - * @author Tim Fennell - */ -public class VersionHeader implements Header { - private String versionedItem; - private String versionString; - - public void parse(String in) { - String[] fields = in.split("\t"); - this.versionedItem = fields[0]; - this.versionString = fields[1]; - } - - public String toString() { - return this.versionedItem + "\t" + this.versionString; - } - - public String getVersionedItem() { return versionedItem; } - public void setVersionedItem(String versionedItem) { - this.versionedItem = StringUtil.assertCharactersNotInString(versionedItem, '\t', '\n'); - } - - public String getVersionString() { return versionString; } - public void setVersionString(String versionString) { - this.versionString = StringUtil.assertCharactersNotInString(versionString, '\t', '\n'); - } - - /** Equals method that checks that both the item and version string are equal. */ - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - VersionHeader that = (VersionHeader) o; - - if (versionString != null ? !versionString.equals(that.versionString) : that.versionString != null) - return false; - if (versionedItem != null ? !versionedItem.equals(that.versionedItem) : that.versionedItem != null) - return false; - - return true; - } -} diff --git a/java/lib/edu/mit/broad/picard/quality/CalibrateQualityScores.java b/java/lib/edu/mit/broad/picard/quality/CalibrateQualityScores.java deleted file mode 100644 index 9aa59618f..000000000 --- a/java/lib/edu/mit/broad/picard/quality/CalibrateQualityScores.java +++ /dev/null @@ -1,148 +0,0 @@ -package edu.mit.broad.picard.quality; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.reference.ReferenceSequenceFile; -import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; -import edu.mit.broad.picard.variation.DbSnpFileReader; -import edu.mit.broad.picard.util.Log; -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.sam.SAMFileWriter; -import edu.mit.broad.sam.SAMFileWriterFactory; -import edu.mit.broad.sam.SAMRecord; - -import java.io.File; -import java.io.PrintStream; - -/** - * Command line program to calibrate quality scores using alignment and dbsnp data. Calibrates - * qualities cycle by cycle and separately for reads one and two in a pair. Bases that fall - * within dbSNP loci are ignored otherwise the empircal mismatch rate is calculated for - * each quality at each cycle and used to calculate the calibrated quality value. - * - * @author Tim Fennell - */ -public class CalibrateQualityScores extends CommandLineProgram { - @Option(shortName="A", doc="A file of aligned reads in SAM or BAM format") - public File ALIGNED_SAM; - - @Option(shortName="I", doc="A SAM or BAM file to rewrite with calibrated qualities. If omitted ALIGNED_SAM is used.", optional=true) - public File INPUT; - - @Option(shortName="O", doc="The SAM or BAM file to write with updated qualities.") - public File OUTPUT; - - @Option(shortName="R", doc="Reference sequence file") - public File REFERENCE; - - @Option(shortName="SNP", doc="Binary file of dbSNP information", optional=true) - public File DBSNP_FILE; - - @Option(shortName="TABLE", doc="A file to output the calibration table(s) to.") - public File CALIBRATION_TABLE_OUT; - - @Option(doc="Optional limit to the number of aligned reads that should be procesed", optional=true) - public Integer READ_LIMIT = -1; - - /** Stock main method for a command line program. */ - public static void main(String[] argv) { - System.exit(new CalibrateQualityScores().instanceMain(argv)); - } - - /** - * Main method for the program. Checks that all input files are present and - * readable and that the output file can be written to. Then loads up all the - * data and calibrates the quality scores and proceeds to write an output file - * with calibrated quality scores instead of the input quality scores. - */ - protected int doWork() { - final Log log = Log.getInstance(getClass()); - - // Some quick parameter checking - if (INPUT == null) INPUT = ALIGNED_SAM; - - IoUtil.assertFileIsReadable(ALIGNED_SAM); - IoUtil.assertFileIsReadable(REFERENCE); - IoUtil.assertFileIsReadable(INPUT); - IoUtil.assertFileIsWritable(OUTPUT); - IoUtil.assertFileIsWritable(CALIBRATION_TABLE_OUT); - - log.info("Reading input files and calculating calibration matrices."); - - // Load things up and calculate the quality score calibrations - SAMFileReader sam = new SAMFileReader(ALIGNED_SAM); - ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE); - DbSnpFileReader dbsnp = null; - - if (DBSNP_FILE != null) { - IoUtil.assertFileIsReadable(DBSNP_FILE); - dbsnp = new DbSnpFileReader(DBSNP_FILE); - } - - QualityScoreCalibrator calibrator = new QualityScoreCalibrator(sam, ref, dbsnp); - calibrator.calibrate(READ_LIMIT); - - // Dump the calibration tables - log.info("Writing out calibration table."); - PrintStream stream = new PrintStream(IoUtil.openFileForWriting(CALIBRATION_TABLE_OUT)); - stream.println("Read 1 Calibration Table:"); - print(stream, calibrator.getRead1Matrix().getCalibratedQualities()); - - if (!calibrator.getRead2Matrix().isEmpty()) { - stream.println(); - stream.println("Read 2 Calibration Table:"); - print(stream, calibrator.getRead2Matrix().getCalibratedQualities()); - } - - // And then load up the input and rewrite with calibrated qualities - log.info("Writing file with calibrated qualities."); - SAMFileReader in = new SAMFileReader(INPUT); - SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT); - - for (SAMRecord rec : in) { - byte[] quals = rec.getBaseQualities(); - byte[] calibrated = new byte[quals.length]; - QualityScoreMatrix matrix = rec.getFirstOfPairFlag() ? calibrator.getRead1Matrix() : calibrator.getRead2Matrix(); - - for (int i=0; i samIterator = this.sam.iterator(); - SAMRecord read = samIterator.next(); - int readsProcessed = 0; - - // Quality score matrixes for reads 1 and 2 separately - this.read1Matrix = new QualityScoreMatrix(); - this.read2Matrix = new QualityScoreMatrix(); - - - refloop: while ((reference = this.ref.nextSequence()) != null) { - final byte[] refBases = reference.getBases(); - final BitSet snps = getDbSnpMask(reference); - - while (read != null && read.getReferenceIndex(header) == reference.getContigIndex()) { - if (!read.getReadUnmappedFlag() && !read.getNotPrimaryAlignmentFlag()) { - final QualityScoreMatrix matrix = read.getFirstOfPairFlag() ? this.read1Matrix : this.read2Matrix; - final byte[] readBases = read.getReadBases(); - final byte[] qualities = read.getBaseQualities(); - - for (AlignmentBlock block : read.getAlignmentBlocks()) { - final int readIndex = block.getReadStart() - 1; - final int refIndex = block.getReferenceStart() - 1; - final int length = block.getLength(); - - for (int i=0; i 0 && ++readsProcessed >= readLimit) { - break refloop; - } - } - - // Advance the sam iterator - if (samIterator.hasNext()) { - read = samIterator.next(); - } - else { - read = null; - } - } - } - - this.read1Matrix.computeCalibratedQualities(); - if (!this.read2Matrix.isEmpty()) this.read2Matrix.computeCalibratedQualities(); - } - - /** Gets the calibration matrix for the first read. */ - public QualityScoreMatrix getRead1Matrix() { return read1Matrix; } - - /** Gets the calibration matrix for the second read. May be empty if there was no second read data. */ - public QualityScoreMatrix getRead2Matrix() { return read2Matrix; } - - /** - * Returns a BitSet that denotes whether a dbSNP entry is present at each - * base in the reference sequence. The set is reference.length() + 1 so that - * it can be indexed by 1-based reference base. True means dbSNP present, - * false means no dbSNP present. - */ - private BitSet getDbSnpMask(ReferenceSequence reference) { - int index = reference.getContigIndex(); - BitSet bits = new BitSet(reference.length() + 1); - - /* Just return an all false bit set if we don't have dbsnp data. */ - if (this.dbsnp == null) { - return bits; - } - - /* Read off the next contig's worth of data. */ - while (this.dbsnp.hasNext()) { - KnownVariant variant = this.dbsnp.peek(); - - if (variant.getSequenceIndex() < index) { - this.dbsnp.next(); - } - else if (variant.getSequenceIndex() == index) { - variant = this.dbsnp.next(); - - for (int i=variant.getStartPos(); i<=variant.getEndPos(); ++i) { - bits.set(i, true); - } - } - else { - break; - } - } - - return bits; - } -} diff --git a/java/lib/edu/mit/broad/picard/quality/QualityScoreMatrix.java b/java/lib/edu/mit/broad/picard/quality/QualityScoreMatrix.java deleted file mode 100644 index c5c1674c6..000000000 --- a/java/lib/edu/mit/broad/picard/quality/QualityScoreMatrix.java +++ /dev/null @@ -1,133 +0,0 @@ -package edu.mit.broad.picard.quality; - -import edu.mit.broad.picard.util.Histogram; - -import java.util.TreeMap; -import java.util.Map; -import java.util.SortedMap; - -/** - *

    Holds all the information necessary to perform quality score calibration for a single - * end/read for a lane or run of sequencing. General usage is to construct an instance - * an call {@link #addObservation(int, int, boolean)} repeatedly and when all input data - * is consumed call {@link #computeCalibratedQualities()}.

    - * - *

    Once this is done then {@link #getCalibratedQualities()} can be called to get a matrix - * of quality score calibrations by cycle and input quality. However it is preferred to call - * {@link #getCalibratedQuality(int, int)} which will attempt to infer the correct value in the - * case that the input quality was not observed in the training data.

    - * - * @author Tim Fennell - */ -public class QualityScoreMatrix { - // Maps by cycle, histograms by quality - private SortedMap> observations = new TreeMap>(); - private SortedMap> errors = new TreeMap>(); - - private int[][] calibratedQualities = null; - - /** - * Adds an observation to the matrix. - * @param cycle the cycle in the read (1-based) - * @param quality the uncalibrated quality - * @param error true if the base did not match the reference, false otherwise - */ - public void addObservation(int cycle, int quality, boolean error) { - Histogram obs = this.observations.get(cycle); - if (obs == null) { - obs = new Histogram(); - this.observations.put(cycle, obs); - } - obs.increment(quality); - - if (error) { - Histogram errs = this.errors.get(cycle); - if (errs == null) { - errs = new Histogram(); - this.errors.put(cycle, errs); - } - errs.increment(quality); - } - } - - /** - * Takes the input observations so far and builds a matrix of input cycle and - * uncalibrated quality to calibrated quality value. - */ - public void computeCalibratedQualities() { - this.calibratedQualities = new int[this.observations.lastKey() + 1][]; - - for (int cycle=1; cycle obs = this.observations.get(cycle); - Histogram err = this.errors.get(cycle); - - this.calibratedQualities[cycle] = new int[obs.lastKey() + 1]; - - for (Integer qual : obs.keySet()) { - double o = obs.get(qual).getValue(); - Histogram.Bin errBin = err.get(qual); - double e = (errBin == null) ? 1 : errBin.getValue(); - - this.calibratedQualities[cycle][qual] = computePhredScore(e, o); - } - } - } - - /** - * Returns the set of calibrated quality scores from the training data. The array is - * indexed first by the cycle (1-based, index 0 is empty) and then by input quality - * (again, the actualy quality, not shifted). - * - * @return an array of calibrated qualities for the read - */ - public int[][] getCalibratedQualities() { - return calibratedQualities; - } - - /** - * Accesses the calibrated quality for the given input cycle and quality. If the quality - * is outside the range given in the training data then the upper or lower bound of - * the calibrated qualities is used instead. - * - * @param cycle the input cycle (1-based) - * @param quality the uncalibrated quality - * @return the calibrated quality for the cycle and uncalibrated quality - */ - public final int getCalibratedQuality(int cycle, int quality) { - final int[] quals = this.calibratedQualities[cycle]; - - // TODO: proper iterpolation where we don't have the right quality - try { - int retval = quals[quality]; - - // If we didn't calibrate this quality value, search up and down for non-zero - for (int i=quality; i>0 && retval == 0; --i) { - if (quals[i] != 0) retval = quals[i]; - } - - for (int i=quality; i sequenceDictionary; - private String cachedLine = null; - private int index = -1; - - /** Constructs a FastaSequenceFile that reads from the specified file. */ - FastaSequenceFile(File file) { - this.file = file; - this.in = new BufferedReader(new InputStreamReader(IoUtil.openFileForReading(file))); - - // Try and locate the dictionary - String dictionaryName = file.getAbsolutePath(); - dictionaryName = dictionaryName.substring(0, dictionaryName.lastIndexOf(".fasta")); - dictionaryName += ".dict"; - File dictionary = new File(dictionaryName); - if (dictionary.exists()) { - IoUtil.assertFileIsReadable(dictionary); - - try { - SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); - SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)), dictionary); - if (header.getSequences() != null && header.getSequences().size() > 0) { - this.sequenceDictionary = header.getSequences(); - } - } - catch (Exception e) { - throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e); - } - } - } - - /** - * Returns the list of sequence records associated with the reference sequence if found - * otherwise null. - */ - public List getSequenceDictionary() { - return this.sequenceDictionary; - } - - public ReferenceSequence nextSequence() { - String line = null; - String name = null; - - // Scan forward to a header line - while ((line = readNextLine()) != null) { - if (line.startsWith(">")) { - name = line.substring(1).trim(); - this.index += 1; - break; - } - } - - // No more! - if (name == null) return null; - - // Read the sequence - int basesRead = 0; - byte[] bases = new byte[250000000]; // big enough to hold human chr1! - while ((line = readNextLine()) != null) { - if (line.startsWith(">")) { - pushBackLine(line); - break; - } - else { - final byte[] nextBases = line.getBytes(ASCII); - final int lineLength = nextBases.length; - - // If the array isn't big enough to hold the next chunk, resize it - if (basesRead + lineLength > bases.length) { - byte[] tmp = new byte[bases.length * 2]; - System.arraycopy(bases, 0, tmp, 0, basesRead); - bases = tmp; - } - - // Now shunt the most recent bases onto the end of the array - System.arraycopy(nextBases, 0, bases, basesRead, lineLength); - basesRead += lineLength; - } - } - - // And lastly resize the array down to the right size - if (basesRead != bases.length) { - byte[] tmp = new byte[basesRead]; - System.arraycopy(bases, 0, tmp, 0, basesRead); - bases = tmp; - } - - return new ReferenceSequence(name, this.index, bases); - } - - /** - * Reads the next line from the file, or if we've saved a line earlier, returns that - * instead. - */ - private String readNextLine() { - // If we have a cached line use it - if (this.cachedLine != null) { - String tmp = this.cachedLine; - this.cachedLine = null; - return tmp; - } - else { - try { return this.in.readLine(); } - catch (IOException ioe) { - throw new PicardException("Error reading line from file: " + this.file.getAbsolutePath(), ioe); - } - } - } - - /** Pushed a line back so that the next call to readNextLine() will return it. */ - private void pushBackLine(String line) { - this.cachedLine = line; - } -} - diff --git a/java/lib/edu/mit/broad/picard/reference/ReferenceSequence.java b/java/lib/edu/mit/broad/picard/reference/ReferenceSequence.java deleted file mode 100644 index 24aebc7b5..000000000 --- a/java/lib/edu/mit/broad/picard/reference/ReferenceSequence.java +++ /dev/null @@ -1,48 +0,0 @@ -package edu.mit.broad.picard.reference; - -/** - * Wrapper around a reference sequence that has been read from a reference file. - * - * @author Tim Fennell - */ -public class ReferenceSequence { - private String name; - private byte[] bases; - private int contigIndex; - private int length; - - /** - * Package level constructor that creates a fully formed ReferenceSequence - * - * @param name the name of the sequence from the source file - * @param index the zero based index of this contig in the source file - * @param bases the bases themselves stored as one-byte characters - */ - ReferenceSequence(String name, int index, byte[] bases) { - this.name = name; - this.contigIndex = index; - this.bases = bases; - this.length = bases.length; - } - - /** Gets the set of names given to this sequence in the source file. */ - public String getName() { return name; } - - /** - * Gets the array of bases that define this sequence. The bases can include any - * letter and possibly include masking information in the form of lower case - * letters. This array is mutable (obviously!) and it NOT a clone of the array - * held interally. Do not modify it!!! - */ - public byte[] getBases() { return bases; } - - /** Gets the 0-based index of this contig in the source file from which it came. */ - public int getContigIndex() { return contigIndex; } - - /** Gets the length of this reference sequence in bases. */ - public int length() { return length; } - - public String toString() { - return "ReferenceSequence " + getName(); - } -} diff --git a/java/lib/edu/mit/broad/picard/reference/ReferenceSequenceFile.java b/java/lib/edu/mit/broad/picard/reference/ReferenceSequenceFile.java deleted file mode 100644 index 34accc3f6..000000000 --- a/java/lib/edu/mit/broad/picard/reference/ReferenceSequenceFile.java +++ /dev/null @@ -1,29 +0,0 @@ -package edu.mit.broad.picard.reference; - -import edu.mit.broad.sam.SAMSequenceRecord; - -import java.util.List; - -/** - * An interface for working with files of reference sequences regardless of the file format - * being used. - * - * @author Tim Fennell - */ -public interface ReferenceSequenceFile { - - /** - * Must return a sequence dictionary with at least the following fields completed - * for each sequence: name, length. - * - * @return a list of sequence records representing the sequences in this reference file - */ - public List getSequenceDictionary(); - - /** - * Retrieves the next whole sequences from the file. - * @return a ReferenceSequence or null if at the end of the file - */ - public ReferenceSequence nextSequence(); - -} diff --git a/java/lib/edu/mit/broad/picard/reference/ReferenceSequenceFileFactory.java b/java/lib/edu/mit/broad/picard/reference/ReferenceSequenceFileFactory.java deleted file mode 100644 index 57b5907d1..000000000 --- a/java/lib/edu/mit/broad/picard/reference/ReferenceSequenceFileFactory.java +++ /dev/null @@ -1,28 +0,0 @@ -package edu.mit.broad.picard.reference; - -import java.io.File; - -/** - * Factory class for creating ReferenceSequenceFile instances for reading reference - * sequences store in various formats. - * - * @author Tim Fennell - */ -public class ReferenceSequenceFileFactory { - - /** - * Attempts to determine the type of the reference file and return an instance - * of ReferenceSequenceFile that is appropriate to read it. - * - * @param file the reference sequence file on disk - */ - public static ReferenceSequenceFile getReferenceSequenceFile(File file) { - String name = file.getName(); - if (name.endsWith(".fasta") || name.endsWith("fasta.gz") || name.endsWith(".txt") || name.endsWith(".txt.gz")) { - return new FastaSequenceFile(file); - } - else { - throw new IllegalArgumentException("File is not a supported reference file type: " + file.getAbsolutePath()); - } - } -} diff --git a/java/lib/edu/mit/broad/picard/sam/CollectAlignmentSummaryMetrics.java b/java/lib/edu/mit/broad/picard/sam/CollectAlignmentSummaryMetrics.java deleted file mode 100644 index a3bc8fed8..000000000 --- a/java/lib/edu/mit/broad/picard/sam/CollectAlignmentSummaryMetrics.java +++ /dev/null @@ -1,352 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ - -package edu.mit.broad.picard.sam; - -import java.io.File; - -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.metrics.AggregateMetricCollector; -import edu.mit.broad.picard.metrics.MetricBase; -import edu.mit.broad.picard.metrics.MetricCollector; -import edu.mit.broad.picard.metrics.MetricsFile; -import edu.mit.broad.picard.metrics.StringHeader; -import edu.mit.broad.picard.reference.ReferenceSequence; -import edu.mit.broad.picard.reference.ReferenceSequenceFile; -import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; -import edu.mit.broad.picard.sam.CollectAlignmentSummaryMetrics.AlignmentSummaryMetrics.Type; -import edu.mit.broad.picard.util.CoordMath; -import edu.mit.broad.picard.util.Histogram; -import edu.mit.broad.picard.util.SequenceUtil; -import edu.mit.broad.sam.AlignmentBlock; -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.sam.SAMRecord; -import edu.mit.broad.sam.util.CloseableIterator; - -/** - * A command line tool to read a BAM file and produce standard alignment metrics that would be applicable to any alignment. - * Metrics to include, but not limited to: - *
      - *
    • Total number of reads (total, period, no exclusions)
    • - *
    • Total number of PF reads (PF == does not fail vendor check flag)
    • - *
    • Number of PF noise reads (does not fail vendor check and has noise attr set)
    • - *
    • Total aligned PF reads (any PF read that has a sequence and position)
    • - *
    • High quality aligned PF reads (high quality == mapping quality >= 20)
    • - *
    • High quality aligned PF bases (actual aligned bases, calculate off alignment blocks)
    • - *
    • High quality aligned PF Q20 bases (subset of above where base quality >= 20)
    • - *
    • Median mismatches in HQ aligned PF reads (how many aligned bases != ref on average)
    • - *
    • Reads aligned in pairs (vs. reads aligned with mate unaligned/not present)
    • - *
    • Read length (how to handle mixed lengths?)
    • - *
    • Bad Cycles - how many machine cycles yielded combined no-call and mismatch rates of >= 80%
    • - *
    • Strand balance - reads mapped to positive strand / total mapped reads
    • - *
    - * Metrics are written for the first read of a pair, the second read, and combined for the pair. - * - * @author Doug Voet - */ -public class CollectAlignmentSummaryMetrics extends CommandLineProgram { - private static final int MAPPING_QUALITY_THRESHOLD = 20; - private static final int BASE_QUALITY_THRESHOLD = 20; - - // Usage and parameters - @Usage(programVersion="1.0") - public String USAGE = "Reads a SAM or BAM file and writes a file containing summary metrics.\n"; - @Option(shortName="I", doc="SAM or BAM file") public File INPUT; - @Option(shortName="O", doc="File to write insert size metrics to") public File OUTPUT; - @Option(shortName="R", doc="Reference sequence file") public File REFERENCE; - @Option(doc="If true (default), \"unsorted\" SAM/BAM files will be considerd coordinate sorted") - public Boolean ASSUME_COODINATE_SORTED = Boolean.TRUE; - - private ReferenceSequenceFile ref; - private ReferenceSequence refSequence; - private SAMFileHeader samFileHeader; - - /** Required main method implementation. */ - public static void main(String[] argv) { - System.exit(new CollectAlignmentSummaryMetrics().instanceMain(argv)); - } - - @Override - protected int doWork() { - IoUtil.assertFileIsReadable(INPUT); - IoUtil.assertFileIsReadable(REFERENCE); - IoUtil.assertFileIsWritable(OUTPUT); - SAMFileReader in = new SAMFileReader(INPUT); - assertCoordinateSortOrder(in); - - this.ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE); - this.samFileHeader = in.getFileHeader(); - - MetricsFile> file = collectMetrics(in.iterator()); - in.close(); - - file.write(OUTPUT); - - return 0; - } - - private void assertCoordinateSortOrder(SAMFileReader in) { - switch (in.getFileHeader().getSortOrder()) { - case coordinate: - break; - case unsorted: - if (this.ASSUME_COODINATE_SORTED) { - break; - } - default: - throw new PicardException("Cannot collect summary statistics in file " + INPUT.getAbsoluteFile() + - " because it is not sorted in coordinate order."); - } - } - - private ReferenceSequence getReference(SAMRecord record) { - while (refSequence == null || - record.getReferenceIndex(samFileHeader) > refSequence.getContigIndex()) { - - refSequence = ref.nextSequence(); - } - - if (refSequence == null || record.getReferenceIndex() != refSequence.getContigIndex()) { - throw new PicardException("Cannot find reference sequence [" + - record.getReferenceIndex() + "] in reference file"); - } - - return refSequence; - } - - /** - * Does all the work of iterating through the sam file and collecting summary alignment metrics. - */ - private MetricsFile> collectMetrics( - CloseableIterator samIterator) { - - final MetricCollector unpairedCollector = - constructCollector(Type.UNPAIRED); - final MetricCollector firstOfPairCollector = - constructCollector(Type.FIRST_OF_PAIR); - final MetricCollector secondOfPairCollector = - constructCollector(Type.SECOND_OF_PAIR); - final MetricCollector pairCollector = - constructCollector(Type.PAIR); - - while (samIterator.hasNext()) { - SAMRecord record = samIterator.next(); - - if (record.getReadPairedFlag()) { - if (record.getFirstOfPairFlag()) { - firstOfPairCollector.addRecord(record); - } else { - secondOfPairCollector.addRecord(record); - } - pairCollector.addRecord(record); - } else { - unpairedCollector.addRecord(record); - } - } - - firstOfPairCollector.onComplete(); - secondOfPairCollector.onComplete(); - pairCollector.onComplete(); - unpairedCollector.onComplete(); - - MetricsFile> file = getMetricsFile(); - file.addHeader(new StringHeader("Input file: " + INPUT.getAbsolutePath())); - file.addHeader(new StringHeader("Output file: " + OUTPUT.getAbsolutePath())); - file.addHeader(new StringHeader("Reference file: " + REFERENCE.getAbsolutePath())); - - if (firstOfPairCollector.getMetrics().TOTAL_READS > 0) { - file.addMetric(firstOfPairCollector.getMetrics()); - // override how bad cycle is determined for paired reads, it should be - // the sum of first and second reads - pairCollector.getMetrics().BAD_CYCLES = - firstOfPairCollector.getMetrics().BAD_CYCLES + - secondOfPairCollector.getMetrics().BAD_CYCLES; - file.addMetric(secondOfPairCollector.getMetrics()); - file.addMetric(pairCollector.getMetrics()); - } - if (unpairedCollector.getMetrics().TOTAL_READS > 0) { - file.addMetric(unpairedCollector.getMetrics()); - } - - return file; - } - - private MetricCollector constructCollector(Type type) { - MetricCollector collector = - new AggregateMetricCollector(new ReadCounter(), new QualityMappingCounter()); - collector.setMetrics(new AlignmentSummaryMetrics()); - collector.getMetrics().TYPE = type; - return collector; - } - - public static class AlignmentSummaryMetrics extends MetricBase { - public enum Type { UNPAIRED, FIRST_OF_PAIR, SECOND_OF_PAIR, PAIR } - public Type TYPE; - public long TOTAL_READS; - public long PF_READS; - public long PF_NOISE_READS; - public long PF_READS_ALIGNED; - public long PF_HQ_ALIGNED_READS; - public long PF_HQ_ALIGNED_BASES; - public long PF_HQ_ALIGNED_Q20_BASES; - public double PF_HQ_MEDIAN_MISMATCHES; - public double MEAN_READ_LENGTH; - public long READS_ALIGNED_IN_PAIRS; - public long BAD_CYCLES; - public double STRAND_BALANCE; - } - - /** counts reads that match various conditions */ - private class ReadCounter implements MetricCollector { - private long numPositiveStrand = 0; - private Histogram readLengthHistogram = new Histogram(); - private AlignmentSummaryMetrics metrics; - - @Override - public void addRecord(SAMRecord record) { - if (record.getNotPrimaryAlignmentFlag()) { - // only want 1 count per read so skip non primary alignments - return; - } - - metrics.TOTAL_READS++; - readLengthHistogram.increment(record.getReadBases().length); - - if (!record.getReadFailsVendorQualityCheckFlag()) { - metrics.PF_READS++; - - if (isNoiseRead(record)) { - metrics.PF_NOISE_READS++; - } - if (!record.getReadUnmappedFlag()) { - metrics.PF_READS_ALIGNED++; - } - } - - if (!record.getReadUnmappedFlag() && - record.getReadPairedFlag() && - !record.getMateUnmappedFlag()) { - metrics.READS_ALIGNED_IN_PAIRS++; - } - - if (!record.getReadNegativeStrandFlag()) { - numPositiveStrand++; - } - } - - @Override - public void onComplete() { - metrics.MEAN_READ_LENGTH = readLengthHistogram.getMean(); - metrics.STRAND_BALANCE = numPositiveStrand / (double) metrics.TOTAL_READS; - } - - private boolean isNoiseRead(SAMRecord record) { - final Object noiseAttribute = record.getAttribute(ReservedTagConstants.XN); - return (noiseAttribute != null && noiseAttribute.equals(1)); - } - - @Override - public void setMetrics(AlignmentSummaryMetrics metrics) { - this.metrics = metrics; - } - - @Override - public AlignmentSummaryMetrics getMetrics() { - return this.metrics; - } - } - - /** counts quality mappings & base calls that match various conditions */ - private class QualityMappingCounter implements MetricCollector { - private Histogram mismatchHistogram = new Histogram(); - private Histogram badCycleHistogram = new Histogram(); - private AlignmentSummaryMetrics metrics; - - @Override - public void addRecord(SAMRecord record) { - if (record.getNotPrimaryAlignmentFlag()) { - return; - } - if (record.getReadUnmappedFlag()) { - final byte[] readBases = record.getReadBases(); - for (int i = 0; i < readBases.length; i++) { - if (SequenceUtil.isNoCall(readBases[i])) { - badCycleHistogram.increment(CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i)); - } - } - } else { - boolean highQualityMapping = isHighQualityMapping(record); - if (highQualityMapping) metrics.PF_HQ_ALIGNED_READS++; - - final byte[] readBases = record.getReadBases(); - final byte[] refBases = getReference(record).getBases(); - final byte[] qualities = record.getBaseQualities(); - long mismatchCount = 0; - - for (AlignmentBlock alignmentBlock : record.getAlignmentBlocks()) { - final int readIndex = alignmentBlock.getReadStart() - 1; - final int refIndex = alignmentBlock.getReferenceStart() - 1; - final int length = alignmentBlock.getLength(); - if (highQualityMapping) metrics.PF_HQ_ALIGNED_BASES += alignmentBlock.getLength(); - - for (int i=0; i= BASE_QUALITY_THRESHOLD) { - metrics.PF_HQ_ALIGNED_Q20_BASES++; - } - if (mismatch) { - mismatchCount++; - } - } - if (mismatch || SequenceUtil.isNoCall(readBases[readBaseIndex])) { - badCycleHistogram.increment(CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i)); - } - } - } - mismatchHistogram.increment(mismatchCount); - } - } - - private boolean isHighQualityMapping(SAMRecord record) { - return !record.getReadFailsVendorQualityCheckFlag() && - record.getMappingQuality() >= MAPPING_QUALITY_THRESHOLD; - } - - @Override - public void onComplete() { - metrics.PF_HQ_MEDIAN_MISMATCHES = mismatchHistogram.getMedian(); - metrics.BAD_CYCLES = 0; - - for (Histogram.Bin cycleBin : badCycleHistogram.values()) { - double badCyclePercentage = cycleBin.getValue() / metrics.TOTAL_READS; - if (badCyclePercentage >= .8) { - metrics.BAD_CYCLES++; - } - } - } - - @Override - public void setMetrics(AlignmentSummaryMetrics metrics) { - this.metrics = metrics; - } - - @Override - public AlignmentSummaryMetrics getMetrics() { - return this.metrics; - } - } -} diff --git a/java/lib/edu/mit/broad/picard/sam/CollectInsertSizeMetrics.java b/java/lib/edu/mit/broad/picard/sam/CollectInsertSizeMetrics.java deleted file mode 100644 index c25d88cc9..000000000 --- a/java/lib/edu/mit/broad/picard/sam/CollectInsertSizeMetrics.java +++ /dev/null @@ -1,154 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.sam; - -import java.io.File; - -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.metrics.MetricsFile; -import edu.mit.broad.picard.util.Histogram; -import edu.mit.broad.picard.util.Log; -import edu.mit.broad.picard.util.RExecutor; -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.sam.SAMRecord; -import edu.mit.broad.sam.util.CloseableIterator; - -/** - * Command line program to read non-duplicate insert sizes, create a histogram - * and report distribution statistics. - * - * @author Doug Voet - */ -public class CollectInsertSizeMetrics extends CommandLineProgram { - private static Log log = Log.getInstance(CollectInsertSizeMetrics.class); - private static final String HISTOGRAM_R_SCRIPT = "edu/mit/broad/picard/sam/insertSizeHistogram.R"; - // Usage and parameters - @Usage(programVersion="1.0") - public String USAGE = "Reads a SAM or BAM file and writes a file containing metrics about " + - "the statistical distribution of insert size (excluding duplicates) " + - "and generates a histogram plot.\n"; - @Option(shortName="I", doc="SAM or BAM file") public File INPUT; - @Option(shortName="O", doc="File to write insert size metrics to") public File OUTPUT; - @Option(shortName="H", doc="File to write insert size histogram chart to") public File HISTOGRAM_FILE; - - /** Required main method implementation. */ - public static void main(String[] argv) { - System.exit(new CollectInsertSizeMetrics().instanceMain(argv)); - } - - @Override - protected int doWork() { - IoUtil.assertFileIsReadable(INPUT); - IoUtil.assertFileIsWritable(OUTPUT); - IoUtil.assertFileIsWritable(HISTOGRAM_FILE); - - SAMFileReader in = new SAMFileReader(INPUT); - MetricsFile file = collectMetrics(in.iterator()); - in.close(); - - file.write(OUTPUT); - - if (file.getMetrics().get(0).READ_PAIRS == 0) { - log.warn("Input file did not contain any records with insert size information."); - } else { - int rResult = RExecutor.executeFromClasspath( - HISTOGRAM_R_SCRIPT, - OUTPUT.getAbsolutePath(), - HISTOGRAM_FILE.getAbsolutePath(), - INPUT.getName()); - - if (rResult != 0) { - throw new PicardException("R script " + HISTOGRAM_R_SCRIPT + " failed with return code " + rResult); - } - } - - return 0; - } - - /** - * Does all the work of iterating through the sam file and collecting insert size metrics. - */ - MetricsFile collectMetrics(CloseableIterator samIterator) { - Histogram insertSizeHistogram = new Histogram("insert_size", "count"); - while (samIterator.hasNext()) { - SAMRecord record = samIterator.next(); - if (skipRecord(record)) { - continue; - } - - int insertSize = Math.abs(record.getInferredInsertSize()); - insertSizeHistogram.increment(insertSize); - } - - MetricsFile file = new MetricsFile(); - file.setHistogram(insertSizeHistogram); - InsertSizeMetrics metrics = new InsertSizeMetrics(); - metrics.READ_PAIRS = (long) insertSizeHistogram.getCount(); - metrics.MAX_INSERT_SIZE = (int) insertSizeHistogram.getMax(); - metrics.MIN_INSERT_SIZE = (int) insertSizeHistogram.getMin(); - metrics.MEAN_INSERT_SIZE = insertSizeHistogram.getMean(); - metrics.STANDARD_DEVIATION = insertSizeHistogram.getStandardDeviation(); - metrics.MEDIAN_INSERT_SIZE = insertSizeHistogram.getMedian(); - - final double total = insertSizeHistogram.getCount(); - final double median = insertSizeHistogram.getMedian(); - double covered = 0; - double low = median; - double high = median; - - while (low >= insertSizeHistogram.getMin() || high <= insertSizeHistogram.getMax()) { - Histogram.Bin lowBin = insertSizeHistogram.get((int) low); - if (lowBin != null) covered += lowBin.getValue(); - - if (low != high) { - Histogram.Bin highBin = insertSizeHistogram.get((int) high); - if (highBin != null) covered += highBin.getValue(); - } - - double percentCovered = covered / total; - int distance = (int) (high - low) + 1; - if (percentCovered >= 0.1 && metrics.WIDTH_OF_10_PERCENT == 0) metrics.WIDTH_OF_10_PERCENT = distance; - if (percentCovered >= 0.2 && metrics.WIDTH_OF_20_PERCENT == 0) metrics.WIDTH_OF_20_PERCENT = distance; - if (percentCovered >= 0.3 && metrics.WIDTH_OF_30_PERCENT == 0) metrics.WIDTH_OF_30_PERCENT = distance; - if (percentCovered >= 0.4 && metrics.WIDTH_OF_40_PERCENT == 0) metrics.WIDTH_OF_40_PERCENT = distance; - if (percentCovered >= 0.5 && metrics.WIDTH_OF_50_PERCENT == 0) metrics.WIDTH_OF_50_PERCENT = distance; - if (percentCovered >= 0.6 && metrics.WIDTH_OF_60_PERCENT == 0) metrics.WIDTH_OF_60_PERCENT = distance; - if (percentCovered >= 0.7 && metrics.WIDTH_OF_70_PERCENT == 0) metrics.WIDTH_OF_70_PERCENT = distance; - if (percentCovered >= 0.8 && metrics.WIDTH_OF_80_PERCENT == 0) metrics.WIDTH_OF_80_PERCENT = distance; - if (percentCovered >= 0.9 && metrics.WIDTH_OF_90_PERCENT == 0) metrics.WIDTH_OF_90_PERCENT = distance; - if (percentCovered >= 0.99 && metrics.WIDTH_OF_99_PERCENT == 0) metrics.WIDTH_OF_99_PERCENT = distance; - - --low; - ++high; - } - - file.addMetric(metrics); - - return file; - } - - /** - * Figures out whether or not the record should be included in the counting of insert sizes - */ - private boolean skipRecord(SAMRecord record) { - return !record.getReadPairedFlag() || - record.getMateUnmappedFlag() || - record.getFirstOfPairFlag() || - record.getNotPrimaryAlignmentFlag() || - record.getDuplicateReadFlag() || - record.getInferredInsertSize() == 0; - } - -} diff --git a/java/lib/edu/mit/broad/picard/sam/ComparableSamRecordIterator.java b/java/lib/edu/mit/broad/picard/sam/ComparableSamRecordIterator.java deleted file mode 100644 index 819811720..000000000 --- a/java/lib/edu/mit/broad/picard/sam/ComparableSamRecordIterator.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright Jan 22, 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.picard.sam; - -import edu.mit.broad.picard.util.PeekableIterator; -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.sam.SAMRecord; - -import java.util.Comparator; - -/** - * Iterator for SAM records that implements comparable to enable sorting of iterators. - * The comparison is performed by comparing the next record in the iterator to the next - * record in another iterator and returning the ordering between those SAM records. - */ -class ComparableSamRecordIterator extends PeekableIterator implements Comparable { - private Comparator comparator; - private SAMFileReader reader; - - /** - * Constructs an iterator for iteration over the supplied SAM file that will be - * able to compare itself to other ComparableSAMRecordIterator instances using - * the supplied comparator for ordering SAMRecords. - * - * @param sam the SAM file to read records from - * @param comparator the Comparator to use to provide ordering fo SAMRecords - */ - public ComparableSamRecordIterator(SAMFileReader sam, Comparator comparator) { - super(sam.iterator()); - this.reader = sam; - this.comparator = comparator; - } - - /** Returns the reader from which this iterator was constructed. */ - public SAMFileReader getReader() { - return reader; - } - - /** - * Compares this iterator to another comparable iterator based on the next record - * available in each iterator. If the two comparable iterators have different - * comparator types internally an exception is thrown. - * - * @param that another iterator to compare to - * @return a negative, 0 or positive number as described in the Comparator interface - */ - public int compareTo(ComparableSamRecordIterator that) { - if (this.comparator.getClass() != that.comparator.getClass()) { - throw new IllegalStateException("Attempt to compare two ComparableSAMRecordIterators that " + - "have different orderings internally"); - } - - SAMRecord record = this.peek(); - SAMRecord record2 = that.peek(); - return comparator.compare(record, record2); - } -} diff --git a/java/lib/edu/mit/broad/picard/sam/CreateSequenceDictionary.java b/java/lib/edu/mit/broad/picard/sam/CreateSequenceDictionary.java deleted file mode 100644 index 01a71fd85..000000000 --- a/java/lib/edu/mit/broad/picard/sam/CreateSequenceDictionary.java +++ /dev/null @@ -1,145 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.sam; - -import edu.mit.broad.sam.SAMSequenceRecord; -import edu.mit.broad.sam.SAMFileWriter; -import edu.mit.broad.sam.SAMFileWriterFactory; -import edu.mit.broad.sam.SAMFileHeader; -import edu.mit.broad.picard.reference.ReferenceSequenceFile; -import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; -import edu.mit.broad.picard.reference.ReferenceSequence; -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.PicardException; - -import java.util.List; -import java.util.ArrayList; -import java.io.File; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import java.math.BigInteger; - -/** - * Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no - * SAMRecords, and the header contains only sequence records. - */ -public class CreateSequenceDictionary extends CommandLineProgram { - - private static final String PROGRAM_VERSION = "1.0"; - - // The following attributes define the command-line arguments - @Usage(programVersion=PROGRAM_VERSION) - public String USAGE = - "Usage: " + getClass().getName() + " [options]\n\n" + - "Read fasta or fasta.gz containing reference sequences, and write as a SAM or BAM file with only sequence dictionary.\n"; - - @Option(doc = "Input reference fasta or fasta.gz") - public File REFERENCE; - - @Option(doc = "Output SAM or BAM file containing only the sequence dictionary") - public File OUTPUT; - - @Option(doc = "Put into AS field of sequence dictionary entry if supplied", optional = true) - public String GENOME_ASSEMBLY; - - @Option(doc = "Put into UIR field of sequence dictionary entry. If not supplied, input reference file is used", - optional = true) - public String URI; - - @Option(doc = "Put into SP field of sequence dictionary entry", optional = true) - public String SPECIES; - - private final MessageDigest md5; - - public CreateSequenceDictionary() { - try { - md5 = MessageDigest.getInstance("MD5"); - } catch (NoSuchAlgorithmException e) { - throw new PicardException("MD5 algorithm not found", e); - } - } - - public static void main(final String[] argv) { - System.exit(new CreateSequenceDictionary().instanceMain(argv)); - } - - /** - * Use reference filename to create URI to go into header if URI was not passed on cmd line. - */ - protected boolean customCommandLineValidation() { - if (URI == null) { - URI = "file:" + REFERENCE.getAbsolutePath(); - } - return true; - } - - /** - * Do the work after command line has been parsed. - * RuntimeException may be thrown by this method, and are reported appropriately. - * - * @return program exit status. - */ - protected int doWork() { - final List sequences = makeSequenceDictionary(REFERENCE); - final SAMFileHeader samHeader = new SAMFileHeader(); - samHeader.setSequences(sequences); - final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(samHeader, false, OUTPUT); - samWriter.close(); - return 0; - } - - - /** - * Read all the sequences from the given reference file, and convert into SAMSequenceRecords - * @param referenceFile fasta or fasta.gz - * @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments. - */ - List makeSequenceDictionary(final File referenceFile) { - final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile); - ReferenceSequence refSeq; - final List ret = new ArrayList(); - while ((refSeq = refSeqFile.nextSequence()) != null) { - ret.add(makeSequenceRecord(refSeq)); - } - return ret; - } - - /** - * Create one SAMSequenceRecord from a single fasta sequence - */ - private SAMSequenceRecord makeSequenceRecord(final ReferenceSequence refSeq) { - final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName()); - ret.setSequenceLength(refSeq.length()); - - // Compute MD5 of upcased bases - final byte[] bases = refSeq.getBases(); - for (int i = 0; i < bases.length; ++i) { - bases[i] = (byte) (Character.toUpperCase(bases[i]) & 0xff); - } - - ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases)); - if (GENOME_ASSEMBLY != null) { - ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY); - } - ret.setAttribute(SAMSequenceRecord.URI_TAG, URI); - if (SPECIES != null) { - ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES); - } - return ret; - } - - private String md5Hash(final byte[] bytes) { - md5.reset(); - md5.update(bytes); - return new BigInteger(1, md5.digest()).toString(16); - } -} diff --git a/java/lib/edu/mit/broad/picard/sam/DuplicationMetrics.java b/java/lib/edu/mit/broad/picard/sam/DuplicationMetrics.java deleted file mode 100644 index 689e2b806..000000000 --- a/java/lib/edu/mit/broad/picard/sam/DuplicationMetrics.java +++ /dev/null @@ -1,116 +0,0 @@ -package edu.mit.broad.picard.sam; - -import edu.mit.broad.picard.metrics.MetricBase; -import edu.mit.broad.picard.util.Histogram; - -/** - * Metrics that are calculated during the process of marking duplicates - * within a stream of SAMRecords. - */ -public class DuplicationMetrics extends MetricBase { - /** The number of mapped reads examined which did not have a mapped mate pair. */ - public long UNPAIRED_READS_EXAMINED; - - /** The number of mapped read pairs examined. */ - public long READ_PAIRS_EXAMINED; - - /** The total number of unmapped reads examined. */ - public long UNMAPPED_READS; - - /** The number of fragments that were marked as duplicates. */ - public long UNPAIRED_READ_DUPLICATES; - - /** The number of read pairs that were marked as duplicates. */ - public long READ_PAIR_DUPLICATES; - - /** The percentage of mapped sequence that is marked as duplicate. */ - public Double PERCENT_DUPLICATION; - - /** The estimated number of unique molecules in the library based on PE duplication. */ - public Long ESTIMATED_LIBRARY_SIZE; - - /** - * Fills in the ESTIMATED_LIBRARY_SIZE based on the paired read data examined where - * possible and the PERCENT_DUPLICATION. - */ - public void calculateDerivedMetrics() { - if (READ_PAIRS_EXAMINED > 0) { - // Following code "borrowed" from CRD codebase - long n = READ_PAIRS_EXAMINED; - long c = READ_PAIRS_EXAMINED - READ_PAIR_DUPLICATES; - - double m = 1.0, M = 100.0; - - if (c >= n || f(m*c, c, n) <= 0) { - throw new IllegalStateException("Invalid values for pairs and unique pairs: " - + n + ", " + c); - - } - - while( f(M*c, c, n) >= 0 ) M *= 10.0; - - for (int i=0; i<40; i++ ) { - double r = (m+M)/2.0; - double u = f( r * c, c, n ); - if ( u == 0 ) break; - else if ( u > 0 ) m = r; - else if ( u < 0 ) M = r; - } - - this.ESTIMATED_LIBRARY_SIZE = (long) (c * (m+M)/2.0); - } - - PERCENT_DUPLICATION = (UNPAIRED_READ_DUPLICATES + READ_PAIR_DUPLICATES *2) /(double) (UNPAIRED_READS_EXAMINED + READ_PAIRS_EXAMINED *2); - } - - /** Method that is used in the computation of estimated library size. */ - private double f(double x, double c, double n) { - return c/x - 1 + Math.exp(-n/x); - } - - /** - * Estimates the ROI (return on investment) that one would see if a library was sequenced to - * x higher coverage than the observed coverage. - * - * @param estimatedLibrarySize the estimated number of molecules in the library - * @param x the multiple of sequencing to be simulated (i.e. how many X sequencing) - * @param pairs the number of pairs observed in the actual sequencing - * @param uniquePairs the number of unique pairs observed in the actual sequencing - * @return a number z <= x that estimates if you had pairs*x as your sequencing then you - * would observe uniquePairs*z unique pairs. - */ - private double estimateRoi(long estimatedLibrarySize, double x, long pairs, long uniquePairs) { - return estimatedLibrarySize * ( 1 - Math.exp(-(x*pairs)/estimatedLibrarySize) ) / uniquePairs; - } - - /** - * Calculates a histogram using the estimateRoi method to estimate the effective yield - * doing x sequencing for x=1..10. - */ - public Histogram calculateRoiHistogram() { - if (ESTIMATED_LIBRARY_SIZE == null) { - try { calculateDerivedMetrics(); } - catch (IllegalStateException ise) { return null; } - } - - long uniquePairs = READ_PAIRS_EXAMINED - READ_PAIR_DUPLICATES; - Histogram histo = new Histogram(); - - for (double x=1; x<=10; x+=1) { - histo.increment(x, estimateRoi(ESTIMATED_LIBRARY_SIZE, x, READ_PAIRS_EXAMINED, uniquePairs)); - } - - return histo; - } - - // Main method used for debugging the derived metrics -// public static void main(String[] args) { -// DuplicationMetrics m = new DuplicationMetrics(); -// m.PAIRS_EXAMINED = Integer.parseInt(args[0]); -// m.DUPLICATE_PAIRS = m.PAIRS_EXAMINED - Integer.parseInt(args[1]); -// m.calculateDerivedMetrics(); -// System.out.println("Percent Duplication: " + m.PERCENT_DUPLICATION); -// System.out.println("Est. Library Size : " + m.ESTIMATED_LIBRARY_SIZE); -// System.out.println(m.calculateRoiHistogram()); -// } -} diff --git a/java/lib/edu/mit/broad/picard/sam/InsertSizeMetrics.java b/java/lib/edu/mit/broad/picard/sam/InsertSizeMetrics.java deleted file mode 100644 index fdc9c4707..000000000 --- a/java/lib/edu/mit/broad/picard/sam/InsertSizeMetrics.java +++ /dev/null @@ -1,38 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.sam; - -import edu.mit.broad.picard.metrics.MetricBase; - -/** - * Metrics class for insert size statistics - * - * @author Doug Voet - */ -public class InsertSizeMetrics extends MetricBase { - public double MEDIAN_INSERT_SIZE; - public int MIN_INSERT_SIZE; - public int MAX_INSERT_SIZE; - public double MEAN_INSERT_SIZE; - public double STANDARD_DEVIATION; - public long READ_PAIRS; - - public int WIDTH_OF_10_PERCENT; - public int WIDTH_OF_20_PERCENT; - public int WIDTH_OF_30_PERCENT; - public int WIDTH_OF_40_PERCENT; - public int WIDTH_OF_50_PERCENT; - public int WIDTH_OF_60_PERCENT; - public int WIDTH_OF_70_PERCENT; - public int WIDTH_OF_80_PERCENT; - public int WIDTH_OF_90_PERCENT; - public int WIDTH_OF_99_PERCENT; -} diff --git a/java/lib/edu/mit/broad/picard/sam/MarkDuplicates.java b/java/lib/edu/mit/broad/picard/sam/MarkDuplicates.java deleted file mode 100644 index 75321bf82..000000000 --- a/java/lib/edu/mit/broad/picard/sam/MarkDuplicates.java +++ /dev/null @@ -1,461 +0,0 @@ -package edu.mit.broad.picard.sam; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.metrics.MetricsFile; -import edu.mit.broad.picard.util.Log; -import edu.mit.broad.sam.util.SortingCollection; -import edu.mit.broad.sam.*; - -import java.io.*; -import java.util.*; - -/** - * A better duplication marking algorithm that handles all cases including clipped - * and gapped alignments. - * - * @author Tim Fennell - */ -public class MarkDuplicates extends CommandLineProgram { - private static final Log log = Log.getInstance(MarkDuplicates.class); - - @Usage public final String USAGE = - "Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules. " + - "All records are then written to the output file with the duplicate records flagged."; - @Option(shortName="I", doc="The input SAM or BAM file to analyze") public File INPUT; - @Option(shortName="O", doc="The output file to right marked records to") public File OUTPUT; - @Option(shortName="M", doc="File to write duplication metrics to") public File METRICS_FILE; - - private SortingCollection pairSort; - private SortingCollection fragSort; - private long[] duplicateIndexes = new long[1000000]; - private int nextIndex = 0; // The next offset into duplicateIndexes to use - - - /** Stock main method. */ - public static void main(String[] args) { - new MarkDuplicates().instanceMain(args); - } - - /** Little struct-like class to hold read pair (and fragment) end data. */ - private static class ReadEnds { - public static final int SIZE_OF = (1*1) + (2*1) + (4*4) + (8*2) + 8; // last 8 == reference overhead - public static final byte F=0, R=1, FF=2, FR=3, RR=4, RF=5; - - short score = 0; - byte orientation; - int read1Sequence = -1; - int read1Coordinate = -1; - long read1IndexInFile = -1; - int read2Sequence = -1; - int read2Coordinate = -1; - long read2IndexInFile = -1; - - boolean isPaired() { return this.read2Sequence != -1; } - } - - /** Comparator for ReadEnds that orders by read1 position then pair orientation then read2 position. */ - private static class ReadEndsComparator implements Comparator { - public int compare(ReadEnds lhs, ReadEnds rhs) { - int retval = lhs.read1Sequence - rhs.read1Sequence; - if (retval == 0) retval = lhs.read1Coordinate - rhs.read1Coordinate; - if (retval == 0) retval = lhs.orientation - rhs.orientation; - if (retval == 0) retval = lhs.read2Sequence - rhs.read2Sequence; - if (retval == 0) retval = lhs.read2Coordinate - rhs.read2Coordinate; - if (retval == 0) retval = (int) (lhs.read1IndexInFile - rhs.read1IndexInFile); - if (retval == 0) retval = (int) (lhs.read2IndexInFile - rhs.read2IndexInFile); - - return retval; - } - } - - /** Coded for ReadEnds that just outputs the primitive fields and reads them back. */ - private static class ReadEndsCodec implements SortingCollection.Codec { - private DataInputStream in; - private DataOutputStream out; - - public SortingCollection.Codec clone() { - return new ReadEndsCodec(); - } - - public void setOutputStream(OutputStream os) { this.out = new DataOutputStream(os); } - public void setInputStream(InputStream is) { this.in = new DataInputStream(is); } - - public void encode(ReadEnds read) { - try { - this.out.writeShort(read.score); - this.out.writeByte(read.orientation); - this.out.writeInt(read.read1Sequence); - this.out.writeInt(read.read1Coordinate); - this.out.writeLong(read.read1IndexInFile); - this.out.writeInt(read.read2Sequence); - - if (read.orientation > ReadEnds.R) { - this.out.writeInt(read.read2Coordinate); - this.out.writeLong(read.read2IndexInFile); - } - this.out.flush(); - } - catch (IOException ioe) { - throw new PicardException("Exception writing ReadEnds to file.", ioe); - } - } - - public ReadEnds decode() { - ReadEnds read = new ReadEnds(); - try { - // If the first read results in an EOF we've exhausted the stream - try { read.score = this.in.readShort(); } - catch (EOFException eof) { return null; } - - read.orientation = this.in.readByte(); - read.read1Sequence = this.in.readInt(); - read.read1Coordinate = this.in.readInt(); - read.read1IndexInFile = this.in.readLong(); - read.read2Sequence = this.in.readInt(); - - if (read.orientation > ReadEnds.R) { - read.read2Coordinate = this.in.readInt(); - read.read2IndexInFile = this.in.readLong(); - } - return read; - } - catch (IOException ioe) { - throw new PicardException("Exception writing ReadEnds to file.", ioe); - } - } - } - - /** - * Main work method. Reads the BAM file once and collects sorted information about - * the 5' ends of both ends of each read (or just one end in the case of pairs). - * Then makes a pass through those determining duplicates before re-reading the - * input file and writing it out with duplication flags set correctly. - */ - protected int doWork() { - log.info("Reading input file and constructing read end information."); - buildSortedReadEndLists(); - generateDuplicateIndexes(); - log.info("Marking " + this.duplicateIndexes.length + " records as duplicates."); - DuplicationMetrics metrics = new DuplicationMetrics(); - SAMFileReader in = new SAMFileReader(INPUT); - SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), - true, - OUTPUT); - - // Now copy over the file while marking all the necessary indexes as duplicates - long recordInFileIndex = 0; - long nextDuplicateIndex = (this.duplicateIndexes.length == 0 ? -1 : this.duplicateIndexes[0]); - int arrayIndex = 1; - - for (SAMRecord rec : in) { - // First bring the simple metrics up to date - if (rec.getReadUnmappedFlag()) { - ++metrics.UNMAPPED_READS; - } - else if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) { - ++metrics.UNPAIRED_READS_EXAMINED; - } - else if (rec.getFirstOfPairFlag()){ - ++metrics.READ_PAIRS_EXAMINED; - } - - - if (recordInFileIndex++ == nextDuplicateIndex) { - rec.setDuplicateReadFlag(true); - - // Update the duplication metrics - if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) { - ++metrics.UNPAIRED_READ_DUPLICATES; - } - else if (rec.getFirstOfPairFlag()) { - ++metrics.READ_PAIR_DUPLICATES; - } - - // Now try and figure out the next duplicate index - try { - nextDuplicateIndex = this.duplicateIndexes[arrayIndex++]; - } - catch (ArrayIndexOutOfBoundsException e) { - // Only happens once we've marked all the duplicates - nextDuplicateIndex = -1; - arrayIndex = -1; - } - } - - out.addAlignment(rec); - } - - out.close(); - - - // Write out the metrics - metrics.calculateDerivedMetrics(); - MetricsFile file = getMetricsFile(); - file.addMetric(metrics); - file.setHistogram(metrics.calculateRoiHistogram()); - file.write(METRICS_FILE); - - return 0; - } - - /** - * Goes through all the records in a file and generates a set of ReadEnds objects that - * hold the necessary information (reference sequence, 5' read coordinate) to do - * duplication, caching to disk as necssary to sort them. - */ - private void buildSortedReadEndLists() { - // TODO: take into account clipping/padding? - int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * 0.25) / ReadEnds.SIZE_OF); - this.pairSort = SortingCollection.newInstance(ReadEnds.class, - new ReadEndsCodec(), - new ReadEndsComparator(), - maxInMemory); - - this.fragSort = SortingCollection.newInstance(ReadEnds.class, - new ReadEndsCodec(), - new ReadEndsComparator(), - maxInMemory); - - Map tmp = new HashMap(); - SAMFileReader sam = new SAMFileReader(INPUT); - SAMFileHeader header = sam.getFileHeader(); - long index = 0; - - for (SAMRecord rec : sam) { - if (rec.getReadUnmappedFlag()) { - continue; - } - - ReadEnds fragmentEnd = buildReadEnds(header, index, rec); - this.fragSort.add(fragmentEnd); - - if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { - String key = rec.getAttribute(ReservedTagConstants.READ_GROUP_ID) + ":" + rec.getReadName(); - ReadEnds pairedEnds = tmp.remove(key); - - // See if we've already seen the first end or not - if (pairedEnds == null) { - pairedEnds = buildReadEnds(header, index, rec); - tmp.put(key, pairedEnds); - } - else { - int sequence = fragmentEnd.read1Sequence; - int coordinate = fragmentEnd.read1Coordinate; - - // If the second read is actually later, just add the second read data, else flip the reads - if (sequence > pairedEnds.read1Sequence || (sequence == pairedEnds.read1Sequence && coordinate >= pairedEnds.read1Coordinate)) { - pairedEnds.read2Sequence = sequence; - pairedEnds.read2Coordinate = coordinate; - pairedEnds.read2IndexInFile = index; - pairedEnds.orientation = getOrientationByte(pairedEnds.orientation == ReadEnds.R, rec.getReadNegativeStrandFlag()); - } - else { - pairedEnds.read2Sequence = pairedEnds.read1Sequence; - pairedEnds.read2Coordinate = pairedEnds.read1Coordinate; - pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile; - pairedEnds.read1Sequence = sequence; - pairedEnds.read1Coordinate = coordinate; - pairedEnds.read1IndexInFile = index; - pairedEnds.orientation = getOrientationByte(rec.getReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds.R); - } - - pairedEnds.score += getScore(rec); - this.pairSort.add(pairedEnds); - } - } - - ++index; - } - } - - /** Builds a read ends object that represents a single read. */ - private ReadEnds buildReadEnds(SAMFileHeader header, long index, SAMRecord rec) { - ReadEnds ends = new ReadEnds(); - ends.read1Sequence = rec.getReferenceIndex(header); - ends.read1Coordinate = rec.getReadNegativeStrandFlag() ? rec.getUnclippedEnd() : rec.getUnclippedStart(); - ends.orientation = rec.getReadNegativeStrandFlag() ? ReadEnds.R : ReadEnds.F; - ends.read1IndexInFile = index; - ends.score = getScore(rec); - - // Doing this lets the ends object know that it's part of a pair - if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { - ends.read2Sequence = rec.getMateReferenceIndex(header); - } - - return ends; - } - - /** - * Returns a single byte that encodes the orientation of the two reads in a pair. - */ - private byte getOrientationByte(boolean read1NegativeStrand, boolean read2NegativeStrand) { - if (read1NegativeStrand) { - if (read2NegativeStrand) return ReadEnds.RR; - else return ReadEnds.RF; - } - else { - if (read2NegativeStrand) return ReadEnds.FR; - else return ReadEnds.FF; - } - } - - - - /** Calculates a score for the read which is the sum of scores over Q20. */ - private short getScore(SAMRecord rec) { - short score = 0; - for (byte b : rec.getBaseQualities()) { - if (b >= 15) score += b; - } - - return score; - } - - /** - * Goes through the accumulated ReadEnds objects and determines which of them are - * to be marked as duplicates. - * - * @return an array with an ordered list of indexes into the source file - */ - private void generateDuplicateIndexes() { - ReadEnds firstOfNextChunk = null; - List nextChunk = new ArrayList(200); - - // First just do the pairs - log.info("Traversing read pair information and detecting duplicates."); - for (ReadEnds next : this.pairSort) { - if (firstOfNextChunk == null) { - firstOfNextChunk = next; - nextChunk.add(firstOfNextChunk); - } - else if (areComparableForDuplicates(firstOfNextChunk, next, true)) { - nextChunk.add(next); - } - else { - if (nextChunk.size() > 1) { - markDuplicatePairs(nextChunk); - } - - nextChunk.clear(); - nextChunk.add(next); - firstOfNextChunk = next; - } - } - markDuplicatePairs(nextChunk); - this.pairSort = null; - - // Now deal with the fragments - log.info("Traversing fragment information and detecting duplicates."); - boolean containsPairs = false; - boolean containsFrags = false; - - for (ReadEnds next : this.fragSort) { - if (firstOfNextChunk != null && areComparableForDuplicates(firstOfNextChunk, next, false)) { - nextChunk.add(next); - containsPairs = containsPairs || next.isPaired(); - containsFrags = containsFrags || !next.isPaired(); - } - else { - if (nextChunk.size() > 1 && containsFrags) { - markDuplicateFragments(nextChunk, containsPairs); - } - - nextChunk.clear(); - nextChunk.add(next); - firstOfNextChunk = next; - containsPairs = next.isPaired(); - containsFrags = !next.isPaired(); - } - } - markDuplicateFragments(nextChunk, containsPairs); - this.fragSort = null; - - // Now shrink down the array and sort it - log.info("Sorting list of duplicate records."); - long[] tmp = new long[this.nextIndex]; - System.arraycopy(this.duplicateIndexes, 0, tmp, 0, this.nextIndex); - this.duplicateIndexes = tmp; - Arrays.sort(this.duplicateIndexes); - } - - private boolean areComparableForDuplicates(final ReadEnds lhs, final ReadEnds rhs, final boolean compareRead2) { - boolean retval = lhs.read1Sequence == rhs.read1Sequence && - lhs.read1Coordinate == rhs.read1Coordinate && - lhs.orientation == rhs.orientation; - - if (compareRead2) { - retval = lhs.read2Sequence == rhs.read2Sequence && - lhs.read2Coordinate == rhs.read2Coordinate; - } - - return retval; - } - - private void addIndexAsDuplicate(final long bamIndex) { - if (this.nextIndex > this.duplicateIndexes.length - 1) { - long[] tmp = new long[this.duplicateIndexes.length * 2]; - System.arraycopy(this.duplicateIndexes, 0, tmp, 0, this.nextIndex); - this.duplicateIndexes = tmp; - } - - this.duplicateIndexes[this.nextIndex++] = bamIndex; - } - - /** - * Takes a list of ReadEnds objects and removes from it all objects that should - * not be marked as duplicates. - * - * @param list - */ - private void markDuplicatePairs(final List list) { - short maxScore = 0; - ReadEnds best = null; - - for (final ReadEnds end : list) { - if (end.score > maxScore || best == null) { - maxScore = end.score; - best = end; - } - } - - for (final ReadEnds end : list) { - if (end != best) { - addIndexAsDuplicate(end.read1IndexInFile); - addIndexAsDuplicate(end.read2IndexInFile); - } - } - } - - /** - * Takes a list of ReadEnds objects and removes from it all objects that should - * not be marked as duplicates. - * - * @param list - */ - private void markDuplicateFragments(final List list, final boolean containsPairs) { - if (containsPairs) { - for (final ReadEnds end : list) { - if (!end.isPaired()) addIndexAsDuplicate(end.read1IndexInFile); - } - } - else { - short maxScore = 0; - ReadEnds best = null; - for (final ReadEnds end : list) { - if (end.score > maxScore || best == null) { - maxScore = end.score; - best = end; - } - } - - for (final ReadEnds end : list) { - if (end != best) { - addIndexAsDuplicate(end.read1IndexInFile); - } - } - } - } -} diff --git a/java/lib/edu/mit/broad/picard/sam/MarkDuplicates2.java b/java/lib/edu/mit/broad/picard/sam/MarkDuplicates2.java deleted file mode 100644 index 908f27f7d..000000000 --- a/java/lib/edu/mit/broad/picard/sam/MarkDuplicates2.java +++ /dev/null @@ -1,461 +0,0 @@ -package edu.mit.broad.picard.sam; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.metrics.MetricsFile; -import edu.mit.broad.picard.util.Log; -import edu.mit.broad.sam.util.SortingCollection; -import edu.mit.broad.sam.*; - -import java.io.*; -import java.util.*; - -/** - * A better duplication marking algorithm that handles all cases including clipped - * and gapped alignments. - * - * @author Tim Fennell - */ -public class MarkDuplicates2 extends CommandLineProgram { - private static final Log log = Log.getInstance(MarkDuplicates2.class); - - @Usage public final String USAGE = - "Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules. " + - "All records are then written to the output file with the duplicate records flagged."; - @Option(shortName="I", doc="The input SAM or BAM file to analyze") public File INPUT; - @Option(shortName="O", doc="The output file to right marked records to") public File OUTPUT; - @Option(shortName="M", doc="File to write duplication metrics to") public File METRICS_FILE; - - private SortingCollection pairSort; - private SortingCollection fragSort; - private long[] duplicateIndexes = new long[1000000]; - private int nextIndex = 0; // The next offset into duplicateIndexes to use - - - /** Stock main method. */ - public static void main(String[] args) { - new MarkDuplicates2().instanceMain(args); - } - - /** Little struct-like class to hold read pair (and fragment) end data. */ - private static class ReadEnds { - public static final int SIZE_OF = (1*1) + (2*1) + (4*4) + (8*2) + 8; // last 8 == reference overhead - public static final byte F=0, R=1, FF=2, FR=3, RR=4, RF=5; - - short score = 0; - byte orientation; - int read1Sequence = -1; - int read1Coordinate = -1; - long read1IndexInFile = -1; - int read2Sequence = -1; - int read2Coordinate = -1; - long read2IndexInFile = -1; - - boolean isPaired() { return this.read2Sequence != -1; } - } - - /** Comparator for ReadEnds that orders by read1 position then pair orientation then read2 position. */ - private static class ReadEndsComparator implements Comparator { - public int compare(ReadEnds lhs, ReadEnds rhs) { - int retval = lhs.read1Sequence - rhs.read1Sequence; - if (retval == 0) retval = lhs.read1Coordinate - rhs.read1Coordinate; - if (retval == 0) retval = lhs.orientation - rhs.orientation; - if (retval == 0) retval = lhs.read2Sequence - rhs.read2Sequence; - if (retval == 0) retval = lhs.read2Coordinate - rhs.read2Coordinate; - if (retval == 0) retval = (int) (lhs.read1IndexInFile - rhs.read1IndexInFile); - if (retval == 0) retval = (int) (lhs.read2IndexInFile - rhs.read2IndexInFile); - - return retval; - } - } - - /** Coded for ReadEnds that just outputs the primitive fields and reads them back. */ - private static class ReadEndsCodec implements SortingCollection.Codec { - private DataInputStream in; - private DataOutputStream out; - - public SortingCollection.Codec clone() { - return new ReadEndsCodec(); - } - - public void setOutputStream(OutputStream os) { this.out = new DataOutputStream(os); } - public void setInputStream(InputStream is) { this.in = new DataInputStream(is); } - - public void encode(ReadEnds read) { - try { - this.out.writeShort(read.score); - this.out.writeByte(read.orientation); - this.out.writeInt(read.read1Sequence); - this.out.writeInt(read.read1Coordinate); - this.out.writeLong(read.read1IndexInFile); - this.out.writeInt(read.read2Sequence); - - if (read.orientation > ReadEnds.R) { - this.out.writeInt(read.read2Coordinate); - this.out.writeLong(read.read2IndexInFile); - } - this.out.flush(); - } - catch (IOException ioe) { - throw new PicardException("Exception writing ReadEnds to file.", ioe); - } - } - - public ReadEnds decode() { - ReadEnds read = new ReadEnds(); - try { - // If the first read results in an EOF we've exhausted the stream - try { read.score = this.in.readShort(); } - catch (EOFException eof) { return null; } - - read.orientation = this.in.readByte(); - read.read1Sequence = this.in.readInt(); - read.read1Coordinate = this.in.readInt(); - read.read1IndexInFile = this.in.readLong(); - read.read2Sequence = this.in.readInt(); - - if (read.orientation > ReadEnds.R) { - read.read2Coordinate = this.in.readInt(); - read.read2IndexInFile = this.in.readLong(); - } - return read; - } - catch (IOException ioe) { - throw new PicardException("Exception writing ReadEnds to file.", ioe); - } - } - } - - /** - * Main work method. Reads the BAM file once and collects sorted information about - * the 5' ends of both ends of each read (or just one end in the case of pairs). - * Then makes a pass through those determining duplicates before re-reading the - * input file and writing it out with duplication flags set correctly. - */ - protected int doWork() { - log.info("Reading input file and constructing read end information."); - buildSortedReadEndLists(); - generateDuplicateIndexes(); - log.info("Marking " + this.duplicateIndexes.length + " records as duplicates."); - DuplicationMetrics metrics = new DuplicationMetrics(); - SAMFileReader in = new SAMFileReader(INPUT); - SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), - true, - OUTPUT); - - // Now copy over the file while marking all the necessary indexes as duplicates - long recordInFileIndex = 0; - long nextDuplicateIndex = (this.duplicateIndexes.length == 0 ? -1 : this.duplicateIndexes[0]); - int arrayIndex = 1; - - for (SAMRecord rec : in) { - // First bring the simple metrics up to date - if (rec.getReadUnmappedFlag()) { - ++metrics.UNMAPPED_READS; - } - else if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) { - ++metrics.UNPAIRED_READS_EXAMINED; - } - else if (rec.getFirstOfPairFlag()){ - ++metrics.READ_PAIRS_EXAMINED; - } - - - if (recordInFileIndex++ == nextDuplicateIndex) { - rec.setDuplicateReadFlag(true); - - // Update the duplication metrics - if (!rec.getReadPairedFlag() || rec.getMateUnmappedFlag()) { - ++metrics.UNPAIRED_READ_DUPLICATES; - } - else if (rec.getFirstOfPairFlag()) { - ++metrics.READ_PAIR_DUPLICATES; - } - - // Now try and figure out the next duplicate index - try { - nextDuplicateIndex = this.duplicateIndexes[arrayIndex++]; - } - catch (ArrayIndexOutOfBoundsException e) { - // Only happens once we've marked all the duplicates - nextDuplicateIndex = -1; - arrayIndex = -1; - } - } - - out.addAlignment(rec); - } - - out.close(); - - - // Write out the metrics - metrics.calculateDerivedMetrics(); - MetricsFile file = getMetricsFile(); - file.addMetric(metrics); - file.setHistogram(metrics.calculateRoiHistogram()); - file.write(METRICS_FILE); - - return 0; - } - - /** - * Goes through all the records in a file and generates a set of ReadEnds objects that - * hold the necessary information (reference sequence, 5' read coordinate) to do - * duplication, caching to disk as necssary to sort them. - */ - private void buildSortedReadEndLists() { - // TODO: take into account clipping/padding? - int maxInMemory = (int) ((Runtime.getRuntime().maxMemory() * 0.25) / ReadEnds.SIZE_OF); - this.pairSort = SortingCollection.newInstance(ReadEnds.class, - new ReadEndsCodec(), - new ReadEndsComparator(), - maxInMemory); - - this.fragSort = SortingCollection.newInstance(ReadEnds.class, - new ReadEndsCodec(), - new ReadEndsComparator(), - maxInMemory); - - Map tmp = new HashMap(); - SAMFileReader sam = new SAMFileReader(INPUT); - SAMFileHeader header = sam.getFileHeader(); - long index = 0; - - for (SAMRecord rec : sam) { - if (rec.getReadUnmappedFlag()) { - continue; - } - - ReadEnds fragmentEnd = buildReadEnds(header, index, rec); - this.fragSort.add(fragmentEnd); - - if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { - String key = rec.getAttribute(ReservedTagConstants.READ_GROUP_ID) + ":" + rec.getReadName(); - ReadEnds pairedEnds = tmp.remove(key); - - // See if we've already seen the first end or not - if (pairedEnds == null) { - pairedEnds = buildReadEnds(header, index, rec); - tmp.put(key, pairedEnds); - } - else { - int sequence = fragmentEnd.read1Sequence; - int coordinate = fragmentEnd.read1Coordinate; - - // If the second read is actually later, just add the second read data, else flip the reads - if (sequence > pairedEnds.read1Sequence || (sequence == pairedEnds.read1Sequence && coordinate >= pairedEnds.read1Coordinate)) { - pairedEnds.read2Sequence = sequence; - pairedEnds.read2Coordinate = coordinate; - pairedEnds.read2IndexInFile = index; - pairedEnds.orientation = getOrientationByte(pairedEnds.orientation == ReadEnds.R, rec.getReadNegativeStrandFlag()); - } - else { - pairedEnds.read2Sequence = pairedEnds.read1Sequence; - pairedEnds.read2Coordinate = pairedEnds.read1Coordinate; - pairedEnds.read2IndexInFile = pairedEnds.read1IndexInFile; - pairedEnds.read1Sequence = sequence; - pairedEnds.read1Coordinate = coordinate; - pairedEnds.read1IndexInFile = index; - pairedEnds.orientation = getOrientationByte(rec.getReadNegativeStrandFlag(), pairedEnds.orientation == ReadEnds.R); - } - - pairedEnds.score += getScore(rec); - this.pairSort.add(pairedEnds); - } - } - - ++index; - } - } - - /** Builds a read ends object that represents a single read. */ - private ReadEnds buildReadEnds(SAMFileHeader header, long index, SAMRecord rec) { - ReadEnds ends = new ReadEnds(); - ends.read1Sequence = rec.getReferenceIndex(header); - ends.read1Coordinate = rec.getReadNegativeStrandFlag() ? rec.getUnclippedEnd() : rec.getUnclippedStart(); - ends.orientation = rec.getReadNegativeStrandFlag() ? ReadEnds.R : ReadEnds.F; - ends.read1IndexInFile = index; - ends.score = getScore(rec); - - // Doing this lets the ends object know that it's part of a pair - if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) { - ends.read2Sequence = rec.getMateReferenceIndex(header); - } - - return ends; - } - - /** - * Returns a single byte that encodes the orientation of the two reads in a pair. - */ - private byte getOrientationByte(boolean read1NegativeStrand, boolean read2NegativeStrand) { - if (read1NegativeStrand) { - if (read2NegativeStrand) return ReadEnds.RR; - else return ReadEnds.RF; - } - else { - if (read2NegativeStrand) return ReadEnds.FR; - else return ReadEnds.FF; - } - } - - - - /** Calculates a score for the read which is the sum of scores over Q20. */ - private short getScore(SAMRecord rec) { - short score = 0; - for (byte b : rec.getBaseQualities()) { - if (b >= 15) score += b; - } - - return score; - } - - /** - * Goes through the accumulated ReadEnds objects and determines which of them are - * to be marked as duplicates. - * - * @return an array with an ordered list of indexes into the source file - */ - private void generateDuplicateIndexes() { - ReadEnds firstOfNextChunk = null; - List nextChunk = new ArrayList(200); - - // First just do the pairs - log.info("Traversing read pair information and detecting duplicates."); - for (ReadEnds next : this.pairSort) { - if (firstOfNextChunk == null) { - firstOfNextChunk = next; - nextChunk.add(firstOfNextChunk); - } - else if (areComparableForDuplicates(firstOfNextChunk, next, true)) { - nextChunk.add(next); - } - else { - if (nextChunk.size() > 1) { - markDuplicatePairs(nextChunk); - } - - nextChunk.clear(); - nextChunk.add(next); - firstOfNextChunk = next; - } - } - markDuplicatePairs(nextChunk); - this.pairSort = null; - - // Now deal with the fragments - log.info("Traversing fragment information and detecting duplicates."); - boolean containsPairs = false; - boolean containsFrags = false; - - for (ReadEnds next : this.fragSort) { - if (firstOfNextChunk != null && areComparableForDuplicates(firstOfNextChunk, next, false)) { - nextChunk.add(next); - containsPairs = containsPairs || next.isPaired(); - containsFrags = containsFrags || !next.isPaired(); - } - else { - if (nextChunk.size() > 1 && containsFrags) { - markDuplicateFragments(nextChunk, containsPairs); - } - - nextChunk.clear(); - nextChunk.add(next); - firstOfNextChunk = next; - containsPairs = next.isPaired(); - containsFrags = !next.isPaired(); - } - } - markDuplicateFragments(nextChunk, containsPairs); - this.fragSort = null; - - // Now shrink down the array and sort it - log.info("Sorting list of duplicate records."); - long[] tmp = new long[this.nextIndex]; - System.arraycopy(this.duplicateIndexes, 0, tmp, 0, this.nextIndex); - this.duplicateIndexes = tmp; - Arrays.sort(this.duplicateIndexes); - } - - private boolean areComparableForDuplicates(final ReadEnds lhs, final ReadEnds rhs, final boolean compareRead2) { - boolean retval = lhs.read1Sequence == rhs.read1Sequence && - lhs.read1Coordinate == rhs.read1Coordinate && - lhs.orientation == rhs.orientation; - - if (compareRead2) { - retval = lhs.read2Sequence == rhs.read2Sequence && - lhs.read2Coordinate == rhs.read2Coordinate; - } - - return retval; - } - - private void addIndexAsDuplicate(final long bamIndex) { - if (this.nextIndex > this.duplicateIndexes.length - 1) { - long[] tmp = new long[this.duplicateIndexes.length * 2]; - System.arraycopy(this.duplicateIndexes, 0, tmp, 0, this.nextIndex); - this.duplicateIndexes = tmp; - } - - this.duplicateIndexes[this.nextIndex++] = bamIndex; - } - - /** - * Takes a list of ReadEnds objects and removes from it all objects that should - * not be marked as duplicates. - * - * @param list - */ - private void markDuplicatePairs(final List list) { - short maxScore = 0; - ReadEnds best = null; - - for (final ReadEnds end : list) { - if (end.score > maxScore || best == null) { - maxScore = end.score; - best = end; - } - } - - for (final ReadEnds end : list) { - if (end != best) { - addIndexAsDuplicate(end.read1IndexInFile); - addIndexAsDuplicate(end.read2IndexInFile); - } - } - } - - /** - * Takes a list of ReadEnds objects and removes from it all objects that should - * not be marked as duplicates. - * - * @param list - */ - private void markDuplicateFragments(final List list, final boolean containsPairs) { - if (containsPairs) { - for (final ReadEnds end : list) { - if (!end.isPaired()) addIndexAsDuplicate(end.read1IndexInFile); - } - } - else { - short maxScore = 0; - ReadEnds best = null; - for (final ReadEnds end : list) { - if (end.score > maxScore || best == null) { - maxScore = end.score; - best = end; - } - } - - for (final ReadEnds end : list) { - if (end != best) { - addIndexAsDuplicate(end.read1IndexInFile); - } - } - } - } -} diff --git a/java/lib/edu/mit/broad/picard/sam/MergeSamFiles.java b/java/lib/edu/mit/broad/picard/sam/MergeSamFiles.java deleted file mode 100644 index cae476956..000000000 --- a/java/lib/edu/mit/broad/picard/sam/MergeSamFiles.java +++ /dev/null @@ -1,95 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. -* Neither the Broad Institute nor MIT can be responsible for its use, misuse, or -* functionality. -*/ -package edu.mit.broad.picard.sam; - -import java.io.File; -import java.util.ArrayList; -import java.util.List; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.sam.SAMFileHeader; -import static edu.mit.broad.sam.SAMFileHeader.SortOrder; -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.sam.SAMFileWriter; -import edu.mit.broad.sam.SAMFileWriterFactory; -import edu.mit.broad.sam.SAMRecord; - -/** - * Reads a SAM or BAM file and combines the output to one file - * - * @author Dave Tefft - */ -public class MergeSamFiles extends CommandLineProgram { - // Usage and parameters - @Usage(programVersion="1.0") - public String USAGE = "Merges multiple SAM/BAM files into one file.\n"; - - @Option(shortName="I", doc="SAM or BAM input file", minElements=1) - public List INPUT = new ArrayList(); - - @Option(shortName="O", doc="SAM or BAM file to write merged result to") - public File OUTPUT; - - @Option(shortName="SO", doc="Sort order of output file", optional=true) - public SAMFileHeader.SortOrder SORT_ORDER = SAMFileHeader.SortOrder.coordinate; - - /** Required main method implementation. */ - public static void main(String[] argv) { - System.exit(new MergeSamFiles().instanceMain(argv)); - } - - /** Combines multiple SAM/BAM files into one. */ - @Override - protected int doWork() { - boolean matchedSortOrders = true; - - // Open the files for reading and writing - List readers = new ArrayList(); - for (File inFile : INPUT) { - IoUtil.assertFileIsReadable(inFile); - SAMFileReader in = new SAMFileReader(inFile); - readers.add(in); - matchedSortOrders = matchedSortOrders && in.getFileHeader().getSortOrder() == SORT_ORDER; - } - - // If all the input sort orders match the output sort order then just merge them and - // write on the fly, otherwise setup to merge and sort before writing out the final file - IoUtil.assertFileIsWritable(OUTPUT); - MergingSamRecordIterator iterator = null; - SAMFileWriter out = null; - - if (matchedSortOrders || SORT_ORDER == SortOrder.unsorted) { - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SORT_ORDER); - iterator = new MergingSamRecordIterator(headerMerger); - out = new SAMFileWriterFactory().makeSAMOrBAMWriter(headerMerger.getMergedHeader(), true, OUTPUT); - } - else { - SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(readers, SortOrder.unsorted); - iterator = new MergingSamRecordIterator(headerMerger); - SAMFileHeader header = headerMerger.getMergedHeader(); - header.setSortOrder(SORT_ORDER); - out = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, OUTPUT); - } - - // Lastly loop through and write out the records - while (iterator.hasNext()) { - SAMRecord record = iterator.next(); - out.addAlignment(record); - } - - out.close(); - return 0; - } - -} \ No newline at end of file diff --git a/java/lib/edu/mit/broad/picard/sam/MergingSamRecordIterator.java b/java/lib/edu/mit/broad/picard/sam/MergingSamRecordIterator.java deleted file mode 100644 index 5641512af..000000000 --- a/java/lib/edu/mit/broad/picard/sam/MergingSamRecordIterator.java +++ /dev/null @@ -1,136 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. -* Neither the Broad Institute nor MIT can be responsible for its use, misuse, or -* functionality. -*/ -package edu.mit.broad.picard.sam; - -import edu.mit.broad.sam.*; -import static edu.mit.broad.sam.SAMFileHeader.SortOrder; -import edu.mit.broad.picard.PicardException; - -import java.util.*; -import java.lang.reflect.Constructor; - -/** - * Provides an iterator interface for merging multiple underlying iterators into a single - * iterable stream. The underlying iterators/files must all have the same sort order unless - * the requested output format is unsorted, in which case any combination is valid. - */ -public class MergingSamRecordIterator implements Iterator { - private final PriorityQueue pq; - private final SamFileHeaderMerger samHeaderMerger; - private final SAMFileHeader.SortOrder sortOrder; - - /** - * Constructs a new merging iterator with the same set of readers and sort order as - * provided by the header merger parameter. - */ - public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger) { - this.samHeaderMerger = headerMerger; - this.sortOrder = headerMerger.getMergedHeader().getSortOrder(); - final SAMRecordComparator comparator = getComparator(); - - final Collection readers = headerMerger.getReaders(); - this.pq = new PriorityQueue(readers.size()); - - for (final SAMFileReader reader : readers) { - if (this.sortOrder != SortOrder.unsorted && reader.getFileHeader().getSortOrder() != this.sortOrder){ - throw new PicardException("Files are not compatible with sort order"); - } - - final ComparableSamRecordIterator iterator = new ComparableSamRecordIterator(reader, comparator); - addIfNotEmpty(iterator); - } - } - - /** Returns true if any of the underlying iterators has more records, otherwise false. */ - public boolean hasNext() { - return !this.pq.isEmpty(); - } - - /** Returns the next record from the top most iterator during merging. */ - public SAMRecord next() { - final ComparableSamRecordIterator iterator = this.pq.poll(); - final SAMRecord record = iterator.next(); - addIfNotEmpty(iterator); - - if (this.samHeaderMerger.hasGroupIdDuplicates()) { - final String id = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID); - final String newId = this.samHeaderMerger.getReadGroupId(iterator.getReader(), id); - record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newId); - } - final String oldProgramGroupId = (String) record.getAttribute(SAMTag.PG.toString()); - if (oldProgramGroupId != null) { - final String newProgramGroupId = this.samHeaderMerger.getProgramGroupId(iterator.getReader(), oldProgramGroupId); - record.setAttribute(SAMTag.PG.toString(), newProgramGroupId); - } - - return record; - } - - /** - * Adds iterator to priority queue. If the iterator has more records it is added - * otherwise it is closed and not added. - */ - private void addIfNotEmpty(final ComparableSamRecordIterator iterator) { - if (iterator.hasNext()) { - pq.offer(iterator); - } - else { - iterator.close(); - } - } - - /** Unsupported operation. */ - public void remove() { - throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()"); - } - - /** - * Get the right comparator for a given sort order (coordinate, alphabetic). In the - * case of "unsorted" it will return a comparator that gives an arbitrary but reflexive - * ordering. - */ - private SAMRecordComparator getComparator() { - // For unsorted build a fake comparator that compares based on object ID - if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) { - return new SAMRecordComparator() { - public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) { - return System.identityHashCode(lhs) - System.identityHashCode(rhs); - } - - public int compare(final SAMRecord lhs, final SAMRecord rhs) { - return fileOrderCompare(lhs, rhs); - } - }; - } - - // Otherwise try and figure out what kind of comparator to return and build it - final Class type = this.sortOrder.getComparator(); - - try { - final Constructor ctor = type.getConstructor(SAMFileHeader.class); - return ctor.newInstance(this.samHeaderMerger.getMergedHeader()); - } - catch (Exception e) { - try { - final Constructor ctor = type.getConstructor(); - return ctor.newInstance(); - } - catch (Exception e2) { - throw new PicardException("Could not instantiate a comparator for sort order: " + this.sortOrder, e2); - } - } - } - - /** Returns the merged header that the merging iterator is working from. */ - public SAMFileHeader getMergedHeader() { - return this.samHeaderMerger.getMergedHeader(); - } -} diff --git a/java/lib/edu/mit/broad/picard/sam/ReservedTagConstants.java b/java/lib/edu/mit/broad/picard/sam/ReservedTagConstants.java deleted file mode 100644 index 2f4d3ef91..000000000 --- a/java/lib/edu/mit/broad/picard/sam/ReservedTagConstants.java +++ /dev/null @@ -1,18 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.sam; - -/** - * Constants for tags used in our SAM/BAM files - */ -public class ReservedTagConstants { - public static final String READ_GROUP_ID = "RG"; // Specified in the SAM spec doc - public static final String XN = "XN"; // Present and set to 1 if a read is a noise read -} diff --git a/java/lib/edu/mit/broad/picard/sam/SamFileHeaderMerger.java b/java/lib/edu/mit/broad/picard/sam/SamFileHeaderMerger.java deleted file mode 100644 index 6c69678ad..000000000 --- a/java/lib/edu/mit/broad/picard/sam/SamFileHeaderMerger.java +++ /dev/null @@ -1,286 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. -* Neither the Broad Institute nor MIT can be responsible for its use, misuse, or -* functionality. -*/ -package edu.mit.broad.picard.sam; - -import edu.mit.broad.sam.*; -import edu.mit.broad.picard.PicardException; - -import java.util.*; - -/** - * Merges SAMFileHeaders that have the same sequences into a single merged header - * object while providing read group translation for cases where read groups - * clash across input headers. - * - * @author Dave Tefft - */ -public class SamFileHeaderMerger { - //Super Header to construct - private final SAMFileHeader mergedHeader; - private final Collection readers; - - //Translation of old group ids to new group ids - private final Map> samGroupIdTranslation = - new HashMap>(); - - //the groups from different files use the same group ids - private boolean hasGroupIdDuplicates = false; - - //Translation of old program group ids to new program group ids - private final Map> samProgramGroupIdTranslation = - new HashMap>(); - - //Letters to construct new ids from a counter - private static final String ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; - - - /** - * Create SAMFileHeader with additional information - * - * @param readers same file readers to combine - * @param sortOrder sort order new header should have - */ - public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder) { - this.readers = readers; - this.mergedHeader = new SAMFileHeader(); - - // Set sequences first because if it throws exception there is no need to continue - final List sequences = getSAMSequences(readers); - this.mergedHeader.setSequences(sequences); - - // Set program that creates input alignments - for (final SAMProgramRecord program : mergeSAMProgramRecordLists(readers)) { - this.mergedHeader.addProgramRecord(program); - } - - // Set read groups for merged header - final List readGroups = getReadGroups(readers); - this.mergedHeader.setReadGroups(readGroups); - this.mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.none); - - this.mergedHeader.setSortOrder(sortOrder); - } - - /** - * Checks to see if there are clashes where different readers are using the same read - * group IDs. If they are then a new set of unique read group IDs are generated (across all - * read groups) otherwise the original read group headers are returned. - * - * @param readers readers to combine - * @return new list of readgroups constructed from all the readers - */ - private List getReadGroups(final Collection readers) { - // Read groups as read from the readers - final List orginalReadGroups = new ArrayList(); - - // Read group with new ids that don't confict - final List modifiedReadGroups = new ArrayList(); - - //set to see if there are duplicate group ids and whether or not we need to modify them - final Set groupIdsSeenBefore = new HashSet(); - - int x = 0; - this.hasGroupIdDuplicates = false; - - for (final SAMFileReader reader : readers) { - final SAMFileHeader header = reader.getFileHeader(); - final Map idTranslation = new HashMap(); - - // Iterate over read groups to find conflicting ids - for (final SAMReadGroupRecord readGroup : header.getReadGroups()) { - final String groupId = readGroup.getReadGroupId(); - final String newGroupId = createNewId(x++); - - // Check to see if same group id is used in two different readers - if (groupIdsSeenBefore.contains(groupId)) { - hasGroupIdDuplicates = true; - } - groupIdsSeenBefore.add(groupId); - - // Creates a new read group with the new id and copies all it's attributes - final SAMReadGroupRecord groupRecordWithNewId = copyReadGroup(readGroup, newGroupId); - - orginalReadGroups.add(readGroup); - modifiedReadGroups.add(groupRecordWithNewId); - - idTranslation.put(groupId, newGroupId); - } - - // Add id tranlation for updating SamRecords with new ids if neccessary - this.samGroupIdTranslation.put(reader, idTranslation); - } - - // return approriate readgroups whether or not the new ids have to be used - if (this.hasGroupIdDuplicates) { - return modifiedReadGroups; - } - else { - return orginalReadGroups; - } - } - - /** - * Get the sequences off the SAMFileReader header. Throws runtime exception if the sequence - * are different from one another - * - * @param readers readers to pull sequences from - * @return sequences from files. Each file should have the same sequence - */ - private List getSAMSequences(final Collection readers) { - List sequences = null; - for (final SAMFileReader reader : readers) { - final SAMFileHeader header = reader.getFileHeader(); - - if (sequences == null) { - sequences = header.getSequences(); - } - else { - final List currentSequences = header.getSequences(); - if (!sequenceListsEqual(sequences, currentSequences)) { - throw new PicardException("Files are not compatible with each other. They can not be combined"); - } - } - } - return sequences; - } - - /** - * Checks the equality of two lists of sequence records using the isSameSequence - * method instead of the equals method which is a more strict identity check. - * @param s1 a list of sequence headers - * @param s2 a second list of sequence headers - * @return true if the two lists match otherwise false - */ - private boolean sequenceListsEqual(final List s1, final List s2) { - if (s1.size() != s2.size()) { - return false; - } - for (int i = 0; i < s1.size(); ++i) { - if (!s1.get(i).isSameSequence(s2.get(i))) { - return false; - } - } - return true; - } - - /** - * Find the alignment program that produced the readers. If there are more than one - * generate a new program represents that - * - * @param readers SAMFileReaders to pull program information from - * @return SAMProgram record that represents all the readers - */ - // TODO: this needs to be fixed up to support multiple program records (PIC-15) - private List mergeSAMProgramRecordLists(final Collection readers) { - final boolean programMixed = false; - final List ret = new ArrayList(); - int nextProgramGroupId = 0; - for (final SAMFileReader reader : readers) { - final SAMFileHeader header = reader.getFileHeader(); - final Map idTranslation = new HashMap(); - for (final SAMProgramRecord oldProgramRecord : header.getProgramRecords()) { - boolean foundMatch = false; - for (final SAMProgramRecord newProgramRecord : ret) { - if (newProgramRecord.equivalent(oldProgramRecord)) { - idTranslation.put(oldProgramRecord.getProgramGroupId(), newProgramRecord.getProgramGroupId()); - foundMatch = true; - break; - } - } - if (!foundMatch) { - final SAMProgramRecord newProgramRecord = new SAMProgramRecord(Integer.toString(nextProgramGroupId++)); - copyProgramGroupAttributes(oldProgramRecord, newProgramRecord); - ret.add(newProgramRecord); - idTranslation.put(oldProgramRecord.getProgramGroupId(), newProgramRecord.getProgramGroupId()); - } - } - samProgramGroupIdTranslation.put(reader, idTranslation); - } - return ret; - } - - private void copyProgramGroupAttributes(final SAMProgramRecord oldProgramRecord, final SAMProgramRecord newProgramRecord) { - for (final Map.Entry entry : oldProgramRecord.getAttributes()) { - newProgramRecord.setAttribute(entry.getKey(), entry.getValue()); - } - } - - - /** - * Copies all the attribute of a readgroup to a new readgroup with a new id - * - * @param readGroup the group to be copied - * @param modifiedId the id for the new readgroup - * @return new read group - */ - private SAMReadGroupRecord copyReadGroup(final SAMReadGroupRecord readGroup, final String modifiedId) { - final SAMReadGroupRecord retval = new SAMReadGroupRecord(modifiedId); - retval.setLibrary(readGroup.getLibrary()); - retval.setSample(readGroup.getSample()); - - for (final Map.Entry attr : readGroup.getAttributes()) { - retval.setAttribute(attr.getKey(), attr.getValue()); - } - - return retval; - } - - - /** - * Creates a base 26 representation of an int - * - * @param n int to covert to letter representation - * @return string rep for an int eg 0 = A 27 = AB - */ - protected static String createNewId(int n) { - final int base = ALPHABET.length(); - - String s = ""; - while (true) { - final int r = n % base; - s = ALPHABET.charAt(r) + s; - n = n / base; - if (n == 0) { - return s; - } - n -= 1; - } - } - - /** Returns the read group id that should be used for the input read and RG id. */ - public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) { - return this.samGroupIdTranslation.get(reader).get(originalReadGroupId); - } - - /** - * @param reader one of the input files - * @param originalProgramGroupId a program group ID from the above input file - * @return new ID from the merged list of program groups in the output file - */ - public String getProgramGroupId(final SAMFileReader reader, final String originalProgramGroupId) { - return this.samProgramGroupIdTranslation.get(reader).get(originalProgramGroupId); - } - - /** Returns true if there are read group duplicates within the merged headers. */ - public boolean hasGroupIdDuplicates() { - return this.hasGroupIdDuplicates; - } - - /** Returns the merged header that should be written to any output merged file. */ - public SAMFileHeader getMergedHeader() { - return this.mergedHeader; - } - - /** Returns the collection of readers that this header merger is working with. */ - public Collection getReaders() { - return this.readers; - } -} diff --git a/java/lib/edu/mit/broad/picard/sam/SamLocusIterator.java b/java/lib/edu/mit/broad/picard/sam/SamLocusIterator.java deleted file mode 100644 index f7a52ae90..000000000 --- a/java/lib/edu/mit/broad/picard/sam/SamLocusIterator.java +++ /dev/null @@ -1,280 +0,0 @@ -package edu.mit.broad.picard.sam; - -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.SAMRecord; -import edu.mit.broad.sam.NotPrimarySkippingIterator; -import edu.mit.broad.picard.directed.GenomeMask; - -import java.util.*; - -/** - * Iterator that traverses a SAM File, accumulating information on a per-locus basis - */ -public class SamLocusIterator implements Iterable, CloseableIterator { - - /** - * The unit of iteration. Holds the locus, plus the base, quality and strand for each read at that locus. - */ - public static class LocusInfo { - protected final int sequenceIndex; - protected final int position; - protected final List bases = new ArrayList(100); - protected final List qualities = new ArrayList(100); - protected final List negativeStrandFlags = new ArrayList(100); - - LocusInfo(final int sequenceIndex, final int position) { - this.sequenceIndex = sequenceIndex; - this.position = position; - } - - /** - * Accumulate info for one read at the locus. - */ - public void add(final Byte readBase, final Byte baseQuality, final boolean strand) { - bases.add(readBase); - qualities.add(baseQuality); - negativeStrandFlags.add(strand); - } - - public int getSequenceIndex() { return sequenceIndex; } - public int getPosition() { return position; } - public List getBases() { return bases; } - public List getQualities() { return qualities; } - public List getNegativeStrandFlags() { return negativeStrandFlags; } - - public String getBasesAsString() { return bytesToString(bases); } - - private static String bytesToString(final List data) { - if (data == null || data.size() == 0) { - return ""; - } - - final char[] chars = new char[data.size()]; - for (int i = 0; i < data.size(); i++) { - chars[i] = (char) (data.get(i) & 0xFF); - } - return new String(chars); - } - } - - - - - private final CloseableIterator underlyingIterator; - private final NotPrimarySkippingIterator it; - private final LinkedList complete = new LinkedList(); - private final LinkedList accumulator = new LinkedList(); - - private boolean includeNonPfReads = false; - private boolean includeDuplicates = false; - private int qualityScoreCutoff = -Integer.MAX_VALUE; - - private GenomeMask mask; - private int lastContig = 0; - private int lastPosition = 0; - - private boolean finishedAlignedReads = false; - - - // this should probably take a SAM - public SamLocusIterator(final CloseableIterator samIterator) { - this.underlyingIterator = samIterator; - this.it = new NotPrimarySkippingIterator(samIterator); - } - - public Iterator iterator() { - return this; - } - - public void close() { - this.underlyingIterator.close(); - } - - private boolean samHasMore() { - return !finishedAlignedReads && it.hasCurrent(); - } - public boolean hasNext() { - return ((complete.size() > 0) || (accumulator.size() > 0) || (samHasMore()) || hasRemainingMaskBases()); - } - - private boolean hasRemainingMaskBases() { - if (mask == null) return false; - - // if there are more contigs in the mask, by definition some of them must have - // marked bases otherwise if we're in the last contig, but we're not at the last marked position, - // there is also more in the mask - return (lastContig <= mask.getMaxContig() || - (lastContig == mask.getMaxContig() && lastPosition <= mask.get(lastContig).nextSetBit(lastPosition+1))); - } - - public LocusInfo next() { - - // if we don't have any completed entries to return, try and make some! - while(complete.size() == 0 && samHasMore()) { - final SAMRecord rec = it.getCurrent(); - final String cigar = rec.getCigarString(); - - // as soon as we hit our first non-aligned read, we can stop! - if (cigar.equals("*")) { - this.finishedAlignedReads = true; - continue; - } - - // skip dupe reads, if so requested - if (!isIncludeDuplicates() && rec.getDuplicateReadFlag()) { it.advance(); continue; } - - // skip non-PF reads, if so requested - if (!isIncludeNonPfReads() && rec.getReadFailsVendorQualityCheckFlag()) { it.advance(); continue; } - - // when we switch contigs, emit everything in the accumulator - if (accumulator.size() > 0 && accumulator.getFirst().sequenceIndex != rec.getReferenceIndex()) { - while (accumulator.size() > 0) { - popLocus(); - } - } - - // pop off things we're not going to accumulate more coverage at the locus in question - while(accumulator.size() > 0 && accumulator.getFirst().position < rec.getAlignmentStart()) { - popLocus(); - } - - // check that it's a non-gapped alignment for now! - // TODO: handle gapped and clipped alignments - if (!cigar.matches("[0-9]+M")) { - System.out.println("Cannot deal with clipped or gapped alignments. CIGAR="+cigar); - System.exit(1); - } - - // at this point, either the list is empty or the head should - // be the same position as the first base of the read - - // interpret the CIGAR string and add the base info - for(int j=0; j < rec.getReadBases().length; j++) { - // if the position is empty, initialize it - if (j > accumulator.size() - 1) { - accumulator.add(new LocusInfo(rec.getReferenceIndex(), rec.getAlignmentStart() + j)); - } - - // if the quality score cutoff is met, accumulate the base info - if (rec.getBaseQualities()[j] >= getQualityScoreCutoff()) { - accumulator.get(j).add(rec.getReadBases()[j], rec.getBaseQualities()[j], rec.getReadNegativeStrandFlag()); - } - } - - - it.advance(); - } - - // if we have nothing to return to the user, and we're at the end of the SAM iterator, - // push everything into the complete queue - if (complete.size() == 0 && !samHasMore()) { - while(accumulator.size() > 0) { - popLocus(); - } - } - - // if there are completed entries, return those - if (complete.size() > 0) { - return complete.removeFirst(); - } else { - - // In this case... we're past the last read from SAM so see if we can - // fill out any more (zero coverage) entries from the mask - LocusInfo zeroResult = null; - while (zeroResult == null && lastContig <= mask.getMaxContig()) { - final int nextbit = mask.get(lastContig).nextSetBit(lastPosition+1); - - // try the next contig - if (nextbit == -1) { - lastContig++; - lastPosition = 0; - } else { - lastPosition = nextbit; - zeroResult = new LocusInfo(lastContig, lastPosition); - } - } - - return zeroResult; - } - } - - /** - * Pop the first entry from the LocusInfo accumulator into the complete queue. In addition, - * check the GenomeMask and if there are intervening mask positions between the last popped base and the one - * about to be popped, put those on the complete queue as well. - */ - private void popLocus() { - final LocusInfo li = accumulator.removeFirst(); - - // fill in any gaps based on our genome mask - final int liContig = li.getSequenceIndex(); - - // if we're not on the same contig, fill in the rest of the bits for the previous contig first... - if (lastContig < liContig) { - while (lastContig < liContig) { - int nextbit = 0; - - if (mask != null && mask.get(lastContig) != null) { - while (nextbit != -1) { - nextbit = mask.get(lastContig).nextSetBit(lastPosition + 1); - if (nextbit > -1) { - complete.addLast(new LocusInfo(lastContig, nextbit)); - lastPosition = nextbit; - } - } - } - lastPosition=0; - lastContig++; - } - } - - // now that we're on the same contig, fill in any unfilled positions - // if we have some bits in the mask to fill in... - if (mask != null && mask.get(lastContig) != null && lastPosition + 1 < li.getPosition()) { - while (lastPosition + 1 < li.getPosition()) { - - final int nextbit = mask.get(lastContig).nextSetBit(lastPosition + 1); - - // if there are no more mask bits, or the next mask bit is - // at or after the current data, just continue on - if (nextbit == -1 || nextbit >= li.getPosition()) { break; } - - // otherwise, pop on the desired empty locus info - complete.addLast(new LocusInfo(lastContig, nextbit)); - lastPosition = nextbit; - } - } - - // only add to the complete queue if it's in the mask (or we have no mask!) - if (mask == null || mask.get(li.getSequenceIndex(), li.getPosition())) { - complete.addLast(li); - } - - lastContig = liContig; - lastPosition = li.getPosition(); - - - } - - public void remove() { - throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); - } - - // -------------------------------------------------------------------------------------------- - // Helper methods below this point... - // -------------------------------------------------------------------------------------------- - - public void setGenomeMask(final GenomeMask mask) { this.mask = mask; } - public GenomeMask getGenomeMask() { return this.mask; } - - public boolean isIncludeNonPfReads() { return includeNonPfReads; } - public void setIncludeNonPfReads(final boolean includeNonPfReads) { this.includeNonPfReads = includeNonPfReads; } - - public boolean isIncludeDuplicates() { return includeDuplicates; } - public void setIncludeDuplicates(final boolean includeDuplicates) { this.includeDuplicates = includeDuplicates; } - - public int getQualityScoreCutoff() { return qualityScoreCutoff; } - public void setQualityScoreCutoff(final int qualityScoreCutoff) { this.qualityScoreCutoff = qualityScoreCutoff; } - - -} diff --git a/java/lib/edu/mit/broad/picard/util/AbstractTextFileParser.java b/java/lib/edu/mit/broad/picard/util/AbstractTextFileParser.java deleted file mode 100644 index 74dd1e12a..000000000 --- a/java/lib/edu/mit/broad/picard/util/AbstractTextFileParser.java +++ /dev/null @@ -1,203 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.util; - -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.sam.util.CloseableIterator; - -import java.util.Iterator; -import java.util.NoSuchElementException; -import java.io.Closeable; - -/** - * Class for parsing text files where each line consists of fields separated by whitespace. - * Code is abstracted into this class so that we can optimize its performance over time. - * - * This class assumes that every line will have the same number of whitespace-separated "words" - * and that lines that start with "#" are comments and should be ignored. - * - * Classes that extend this parser can do so simply by implementing their own constructors and the - * readNextLine(), close(), and getFileName() methods. - * - * @author Kathleen Tibbetts - */ -public abstract class AbstractTextFileParser implements Iterable, CloseableIterator { - - private boolean treatGroupedDelimitersAsOne = true; // Whether multiple delimiters in succession should be treated as one - private byte nextLine[] = null; - private int wordCount = 0; /* The number of delimiter-separated "words" per line of the file. - We can save a little caclulation, or handle files with varying numbers of - words per line, by specifying this if known in advance */ - private boolean iterating = false; - - /** - * Closes this stream and releases any system resources associated with it. - */ - public abstract void close(); - - /** - * @return the next line of text from the underlying stream(s) or null if there is no next line - */ - protected abstract byte[] readNextLine(); - - /** - * @return the name(s) of the file(s) being parsed, or null if no name is available - */ - protected abstract String getFileName(); - - /** - * @return an iterator over a set of elements of type String[] - */ - public Iterator iterator() { - if (iterating) { - throw new IllegalStateException("iterator() method can only be called once, before the" + - "first call to hasNext()"); - } - nextLine = readNextLine(); - iterating = true; - return this; - } - - /** - * Returns true if the iteration has more elements. - * - * @return true if the iteration has more elements. Otherwise returns false. - */ - public boolean hasNext() { - // If this is the start of iteration, queue up the first item - if(!iterating) { - nextLine = readNextLine(); - iterating = true; - } - return nextLine != null; - } - - /** - * Returns the next element in the iteration. - * - * @return the next tlement in the iteration - * @throws java.util.NoSuchElementException - */ - public String[] next() { - - if (!hasNext()) { - throw new NoSuchElementException("Iteration from text file(s) " + - getFileName() + " has no more elements."); - } - - String[] result = parseLine(nextLine); - do { - nextLine = readNextLine(); - } - while (nextLine != null && isComment(nextLine)); - return result; - } - - /** - * This method represents the most efficient way (so far) to parse a line of whitespace-delimited text - * - * @param line the line to parse - * @return an array of all the "words" - */ - private String[] parseLine(byte line[]) { - - if (getWordCount() == 0) { - calculateWordCount(line); - } - String parts[] = new String[getWordCount()]; - boolean delimiter = true; - int index=0; - int start = 0; - - try - { - for (int i = 0; i < line.length; i++) { - if (isDelimiter(line[i])) { - if (!delimiter) { - parts[index++] = new String(line,start,i-start); - } - else if(!isTreatGroupedDelimitersAsOne()) { - parts[index++] = null; - } - delimiter=true; - } - else { - if (delimiter) start = i; - delimiter = false; - } - } - if (!delimiter) { - parts[index] = new String(line,start,line.length-start); - } - } - catch (ArrayIndexOutOfBoundsException e) { - throw new PicardException("Unexpected number of elements found when parsing file " + - this.getFileName() + ": " + index + ". Expected a maximum of " + - this.getWordCount() + " elements per line."); - } - return parts; - } - - /** - * Calculates the number of delimiter-separated "words" in a line and sets the value of wordCount - * - * @param line representative line from the file - */ - protected void calculateWordCount(byte line[]) { - int words = 0; - boolean delimiter = true; - for (byte b : line) { - if (isDelimiter(b)) { - if (delimiter && !isTreatGroupedDelimitersAsOne()) words++; - delimiter = true; - } else { - if (delimiter) words++; - delimiter = false; - } - } - setWordCount(words); - } - - /** - * Required method for Iterator API. - * - * @throws UnsupportedOperationException - */ - public void remove() { - throw new UnsupportedOperationException("Remove() not supported."); - } - - /** - * Determines whether a given line is a comment - * - * @param line the line to evaluate - * @return true if the line is a comment (and should be ignored) otherwise false - */ - protected boolean isComment(byte line[]) { - return line[0] == '#'; - } - - /** - * Determines whether a given character is a delimiter - * - * @param b the character to evaluate - * @return true if b is a delimiter; otherwise false - */ - protected boolean isDelimiter(byte b) { - return b == ' ' || b == '\t'; - } - - protected int getWordCount() { return wordCount; } - protected void setWordCount(int wordCount) { this.wordCount = wordCount; } - protected boolean isTreatGroupedDelimitersAsOne() { return treatGroupedDelimitersAsOne; } - protected void setTreatGroupedDelimitersAsOne(boolean treatGroupedDelimitersAsOne) { - this.treatGroupedDelimitersAsOne = treatGroupedDelimitersAsOne; - } -} diff --git a/java/lib/edu/mit/broad/picard/util/ArrayUtil.java b/java/lib/edu/mit/broad/picard/util/ArrayUtil.java deleted file mode 100644 index 7ca7e3883..000000000 --- a/java/lib/edu/mit/broad/picard/util/ArrayUtil.java +++ /dev/null @@ -1,33 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.util; - -public class ArrayUtil { - - /** - * Reverse the elements of the given array in place - */ - public static void reverseArray(T[] array) { - for (int left=0, right=array.length-1; left files = new ArrayList(); - String currentFileName = null; - - /** - * Constructor. Opens up a buffered reader and reads the first line. - * - * @param files the file(s) to parse, in order - */ - public BasicTextFileParser(boolean treatGroupedDelimitersAsOne, File... files) { - if (files.length == 0) { - throw new IllegalArgumentException("At least one file must be specified."); - } - this.files.addAll(Arrays.asList(files)); - File f = this.files.remove(0); - currentFileName = f.getAbsolutePath(); - reader = new AsciiLineReader(IoUtil.openFileForReading(f)); - this.setTreatGroupedDelimitersAsOne(treatGroupedDelimitersAsOne); - } - - /** - * Constructor. In addition to opening and priming the files, it sets the number of - * whitespace-separated "words" per line. - * - * @param files the file(s) to parse - * @param wordCount number of whitespace-separated "words" per line - */ - public BasicTextFileParser(boolean treatGroupedDelimitersAsOne, int wordCount, File... files) { - this(treatGroupedDelimitersAsOne, files); - setWordCount(wordCount); - } - /** - * Workhorse method that reads the next line from the underlying reader - * - * @return String or null if there is no next line - */ - protected byte[] readNextLine() - { - try { - String line = reader.readLine(); - if (line != null) { - return line.getBytes(); - } - if (files.size() > 0) { - currentFileName = files.get(0).getAbsolutePath(); - reader = new AsciiLineReader(IoUtil.openFileForReading(files.remove(0))); - return readNextLine(); - } - return null; - } - catch(RuntimeIOException ioe) { - throw new PicardException("Error reading from file " + currentFileName, ioe); - } - } - - /** - * Closes the underlying stream - */ - public void close() { - if (reader != null) { - reader.close(); - } - } - - /** - * Gets the name of the file being parsed - * - * @return the name of the file being parsed - */ - protected String getFileName() { - return this.currentFileName; - } -} diff --git a/java/lib/edu/mit/broad/picard/util/CloseableIteratorWrapper.java b/java/lib/edu/mit/broad/picard/util/CloseableIteratorWrapper.java deleted file mode 100644 index 909901652..000000000 --- a/java/lib/edu/mit/broad/picard/util/CloseableIteratorWrapper.java +++ /dev/null @@ -1,42 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.util; - -import java.util.Iterator; - -import edu.mit.broad.sam.util.CloseableIterator; - -public class CloseableIteratorWrapper implements CloseableIterator { - Iterator wrappedIterator; - - public CloseableIteratorWrapper(Iterator wrappedIterator) { - this.wrappedIterator = wrappedIterator; - } - - @Override - public boolean hasNext() { - return wrappedIterator.hasNext(); - } - - @Override - public T next() { - return wrappedIterator.next(); - } - - @Override - public void remove() { - wrappedIterator.remove(); - } - - @Override - public void close() { - } -} \ No newline at end of file diff --git a/java/lib/edu/mit/broad/picard/util/CloserUtil.java b/java/lib/edu/mit/broad/picard/util/CloserUtil.java deleted file mode 100644 index 8b5f702ef..000000000 --- a/java/lib/edu/mit/broad/picard/util/CloserUtil.java +++ /dev/null @@ -1,50 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.util; - -import java.util.List; -import java.util.Arrays; -import java.io.Closeable; -import java.io.IOException; - -/** - * Utility to close things that implement Closeable - * - * @author Kathleen Tibbetts - */ -public class CloserUtil { - - /** - * Calls close() on obj if it implements Closeable - * - * @param obj The potentially closeable object - */ - public static void close(Object obj) { - close(Arrays.asList(obj)); - } - - /** - * Calls close() on all elements of objs that implement Closeable - * - * @param objs A list of potentially closeable objects - */ - public static void close(List objs) { - for (Object o : objs) { - if (o instanceof Closeable) { - try { - ((Closeable)o).close(); - } - catch (IOException ioe) { - // Do nothing - } - } - } - } -} diff --git a/java/lib/edu/mit/broad/picard/util/CoordMath.java b/java/lib/edu/mit/broad/picard/util/CoordMath.java deleted file mode 100644 index 981b494c0..000000000 --- a/java/lib/edu/mit/broad/picard/util/CoordMath.java +++ /dev/null @@ -1,59 +0,0 @@ -/* - The Broad Institute - SOFTWARE COPYRIGHT NOTICE AGREEMENT - This software and its documentation are copyright 2005 by the - Broad Institute/Massachusetts Institute of Technology. All rights are - reserved. - - This software is supplied without any warranty or guaranteed support - whatsoever. Neither the Broad Institute nor MIT can be responsible for its - use, misuse, or functionality. -*/ -package edu.mit.broad.picard.util; - - -/** - * Basic coordinate-based math utils, so it's encapsulated in one place! Assumes - * a one-based coordinate system and then 'end' is always inclusive - */ -public class CoordMath { - - /** Gets the length of an interval given the start and the end. */ - public static int getLength(int start, int end) { return (end - start) + 1; } - - /** Gets the start of an interval given the end and the length. */ - public static int getStart(int end, int length) { return end - length + 1; } - - /** Gets the end of an interval given the start and the length. */ - public static int getEnd(int start, int length) { return start + length - 1; } - - /** Checks to see if the two sets of coordinates have any overlap. */ - public static boolean overlaps(int start, int end, int start2, int end2) { - return (start2 >= start && start2 <= end) || (end2 >=start && end2 <= end) || - encloses(start2, end2, start, end); - } - - /** Returns true if the "inner" coords and totally enclosed by the "outer" coords. */ - public static boolean encloses(int outerStart, int outerEnd, int innerStart, int innerEnd) { - return innerStart >= outerStart && innerEnd <= outerEnd; - } - - /** - * Determines the amount of overlap between two coordinate ranges. Assumes that the two ranges - * actually do overlap and therefore may produce strange results when they do not! - */ - public static int getOverlap(int start, int end, int start2, int end2) { - return getLength(Math.max(start, start2), Math.min(end, end2)); - } - - /** - * Determines the read cycle number for the base - * - * @param isNegativeStrand true if the read is negative strand - * @param readLength - * @param readBaseIndex the 0-based index of the read base in question - */ - public static int getCycle(boolean isNegativeStrand, int readLength, final int readBaseIndex) { - return isNegativeStrand ? readLength - readBaseIndex : readBaseIndex + 1; - } -} diff --git a/java/lib/edu/mit/broad/picard/util/Coverage.java b/java/lib/edu/mit/broad/picard/util/Coverage.java deleted file mode 100644 index 26212f4fc..000000000 --- a/java/lib/edu/mit/broad/picard/util/Coverage.java +++ /dev/null @@ -1,36 +0,0 @@ -package edu.mit.broad.picard.util; - -/** - * A simple class that is used to store the coverage information about an interval. - * - * @author Tim Fennell - */ -public class Coverage { - private Interval interval; - private short[] depths; - - /** Constructs a new coverage object for the provided mapping with the desired padding either side. */ - public Coverage(Interval i, int padding) { - this.interval = i; - this.depths = new short[interval.length() + 2*padding]; - } - - /** Adds a single point of depth at the desired offset into the coverage array. */ - public void addBase(int offset) { - if (offset >= 0 && offset < this.depths.length) { - this.depths[offset] += 1; - } - } - - /** Returns true if any base in the range has coverage of > 1 */ - public boolean hasCoverage() { - for (short s : depths) { - if (s > 1) return true; - } - - return false; - } - - /** Gets the coverage depths as an array of shorts. */ - public short[] getDepths() { return this.depths; } -} diff --git a/java/lib/edu/mit/broad/picard/util/CreateAnalysisDirectory.java b/java/lib/edu/mit/broad/picard/util/CreateAnalysisDirectory.java deleted file mode 100644 index c7ba6c626..000000000 --- a/java/lib/edu/mit/broad/picard/util/CreateAnalysisDirectory.java +++ /dev/null @@ -1,88 +0,0 @@ -package edu.mit.broad.picard.util; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Usage; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.io.IoUtil; - -import java.io.File; -import java.util.Date; -import java.text.SimpleDateFormat; - -/** - * CommandLineProgram to create Picard analysis directory - * - * @author Kathleen Tibbetts - */ -public class CreateAnalysisDirectory extends CommandLineProgram { - - public static final SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy_MM_dd"); - - // The following attributes define the command-line arguments - @Usage(programVersion="1.0") - public String USAGE = - "Usage: " + getClass().getName() + " [options]\n\n" + - "Create a new Picard analysis directory.\n"; - - - @Option(shortName = "P", doc = "Analysis directory prefix. ") - public String PREFIX = "/seq/picard"; - - @Option(shortName = "F", doc = "The flowcell. ") - public String FLOWCELL; - - @Option(shortName = "A", doc = "The first cycle being analyzed. ") - public Integer FIRST_CYCLE = 1; - - @Option(shortName = "O", doc = "The last cycle being analyzed. ") - public Integer LAST_CYCLE; - - @Option(shortName = "R", doc = "The run date in the format MM/dd/yyyy. ") - public Date RUNDATE; - - @Option(shortName = "L", doc = "Lane number. ") - public Integer LANE; - - @Option(shortName="LIB", doc = "Library this analysis is for (e.g. 'Solexa-1234'). ") - public String LIBRARY; - - @Option(shortName="S", doc = "Analysis start date in the format MM/dd/yyyy") - public Date ANALYSIS_START_DATE; - - @Override - protected int doWork() { - if (PREFIX.charAt(PREFIX.length()-1) == '/') { - PREFIX = PREFIX.substring(0, PREFIX.length()-1); - } - IoUtil.assertDirectoryIsWritable(new File(PREFIX)); - String parts[] = { PREFIX, FLOWCELL, "C" + FIRST_CYCLE + "-" + LAST_CYCLE + "_" + - dateFormat.format(RUNDATE) + "_" + dateFormat.format(ANALYSIS_START_DATE), - String.valueOf(LANE), LIBRARY }; - String directory = null; - - for (int i = 1; i < parts.length; i++) { - StringBuilder sb = new StringBuilder(); - for (int j=0; j <= i; j++) { - sb.append(parts[j]).append("/"); - } - directory = sb.toString(); - File dir = new File(directory); - if (!dir.exists()) { - if (!dir.mkdir()) { - System.err.println("Unable to create directory " + directory); - return 1; - } - } - } - System.out.print(directory); - return 0; - } - - public static void main(String[] argv) { - CreateAnalysisDirectory cmd = new CreateAnalysisDirectory(); - cmd.QUIET = true; - System.exit(cmd.instanceMain(argv)); - } - - -} diff --git a/java/lib/edu/mit/broad/picard/util/FormatUtil.java b/java/lib/edu/mit/broad/picard/util/FormatUtil.java deleted file mode 100644 index 94816c1fe..000000000 --- a/java/lib/edu/mit/broad/picard/util/FormatUtil.java +++ /dev/null @@ -1,135 +0,0 @@ -package edu.mit.broad.picard.util; - -import edu.mit.broad.picard.PicardException; - -import java.security.InvalidParameterException; -import java.text.DateFormat; -import java.text.NumberFormat; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.util.Date; -import java.math.RoundingMode; - -/** - * Simple class used to format object values into a standard format for printing. - * - * @author Tim Fennell - */ -public class FormatUtil { - private DateFormat dateFormat; - private NumberFormat integerFormat; - private NumberFormat floatFormat; - - /** Constructs a new FormatUtil and initializes various internal formatters. */ - public FormatUtil() { - this.dateFormat = new SimpleDateFormat("yyyy-MM-dd"); - - this.integerFormat = NumberFormat.getIntegerInstance(); - this.integerFormat.setGroupingUsed(false); - - this.floatFormat = NumberFormat.getNumberInstance(); - this.floatFormat.setGroupingUsed(false); - this.floatFormat.setMaximumFractionDigits(6); - this.floatFormat.setRoundingMode(RoundingMode.HALF_DOWN); - } - - /** Formats a short to an integer string. */ - public String format(short value) { return this.integerFormat.format(value); } - - /** Formats an int to an integer string. */ - public String format(int value) { return this.integerFormat.format(value); } - - /** Formats a long to an integer string. */ - public String format(long value) { return this.integerFormat.format(value); } - - /** Formats a float to a floating point string. */ - public String format(float value) {return this.floatFormat.format(value); } - - /** Formats a double to a floating point string. */ - public String format(double value) {return this.floatFormat.format(value); } - - /** Formats an enum to the String representation of an enum. */ - public String format(Enum value) { return value.name(); } - - /** Formats a date to a date string without time. */ - public String format(Date value) { return this.dateFormat.format(value); } - - /** Formats a boolean value to a String. */ - public String format(boolean value) { if (value) return "Y"; else return "N"; } - - /** Attempts to determine the type of value and format it appropriately. */ - public String format(Object value) { - if (value == null) return ""; - if (value instanceof Short) return format( ((Short) value).shortValue() ); - if (value instanceof Integer) return format( ((Integer) value).intValue() ); - if (value instanceof Long) return format( ((Long) value).longValue() ); - if (value instanceof Float) return format( ((Float) value).floatValue() ); - if (value instanceof Double) return format( ((Double) value).doubleValue() ); - if (value instanceof Enum) return format( ((Enum) value) ); - if (value instanceof Date) return format( ((Date) value) ); - if (value instanceof Boolean) return format( ((Boolean) value).booleanValue() ); - return value.toString(); - } - - /////////////////////////////////////////////////////////////////////////// - // Parsing methods - /////////////////////////////////////////////////////////////////////////// - - /** Parses a String into a short. */ - public short parseShort(String value) { return Short.parseShort(value); } - - /** Parses a String into an int. */ - public int parseInt(String value) { return Integer.parseInt(value); } - - /** Parses a String into a long. */ - public long parseLong(String value) { return Long.parseLong(value); } - - /** Parses a String into a float. */ - public float parseFloat(String value) { return Float.parseFloat(value); } - - /** Parses a String into a double. */ - public double parseDouble(String value) { return Double.parseDouble(value); } - - /** Parses a String into an Enum of the given type. */ - public E parseEnum(String value, Class type) { return (E) Enum.valueOf(type, value); } - - /** Parses a String into a date. */ - public Date parseDate(String value) { - try { - return this.dateFormat.parse(value); - } - catch (ParseException pe) { - throw new PicardException("Could not parse value as date: " + value, pe); - } - } - - /** Parses a String into a boolean. */ - public boolean parseBoolean(String value) { - if (value == null || value.length() == 0) return false; - char ch = Character.toUpperCase(value.charAt(0)); - - return (ch == 'Y'); - } - - /** - * Attempts to determine the correct parse method to call based on the desired - * return type and then parses the String and returns the value. - * - * @param value the String value to be parsed - * @param returnType the desired return type - * @return an object of the returnType - */ - public Object parseObject(String value, Class returnType) { - if (returnType == Short.class || returnType == Short.TYPE) return parseShort(value); - if (returnType == Integer.class || returnType == Integer.TYPE) return parseInt(value); - if (returnType == Long.class || returnType == Long.TYPE) return parseLong(value); - if (returnType == Float.class || returnType == Float.TYPE) return parseFloat(value); - if (returnType == Double.class || returnType == Double.TYPE) return parseDouble(value); - if (returnType == Boolean.class || returnType == Boolean.TYPE) return parseBoolean(value); - if (returnType == Date.class) return parseDate(value); - if (Enum.class.isAssignableFrom(returnType)) return parseEnum(value, (Class)returnType); - if (returnType == String.class) return value; - - throw new InvalidParameterException("Don't know how to convert a String to a " + returnType.getName()); - } -} diff --git a/java/lib/edu/mit/broad/picard/util/Histogram.java b/java/lib/edu/mit/broad/picard/util/Histogram.java deleted file mode 100644 index 3d1f3f807..000000000 --- a/java/lib/edu/mit/broad/picard/util/Histogram.java +++ /dev/null @@ -1,152 +0,0 @@ -package edu.mit.broad.picard.util; - -import edu.mit.broad.picard.util.Histogram.Bin; - -import java.util.TreeMap; - -/** - * Class for computing and accessing histogram type data. Stored internally in - * a sorted Map so that keys can be iterated in order. - * - * @author Tim Fennell - */ -public class Histogram extends TreeMap { - private String binLabel = "BIN"; - private String valueLabel = "VALUE"; - private double count = 0; - private Double mean; - - /** Constructs a new Histogram with default bin and value labels. */ - public Histogram() { } - - /** Constructs a new Histogram with supplied bin and value labels. */ - public Histogram(String binLabel, String valueLabel) { - this.binLabel = binLabel; - this.valueLabel = valueLabel; - } - - /** Represents a bin in the Histogram. */ - public class Bin { - private final K id; - private double value = 0; - - /** Constructs a new bin with the given ID. */ - private Bin(K id) { this.id = id; } - - /** Gets the ID of this bin. */ - public K getId() { return id; } - - /** Gets the value in the bin. */ - public double getValue() { return value; } - - /** Returns the String format for the value in the bin. */ - public String toString() { return String.valueOf(this.value); } - - /** Checks the equality of the bin by ID and value. */ - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - Bin bin = (Bin) o; - - if (Double.compare(bin.value, value) != 0) return false; - if (!id.equals(bin.id)) return false; - - return true; - } - - public double getIdValue() { - if (id instanceof Number) { - return ((Number) id).doubleValue(); - } else { - throw new UnsupportedOperationException("getIdValue only supported for Histogram"); - } - } - } - - /** Prefill the histogram with the supplied set of bins. */ - public void prefillBins(K... ids) { - for (K id : ids) { - put(id, new Bin(id)); - } - } - - /** Increments the value in the designated bin by 1. */ - public void increment(K id) { - increment(id, 1d); - } - - /** Increments the value in the designated bin by the supplied increment. */ - public void increment(K id, double increment) { - Bin bin = get(id); - if (bin == null) { - bin = new Bin(id); - put(id, bin); - } - - bin.value += increment; - count += increment; - mean = null; - } - - public String getBinLabel() { return binLabel; } - public void setBinLabel(String binLabel) { this.binLabel = binLabel; } - - public String getValueLabel() { return valueLabel; } - public void setValueLabel(String valueLabel) { this.valueLabel = valueLabel; } - - /** Checks that the labels and values in the two histograms are identical. */ - public boolean equals(Object o) { - return o != null && - (o instanceof Histogram) && - ((Histogram) o).binLabel.equals(this.binLabel) && - ((Histogram) o).valueLabel.equals(this.valueLabel) && - super.equals(o); - } - - public double getMean() { - if (mean == null) { - double total = 0; - for (Bin bin : values()) { - total += bin.getValue() * bin.getIdValue(); - } - - mean = total / count; - } - - return mean; - } - - public double getStandardDeviation() { - double total = 0; - for (Bin bin : values()) { - total += bin.getValue() * bin.getIdValue() * bin.getIdValue(); - } - - return Math.sqrt((total / count) - (getMean() * getMean())); - } - - public double getMedian() { - double total = 0; - double halfCount = count / 2; - for (Bin bin : values()) { - total += bin.getValue(); - if (total >= halfCount) { - return bin.getIdValue(); - } - } - return 0; - } - - public double getMin() { - return firstEntry().getValue().getIdValue(); - } - - public double getMax() { - return lastEntry().getValue().getIdValue(); - } - - public double getCount() { - return count; - } -} diff --git a/java/lib/edu/mit/broad/picard/util/Interval.java b/java/lib/edu/mit/broad/picard/util/Interval.java deleted file mode 100644 index 79a091807..000000000 --- a/java/lib/edu/mit/broad/picard/util/Interval.java +++ /dev/null @@ -1,139 +0,0 @@ -package edu.mit.broad.picard.util; - -import edu.mit.broad.picard.PicardException; - -import java.util.List; -import java.util.Collection; - -/** - * Represents a simple interval on a sequence. Coordinates are 1-based closed ended. - * - * @author Tim Fennell - */ -public class Interval implements Comparable, Cloneable { - private String sequence; - private int start; - private int end; - private boolean negativeStrand; - private String name; - - /** - * Constructs an interval with the supplied sequence and start and end. If the end - * position is less than the start position an exception is thrown. - * - * @param sequence the name of the sequence - * @param start the start position of the interval on the sequence - * @param end the end position of the interval on the sequence - */ - public Interval(String sequence, int start, int end) { - this.sequence = sequence; - this.start = start; - this.end = end; - - if (this.end < this.start) throw new IllegalArgumentException("start must be less than or equal to end!"); - } - - /** - * Constructs an interval with the supplied sequence and start, end, strand and name. - * If the end position is less than the start position an exception is thrown. - * - * @param sequence the name of the sequence - * @param start the start position of the interval on the sequence - * @param end the end position of the interval on the sequence - * @param negative true to indicate negative strand, false otherwise - * @param name the name (possibly null) of the interval - * - */ - public Interval(String sequence, int start, int end, boolean negative, String name) { - this(sequence, start, end); - this.negativeStrand = negative; - this.name = name; - } - - /** Gets the name of the sequence on which the interval resides. */ - public String getSequence() { return sequence; } - - /** Gets the 1-based start position of the interval on the sequence. */ - public int getStart() { return start; } - - /** Gets the 1-based closed-ended end position of the interval on the sequence. */ - public int getEnd() { return end; } - - /** Returns true if the interval is on the negative strand, otherwise false. */ - public boolean isNegativeStrand() { return this.negativeStrand; } - - /** Returns true if the interval is on the positive strand, otherwise false. */ - public boolean isPositiveStrand() { return !this.negativeStrand; } - - /** Returns the name of the interval, possibly null. */ - public String getName() { return this.name; } - - /** Returns true if this interval overlaps the other interval, otherwise false. */ - public boolean intersects(Interval other) { - return (this.getSequence().equals(other.getSequence()) && - CoordMath.overlaps(this.start, this.end, other.start, other.end)); - } - - /** Returns true if this interval overlaps the other interval, otherwise false. */ - public boolean abuts(Interval other) { - return this.getSequence().equals(other.getSequence()) && - (this.start == other.end + 1 || other.start == this.end + 1); - } - - /** Gets the length of this interval. */ - public int length() { return this.end - this.start + 1; } - - /** Counts the total number of bases a collection of intervals. */ - public static long countBases(Collection intervals) { - long total = 0; - for (Interval i : intervals) { - total += i.length(); - } - - return total; - } - - - /** - * Sort based on sequence.compareTo, then start pos, then end pos - * with null objects coming lexically last - */ - public int compareTo(Interval that) { - if (that == null) return -1; // nulls last - - int result = this.getSequence().compareTo(that.getSequence()); - if (result == 0) { - if (this.start == that.start) { - result = this.end - that.end; - } - else { - result = this.start - that.start; - } - } - - return result; - } - - /** Equals method that agrees with {@link #compareTo(Interval)}. */ - public boolean equals(Interval that) { - return (this.compareTo(that) == 0); - } - - public int hashCode() { - int result; - result = sequence.hashCode(); - result = 31 * result + (start ^ (start >>> 32)); - result = 31 * result + (end ^ (end >>> 32)); - return result; - } - - public String toString() { - return getSequence() + ":" + start + "-" + end; - } - - @Override - public Interval clone() { - try { return (Interval) super.clone(); } - catch (CloneNotSupportedException cnse) { throw new PicardException("That's unpossible", cnse); } - } -} diff --git a/java/lib/edu/mit/broad/picard/util/IntervalTree.java b/java/lib/edu/mit/broad/picard/util/IntervalTree.java deleted file mode 100644 index 8821b25d0..000000000 --- a/java/lib/edu/mit/broad/picard/util/IntervalTree.java +++ /dev/null @@ -1,1304 +0,0 @@ -/* - * $Id: IntervalTree.java 51146 2007-11-05 17:48:24Z tsharpe $ - * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT - * Software and documentation are copyright 2005 by the Broad Institute. - * All rights are reserved. - * - * Users acknowledge that this software is supplied without any warranty or support. - * The Broad Institute is not responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.picard.util; - -import java.util.ConcurrentModificationException; -import java.util.Iterator; -import java.util.NoSuchElementException; - -/** - * A Red-Black tree with intervals for keys. - * Not thread-safe, and cannot be made so. - * - * 7/24/2008: This was copied from the tedUtils package. - * IMPORTANT!!! It has been modified to use the Reseq way of - * handling coordinates (end-inclusive). - * - * @author tsharpe - */ -public class IntervalTree implements Iterable> -{ - /** - * Return the number of intervals in the tree. - * @return The number of intervals. - */ - public int size() - { - return mRoot == null ? 0 : mRoot.getSize(); - } - - /** - * Remove all entries. - */ - public void clear() - { - mRoot = null; - } - - /** - * Put a new interval into the tree (or update the value associated with an existing interval). - * If the interval is novel, the special sentinel value is returned. - * @param interval The interval. - * @param value The associated value. - * @return The old value associated with that interval, or the sentinel. - */ - public V put( HalfOpenInterval interval, V value ) - { - return put(interval.getStart(),interval.getEnd(),value); - } - - /** - * Put a new interval into the tree (or update the value associated with an existing interval). - * If the interval is novel, the special sentinel value is returned. - * @param start The interval's start. - * @param end The interval's end. - * @param value The associated value. - * @return The old value associated with that interval, or the sentinel. - */ - @SuppressWarnings("null") - public V put( int start, int end, V value ) - { - if ( start > end ) - throw new IllegalArgumentException("Start cannot exceed end."); - - V result = mSentinel; - - if ( mRoot == null ) - { - mRoot = new Node(start,end,value); - } - else - { - Node parent = null; - Node node = mRoot; - int cmpVal = 0; - - while ( node != null ) - { - parent = node; // last non-null node - cmpVal = node.compare(start,end); - if ( cmpVal == 0 ) - { - break; - } - - node = cmpVal < 0 ? node.getLeft() : node.getRight(); - } - - if ( cmpVal == 0 ) - { - result = parent.setValue(value); - } - else - { - if ( cmpVal < 0 ) - { - mRoot = parent.insertLeft(start,end,value,mRoot); - } - else - { - mRoot = parent.insertRight(start,end,value,mRoot); - } - } - } - - return result; - } - - /** - * Remove an interval from the tree. If the interval does not exist in the tree the - * special sentinel value is returned. - * @param interval The interval to remove. - * @return The value associated with that interval, or the sentinel. - */ - public V remove( HalfOpenInterval interval ) - { - return remove(interval.getStart(),interval.getEnd()); - } - - /** - * Remove an interval from the tree. If the interval does not exist in the tree the - * special sentinel value is returned. - * @param start The interval's start. - * @param end The interval's end. - * @return The value associated with that interval, or the sentinel. - */ - public V remove( int start, int end ) - { - V result = mSentinel; - Node node = mRoot; - - while ( node != null ) - { - int cmpVal = node.compare(start,end); - if ( cmpVal == 0 ) - { - result = node.getValue(); - mRoot = node.remove(mRoot); - break; - } - - node = cmpVal < 0 ? node.getLeft() : node.getRight(); - } - - return result; - } - - /** - * Find an interval. - * @param interval The interval sought. - * @return The Node that represents that interval, or null. - */ - public Node find( HalfOpenInterval interval ) - { - return find(interval.getStart(),interval.getEnd()); - } - - /** - * Find an interval. - * @param start The interval's start. - * @param end The interval's end. - * @return The Node that represents that interval, or null. - */ - public Node find( int start, int end ) - { - Node node = mRoot; - - while ( node != null ) - { - int cmpVal = node.compare(start,end); - if ( cmpVal == 0 ) - { - break; - } - - node = cmpVal < 0 ? node.getLeft() : node.getRight(); - } - - return node; - } - - /** - * Find the nth interval in the tree. - * @param idx The rank of the interval sought (from 0 to size()-1). - * @return The Node that represents the nth interval. - */ - public Node findByIndex( int idx ) - { - return Node.findByRank(mRoot,idx+1); - } - - /** - * Find the rank of the specified interval. If the specified interval is not in the - * tree, then -1 is returned. - * @param interval The interval for which the index is sought. - * @return The rank of that interval, or -1. - */ - public int getIndex( HalfOpenInterval interval ) - { - return getIndex(interval.getStart(),interval.getEnd()); - } - - /** - * Find the rank of the specified interval. If the specified interval is not in the - * tree, then -1 is returned. - * @param start The interval's start. - * @param end The interval's end. - * @return The rank of that interval, or -1. - */ - public int getIndex( int start, int end ) - { - return Node.getRank(mRoot,start,end) - 1; - } - - /** - * Find the least interval in the tree. - * @return The earliest interval, or null if the tree is empty. - */ - public Node min() - { - Node result = null; - Node node = mRoot; - - while ( node != null ) - { - result = node; - node = node.getLeft(); - } - - return result; - } - - /** - * Find the earliest interval in the tree greater than or equal to the specified interval. - * @param interval The interval sought. - * @return The earliest >= interval, or null if there is none. - */ - public Node min( HalfOpenInterval interval ) - { - return min(interval.getStart(),interval.getEnd()); - } - - /** - * Find the earliest interval in the tree greater than or equal to the specified interval. - * @param start The interval's start. - * @param end The interval's end. - * @return The earliest >= interval, or null if there is none. - */ - @SuppressWarnings("null") - public Node min( int start, int end ) - { - Node result = null; - Node node = mRoot; - int cmpVal = 0; - - while ( node != null ) - { - result = node; - cmpVal = node.compare(start,end); - if ( cmpVal == 0 ) - { - break; - } - - node = cmpVal < 0 ? node.getLeft() : node.getRight(); - } - - if ( cmpVal > 0 ) - { - result = result.getNext(); - } - - return result; - } - - /** - * Find the earliest interval in the tree that overlaps the specified interval. - * @param interval The interval sought. - * @return The earliest overlapping interval, or null if there is none. - */ - public Node minOverlapper( HalfOpenInterval interval ) - { - return minOverlapper(interval.getStart(),interval.getEnd()); - } - - /** - * Find the earliest interval in the tree that overlaps the specified interval. - * @param start The interval's start. - * @param end The interval's end. - * @return The earliest overlapping interval, or null if there is none. - */ - public Node minOverlapper( int start, int end ) - { - Node result = null; - Node node = mRoot; - - if ( node != null && node.getMaxEnd() >= start ) - { - while ( true ) - { - if ( node.getStart() <= end && start <= node.getEnd() ) - { // this node overlaps. there might be a lesser overlapper down the left sub-tree. - // no need to consider the right sub-tree: even if there's an overlapper, if won't be minimal - result = node; - node = node.getLeft(); - if ( node == null || node.getMaxEnd() < start ) - break; // no left sub-tree or all nodes end too early - } - else - { // no overlap. if there might be a left sub-tree overlapper, consider the left sub-tree. - Node left = node.getLeft(); - if ( left != null && left.getMaxEnd() >= start ) - { - node = left; - } - else - { // left sub-tree cannot contain an overlapper. consider the right sub-tree. - if ( node.getStart() > end ) - break; // everything in the right sub-tree is past the end of the query interval - - node = node.getRight(); - if ( node == null || node.getMaxEnd() < start ) - break; // no right sub-tree or all nodes end too early - } - } - } - } - - return result; - } - - /** - * Find the greatest interval in the tree. - * @return The latest interval, or null if the tree is empty. - */ - public Node max() - { - Node result = null; - Node node = mRoot; - - while ( node != null ) - { - result = node; - node = node.getRight(); - } - - return result; - } - - /** - * Find the latest interval in the tree less than or equal to the specified interval. - * @param interval The interval sought. - * @return The latest <= interval, or null if there is none. - */ - public Node max( HalfOpenInterval interval ) - { - return max(interval.getStart(),interval.getEnd()); - } - - /** - * Find the latest interval in the tree less than or equal to the specified interval. - * @param start The interval's start. - * @param end The interval's end. - * @return The latest >= interval, or null if there is none. - */ - @SuppressWarnings("null") - public Node max( int start, int end ) - { - Node result = null; - Node node = mRoot; - int cmpVal = 0; - - while ( node != null ) - { - result = node; - cmpVal = node.compare(start,end); - if ( cmpVal == 0 ) - { - break; - } - - node = cmpVal < 0 ? node.getLeft() : node.getRight(); - } - - if ( cmpVal < 0 ) - { - result = result.getPrev(); - } - - return result; - } - - /** - * Return an iterator over the entire tree. - * @return An iterator. - */ - public Iterator> iterator() - { - return new FwdIterator(min()); - } - - /** - * Return an iterator over all intervals greater than or equal to the specified interval. - * @param interval The minimum interval. - * @return An iterator. - */ - public Iterator> iterator( HalfOpenInterval interval ) - { - return new FwdIterator(min(interval.getStart(),interval.getEnd())); - } - - /** - * Return an iterator over all intervals greater than or equal to the specified interval. - * @param start The interval's start. - * @param end The interval's end. - * @return An iterator. - */ - public Iterator> iterator( int start, int end ) - { - return new FwdIterator(min(start,end)); - } - - /** - * Return an iterator over all intervals overlapping the specified range. - * @param start The range start. - * @param end The range end. - * @return An iterator. - */ - public Iterator> overlappers( int start, int end ) - { - return new OverlapIterator(start,end); - } - - /** - * Return an iterator over the entire tree that returns intervals in reverse order. - * @return An iterator. - */ - public Iterator> reverseIterator() - { - return new RevIterator(max()); - } - - /** - * Return an iterator over all intervals less than or equal to the specified interval, in reverse order. - * @param interval The maximum interval. - * @return An iterator. - */ - public Iterator> reverseIterator( HalfOpenInterval interval ) - { - return new RevIterator(max(interval.getStart(),interval.getEnd())); - } - - /** - * Return an iterator over all intervals less than or equal to the specified interval, in reverse order. - * @param start The interval's start. - * @param end The interval's end. - * @return An iterator. - */ - public Iterator> reverseIterator( int start, int end ) - { - return new RevIterator(max(start,end)); - } - - /** - * Get the special sentinel value that will be used to signal novelty when putting a new interval - * into the tree, or to signal "not found" when removing an interval. This is null by default. - * @return The sentinel value. - */ - public V getSentinel() - { - return mSentinel; - } - - /** - * Set the special sentinel value that will be used to signal novelty when putting a new interval - * into the tree, or to signal "not found" when removing an interval. - * @param sentinel The new sentinel value. - * @return The old sentinel value. - */ - public V setSentinel( V sentinel ) - { - V result = mSentinel; - mSentinel = sentinel; - return result; - } - - void removeNode( Node node ) - { - mRoot = node.remove(mRoot); - } - - private Node mRoot; - private V mSentinel; - - public static class Node - implements HalfOpenInterval - { - Node( int start, int end, V1 value ) - { - mStart = start; - mEnd = end; - mValue = value; - mSize = 1; - mMaxEnd = mEnd; - mIsBlack = true; - } - - Node( Node parent, int start, int end, V1 value ) - { - mParent = parent; - mStart = start; - mEnd = end; - mValue = value; - mMaxEnd = mEnd; - mSize = 1; - } - - public int getStart() - { - return mStart; - } - - public int getEnd() - { - return mEnd; - } - - public int getLength() - { - return mEnd - mStart; - } - - public int getRelationship( HalfOpenInterval interval ) - { - int result = 0; - if ( mStart < interval.getStart() ) - result = HalfOpenInterval.HAS_LESSER_PART; - if ( mEnd > interval.getEnd() ) - result |= HalfOpenInterval.HAS_GREATER_PART; - if ( mStart < interval.getEnd() && interval.getStart() < mEnd ) - result |= HalfOpenInterval.HAS_OVERLAPPING_PART; - return result; - } - - public boolean isAdjacent( HalfOpenInterval interval ) - { - return mStart == interval.getEnd() || mEnd == interval.getStart(); - } - - public V1 getValue() - { - return mValue; - } - - public V1 setValue( V1 value ) - { - V1 result = mValue; - mValue = value; - return result; - } - - int getSize() - { - return mSize; - } - - int getMaxEnd() - { - return mMaxEnd; - } - - Node getLeft() - { - return mLeft; - } - - Node insertLeft( int start, int end, V1 value, Node root ) - { - mLeft = new Node(this,start,end,value); - return insertFixup(mLeft,root); - } - - Node getRight() - { - return mRight; - } - - Node insertRight( int start, int end, V1 value, Node root ) - { - mRight = new Node(this,start,end,value); - return insertFixup(mRight,root); - } - - Node getNext() - { - Node result; - - if ( mRight != null ) - { - result = mRight; - while ( result.mLeft != null ) - { - result = result.mLeft; - } - } - else - { - Node node = this; - result = mParent; - while ( result != null && node == result.mRight ) - { - node = result; - result = result.mParent; - } - } - - return result; - } - - Node getPrev() - { - Node result; - - if ( mLeft != null ) - { - result = mLeft; - while ( result.mRight != null ) - { - result = result.mRight; - } - } - else - { - Node node = this; - result = mParent; - while ( result != null && node == result.mLeft ) - { - node = result; - result = result.mParent; - } - } - - return result; - } - - boolean wasRemoved() - { - return mSize == 0; - } - - Node remove( Node root ) - { - if ( mSize == 0 ) - { - throw new IllegalStateException("Entry was already removed."); - } - - if ( mLeft == null ) - { - if ( mRight == null ) - { // no children - if ( mParent == null ) - { - root = null; - } - else if ( mParent.mLeft == this ) - { - mParent.mLeft = null; - fixup(mParent); - - if ( mIsBlack ) - root = removeFixup(mParent,null,root); - } - else - { - mParent.mRight = null; - fixup(mParent); - - if ( mIsBlack ) - root = removeFixup(mParent,null,root); - } - } - else - { // single child on right - root = spliceOut(mRight,root); - } - } - else if ( mRight == null ) - { // single child on left - root = spliceOut(mLeft,root); - } - else - { // two children - Node next = getNext(); - root = next.remove(root); - - // put next into tree in same position as this, effectively removing this - if ( (next.mParent = mParent) == null ) - root = next; - else if ( mParent.mLeft == this ) - mParent.mLeft = next; - else - mParent.mRight = next; - - if ( (next.mLeft = mLeft) != null ) - { - mLeft.mParent = next; - } - - if ( (next.mRight = mRight) != null ) - { - mRight.mParent = next; - } - - next.mIsBlack = mIsBlack; - next.mSize = mSize; - } - - mSize = 0; - return root; - } - - // backwards comparison! compares start+end to this. - int compare( int start, int end ) - { - int result = 0; - - if ( start > mStart ) - result = 1; - else if ( start < mStart ) - result = -1; - else if ( end > mEnd ) - result = 1; - else if ( end < mEnd ) - result = -1; - - return result; - } - - @SuppressWarnings("null") - static Node getNextOverlapper( Node node, int start, int end ) - { - do - { - Node nextNode = node.mRight; - if ( nextNode != null && nextNode.mMaxEnd >= start ) - { - node = nextNode; - while ( (nextNode = node.mLeft) != null && nextNode.mMaxEnd >= start ) - node = nextNode; - } - else - { - nextNode = node; - while ( (node = nextNode.mParent) != null && node.mRight == nextNode ) - nextNode = node; - } - - if ( node != null && node.mStart > end ) - node = null; - } - while ( node != null && !(node.mStart <= end && start <= node.mEnd) ); - - return node; - } - - static Node findByRank( Node node, int rank ) - { - while ( node != null ) - { - int nodeRank = node.getRank(); - if ( rank == nodeRank ) - break; - - if ( rank < nodeRank ) - { - node = node.mLeft; - } - else - { - node = node.mRight; - rank -= nodeRank; - } - } - - return node; - } - - static int getRank( Node node, int start, int end ) - { - int rank = 0; - - while ( node != null ) - { - int cmpVal = node.compare(start,end); - if ( cmpVal < 0 ) - { - node = node.mLeft; - } - else - { - rank += node.getRank(); - if ( cmpVal == 0 ) - return rank; // EARLY RETURN!!! - - node = node.mRight; - } - } - - return 0; - } - - private int getRank() - { - int result = 1; - if ( mLeft != null ) - result = mLeft.mSize + 1; - return result; - } - - private Node spliceOut( Node child, Node root ) - { - if ( (child.mParent = mParent) == null ) - { - root = child; - child.mIsBlack = true; - } - else - { - if ( mParent.mLeft == this ) - mParent.mLeft = child; - else - mParent.mRight = child; - fixup(mParent); - - if ( mIsBlack ) - root = removeFixup(mParent,child,root); - } - - return root; - } - - private Node rotateLeft( Node root ) - { - Node child = mRight; - - int childSize = child.mSize; - child.mSize = mSize; - mSize -= childSize; - - if ( (mRight = child.mLeft) != null ) - { - mRight.mParent = this; - mSize += mRight.mSize; - } - - if ( (child.mParent = mParent) == null ) - root = child; - else if ( this == mParent.mLeft ) - mParent.mLeft = child; - else - mParent.mRight = child; - - child.mLeft = this; - mParent = child; - - setMaxEnd(); - child.setMaxEnd(); - - return root; - } - - private Node rotateRight( Node root ) - { - Node child = mLeft; - - int childSize = child.mSize; - child.mSize = mSize; - mSize -= childSize; - - if ( (mLeft = child.mRight) != null ) - { - mLeft.mParent = this; - mSize += mLeft.mSize; - } - - if ( (child.mParent = mParent) == null ) - root = child; - else if ( this == mParent.mLeft ) - mParent.mLeft = child; - else - mParent.mRight = child; - - child.mRight = this; - mParent = child; - - setMaxEnd(); - child.setMaxEnd(); - - return root; - } - - private void setMaxEnd() - { - mMaxEnd = mEnd; - if ( mLeft != null ) - mMaxEnd = Math.max(mMaxEnd,mLeft.mMaxEnd); - if ( mRight != null ) - mMaxEnd = Math.max(mMaxEnd,mRight.mMaxEnd); - } - - private static void fixup( Node node ) - { - do - { - node.mSize = 1; - node.mMaxEnd = node.mEnd; - if ( node.mLeft != null ) - { - node.mSize += node.mLeft.mSize; - node.mMaxEnd = Math.max(node.mMaxEnd,node.mLeft.mMaxEnd); - } - if ( node.mRight != null ) - { - node.mSize += node.mRight.mSize; - node.mMaxEnd = Math.max(node.mMaxEnd,node.mRight.mMaxEnd); - } - } - while ( (node = node.mParent) != null ); - } - - private static Node insertFixup( Node daughter, Node root ) - { - Node mom = daughter.mParent; - fixup(mom); - - while( mom != null && !mom.mIsBlack ) - { - Node gramma = mom.mParent; - Node auntie = gramma.mLeft; - if ( auntie == mom ) - { - auntie = gramma.mRight; - if ( auntie != null && !auntie.mIsBlack ) - { - mom.mIsBlack = true; - auntie.mIsBlack = true; - gramma.mIsBlack = false; - daughter = gramma; - } - else - { - if ( daughter == mom.mRight ) - { - root = mom.rotateLeft(root); - mom = daughter; - } - mom.mIsBlack = true; - gramma.mIsBlack = false; - root = gramma.rotateRight(root); - break; - } - } - else - { - if ( auntie != null && !auntie.mIsBlack ) - { - mom.mIsBlack = true; - auntie.mIsBlack = true; - gramma.mIsBlack = false; - daughter = gramma; - } - else - { - if ( daughter == mom.mLeft ) - { - root = mom.rotateRight(root); - mom = daughter; - } - mom.mIsBlack = true; - gramma.mIsBlack = false; - root = gramma.rotateLeft(root); - break; - } - } - mom = daughter.mParent; - } - root.mIsBlack = true; - return root; - } - - private static Node removeFixup( Node parent, Node node, Node root ) - { - do - { - if ( node == parent.mLeft ) - { - Node sister = parent.mRight; - if ( !sister.mIsBlack ) - { - sister.mIsBlack = true; - parent.mIsBlack = false; - root = parent.rotateLeft(root); - sister = parent.mRight; - } - if ( (sister.mLeft == null || sister.mLeft.mIsBlack) && (sister.mRight == null || sister.mRight.mIsBlack) ) - { - sister.mIsBlack = false; - node = parent; - } - else - { - if ( sister.mRight == null || sister.mRight.mIsBlack ) - { - sister.mLeft.mIsBlack = true; - sister.mIsBlack = false; - root = sister.rotateRight(root); - sister = parent.mRight; - } - sister.mIsBlack = parent.mIsBlack; - parent.mIsBlack = true; - sister.mRight.mIsBlack = true; - root = parent.rotateLeft(root); - node = root; - } - } - else - { - Node sister = parent.mLeft; - if ( !sister.mIsBlack ) - { - sister.mIsBlack = true; - parent.mIsBlack = false; - root = parent.rotateRight(root); - sister = parent.mLeft; - } - if ( (sister.mLeft == null || sister.mLeft.mIsBlack) && (sister.mRight == null || sister.mRight.mIsBlack) ) - { - sister.mIsBlack = false; - node = parent; - } - else - { - if ( sister.mLeft == null || sister.mLeft.mIsBlack ) - { - sister.mRight.mIsBlack = true; - sister.mIsBlack = false; - root = sister.rotateLeft(root); - sister = parent.mLeft; - } - sister.mIsBlack = parent.mIsBlack; - parent.mIsBlack = true; - sister.mLeft.mIsBlack = true; - root = parent.rotateRight(root); - node = root; - } - } - parent = node.mParent; - } - while ( parent != null && node.mIsBlack ); - - node.mIsBlack = true; - return root; - } - - private Node mParent; - private Node mLeft; - private Node mRight; - private int mStart; - private int mEnd; - private V1 mValue; - private int mSize; - private int mMaxEnd; - private boolean mIsBlack; - } - - public class FwdIterator - implements Iterator> - { - public FwdIterator( Node node ) - { - mNext = node; - } - - public boolean hasNext() - { - return mNext != null; - } - - public Node next() - { - if ( mNext == null ) - { - throw new NoSuchElementException("No next element."); - } - - if ( mNext.wasRemoved() ) - { - mNext = min(mNext.getStart(),mNext.getEnd()); - if ( mNext == null ) - throw new ConcurrentModificationException("Current element was removed, and there are no more elements."); - } - mLast = mNext; - mNext = mNext.getNext(); - return mLast; - } - - public void remove() - { - if ( mLast == null ) - { - throw new IllegalStateException("No entry to remove."); - } - - removeNode(mLast); - mLast = null; - } - - private Node mNext; - private Node mLast; - } - - public class RevIterator - implements Iterator> - { - public RevIterator( Node node ) - { - mNext = node; - } - - public boolean hasNext() - { - return mNext != null; - } - - public Node next() - { - if ( mNext == null ) - throw new NoSuchElementException("No next element."); - if ( mNext.wasRemoved() ) - { - mNext = max(mNext.getStart(),mNext.getEnd()); - if ( mNext == null ) - throw new ConcurrentModificationException("Current element was removed, and there are no more elements."); - } - mLast = mNext; - mNext = mNext.getPrev(); - return mLast; - } - - public void remove() - { - if ( mLast == null ) - { - throw new IllegalStateException("No entry to remove."); - } - - removeNode(mLast); - mLast = null; - } - - private Node mNext; - private Node mLast; - } - - public class OverlapIterator - implements Iterator> - { - public OverlapIterator( int start, int end ) - { - mNext = minOverlapper(start,end); - mStart = start; - mEnd = end; - } - - public boolean hasNext() - { - return mNext != null; - } - - public Node next() - { - if ( mNext == null ) - { - throw new NoSuchElementException("No next element."); - } - - if ( mNext.wasRemoved() ) - { - throw new ConcurrentModificationException("Current element was removed."); - } - - mLast = mNext; - mNext = Node.getNextOverlapper(mNext,mStart,mEnd); - return mLast; - } - - public void remove() - { - if ( mLast == null ) - { - throw new IllegalStateException("No entry to remove."); - } - - removeNode(mLast); - mLast = null; - } - - private Node mNext; - private Node mLast; - private int mStart; - private int mEnd; - } - - public static class ValuesIterator - implements Iterator - { - public ValuesIterator( Iterator> itr ) - { - mItr = itr; - } - - public boolean hasNext() - { - return mItr.hasNext(); - } - - public V1 next() - { - return mItr.next().getValue(); - } - - public void remove() - { - mItr.remove(); - } - - private Iterator> mItr; - } -} - -/** - * Semi-open interval on the integer number line. - * Turf covered runs from the start value inclusive, up to, but not including, the end value. - * - * @author tsharpe - * @version $Revision: 51146 $ - */ -interface HalfOpenInterval -{ - // bit-wise definitions from which the other constants are composed - static final int HAS_LESSER_PART = 1; - static final int HAS_OVERLAPPING_PART = 2; - static final int HAS_GREATER_PART = 4; - - static final int IS_ADJACENT_AND_EMPTY = 0; - static final int IS_STRICTLY_LESS = HAS_LESSER_PART; // 1 - static final int IS_SUBSET = HAS_OVERLAPPING_PART; // 2 - static final int IS_LEFT_OVERHANGING_OVERLAPPER = HAS_LESSER_PART | HAS_OVERLAPPING_PART; // 3 - static final int IS_STRICTLY_GREATER = HAS_GREATER_PART; // 4 - // there is no value that equals 5, since that would imply overhanging on left and right without overlapping - static final int IS_RIGHT_OVERHANGING_OVERLAPPER = HAS_GREATER_PART | HAS_OVERLAPPING_PART; // 6 - static final int IS_SUPERSET = HAS_LESSER_PART | HAS_OVERLAPPING_PART | HAS_GREATER_PART; // 7 - - /** - * Returns the starting point of the interval. - * @return The start. - */ - int getStart(); - - /** - * Returns the ending point of the interval. - * The interval is not regarded as including this point. - * @return The end. - */ - int getEnd(); - - /** - * End - start. - */ - int getLength(); - - /** - * Returns a constant that describes the relationship of this interval - * to a specified interval with regard to position on the number line. - * @param interval The interval to compare this one to. - * @return One of the IS_* constants defined above. - */ - int getRelationship( HalfOpenInterval interval ); - - /** - * Returns true if this interval ends where the specified interval starts, - * or vice versa. - * @param interval The interval to compare this one to. - * @return True, if adjacent. - */ - boolean isAdjacent( HalfOpenInterval interval ); -} diff --git a/java/lib/edu/mit/broad/picard/util/ListMap.java b/java/lib/edu/mit/broad/picard/util/ListMap.java deleted file mode 100644 index bee27cc18..000000000 --- a/java/lib/edu/mit/broad/picard/util/ListMap.java +++ /dev/null @@ -1,24 +0,0 @@ -package edu.mit.broad.picard.util; - -import java.util.List; -import java.util.HashMap; -import java.util.ArrayList; - -/** - * A Map class that holds a list of entries under each key instead of a single entry, and - * provides utility methods for adding an entry under a key. - * - * @author Tim Fennell - */ -public class ListMap extends HashMap> { - /** Adds a single value to the list stored under a key. */ - public void add(K key, V value) { - List values = get(key); - if (values == null) { - values = new ArrayList(); - put(key, values); - } - - values.add(value); - } -} diff --git a/java/lib/edu/mit/broad/picard/util/Log.java b/java/lib/edu/mit/broad/picard/util/Log.java deleted file mode 100644 index 43a628bdb..000000000 --- a/java/lib/edu/mit/broad/picard/util/Log.java +++ /dev/null @@ -1,182 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.util; - -import java.io.PrintStream; -import java.util.Arrays; - -/** - *

    A wafer thin wrapper around System.out that uses var-args to make it - * much more efficient to call the logging methods in without having to - * surround every call site with calls to Log.isXXXEnabled(). All the methods on this - * class take a variable length list of arguments and, only if logging is enabled for - * the level and channel being logged to, will those arguments be toString()'d and - * appended together.

    - * - * @author Tim Fennell - */ -public final class Log { - /** Enumeration for setting log levels. */ - public static enum LogLevel { ERROR, WARNING, INFO, DEBUG } - - private static LogLevel globalLogLevel = LogLevel.DEBUG; - - private final Class clazz; - private final String className; - private final LogLevel level = globalLogLevel; - private final PrintStream out = System.out; - - /** - * Private constructor - */ - private Log(final Class clazz) { - this.clazz = clazz; - this.className = clazz.getSimpleName(); - } - - /** - * Get a Log instance to perform logging within the Class specified. Returns an instance - * of this class which wraps an instance of the commons logging Log class. - * @param clazz the Class which is going to be doing the logging - * @return a Log instance with which to log - */ - public static Log getInstance(final Class clazz) { - return new Log(clazz); - } - - public static void setGlobalLogLevel(final LogLevel logLevel) { - globalLogLevel = logLevel; - } - - /** Returns true if the specified log level is enabled otherwise false. */ - public final boolean isEnabled(final LogLevel level) { - return level.ordinal() <= this.level.ordinal(); - } - - /** - * Private method that does the actual printing of messages to a PrintWriter. Outputs the log level, - * class name and parts followed by the stack trace if a throwable is provided. - * - * @param level the Log level being logged at - * @param throwable a Throwable if one is available otherwise null - * @param parts the parts of the message to be concatenated - */ - private final void emit(final LogLevel level, final Throwable throwable, final Object... parts) { - if (isEnabled(level)) { - this.out.print(level.name()); - this.out.print('\t'); - this.out.print(this.className); - this.out.print('\t'); - - for (final Object part : parts) { - if (part != null && part.getClass().isArray()) { - final Class component = part.getClass().getComponentType(); - if (component.equals(Boolean.TYPE)) this.out.print(Arrays.toString( (boolean[]) part)); - else if (component.equals(Byte.TYPE)) this.out.print(Arrays.toString( (byte[]) part)); - else if (component.equals(Character.TYPE)) this.out.print(Arrays.toString( (char[]) part)); - else if (component.equals(Double.TYPE)) this.out.print(Arrays.toString( (double[]) part)); - else if (component.equals(Float.TYPE)) this.out.print(Arrays.toString( (float[]) part)); - else if (component.equals(Integer.TYPE)) this.out.print(Arrays.toString( (int[]) part)); - else if (component.equals(Long.TYPE)) this.out.print(Arrays.toString( (long[]) part)); - else if (component.equals(Short.TYPE)) this.out.print(Arrays.toString( (short[]) part)); - else this.out.print(Arrays.toString( (Object[]) part)); - } - else { - this.out.print(part); - } - } - - this.out.println(); - - // Print out the exception if there is one - if (throwable != null) { - throwable.printStackTrace(this.out); - } - } - } - - /** - * Logs a Throwable and optional message parts at level error. - * @param throwable an instance of Throwable that should be logged with stack trace - * @param messageParts zero or more objects which should be combined, by calling toString() - * to form the log message. - */ - public final void error(final Throwable throwable, final Object... messageParts) { - emit(LogLevel.ERROR, throwable, messageParts); - } - - /** - * Logs a Throwable and optional message parts at level warn. - * @param throwable an instance of Throwable that should be logged with stack trace - * @param messageParts zero or more objects which should be combined, by calling toString() - * to form the log message. - */ - public final void warn(final Throwable throwable, final Object... messageParts) { - emit(LogLevel.WARNING, throwable, messageParts); - } - - /** - * Logs a Throwable and optional message parts at level info. - * @param throwable an instance of Throwable that should be logged with stack trace - * @param messageParts zero or more objects which should be combined, by calling toString() - * to form the log message. - */ - public final void info(final Throwable throwable, final Object... messageParts) { - emit(LogLevel.INFO, throwable, messageParts); - } - - /** - * Logs a Throwable and optional message parts at level debug. - * @param throwable an instance of Throwable that should be logged with stack trace - * @param messageParts zero or more objects which should be combined, by calling toString() - * to form the log message. - */ - public final void debug(final Throwable throwable, final Object... messageParts) { - emit(LogLevel.DEBUG, throwable, messageParts); - } - - // Similar methods, but without Throwables, follow - - /** - * Logs one or more message parts at level error. - * @param messageParts one or more objects which should be combined, by calling toString() - * to form the log message. - */ - public final void error(final Object... messageParts) { - emit(LogLevel.ERROR, null, messageParts); - } - - /** - * Logs one or more message parts at level warn. - * @param messageParts one or more objects which should be combined, by calling toString() - * to form the log message. - */ - public final void warn(final Object... messageParts) { - emit(LogLevel.WARNING, null, messageParts); - } - - /** - * Logs one or more message parts at level info. - * @param messageParts one or more objects which should be combined, by calling toString() - * to form the log message. - */ - public final void info(final Object... messageParts) { - emit(LogLevel.INFO, null, messageParts); - } - - /** - * Logs one or more message parts at level debug. - * @param messageParts one or more objects which should be combined, by calling toString() - * to form the log message. - */ - public final void debug(final Object... messageParts) { - emit(LogLevel.DEBUG, null, messageParts); - } -} diff --git a/java/lib/edu/mit/broad/picard/util/MathUtil.java b/java/lib/edu/mit/broad/picard/util/MathUtil.java deleted file mode 100644 index 0cf6de4b7..000000000 --- a/java/lib/edu/mit/broad/picard/util/MathUtil.java +++ /dev/null @@ -1,33 +0,0 @@ -package edu.mit.broad.picard.util; - -/** - * General math utilities - * - * @author Tim Fennell - */ -public class MathUtil { - /** Calculated the mean of an array of doubles. */ - public static double mean(double[] in, int start, int stop) { - double total = 0; - for (int i=start; i { - private Map>> cache = new HashMap>>(); - private final int lhsBuffer; - private final int rhsBuffer; - - /** - * Constructs an overlap detector. - * @param lhsBuffer the amount by which to "trim" coordinates of mappings on the left - * hand side when calculating overlaps - * @param rhsBuffer the amount by which to "trim" coordinates of mappings on the right - * hand side when calculating overlaps - */ - public OverlapDetector(int lhsBuffer, int rhsBuffer) { - this.lhsBuffer = lhsBuffer; - this.rhsBuffer = rhsBuffer; - } - - /** Adds a mapping to the set of mappings against which to match candidates. */ - public void addLhs(T object, Interval interval) { - Object seqId = interval.getSequence(); - - IntervalTree> tree = this.cache.get(seqId); - if (tree == null) { - tree = new IntervalTree>(); - this.cache.put(seqId, tree); - } - - int start = interval.getStart() + this.lhsBuffer; - int end = interval.getEnd() - this.lhsBuffer; - - Set objects = new HashSet(); - objects.add(object); - if (start <= end) // Don't put in sequences that have no overlappable bases - { - Set alreadyThere = tree.put(start, end, objects); - if (alreadyThere != null) - { - alreadyThere.add(object); - tree.put(start, end, alreadyThere); - } - } - } - - /** Adds all items to the overlap detector. */ - public void addAll(List objects, List intervals) { - if (objects.size() != intervals.size()) { - throw new IllegalArgumentException("Objects and intervals must be the same size."); - } - - for (int i=0; i getOverlaps(Interval rhs) { - Collection matches = new ArrayList(); - - Object seqId = rhs.getSequence(); - IntervalTree> tree = this.cache.get(seqId); - int start = rhs.getStart() + this.rhsBuffer; - int end = rhs.getEnd() - this.rhsBuffer; - - if (tree != null && start <= end) - { - Iterator>> it = tree.overlappers(start, end); - while (it.hasNext()) - { - IntervalTree.Node> node = it.next(); - matches.addAll(node.getValue()); - } - } - - return matches; - } - - /** Gets all the objects that could be returned by the overlap detector. */ - public Collection getAll() { - Collection all = new HashSet(); - for (IntervalTree> tree : this.cache.values()) { - for (IntervalTree.Node> node : tree) { - all.addAll(node.getValue()); - } - } - - return all; - } -} diff --git a/java/lib/edu/mit/broad/picard/util/PasteParser.java b/java/lib/edu/mit/broad/picard/util/PasteParser.java deleted file mode 100644 index 2b785a52f..000000000 --- a/java/lib/edu/mit/broad/picard/util/PasteParser.java +++ /dev/null @@ -1,132 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.util; - -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.sam.util.CloseableIterator; - -import java.util.Iterator; -import java.util.NoSuchElementException; - -/** - * Class to merge files horizontally (like the Unix paste command), so that the first line of each file - * is merged together in one big line, then the second lines, etc. - * - * @author Kathleen Tibbetts - */ -public class PasteParser implements Iterable, CloseableIterator{ - - private final CloseableIterator[] iterators; - private boolean iterating = false; - private String[][] next = null; - - /** - * Constructor - * - * @param iterators The iterators containing the files to merge together - */ - public PasteParser(CloseableIterator... iterators) { - this.iterators = iterators; - } - - /** - * Merges the "next" line from each of the underying iterators and returns an array of the results. - * - * @return An array of the lines from each iterator - * @throws PicardException if the files are not exhausted at the same time - */ - protected String[][] readNextLine() { - String result[][] = new String[iterators.length][]; - boolean oneFinished = false; - boolean oneNotFinished = false; - - for (int i = 0; i < iterators.length; i++) { - if (!iterators[i].hasNext()) { - oneFinished = true; - } - else { - result[i] = iterators[i].next(); - oneNotFinished = true; - } - } - if (oneFinished) { - if (oneNotFinished) { - throw new PicardException("Mismatched file lengths in PasteParser"); - } - else { - return null; - } - } - return result; - } - - /** - * Closes the underlying iterators. - */ - public void close() { - for (CloseableIterator iterator : iterators) { - iterator.close(); - } - } - - /** - * Required method for Iterator API. - * - * @throws UnsupportedOperationException - */ - public void remove() { - throw new UnsupportedOperationException("Remove() not supported."); - } - - /** - * Returns an iterator over a set of elements of type BustardReadData. - * - * @return an iterator over a set of elements of type BustardReadData - */ - public Iterator iterator() { - if (iterating) { - throw new IllegalStateException("iterator() method can only be called once, before the" + - "first call to hasNext()"); - } - next = readNextLine(); - iterating = true; - return this; - } - - /** - * Returns true if the iteration has more elements. - * - * @return true if the iteration has more elements. Otherwise returns false. - */ - public boolean hasNext() { - if (!iterating) { - next = readNextLine(); - iterating = true; - } - return next != null; - } - - /** - * Returns the next element in the iteration. - * - * @return the next element in the iteration - * @throws java.util.NoSuchElementException - */ - public String[][] next() { - - if (!hasNext()) { - throw new NoSuchElementException("Iteration has no more elements."); - } - - String[][] result = next; - next = readNextLine(); - return result; - } -} diff --git a/java/lib/edu/mit/broad/picard/util/PeekableIterator.java b/java/lib/edu/mit/broad/picard/util/PeekableIterator.java deleted file mode 100644 index eae31253d..000000000 --- a/java/lib/edu/mit/broad/picard/util/PeekableIterator.java +++ /dev/null @@ -1,65 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright Jan 22, 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ -package edu.mit.broad.picard.util; - -import edu.mit.broad.sam.util.CloseableIterator; - -/** - * Generic Closable Iterator that allows you to peek at the next value before calling next - */ -public class PeekableIterator implements CloseableIterator { - private CloseableIterator iterator; - private Object nextObject; - - /** Constructs a new iterator that wraps the supplied iterator. */ - public PeekableIterator(CloseableIterator iterator) { - this.iterator = iterator; - advance(); - } - - /** Closes the underlying iterator. */ - public void close() { - this.iterator.close(); - } - - /** True if there are more items, in which case both next() and peek() will return a value. */ - public boolean hasNext() { - return this.nextObject != null; - } - - /** Returns the next object and advances the iterator. */ - public Object next() { - Object retval = this.nextObject; - advance(); - return retval; - } - - /** - * Returns the next object but does not advance the iterator. Subsequent calls to peek() - * and next() will return the same object. - */ - public Object peek(){ - return this.nextObject; - } - - private void advance(){ - if (this.iterator.hasNext()) { - this.nextObject = iterator.next(); - } - else { - this.nextObject = null; - } - } - - /** Unsupported Operation. */ - public void remove() { - throw new UnsupportedOperationException("Not supported: remove"); - } -} diff --git a/java/lib/edu/mit/broad/picard/util/ProcessExecutor.java b/java/lib/edu/mit/broad/picard/util/ProcessExecutor.java deleted file mode 100644 index 6655e37cd..000000000 --- a/java/lib/edu/mit/broad/picard/util/ProcessExecutor.java +++ /dev/null @@ -1,121 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -package edu.mit.broad.picard.util; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.ThreadFactory; - -import edu.mit.broad.picard.PicardException; - -/** - * Utility class that will execute sub processes via Runtime.getRuntime().exec(...) and read - * off the output from stderr and stdout of the sub process. This implementation uses a different - * thread to read each stream: the current thread for stdout and another, internal thread for - * stderr. This utility is able to handle concurrent executions, spawning as many threads as - * are required to handle the concurrent load. - * - * @author Doug Voet - */ -public class ProcessExecutor { - private static Log log = Log.getInstance(ProcessExecutor.class); - private static ExecutorService executorService = Executors.newCachedThreadPool(new ThreadFactory() { - @Override - public Thread newThread(Runnable r) { - return new Thread(r, "ProcessExecutor Thread"); - } - }); - - /** - * Executes the command via Runtime.getRuntime().exec() then writes stderr to log.error - * and stdout to log.info and blocks until the command is complete. - * - * @see Runtime#exec(String) - * - * @param command command string - * @return return code of command - */ - public static int execute(String command) { - try { - Process process = Runtime.getRuntime().exec(command); - return readStreamsAndWaitFor(process); - } catch (Throwable t) { - throw new PicardException("Unexpected exception executing [" + StringUtil.join(" ", command) + "]", t); - } - } - - /** - * Executes the command via Runtime.getRuntime().exec() then writes stderr to log.error - * and stdout to log.info and blocks until the command is complete. - * - * @see Runtime#exec(String[]) - * - * @param commandParts command string - * @return return code of command - */ - public static int execute(String[] commandParts) { - try { - Process process = Runtime.getRuntime().exec(commandParts); - return readStreamsAndWaitFor(process); - } catch (Throwable t) { - throw new PicardException("Unexpected exception executing [" + StringUtil.join(" ", commandParts) + "]", t); - } - } - - private static int readStreamsAndWaitFor(Process process) - throws InterruptedException, ExecutionException { - Future stderrReader = executorService.submit(new LogErrorProcessOutputReader(process.getErrorStream())); - new LogInfoProcessOutputReader(process.getInputStream()).run(); - // wait for stderr reader to be done - stderrReader.get(); - return process.waitFor(); - } - - /** - * Runnable that reads off the given stream and logs it somewhere. - */ - private static abstract class ProcessOutputReader implements Runnable { - private BufferedReader reader; - public ProcessOutputReader(InputStream stream) { - reader = new BufferedReader(new InputStreamReader(stream)); - } - - @Override - public void run() { - try { - String line; - while ((line = reader.readLine()) != null) { - log(line); - } - } catch (IOException e) { - throw new PicardException("Unexpected exception reading from process stream", e); - } - } - - protected abstract void log(String message); - } - - private static class LogErrorProcessOutputReader extends ProcessOutputReader { - public LogErrorProcessOutputReader(InputStream stream) { super(stream); } - @Override protected void log(String message) { log.error(message); } - } - - private static class LogInfoProcessOutputReader extends ProcessOutputReader { - public LogInfoProcessOutputReader(InputStream stream) { super(stream); } - @Override protected void log(String message) { log.info(message); } - } -} diff --git a/java/lib/edu/mit/broad/picard/util/RExecutor.java b/java/lib/edu/mit/broad/picard/util/RExecutor.java deleted file mode 100644 index 7faa23a9c..000000000 --- a/java/lib/edu/mit/broad/picard/util/RExecutor.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2009 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. Neither - * the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. - */ - -package edu.mit.broad.picard.util; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.io.IoUtil; - -/** - * Util class for executing R scripts. - * - * @author Doug Voet - */ -public class RExecutor { - private static final String R_EXE = "Rscript"; - - /** - * Executes the given R script that is stored in a file on the classpath. The script file - * is read from the classpath and written to a temp file then executed by a call to Rscript. - * Blocks until the R script is complete. - * - * @param rScriptName the fully qualified name of the classpath resource of the script - * @param arguments any arguments required by the script - * @return the return code of the R process - */ - public static int executeFromClasspath(String rScriptName, String... arguments) { - File scriptFile = writeScriptFile(rScriptName); - int returnCode = executeFromFile(scriptFile, arguments); - scriptFile.delete(); - return returnCode; - } - - /** - * Executes the given R script that is stored in a file by a call to Rscript. - * Blocks until the R script is complete. - * - * @param scriptFile the file object for the script - * @param arguments any arguments required by the script - * @return the return code of the R process - */ - public static int executeFromFile(File scriptFile, String... arguments) { - String[] command = new String[arguments.length + 2]; - command[0] = R_EXE; - command[1] = scriptFile.getAbsolutePath(); - System.arraycopy(arguments, 0, command, 2, arguments.length); - return ProcessExecutor.execute(command); - } - - /** - * Writes the classpath resource named by rScriptName to the temp dir. - */ - private static File writeScriptFile(String rScriptName) { - InputStream scriptStream = null; - OutputStream scriptFileStream = null; - try { - scriptStream = RExecutor.class.getClassLoader().getResourceAsStream(rScriptName); - if (scriptStream == null) { - throw new IllegalArgumentException("Script [" + rScriptName + "] not found in classpath"); - } - File scriptFile = File.createTempFile("script", ".R"); - scriptFileStream = IoUtil.openFileForWriting(scriptFile); - IoUtil.copyStream(scriptStream, scriptFileStream); - return scriptFile; - } catch (IOException e) { - throw new PicardException("Unexpected exception creating R script file", e); - } finally { - if (scriptStream != null) { - try { - scriptStream.close(); - } catch (IOException e) { - } - } - if (scriptFileStream != null) { - try { - scriptFileStream.close(); - } catch (IOException e) { - } - } - } - } -} diff --git a/java/lib/edu/mit/broad/picard/util/SamPairUtil.java b/java/lib/edu/mit/broad/picard/util/SamPairUtil.java deleted file mode 100644 index 4d78019db..000000000 --- a/java/lib/edu/mit/broad/picard/util/SamPairUtil.java +++ /dev/null @@ -1,74 +0,0 @@ -package edu.mit.broad.picard.util; - -import edu.mit.broad.sam.SAMRecord; - -/** - * Utility mthods for pairs of SAMRecords - */ -public class SamPairUtil { - - // TODO: KT and TF say this is more complicated than what I have here - public static boolean isProperPair(final SAMRecord firstEnd, final SAMRecord secondEnd, boolean jumpingLibrary) { - if (firstEnd.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { - return false; - } - if (!firstEnd.getReferenceName().equals(secondEnd.getReferenceName())) { - return false; - } - if (firstEnd.getReadNegativeStrandFlag() == secondEnd.getReadNegativeStrandFlag()) { - return false; - } - final SAMRecord positiveEnd; - final SAMRecord negativeEnd; - if (firstEnd.getReadNegativeStrandFlag()) { - positiveEnd = secondEnd; - negativeEnd = firstEnd; - } else { - positiveEnd = firstEnd; - negativeEnd = secondEnd; - } - if (!jumpingLibrary) { - return positiveEnd.getAlignmentStart() < negativeEnd.getAlignmentStart() + negativeEnd.getReadBases().length; - } else { - return negativeEnd.getAlignmentStart() < positiveEnd.getAlignmentStart() + positiveEnd.getReadBases().length; - } - } - - public static int computeInsertSize(final SAMRecord firstEnd, final SAMRecord secondEnd) { - if (firstEnd.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { - return 0; - } - if (!firstEnd.getReferenceName().equals(secondEnd.getReferenceName())) { - return 0; - } - int firstEnd5PrimePosition = firstEnd.getReadNegativeStrandFlag()? firstEnd.getAlignmentEnd(): firstEnd.getAlignmentStart(); - int secondEnd5PrimePosition = secondEnd.getReadNegativeStrandFlag()? secondEnd.getAlignmentEnd(): secondEnd.getAlignmentStart(); - return secondEnd5PrimePosition - firstEnd5PrimePosition; - } - - /** - * Write the mate info for two SAMRecords - */ - public static void setMateInfo(final SAMRecord samRecord, final SAMRecord mate) { - if (!samRecord.getMateUnmappedFlag()) { - samRecord.setMateReferenceName(mate.getReferenceName()); - samRecord.setMateAlignmentStart(mate.getAlignmentStart()); - samRecord.setMateNegativeStrandFlag(mate.getReadNegativeStrandFlag()); - } else { - samRecord.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); - samRecord.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); - samRecord.setMateUnmappedFlag(true); - } - if (!mate.getMateUnmappedFlag()) { - mate.setMateReferenceName(samRecord.getReferenceName()); - mate.setMateAlignmentStart(samRecord.getAlignmentStart()); - mate.setMateNegativeStrandFlag(samRecord.getReadNegativeStrandFlag()); - } else { - mate.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME); - mate.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); - mate.setMateUnmappedFlag(true); - } - } - - -} diff --git a/java/lib/edu/mit/broad/picard/util/SequenceUtil.java b/java/lib/edu/mit/broad/picard/util/SequenceUtil.java deleted file mode 100644 index e847611c6..000000000 --- a/java/lib/edu/mit/broad/picard/util/SequenceUtil.java +++ /dev/null @@ -1,76 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.util; - -public class SequenceUtil { - /** - * Calculate the reverse complement of the specified sequence - * (Stolen from Reseq) - * - * @param sequenceData - * @return reverse complement - */ - public static String reverseComplement(String sequenceData) { - - final char[] original = sequenceData.toCharArray(); - final char[] complement = new char[original.length]; - - for (int i=0, j=complement.length-1; i 90) lhs -= 32; - if (rhs > 90) rhs -= 32; - } - - return lhs == rhs; - } - - /** - * returns true if the value of base represents a no call - */ - public static boolean isNoCall(byte base) { - return base == 'N' || base == 'n' || base == '.'; - } - -} diff --git a/java/lib/edu/mit/broad/picard/util/StringSortingCollectionFactory.java b/java/lib/edu/mit/broad/picard/util/StringSortingCollectionFactory.java deleted file mode 100644 index fbc4798b9..000000000 --- a/java/lib/edu/mit/broad/picard/util/StringSortingCollectionFactory.java +++ /dev/null @@ -1,121 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.util; - -import edu.mit.broad.sam.util.SortingCollection; -import edu.mit.broad.sam.util.RuntimeIOException; - -import java.util.Comparator; -import java.nio.ByteBuffer; -import java.io.OutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.File; - -/** - * Factory to create new String SortingCollections - * - * @author Kathleen Tibbetts - */ -public class StringSortingCollectionFactory { - - private static final File TEMP_DIR = new File(System.getProperty("java.io.tmpdir"), "StringSortingCollectionFactory"); - private static final int MAX_RECORDS_IN_RAM = 20000; - - private StringSortingCollectionFactory() { - } - - public static SortingCollection newCollection() { - return SortingCollection.newInstance( - String.class, new StringCodec(), new StringComparator(), MAX_RECORDS_IN_RAM, TEMP_DIR); - } - - static class StringCodec implements SortingCollection.Codec { - ByteBuffer byteBuffer = ByteBuffer.allocate(4); - OutputStream os; - InputStream is; - - /** Returns a new StringCodec. */ - public SortingCollection.Codec clone() { - return new StringCodec(); - } - - /** - * Where to write encoded output - * - * @param os the output stream to encode output - */ - public void setOutputStream(final OutputStream os) { - this.os = os; - } - - /** - * Where to read encoded input from - * - * @param is where to read encoded input from - */ - public void setInputStream(final InputStream is) { - this.is = is; - } - - /** - * Write object to file - * - * @param val what to write - */ - public void encode(final String val) { - try { - byteBuffer.clear(); - byteBuffer.putInt(val.length()); - os.write(byteBuffer.array()); - os.write(val.getBytes()); - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - /** - * Read the next record from the input stream and convert into a java object. - * - * @return null if no more records. Should throw exception if EOF is encountered in the middle of - * a record. - */ - public String decode() { - try { - byteBuffer.clear(); - int bytesRead = is.read(byteBuffer.array()); - if (bytesRead == -1) { - return null; - } - if (bytesRead != 4) { - throw new RuntimeException("Unexpected EOF in middle of record"); - } - byteBuffer.limit(4); - final int length = byteBuffer.getInt(); - final byte[] buf = new byte[length]; - bytesRead = is.read(buf); - if (bytesRead != length) { - throw new RuntimeException("Unexpected EOF in middle of record"); - } - return new String(buf); - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - } - - static class StringComparator implements Comparator { - - public int compare(final String s, final String s1) { - return s.compareTo(s1); - } - } - -} diff --git a/java/lib/edu/mit/broad/picard/util/StringUtil.java b/java/lib/edu/mit/broad/picard/util/StringUtil.java deleted file mode 100644 index 2cf15de82..000000000 --- a/java/lib/edu/mit/broad/picard/util/StringUtil.java +++ /dev/null @@ -1,108 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.util; - -/** - * Utilities that are useful when dealing with Strings. - * - * @author Tim Fennell - */ -public class StringUtil { - /** - * Return input string with newlines inserted to ensure that all lines - * have length <= maxLineLength. if a word is too long, it is simply broken - * at maxLineLength. Does not handle tabs intelligently (due to implementer laziness). - */ - public static String wordWrap(String s, int maxLineLength) { - String[] lines = s.split("\n"); - StringBuilder sb = new StringBuilder(); - for (String line: lines) { - if (sb.length() > 0) { - sb.append("\n"); - } - sb.append(wordWrapSingleLine(line, maxLineLength)); - } - if (s.endsWith("\n")) { - sb.append("\n"); - } - return sb.toString(); - } - - public static String wordWrapSingleLine(String s, int maxLineLength) { - if (s.length() <= maxLineLength) { - return s; - } - StringBuilder sb = new StringBuilder(); - int startCopyFrom = 0; - while (startCopyFrom < s.length()) { - int lastSpaceIndex = startCopyFrom; - int i; - // Find break point (if it exists) - for (i = startCopyFrom; i < s.length() && i - startCopyFrom < maxLineLength; ++i) { - if (Character.isWhitespace(s.charAt(i))) { - lastSpaceIndex = i; - } - } - if (i - startCopyFrom < maxLineLength) { - lastSpaceIndex = i; - } - // Include any trailing whitespace - for (; lastSpaceIndex < s.length() && Character.isWhitespace(s.charAt(lastSpaceIndex)); ++lastSpaceIndex) {} - if (sb.length() > 0) { - sb.append("\n"); - } - // Handle situation in which there is no word break. Just break the word in the middle. - if (lastSpaceIndex == startCopyFrom) { - lastSpaceIndex = i; - } - sb.append(s.substring(startCopyFrom, lastSpaceIndex)); - startCopyFrom = lastSpaceIndex; - } - return sb.toString(); - } - - /** - * - * @param separator String to interject between each string in strings arg - * @param strings List of strings to be joined. - * @return String that concatenates each item of strings arg, with separator btw each of them. - */ - public static String join(String separator, String... strings) { - if (strings.length == 0) { - return ""; - } - StringBuilder ret = new StringBuilder(strings[0]); - for (int i = 1; i < strings.length; ++i) { - ret.append(separator); - ret.append(strings[i]); - } - return ret.toString(); - } - - /** - * Checks that a String doesn't contain one or more characters of interest. - * - * @param s the String to check - * @param chars the characters to check for - * @return String the input String for convenience - * @throws IllegalArgumentException if the String contains one or more of the characters - */ - public static String assertCharactersNotInString(final String s, final char... chars) { - for (char ch : s.toCharArray()) { - for (int i=0; ib is a delimiter; otherwise false - */ - protected boolean isDelimiter(byte b) { - return b == '\t'; - } -} diff --git a/java/lib/edu/mit/broad/picard/variation/DbSnpFileGenerator.java b/java/lib/edu/mit/broad/picard/variation/DbSnpFileGenerator.java deleted file mode 100644 index 5f44c972a..000000000 --- a/java/lib/edu/mit/broad/picard/variation/DbSnpFileGenerator.java +++ /dev/null @@ -1,172 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.variation; - -import java.io.*; -import java.util.*; -import edu.mit.broad.sam.SAMSequenceRecord; -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.picard.io.IoUtil; -import edu.mit.broad.picard.util.TabbedTextFileParser; -import edu.mit.broad.picard.util.Log; - -/** - * Generates a binary version of the data for all dbSnps from a UCSU snp###.txt file. Files with SNP data - * can be downloaded here: http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/. See KnownVariantCodec.java - * for binary file format. - */ -public class DbSnpFileGenerator { - // Codes from the DbSnp file that we will handle. All others are ignored. - // Package visibility for testing purposes. - static final String snp = "single"; // code in DbSnp file for a SNP - static final String insertion = "insertion"; // code in DbSnp file for an insertion - static final String deletion = "deletion"; // code in DbSnp file for a deletion - static final String indel = "in-del"; // code in DbSnp file for an insertion/deletion - - private File snpFile; - private File seqDictionaryFile; - private Map sequenceToIndex = new HashMap(); - private List dictionary; - private BinaryCodec codec; - private KnownVariantCodec kvCodec = new KnownVariantCodec(); - private Map> sequenceToSnps; - - private final Log log = Log.getInstance(DbSnpFileGenerator.class); - - /** - * Protected constructor so we can use a temporary file during testing - * @param snpFile The UCSC dbSnp file - * @param seqDictionaryFile The Sequence Dictionary - * @param tempOutputFile The binary file to write to - */ - DbSnpFileGenerator(File snpFile, File seqDictionaryFile, File tempOutputFile) { - this.snpFile = snpFile; - this.seqDictionaryFile = seqDictionaryFile; - this.codec = new BinaryCodec(new DataOutputStream(IoUtil.openFileForWriting(tempOutputFile))); - } - - /** - * Writes the full binary dbSnp file and calls close on the BinaryCodec. - */ - public void writeDbSnpFile() { - kvCodec.encode(KnownVariantCodec.MAGIC_NUMBER, codec); - writeReferenceSequences(); - writeDbSnpRecords(); - codec.close(); - } - - /** - * Writes the number of reference sequences and then the sequences themselves - */ - private void writeReferenceSequences() { - SAMFileReader sam = new SAMFileReader(this.seqDictionaryFile); - this.dictionary = sam.getFileHeader().getSequences(); - kvCodec.encode(this.dictionary, codec); - } - - /** - * Writes all the dbSnp records to the file in the order of the reference sequences - * in the sequence dictionary file. - */ - private void writeDbSnpRecords() { - sequenceToSnps = new HashMap>(); - int count = 0; - - TabbedTextFileParser parser = new TabbedTextFileParser(true, snpFile); - while(parser.hasNext()) { - String parts[] = parser.next(); - String sequence = parts[1]; - - // If we don't have this sequence in our dictionary, ignore it - if (!getSequenceToIndex().containsKey(sequence)) { - continue; - } - - int start = Integer.parseInt(parts[2]) + 1; // We go from a zero-based to a 1-based system. - int end = Integer.parseInt(parts[3]); - - String var = parts[11]; - - // We only care about SNPs, insertions, and deletions; otherwise skip it - VariantType type = null; - if (var.equals(snp)) { - type = VariantType.SNP; - end = start; // For SNPs, we mark the start and end as the same location - } - // For insertions and deletions, we mark the base on either side of the affected reference sequence - else if (var.equals(insertion)) { - type = VariantType.insertion; - end = start + 1; // Insertions are always length 1 - } - else if (var.equals(deletion)) { - type = VariantType.deletion; - start = start - 1; - end++; - } - else if (var.equals(indel)) { // For indels, we do one each of an insertion (here) and a deletion (below) - type = VariantType.insertion; - start = start - 1; - end = start + 1; - } - else { - continue; - } - - if (!sequenceToSnps.containsKey(sequence)) { - sequenceToSnps.put(sequence, new TreeSet()); - } - SortedSet sequenceVars = sequenceToSnps.get(sequence); - - boolean validated = !parts[12].equals("unknown"); - String name = parts[4]; - - sequenceVars.add(new KnownVariant(name, getSequenceToIndex().get(sequence), start, end, type, validated)); - count++; - - // If it's an in-del, we add it as a deletion (in addition to the insertion we also added) so we - // will have two records in our binary format for the one record in the text file - if (var.equals(indel)) { - sequenceVars.add(new KnownVariant(name, getSequenceToIndex().get(sequence), start, - Integer.parseInt(parts[3])+1, VariantType.deletion, validated)); - count++; - } - } - - codec.writeInt(count); - // Loop through the sequences from the sequence dictionary in order - for (int i = 0; i < dictionary.size(); i++) { - // And write their known variants in order - if (sequenceToSnps.containsKey(dictionary.get(i).getSequenceName())) { - for (Iterator it = sequenceToSnps.get(dictionary.get(i).getSequenceName()).iterator(); - it.hasNext();) { - kvCodec.encode(it.next(), codec); - } - } - } - log.info("Wrote " + count + " dbSnp records."); - } - - /** - * Returns the map of sequences to their index in the reference dictionary, - * creating it if it does not already exist - * - * @return the map of sequences to their index in the reference dictionary - */ - private Map getSequenceToIndex() { - if (sequenceToIndex.keySet().size() == 0) { - for (int i = 0; i < dictionary.size(); i++) { - sequenceToIndex.put(dictionary.get(i).getSequenceName(), i); - } - } - return sequenceToIndex; - } - -} \ No newline at end of file diff --git a/java/lib/edu/mit/broad/picard/variation/DbSnpFileReader.java b/java/lib/edu/mit/broad/picard/variation/DbSnpFileReader.java deleted file mode 100644 index dbee370d2..000000000 --- a/java/lib/edu/mit/broad/picard/variation/DbSnpFileReader.java +++ /dev/null @@ -1,149 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.variation; - -import java.io.*; -import java.util.*; -import edu.mit.broad.sam.SAMSequenceRecord; -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.picard.PicardException; -import edu.mit.broad.picard.io.IoUtil; - -/** - * Reader for DbSnp binary files. See DbSnpFileGenerator for file format. - */ -public class DbSnpFileReader implements KnownVariantIterator -{ - private BinaryCodec codec = null; - private KnownVariantCodec kvCodec = new KnownVariantCodec(); - List dictionary; - private Map refIndexToName = new HashMap(); - private KnownVariant next = null; - private int dbSnpCount = -1; - - /** - * Constructor - * - * @param dbSnpFile The binary dbSnp file to read - */ - public DbSnpFileReader(File dbSnpFile) - { - codec = new BinaryCodec(new DataInputStream(IoUtil.openFileForReading(dbSnpFile))); - readHeader(); - next = readNextDbSnp(); - } - - /** - * Returns an iterator over a set of elements of type KnownVariant. - * - * @return an Iterator - */ - public Iterator iterator() - { - return this; - } - - /** - * Returns true if the iteration has more elements. - * - * @return true if the iterator has more elements. - */ - public boolean hasNext() - { - return next != null; - } - - /** - * Returns the next element in the iteration. - * - * @return the next KnownVariant in the iteratoion - */ - public KnownVariant next() - { - if (!hasNext()) throw new NoSuchElementException(); - KnownVariant result = next; - next = readNextDbSnp(); - return result; - } - - /** Allows peeking at the next value without advaning the iterator. */ - public KnownVariant peek() { - return this.next; - } - - /** - * Not supported. - * - * @throws UnsupportedOperationException - */ - public void remove() - { - throw new UnsupportedOperationException("Remove() not supported."); - } - - /** - * Closes the underlying stream, via the BinaryCodec's close() method - */ - public void close() - { - codec.close(); - } - - /** - * Reads the header data from the binary file, validates the version, and populates refIndexToName - * - * @throws IOException - */ - private void readHeader() - { - // Verify that we are using the correct version - String ver = kvCodec.decodeMagicNumber(codec); - if (!ver.equals(KnownVariantCodec.MAGIC_NUMBER)) - { - throw new RuntimeException("Unsupported dbSnp file version: " + ver); - } - - // Read the number of reference sequences and then the sequences themselves - dictionary = kvCodec.decodeSequenceDictionary(codec); - for (int i = 0; i < dictionary.size(); i++) - { - refIndexToName.put(i, dictionary.get(i)); - } - - dbSnpCount = codec.readInt(); - } - - /** - * Reads the next dbSnp record from the binary file - * - * @return the populated KnownVariant object - */ - private KnownVariant readNextDbSnp() { - KnownVariant kv = kvCodec.decodeKnownVariant(codec); - if (kv != null) { - kv.setRefrenceSequence(refIndexToName.get(kv.getSequenceIndex()).getSequenceName()); - } - return kv; - } - - /** - * Returns the SequenceDictionary for this file in SAM format - * - * @return an ordered List of SAMSequenceRecords - */ - public List getSequenceDictionary() { return dictionary; } - - /** - * Returns the total number of dbSnp records encoded in the file - * - * @return total dbSnps encoded in the file - */ - public int getCountDbSnpRecords() { return dbSnpCount; } -} \ No newline at end of file diff --git a/java/lib/edu/mit/broad/picard/variation/GenerateDbSnpFile.java b/java/lib/edu/mit/broad/picard/variation/GenerateDbSnpFile.java deleted file mode 100644 index 65c8570b6..000000000 --- a/java/lib/edu/mit/broad/picard/variation/GenerateDbSnpFile.java +++ /dev/null @@ -1,51 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.variation; - -import java.io.File; - -import edu.mit.broad.picard.cmdline.CommandLineProgram; -import edu.mit.broad.picard.cmdline.Option; -import edu.mit.broad.picard.cmdline.Usage; - -/** - * CommandLineProgram to generate to invoke DbSnpFileGenerator - * - * @author Kathleen Tibbetts - */ -public class GenerateDbSnpFile extends CommandLineProgram -{ - // The following attributes define the command-line arguments - @Usage(programVersion="1.0") - public String USAGE = - "Usage: " + getClass().getName() + " [options]\n\n" + - "Generate a KnownVariant binary file from a UCSC DbSnp text file.\n"; - - @Option(shortName = "S", doc = "UCSC SNP file. ") - public File SNP_FILE; - - @Option(shortName = "D", doc = "Sequence Dictionary for the genome in SAM or BAM format. ") - public File SEQUENCE_DICTIONARY; - - @Option(shortName = "O", doc = "The binary output file. ") - public File OUTPUT; - - @Override - protected int doWork() { - DbSnpFileGenerator generator = new DbSnpFileGenerator(SNP_FILE, SEQUENCE_DICTIONARY, OUTPUT); - generator.writeDbSnpFile(); - return 0; - } - - public static void main(String[] argv) { - System.exit(new GenerateDbSnpFile().instanceMain(argv)); - } - -} diff --git a/java/lib/edu/mit/broad/picard/variation/KnownVariant.java b/java/lib/edu/mit/broad/picard/variation/KnownVariant.java deleted file mode 100644 index 26b2f33be..000000000 --- a/java/lib/edu/mit/broad/picard/variation/KnownVariant.java +++ /dev/null @@ -1,115 +0,0 @@ -package edu.mit.broad.picard.variation; - -/** - * Utility class to hold data about a population or somatic variant. - * - * IMPORTANT! Regardless of the coordinate system of the data from which it is drawn, the data - * in this class should be 1-based. Start and end coordinates should be as follows: - * For SNPs, start and end should be the same base. - * For insertions and deletions, the base on either side of the affected reference sequence - * will be the start and end. For insertions, this means they will always be 1 base apart. - */ -public class KnownVariant implements Comparable -{ - private final String name; - private final int sequenceIndex; - private final int startPos; - private final int endPos; - private final VariantType type; - private final boolean validated; - private transient String referenceSequence; - - /** - * Constructor - * - * @param name - * @param sequenceIndex - * @param startPos - * @param endPos - * @param type - * @param validated - */ - public KnownVariant(String name, int sequenceIndex, int startPos, int endPos, - VariantType type, boolean validated) - { - this.name = name; - this.sequenceIndex = sequenceIndex; - this.startPos = startPos; - this.endPos = endPos; - this.type = type; - this.validated = validated; - } - - /** - * Compares this object with the specified object for order. Returns a negative integer, zero, or a positive - * integer as this object is less than, equal to, or greater than the specified object. - * - * @param that The KnownVariant to compare - * @return a negative integer, zero, or a positive integer as this object is less than, equal to, - * or greater than the specified object - */ - public int compareTo(KnownVariant that) - { - if (this.getSequenceIndex() != that.getSequenceIndex()) - { - return (this.getSequenceIndex() > that.getSequenceIndex()) ? 1 : -1; - } - else if (this.getStartPos() != that.getStartPos()) - { - return (this.getStartPos() > that.getStartPos()) ? 1 : -1; - } - else if (this.getEndPos() != that.getEndPos()) - { - return (this.getEndPos() > that.getEndPos()) ? 1 : -1; - } - else if (!this.getName().equals(that.getName())) - { - return this.getName().compareTo(that.getName()); - } - else if (this.getType() != that.getType()) - { - return this.getType().compareTo(that.getType()); - } - else if (this.isValidated() != that.isValidated()) - { - return this.isValidated() ? 1 : -1; - } - return 0; - } - - public boolean equals(Object o) - { - if (!(o instanceof KnownVariant)) { - return false; - } - KnownVariant that = (KnownVariant)o; - return (this.name.equals(that.name) && - this.sequenceIndex == that.sequenceIndex && - this.startPos == that.startPos && - this.endPos == that.endPos && - this.type == that.type && - this.validated == that.validated); - } - - public int hasCode() - { - int result = 17; - result = 37*result + name.hashCode(); - result = 37*result + sequenceIndex; - result = 37*result + startPos; - result = 37*result + endPos; - result = 37*result + type.hashCode(); - result = 37*result + (validated ? 1 : 0); - return result; - } - - public String getName() { return name; } - public int getSequenceIndex() { return sequenceIndex; } - public String getRefrenceSequence() { return referenceSequence; } - public void setRefrenceSequence(String referenceSequence) { this.referenceSequence = referenceSequence; } - public int getStartPos() { return startPos; } - public int getEndPos() { return endPos; } - public VariantType getType() { return type; } - public boolean isValidated() { return validated; } - -} diff --git a/java/lib/edu/mit/broad/picard/variation/KnownVariantCodec.java b/java/lib/edu/mit/broad/picard/variation/KnownVariantCodec.java deleted file mode 100644 index 2258e756c..000000000 --- a/java/lib/edu/mit/broad/picard/variation/KnownVariantCodec.java +++ /dev/null @@ -1,179 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.variation; - -import edu.mit.broad.sam.SAMSequenceRecord; -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.RuntimeEOFException; - -import java.util.ArrayList; -import java.util.List; - -/** - * Class for encoding and deconding binary data about KnownVariants - * - * IMPORTANT! This class assumes that a KnownVariant instance is 1-based and end-inclusive - * and that the binary format is 0-based and end-exclusive. - * - * The format for the binary dbSnp file is as follows: - * - * Field Description Type Value - * ----- ----------- ---- ----- - * magic Known variant magic number char[4] DBS\1 - * n_ref # reference sequences int32 - * - * -- List of references information (n = n_ref) - * l_name length of the reference name plus 1 (including NULL) int32 - * name Name; NULL terminated char[l_name] - * t_ref Length of the reference sequence int32 - * - * - * n_snps # of Known Variant records int32 - * - * -- List of DBSnps - * block_size Length of the remainder of the block - * rID Reference sequence ID (-1 <= rId <= n_ref) int32 - * pos 0-based leftmost coordinate int32 - * snp_len Length of the dbSnp int32 - * type type of SNP int8 0 = deletion - * 1 = het - * 2 = in-del - * 3 = insertion - * 4 = microsatellite - * 5 = mixed - * 6 = mnp - * 7 = named - * 8 = single - * 9 = unknown - * validated whether the SNP has been validated int8 1 | 0 - * name name of the dbSnp; NULL terminated char[block_size-15] - * - * @author Kathleen Tibbetts - **/ -public class KnownVariantCodec -{ - public static final String MAGIC_NUMBER = "DBS\1"; - private static final int KV_RECORD_LENGTH_LESS_NAME = 15; - - /** - * Reads data about a known variant from the BinaryCodec and instantiates a KnownVariant - * object with those values - * - * @param codec The BinaryCodec from which to read - * @return a populated KnownVariant object - */ - public KnownVariant decodeKnownVariant(BinaryCodec codec) - { - int blockSize; - try { - blockSize = codec.readInt(); - } - catch (RuntimeEOFException e) { - return null; - } - int seqIndex = codec.readInt(); - int startPos = codec.readInt() + 1; // Switch to 1-based - int endPos = codec.readInt(); - byte[] buffer = new byte[1]; - codec.readBytes(buffer); - VariantType type = VariantType.getVariantTypeFromOrdinal((int) buffer[0]); - codec.readBytes(buffer); - boolean validated = ((int) buffer[0]) == 1; - String name = codec.readString(blockSize - KV_RECORD_LENGTH_LESS_NAME); - codec.readBytes(buffer); // Skip the null terminator - return new KnownVariant(name, seqIndex, startPos, endPos, type, validated); - - } - - /** - * Writes data from a KnownVariant in the expected format to the BinaryCodec - * - * @param variant The KnownVariant to encode - * @param codec The BinaryCodec to which to write - */ - public void encode(KnownVariant variant, BinaryCodec codec) - { - codec.writeInt(variant.getName().length() + KV_RECORD_LENGTH_LESS_NAME);// Length of the rest of the block - codec.writeInt(variant.getSequenceIndex()); // Index of the reference sequence - codec.writeInt((int)variant.getStartPos()-1); // Switch to 0-based leftmost coordinate - codec.writeInt((int)variant.getEndPos()); // end position, exclusive - byte b[] = new byte[1]; - b[0] = (byte)variant.getType().ordinal(); // Type - codec.writeBytes(b); - b[0] = (byte)(variant.isValidated() ? 1 : 0); // Validated - codec.writeBytes(b); - codec.writeString(variant.getName(), false, true); // The null-terminated name - } - - /** - * Reads data about the Sequence Dictionary from the BinaryCodec and instantiates a List of - * SAMSequenceRecords with those values - * - * @param codec The BinaryCodec from which to read - * @return a populated List of SAMSequenceRecords - */ - public List decodeSequenceDictionary(BinaryCodec codec) - { - int total = codec.readInt(); - List dictionary = new ArrayList(total); - for (int i = 0; i < total; i++) - { - int len = codec.readInt(); - // Read the name, leaving off and then skipping the null terminator - String name = codec.readString(len-1); - byte[] buffer = new byte[1]; - codec.readBytes(buffer); - int seqLength = codec.readInt(); - SAMSequenceRecord rec = new SAMSequenceRecord(name); - rec.setSequenceLength(seqLength); - dictionary.add(rec); - } - return dictionary; - } - - /** - * Writes a Sequence Dictionary in the format excpected to the BinaryCodec - * - * @param dictionary The list of SAMSequenceRecords to encode - * @param codec The BinaryCodec to which to write - */ - public void encode(List dictionary, BinaryCodec codec) - { - codec.writeInt(dictionary.size()); - for (SAMSequenceRecord sequence : dictionary) - { - codec.writeString(sequence.getSequenceName(), true, true); - codec.writeInt(sequence.getSequenceLength()); - } - - } - - /** - * Reads data about the Magic Number from the BinaryCodec and returns a string with its value - * - * @param codec The BinaryCodec from which to read - * @return a Magic Number - */ - public String decodeMagicNumber(BinaryCodec codec) - { - return codec.readString(4); - } - - /** - * Writes a Magic Number in the format excpected to the BinaryCodec - * - * @param magicNumber The magic number to encode - * @param codec The BinaryCodec to which to write - */ - public void encode(String magicNumber, BinaryCodec codec) - { - codec.writeString(magicNumber, false, false); - } -} diff --git a/java/lib/edu/mit/broad/picard/variation/KnownVariantIterator.java b/java/lib/edu/mit/broad/picard/variation/KnownVariantIterator.java deleted file mode 100644 index 6cb0712e1..000000000 --- a/java/lib/edu/mit/broad/picard/variation/KnownVariantIterator.java +++ /dev/null @@ -1,31 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.variation; - -import edu.mit.broad.sam.SAMSequenceRecord; - -import java.util.Iterator; -import java.util.List; - -/** - * API for iterating over records representing known variations - * - * @author Kathleen Tibbetts - */ -public interface KnownVariantIterator extends Iterable, Iterator -{ - /** - * Return the list of sequence dictionary (list of SAMSequenceRecords in order) - * for this KnownVariantIterator - * - * @return The SAMSequenceRecords that comprise the sequence dictionary for this iterator, in order - */ - public List getSequenceDictionary(); -} diff --git a/java/lib/edu/mit/broad/picard/variation/VariantType.java b/java/lib/edu/mit/broad/picard/variation/VariantType.java deleted file mode 100644 index 354e04723..000000000 --- a/java/lib/edu/mit/broad/picard/variation/VariantType.java +++ /dev/null @@ -1,30 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.picard.variation; - -/** - * Enum to hold the possible types of dbSnps. Note that these correspsond to the names used - * in the dbSnp database with the exception of indel (which is in-del in dbSnp). - */ -public enum VariantType -{ - SNP, insertion, deletion; - - /** - * Gets the enum for a given ordinal - * - * @param ordinal - * @return VariantType - */ - public static VariantType getVariantTypeFromOrdinal(int ordinal) - { - return VariantType.class.getEnumConstants()[ordinal]; - } -} diff --git a/java/lib/edu/mit/broad/sam/AlignmentBlock.java b/java/lib/edu/mit/broad/sam/AlignmentBlock.java deleted file mode 100644 index ef1ec841c..000000000 --- a/java/lib/edu/mit/broad/sam/AlignmentBlock.java +++ /dev/null @@ -1,31 +0,0 @@ -package edu.mit.broad.sam; - -/** - * Represents the contiguous alignment of a subset of read bases to a reference - * sequence. Simply put an alignment block tells you that read bases from - * readStart are aligned to the reference (matching or mismatching) from - * referenceStart for length bases. - * - * @author Tim Fennell - */ -public class AlignmentBlock { - private int readStart; - private int referenceStart; - private int length; - - /** Constructs a new alignment block with the supplie read and ref starts and length. */ - AlignmentBlock(int readStart, int referenceStart, int length) { - this.readStart = readStart; - this.referenceStart = referenceStart; - this.length = length; - } - - /** The first, 1-based, base in the read that is aligned to the reference reference. */ - public int getReadStart() { return readStart; } - - /** The first, 1-based, position in the reference to which the read is aligned. */ - public int getReferenceStart() { return referenceStart; } - - /** The number of contiguous bases aligned to the reference. */ - public int getLength() { return length; } -} diff --git a/java/lib/edu/mit/broad/sam/BAMFileConstants.java b/java/lib/edu/mit/broad/sam/BAMFileConstants.java deleted file mode 100644 index 7b5cf6c70..000000000 --- a/java/lib/edu/mit/broad/sam/BAMFileConstants.java +++ /dev/null @@ -1,33 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -class BAMFileConstants { - /** - * The beginning of a BAMRecord is a fixed-size block of 8 int32s - */ - static final int FIXED_BLOCK_SIZE = 8 * 4; - - /** - * Sanity check -- we never expect BAMRecords to be as big as this. - */ - static final int MAXIMUM_RECORD_LENGTH = 1024 * 1024; - - /** - * BAM file magic number. This is what is present in the gunzipped version of the file, - * which never exists on disk. - */ - - static final byte[] BAM_MAGIC = "BAM\1".getBytes(); - /** - * BAM index file magic number. - */ - static final byte[] BAM_INDEX_MAGIC = "BAI\1".getBytes(); -} diff --git a/java/lib/edu/mit/broad/sam/BAMFileIndex.java b/java/lib/edu/mit/broad/sam/BAMFileIndex.java deleted file mode 100644 index d6624b76d..000000000 --- a/java/lib/edu/mit/broad/sam/BAMFileIndex.java +++ /dev/null @@ -1,277 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.sam; - - -import edu.mit.broad.sam.util.RuntimeEOFException; -import edu.mit.broad.sam.util.RuntimeIOException; - -import java.io.*; -import java.nio.*; -import java.nio.channels.*; -import java.util.*; - -/** - * Internal class for reading BAM file indexes. - */ -class BAMFileIndex -{ - private static final int MAX_BINS = 37450; // =(8^6-1)/7+1 - private static final int BAM_LIDX_SHIFT = 16; - - private File mFile = null; - private FileInputStream mFileStream = null; - private MappedByteBuffer mFileBuffer = null; - - - BAMFileIndex(final File file) { - mFile = file; - } - - void close() { - closeFileStream(); - } - - long[] getSearchBins(int referenceIndex, int startPos, int endPos) { - - openIndex(); - seek(4); - - int sequenceCount = readInteger(); - // System.out.println("# Sequence count: " + sequenceCount); - if (referenceIndex >= sequenceCount) { - return null; - } - - BitSet regionBins = regionToBins(startPos, endPos); - if (regionBins == null) { - return null; - } - - for (int i = 0; i < referenceIndex; i++) { - // System.out.println("# Sequence TID: " + i); - int nBins = readInteger(); - // System.out.println("# nBins: " + nBins); - for (int j = 0; j < nBins; j++) { - int bin = readInteger(); - int nChunks = readInteger(); - // System.out.println("# bin[" + j + "] = " + bin + ", nChunks = " + nChunks); - skipBytes(16 * nChunks); - } - int nLinearBins = readInteger(); - // System.out.println("# nLinearBins: " + nLinearBins); - skipBytes(8 * nLinearBins); - } - - // System.out.println("# Sequence target TID: " + referenceIndex); - int nIndexBins = readInteger(); - // System.out.println("# nBins: " + nIndexBins); - if (nIndexBins == 0) { - return null; - } - - List chunkList = new ArrayList(); - for (int i = 0; i < nIndexBins; i++) { - int indexBin = readInteger(); - int nChunks = readInteger(); - // System.out.println("# bin[" + i + "] = " + indexBin + ", nChunks = " + nChunks); - if (regionBins.get(indexBin)) { - for (int ci = 0; ci < nChunks; ci++) { - long chunkBegin = readLong(); - long chunkEnd = readLong(); - chunkList.add(new Chunk(chunkBegin, chunkEnd)); - } - } else { - skipBytes(16 * nChunks); - } - } - - if (chunkList.isEmpty()) { - return null; - } - - int start = (startPos <= 0) ? 0 : startPos-1; - int regionLinearBin = start >> BAM_LIDX_SHIFT; - int nLinearBins = readInteger(); - // System.out.println("# nLinearBins: " + nLinearBins); - // System.out.println("# regionLinearBin: " + regionLinearBin); - long minimumOffset = 0; - if (regionLinearBin < nLinearBins) { - skipBytes(8 * regionLinearBin); - minimumOffset = readLong(); - } - chunkList = optimizeChunkList(chunkList, minimumOffset); - return convertToArray(chunkList); - } - - private List optimizeChunkList(List chunkList, long minimumOffset) { - Chunk lastChunk = null; - Collections.sort(chunkList); - List result = new ArrayList(); - for (Chunk chunk : chunkList) { - if (chunk.getChunkEnd() <= minimumOffset) { - continue; - } - if (result.isEmpty()) { - result.add(chunk); - lastChunk = chunk; - continue; - } - // Coalesce chunks that are in adjacent file blocks. - // This is a performance optimization. - long lastFileBlock = getFileBlock(lastChunk.getChunkEnd()); - long chunkFileBlock = getFileBlock(chunk.getChunkStart()); - if (chunkFileBlock - lastFileBlock > 1) { - result.add(chunk); - lastChunk = chunk; - } else { - if (chunk.getChunkEnd() > lastChunk.getChunkEnd()) { - lastChunk.setChunkEnd(chunk.getChunkEnd()); - } - } - } - return result; - } - - private long[] convertToArray(List chunkList) { - int count = chunkList.size() * 2; - if (count == 0) { - return null; - } - int index = 0; - long[] result = new long[count]; - for (Chunk chunk : chunkList) { - result[index++] = chunk.getChunkStart(); - result[index++] = chunk.getChunkEnd(); - } - return result; - } - - private BitSet regionToBins(int startPos, int endPos) { - int maxPos = 0x1FFFFFFF; - int start = (startPos <= 0) ? 0 : (startPos-1) & maxPos; - int end = (endPos <= 0) ? maxPos : (endPos-1) & maxPos; - if (start > end) { - return null; - } - int k; - BitSet bitSet = new BitSet(MAX_BINS); - bitSet.set(0); - for (k = 1 + (start>>26); k <= 1 + (end>>26); ++k) bitSet.set(k); - for (k = 9 + (start>>23); k <= 9 + (end>>23); ++k) bitSet.set(k); - for (k = 73 + (start>>20); k <= 73 + (end>>20); ++k) bitSet.set(k); - for (k = 585 + (start>>17); k <= 585 + (end>>17); ++k) bitSet.set(k); - for (k = 4681 + (start>>14); k <= 4681 + (end>>14); ++k) bitSet.set(k); - return bitSet; - } - - private long getFileBlock(long bgzfOffset) { - return ((bgzfOffset >> 16L) & 0xFFFFFFFFFFFFL); - } - - private void openIndex() { - if (mFileBuffer != null) { - return; - } - openFileStream(); - seek(0); - byte[] buffer = new byte[4]; - readBytes(buffer); - if (!Arrays.equals(buffer, BAMFileConstants.BAM_INDEX_MAGIC)) { - closeFileStream(); - throw new RuntimeException("Invalid file header in BAM index " + mFile + - ": " + new String(buffer)); - } - } - - private void readBytes(byte[] buffer) { - mFileBuffer.get(buffer); - } - - private int readInteger() { - return mFileBuffer.getInt(); - } - - private long readLong() { - return mFileBuffer.getLong(); - } - - private void skipBytes(int count) { - mFileBuffer.position(mFileBuffer.position() + count); - } - - private void seek(int position) { - mFileBuffer.position(position); - } - - private void openFileStream() { - if (mFileStream != null) { - return; - } - try { - mFileStream = new FileInputStream(mFile); - FileChannel channel = mFileStream.getChannel(); - mFileBuffer = channel.map(FileChannel.MapMode.READ_ONLY, 0L, channel.size()); - mFileBuffer.order(ByteOrder.LITTLE_ENDIAN); - } catch (IOException exc) { - throw new RuntimeIOException(exc.getMessage(), exc); - } - } - - private void closeFileStream() { - if (mFileStream == null) { - return; - } - try { - mFileStream.close(); - } catch (IOException exc) { - throw new RuntimeIOException(exc.getMessage(), exc); - } - mFileStream = null; - mFileBuffer = null; - } - - private static class Chunk - implements Comparable { - - private long mChunkStart; - private long mChunkEnd; - - Chunk(long start, long end) { - mChunkStart = start; - mChunkEnd = end; - } - - long getChunkStart() { - return mChunkStart; - } - - void setChunkStart(long value) { - mChunkStart = value; - } - - long getChunkEnd() { - return mChunkEnd; - } - - void setChunkEnd(long value) { - mChunkEnd = value; - } - - public int compareTo(Chunk chunk) { - int result = Long.signum(mChunkStart - chunk.mChunkStart); - if (result == 0) { - result = Long.signum(mChunkEnd - chunk.mChunkEnd); - } - return result; - } - } -} diff --git a/java/lib/edu/mit/broad/sam/BAMFileReader.java b/java/lib/edu/mit/broad/sam/BAMFileReader.java deleted file mode 100644 index 4e81fc017..000000000 --- a/java/lib/edu/mit/broad/sam/BAMFileReader.java +++ /dev/null @@ -1,317 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.sam; - - -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.BlockCompressedInputStream; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.util.StringLineReader; - -import java.io.DataInputStream; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** - * Internal class for reading and querying BAM files. - */ -class BAMFileReader - extends SAMFileReader.ReaderImplementation { - private boolean mIsSeekable = false; - private BinaryCodec mStream = null; - private final BlockCompressedInputStream mCompressedInputStream; - private SAMFileHeader mFileHeader = null; - private BAMFileIndex mFileIndex = null; - private long mFirstRecordPointer = 0; - private CloseableIterator mCurrentIterator = null; - private final boolean eagerDecode; - - - BAMFileReader(final InputStream stream, final boolean eagerDecode) - throws IOException { - mIsSeekable = false; - mCompressedInputStream = new BlockCompressedInputStream(stream); - mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); - this.eagerDecode = eagerDecode; - readHeader(null); - } - - BAMFileReader(final File file, final boolean eagerDecode) - throws IOException { - mIsSeekable = true; - mCompressedInputStream = new BlockCompressedInputStream(file); - mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream)); - this.eagerDecode = eagerDecode; - readHeader(file); - mFirstRecordPointer = mCompressedInputStream.getFilePointer(); - } - - void close() { - if (mStream != null) { - mStream.close(); - } - mStream = null; - mFileHeader = null; - mFileIndex = null; - } - - BAMFileIndex getFileIndex() { - return mFileIndex; - } - - void setFileIndex(final BAMFileIndex fileIndex) { - mFileIndex = fileIndex; - } - - SAMFileHeader getFileHeader() { - return mFileHeader; - } - - /** - * Currently this is ignored for BAM reading. Always do strict validation. - */ - void setValidationStringency(final SAMFileReader.ValidationStringency validationStringency) { - } - - CloseableIterator getIterator() { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (mIsSeekable) { - try { - mCompressedInputStream.seek(mFirstRecordPointer); - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - mCurrentIterator = new BAMFileIterator(); - return mCurrentIterator; - } - - CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { - if (mStream == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mCurrentIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - if (!mIsSeekable) { - throw new UnsupportedOperationException("Cannot query stream-based BAM file"); - } - if (mFileIndex == null) { - throw new IllegalStateException("No BAM file index is available"); - } - mCurrentIterator = new BAMFileIndexIterator(sequence, start, end, contained); - return mCurrentIterator; - } - - private void readHeader(final File file) - throws IOException { - - final byte[] buffer = new byte[4]; - mStream.readBytes(buffer); - if (!Arrays.equals(buffer, BAMFileConstants.BAM_MAGIC)) { - throw new IOException("Invalid BAM file header"); - } - - final int headerTextLength = mStream.readInt(); - final String textHeader = mStream.readString(headerTextLength); - mFileHeader = new SAMTextHeaderCodec().decode(new StringLineReader(textHeader), - file); - - final int sequenceCount = mStream.readInt(); - if (mFileHeader.getSequences().size() > 0) { - // It is allowed to have binary sequences but no text sequences, so only validate if both are present - if (sequenceCount != mFileHeader.getSequences().size()) { - throw new SAMFormatException("Number of sequences in text header (" + mFileHeader.getSequences().size() + - ") != number of sequences in binary header (" + sequenceCount + ") for file " + file); - } - for (int i = 0; i < sequenceCount; i++) { - final SAMSequenceRecord binarySequenceRecord = readSequenceRecord(file); - final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i); - if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) { - throw new SAMFormatException("For sequence " + i + ", text and binary have different names in file " + - file); - } - if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) { - throw new SAMFormatException("For sequence " + i + ", text and binary have different lengths in file " + - file); - } - } - } else { - // If only binary sequences are present, copy them into mFileHeader - final List sequences = new ArrayList(sequenceCount); - for (int i = 0; i < sequenceCount; i++) { - sequences.add(readSequenceRecord(file)); - } - mFileHeader.setSequences(sequences); - } - } - - private SAMSequenceRecord readSequenceRecord(final File file) { - final int nameLength = mStream.readInt(); - if (nameLength <= 1) { - throw new SAMFormatException("Invalid BAM file header: missing sequence name in file " + file); - } - final String sequenceName = mStream.readString(nameLength - 1); - // Skip the null terminator - mStream.readByte(); - final int sequenceLength = mStream.readInt(); - final SAMSequenceRecord record = new SAMSequenceRecord(sequenceName); - record.setSequenceLength(sequenceLength); - return record; - } - - private class BAMFileIterator - implements CloseableIterator { - - private SAMRecord mNextRecord = null; - private final BAMRecordCodec bamRecordCodec = new BAMRecordCodec(getFileHeader()); - - - BAMFileIterator() { - this(true); - } - - BAMFileIterator(final boolean advance) { - this.bamRecordCodec.setInputStream(BAMFileReader.this.mStream.getInputStream()); - - if (advance) { - advance(); - } - } - - public void close() { - if (this != mCurrentIterator) { - throw new IllegalStateException("Attempt to close non-current iterator"); - } - mCurrentIterator = null; - } - - public boolean hasNext() { - return (mNextRecord != null); - } - - public SAMRecord next() { - final SAMRecord result = mNextRecord; - advance(); - return result; - } - - public void remove() { - throw new UnsupportedOperationException("Not supported: remove"); - } - - void advance() { - try { - mNextRecord = getNextRecord(); - if (eagerDecode && mNextRecord != null) { - mNextRecord.eagerDecode(); - } - } catch (IOException exc) { - throw new RuntimeException(exc.getMessage(), exc); - } - } - - SAMRecord getNextRecord() - throws IOException { - return bamRecordCodec.decode(); - } - } - - private class BAMFileIndexIterator - extends BAMFileIterator { - - private long[] mFilePointers = null; - private int mFilePointerIndex = 0; - private long mFilePointerLimit = -1; - private int mReferenceIndex = -1; - private int mRegionStart = 0; - private int mRegionEnd = 0; - private boolean mReturnContained = false; - - - BAMFileIndexIterator(final String sequence, final int start, final int end, final boolean contained) { - super(false); // delay advance() until after construction - final SAMFileHeader fileHeader = getFileHeader(); - mReferenceIndex = fileHeader.getSequenceIndex(sequence); - if (mReferenceIndex != -1) { - final BAMFileIndex fileIndex = getFileIndex(); - mFilePointers = fileIndex.getSearchBins(mReferenceIndex, start, end); - } - mRegionStart = start; - mRegionEnd = (end <= 0) ? Integer.MAX_VALUE : end; - mReturnContained = contained; - advance(); - } - - SAMRecord getNextRecord() - throws IOException { - while (true) { - // Advance to next file block if necessary - while (mCompressedInputStream.getFilePointer() >= mFilePointerLimit) { - if (mFilePointers == null || - mFilePointerIndex >= mFilePointers.length) { - return null; - } - final long startOffset = mFilePointers[mFilePointerIndex++]; - final long endOffset = mFilePointers[mFilePointerIndex++]; - mCompressedInputStream.seek(startOffset); - mFilePointerLimit = endOffset; - } - // Pull next record from stream - final SAMRecord record = super.getNextRecord(); - if (record == null) { - return null; - } - // If beyond the end of this reference sequence, end iteration - final int referenceIndex = record.getReferenceIndex(); - if (referenceIndex != mReferenceIndex) { - if (referenceIndex < 0 || - referenceIndex > mReferenceIndex) { - mFilePointers = null; - return null; - } - // If before this reference sequence, continue - continue; - } - if (mRegionStart == 0 && mRegionEnd == Integer.MAX_VALUE) { - // Quick exit to avoid expensive alignment end calculation - return record; - } - final int alignmentStart = record.getAlignmentStart(); - final int alignmentEnd = record.getAlignmentEnd(); - if (alignmentStart > mRegionEnd) { - // If scanned beyond target region, end iteration - mFilePointers = null; - return null; - } - // Filter for overlap with region - if (mReturnContained) { - if (alignmentStart >= mRegionStart && alignmentEnd <= mRegionEnd) { - return record; - } - } else { - if (alignmentEnd >= mRegionStart && alignmentStart <= mRegionEnd) { - return record; - } - } - } - } - } -} diff --git a/java/lib/edu/mit/broad/sam/BAMFileWriter.java b/java/lib/edu/mit/broad/sam/BAMFileWriter.java deleted file mode 100644 index 6a7bf7d9b..000000000 --- a/java/lib/edu/mit/broad/sam/BAMFileWriter.java +++ /dev/null @@ -1,64 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.BlockCompressedOutputStream; - -import java.io.DataOutputStream; -import java.io.File; - -/** - * Concrete implementation of SAMFileWriter for writing gzipped BAM files. - */ -class BAMFileWriter extends SAMFileWriterImpl { - - private final BinaryCodec outputBinaryCodec; - private BAMRecordCodec bamRecordCodec = null; - - public BAMFileWriter(final File path) { - outputBinaryCodec = new BinaryCodec(new DataOutputStream(new BlockCompressedOutputStream(path))); - outputBinaryCodec.setOutputFileName(path.toString()); - } - - private void prepareToWriteAlignments() { - if (bamRecordCodec == null) { - bamRecordCodec = new BAMRecordCodec(getHeader()); - bamRecordCodec.setOutputStream(outputBinaryCodec.getOutputStream()); - } - } - - protected void writeAlignment(final SAMRecord alignment) { - prepareToWriteAlignments(); - bamRecordCodec.encode(alignment); - } - - protected void writeHeader(final String textHeader) { - outputBinaryCodec.writeBytes(BAMFileConstants.BAM_MAGIC); - - // calculate and write the length of the SAM file header text and the header text - outputBinaryCodec.writeString(textHeader, true, false); - - // write the sequences binarily. This is redundant with the text header - outputBinaryCodec.writeInt(getHeader().getSequences().size()); - for (final SAMSequenceRecord sequenceRecord: getHeader().getSequences()) { - outputBinaryCodec.writeString(sequenceRecord.getSequenceName(), true, true); - outputBinaryCodec.writeInt(sequenceRecord.getSequenceLength()); - } - } - - protected void finish() { - outputBinaryCodec.close(); - } - - protected String getFilename() { - return outputBinaryCodec.getOutputFileName(); - } -} diff --git a/java/lib/edu/mit/broad/sam/BAMRecord.java b/java/lib/edu/mit/broad/sam/BAMRecord.java deleted file mode 100644 index 1ae5c0f3f..000000000 --- a/java/lib/edu/mit/broad/sam/BAMRecord.java +++ /dev/null @@ -1,280 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.sam; - -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.StringUtil; - -import java.io.ByteArrayInputStream; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.Set; - - -/** - * Wrapper class for binary BAM records. - * Delays unpacking all data binary until requested. - */ -class BAMRecord - extends SAMRecord -{ - private static final int READ_NAME_OFFSET = 0; - - private byte[] mRestOfBinaryData = null; - private int mReadLength = 0; - private final short mReadNameLength; - private final int mCigarLen; - private boolean mAttributesDecoded = false; - private boolean mCigarDecoded = false; - - /** - * If any of the properties set from mRestOfBinaryData have been overridden by calls to setters, - * this is set to true, indicating that mRestOfBinaryData cannot be used to write this record to disk. - */ - private boolean mBinaryDataStale; - - BAMRecord(final SAMFileHeader header, final int referenceID, final int coordinate, final short readNameLength, final short mappingQuality, - final int indexingBin, final int cigarLen, final int flags, final int readLen, final int mateReferenceID, final int mateCoordinate, final int insertSize, - final byte[] restOfData) { - setReferenceIndex(referenceID, header); - setAlignmentStart(coordinate); - mReadNameLength = readNameLength; - setMappingQuality(mappingQuality); - setIndexingBin(indexingBin); - mCigarLen = cigarLen; - setFlags(flags); - mReadLength = readLen; - setMateReferenceIndex(mateReferenceID, header); - setMateAlignmentStart(mateCoordinate); - setInferredInsertSize(insertSize); - mRestOfBinaryData = restOfData; - - // Set these to null in order to mark them as being candidates for lazy initialization. - // If this is not done, they will have non-null defaults. - super.setReadName(null); - super.setCigarString(null); - super.setReadBases(null); - super.setBaseQualities(null); - - // Mark the binary block as being valid for writing back out to disk - mBinaryDataStale = false; - } - - protected void eagerDecode() { - // Force all the lazily-initialized attributes to be decoded. - getReadName(); - getCigar(); - getReadBases(); - getBaseQualities(); - getAttributes(); - super.eagerDecode(); - mRestOfBinaryData = null; - } - - /** - * If this record has a valid binary representation of the variable-length portion of a binary record stored, - * return that byte array, otherwise return null. This will never be true for SAMRecords. It will be true - * for BAMRecords that have not been eagerDecoded(), and for which none of the data in the variable-length - * portion has been changed. - */ - @Override - public byte[] getVariableBinaryRepresentation() { - if (mBinaryDataStale) { - return null; - } - // This may have been set to null by eagerDecode() - return mRestOfBinaryData; - } - - /** - * Depending on the concrete implementation, the binary file size of attributes may be known without - * computing them all. - * - * @return binary file size of attribute, if known, else -1 - */ - @Override - public int getAttributesBinarySize() { - if (mBinaryDataStale || mRestOfBinaryData == null) { - return -1; - } - final int tagsOffset = readNameSize() + cigarSize() + basesSize() + qualsSize(); - return mRestOfBinaryData.length - tagsOffset; - } - - @Override - public void setReadName(final String value) { - super.setReadName(value); - mBinaryDataStale = true; - } - - @Override - public void setCigar(final Cigar cigar) { - super.setCigar(cigar); - mBinaryDataStale = true; - } - - @Override - public void setReadBases(final byte[] value) { - super.setReadBases(value); - mBinaryDataStale = true; - } - - @Override - public void setBaseQualities(final byte[] value) { - super.setBaseQualities(value); - mBinaryDataStale = true; - } - - @Override - public void setAttribute(final String key, final Object value) { - // populate all the attributes from the binary block before overwriting one - getAttributes(); - super.setAttribute(key, value); - mBinaryDataStale = true; - } - - /** - * Avoids decoding binary block to get read length - */ - @Override - public int getReadLength() { - return mReadLength; - } - - @Override - public String getReadName() { - String result = super.getReadName(); - if (mRestOfBinaryData != null && result == null) { - result = decodeReadName(); - super.setReadName(result); - } - return result; - } - - /** - * Do not include null terminator - */ - @Override - public int getReadNameLength() { - return mReadNameLength - 1; - } - - @Override - public Cigar getCigar() { - if (mRestOfBinaryData != null && !mCigarDecoded) { - final int cigarOffset = readNameSize(); - final ByteBuffer byteBuffer = ByteBuffer.wrap(mRestOfBinaryData, cigarOffset, cigarSize()); - byteBuffer.order(ByteOrder.LITTLE_ENDIAN); - super.setCigar(BinaryCigarCodec.getSingleton().decode(byteBuffer)); - mCigarDecoded = true; - } - return super.getCigar(); - } - - @Override - public int getCigarLength() { - return mCigarLen; - } - - @Override - public byte[] getReadBases() { - byte[] result = super.getReadBases(); - if (mRestOfBinaryData != null && result == null && mReadLength > 0) { - result = decodeReadBases(); - super.setReadBases(result); - } - return result; - } - - @Override - public byte[] getBaseQualities() { - byte[] ret = super.getBaseQualities(); - if (mRestOfBinaryData != null && ret == null && mReadLength > 0) { - ret = decodeBaseQualities(); - super.setBaseQualities(ret); - } - return ret; - } - - @Override - public Object getAttribute(final String key) { - if (!mAttributesDecoded) { - decodeAttributes(); - } - return super.getAttribute(key); - } - - @Override - public Set> getAttributes() { - if (!mAttributesDecoded) { - decodeAttributes(); - } - return super.getAttributes(); - } - - private void decodeAttributes() { - if (mAttributesDecoded) { - return; - } - mAttributesDecoded = true; - final Map attributes = new LinkedHashMap(); - final int tagsOffset = readNameSize() + cigarSize() + basesSize() + qualsSize(); - final int tagsSize = mRestOfBinaryData.length - tagsOffset; - final BinaryCodec byteBufferCodec = new BinaryCodec(new ByteArrayInputStream(mRestOfBinaryData, tagsOffset, tagsSize)); - new BinaryTagCodec(byteBufferCodec).readTags(attributes); - for (final Map.Entry entry : attributes.entrySet()) { - super.setAttribute(entry.getKey(), entry.getValue()); - } - } - - private byte[] decodeBaseQualities() { - if (mReadLength == 0) { - return null; - } - final int qualsOffset = readNameSize() + cigarSize() + basesSize(); - final byte[] ret = new byte[qualsSize()]; - System.arraycopy(mRestOfBinaryData, qualsOffset, ret, 0, qualsSize()); - return ret; - } - - private String decodeReadName() { - // Don't include terminating null - return StringUtil.bytesToString(mRestOfBinaryData, READ_NAME_OFFSET, mReadNameLength-1); - } - - private byte[] decodeReadBases() { - if (mReadLength == 0) { - return null; - } - final int basesOffset = readNameSize() + cigarSize(); - return SAMUtils.compressedBasesToBytes(mReadLength, mRestOfBinaryData, basesOffset); - } - - /* methods for computing size of variably-sizes elements */ - - private int readNameSize() { - return mReadNameLength; - } - - private int cigarSize() { - return mCigarLen * 4; - } - - private int basesSize() { - return (mReadLength + 1)/2; - } - - private int qualsSize() { - return mReadLength; - } -} diff --git a/java/lib/edu/mit/broad/sam/BAMRecordCodec.java b/java/lib/edu/mit/broad/sam/BAMRecordCodec.java deleted file mode 100644 index b73254b52..000000000 --- a/java/lib/edu/mit/broad/sam/BAMRecordCodec.java +++ /dev/null @@ -1,163 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.RuntimeEOFException; -import edu.mit.broad.sam.util.SortingCollection; - -import java.io.InputStream; -import java.io.OutputStream; -import java.util.Map; - -public class BAMRecordCodec implements SortingCollection.Codec { - private final BinaryCigarCodec cigarCodec = new BinaryCigarCodec(); - private final SAMFileHeader header; - private OutputStream os; - private InputStream is; - private BinaryCodec binaryCodec; - private BinaryTagCodec binaryTagCodec; - - public BAMRecordCodec(final SAMFileHeader header) { - this.header = header; - } - - public BAMRecordCodec clone() { - BAMRecordCodec other = new BAMRecordCodec(this.header); - return other; - } - - - /** Sets the output stream that records will be written to. */ - public void setOutputStream(final OutputStream os) { - this.os = os; - this.binaryCodec = new BinaryCodec(this.os); - this.binaryTagCodec = new BinaryTagCodec(this.binaryCodec); - } - - /** Sets the input stream that records will be read from. */ - public void setInputStream(final InputStream is) { - this.is = is; - this.binaryCodec = new BinaryCodec(this.is); - this.binaryTagCodec = new BinaryTagCodec(this.binaryCodec); - } - - /** - * Write object to OutputStream. - * The SAMRecord must have a header set into it so reference indices can be resolved. - * - * @param alignment what to write - */ - public void encode(final SAMRecord alignment) { - // Compute block size, as it is the first element of the file representation of SAMRecord - final int readLength = alignment.getReadLength(); - - final int cigarLength = alignment.getCigarLength(); - - int blockSize = BAMFileConstants.FIXED_BLOCK_SIZE + alignment.getReadNameLength() + 1 + // null terminated - cigarLength * 4 + - (readLength + 1) / 2 + // 2 bases per byte - readLength; - - final int attributesSize = alignment.getAttributesBinarySize(); - if (attributesSize != -1) { - blockSize += attributesSize; - } else { - if (alignment.getAttributes() != null) { - for (final Map.Entry attribute : alignment.getAttributes()) { - blockSize += (BinaryTagCodec.getTagSize(attribute.getValue())); - } - } - } - - int indexBin = 0; - if (alignment.getReferenceIndex(header) >= 0) { - if (alignment.getIndexingBin() != null) { - indexBin = alignment.getIndexingBin(); - } else { - indexBin = SAMUtils.reg2bin(alignment.getAlignmentStart() - 1, - alignment.getAlignmentEnd() - 1); - } - } - - // Blurt out the elements - this.binaryCodec.writeInt(blockSize); - this.binaryCodec.writeInt(alignment.getReferenceIndex(header)); - // 0-based!! - this.binaryCodec.writeInt(alignment.getAlignmentStart() - 1); - this.binaryCodec.writeUByte((short)(alignment.getReadNameLength() + 1)); - this.binaryCodec.writeUByte((short)alignment.getMappingQuality()); - this.binaryCodec.writeUShort(indexBin); - this.binaryCodec.writeUShort(cigarLength); - this.binaryCodec.writeUShort(alignment.getFlags()); - this.binaryCodec.writeInt(alignment.getReadLength()); - this.binaryCodec.writeInt(alignment.getMateReferenceIndex(header)); - this.binaryCodec.writeInt(alignment.getMateAlignmentStart() - 1); - this.binaryCodec.writeInt(alignment.getInferredInsertSize()); - final byte[] variableLengthBinaryBlock = alignment.getVariableBinaryRepresentation(); - if (variableLengthBinaryBlock != null) { - this.binaryCodec.writeBytes(variableLengthBinaryBlock); - } else { - this.binaryCodec.writeString(alignment.getReadName(), false, true); - final int[] binaryCigar = cigarCodec.encode(alignment.getCigar()); - for (final int cigarElement : binaryCigar) { - // Assumption that this will fit into an integer, despite the fact - // that it is specced as a uint. - this.binaryCodec.writeInt(cigarElement); - } - this.binaryCodec.writeBytes(SAMUtils.bytesToCompressedBases(alignment.getReadBases())); - this.binaryCodec.writeBytes(alignment.getBaseQualities()); - if (alignment.getAttributes() != null) { - for (final Map.Entry attribute : alignment.getAttributes()) { - this.binaryTagCodec.writeTag(attribute.getKey(), attribute.getValue()); - } - } - } - } - - /** - * Read the next record from the input stream and convert into a java object. - * - * @return null if no more records. Should throw exception if EOF is encountered in the middle of - * a record. - */ - public SAMRecord decode() { - int recordLength = 0; - try { - recordLength = this.binaryCodec.readInt(); - } - catch (RuntimeEOFException e) { - return null; - } - - if (recordLength < BAMFileConstants.FIXED_BLOCK_SIZE || - recordLength > BAMFileConstants.MAXIMUM_RECORD_LENGTH) { - throw new SAMFormatException("Invalid record length: " + recordLength); - } - - final int referenceID = this.binaryCodec.readInt(); - final int coordinate = this.binaryCodec.readInt() + 1; - final short readNameLength = this.binaryCodec.readUByte(); - final short mappingQuality = this.binaryCodec.readUByte(); - final int bin = this.binaryCodec.readUShort(); - final int cigarLen = this.binaryCodec.readUShort(); - final int flags = this.binaryCodec.readUShort(); - final int readLen = this.binaryCodec.readInt(); - final int mateReferenceID = this.binaryCodec.readInt(); - final int mateCoordinate = this.binaryCodec.readInt() + 1; - final int insertSize = this.binaryCodec.readInt(); - final byte[] restOfRecord = new byte[recordLength - BAMFileConstants.FIXED_BLOCK_SIZE]; - this.binaryCodec.readBytes(restOfRecord); - final BAMRecord ret = new BAMRecord(header, referenceID, coordinate, readNameLength, mappingQuality, - bin, cigarLen, flags, readLen, mateReferenceID, mateCoordinate, insertSize, restOfRecord); - ret.setHeader(header); - return ret; - } -} diff --git a/java/lib/edu/mit/broad/sam/BinaryCigarCodec.java b/java/lib/edu/mit/broad/sam/BinaryCigarCodec.java deleted file mode 100644 index 5455f6532..000000000 --- a/java/lib/edu/mit/broad/sam/BinaryCigarCodec.java +++ /dev/null @@ -1,68 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -import java.nio.ByteBuffer; - -/** - * Converter between binary and text CIGAR representation. - */ -class BinaryCigarCodec { - private static final BinaryCigarCodec singleton = new BinaryCigarCodec(); - - /** - * It is not necssary to get the singleton but it is preferrable to use the same one - * over and over vs. creating a new object for each BAMRecord. - */ - static BinaryCigarCodec getSingleton() { - return singleton; - } - - int[] encode(final Cigar cigar) { - if (cigar.numCigarElements() == 0) { - return new int[0]; - } - - // Binary rep can be no longer than 1/2 of text rep - // Although this is documented as uint, I think lengths will never get that long, - // and it's a pain in Java. - final int[] binaryCigar = new int[cigar.numCigarElements()]; - int binaryCigarLength = 0; - for (int i = 0; i < cigar.numCigarElements(); ++i) { - final CigarElement cigarElement = cigar.getCigarElement(i); - final int op = CigarOperator.enumToBinary(cigarElement.getOperator()); - binaryCigar[binaryCigarLength++] = cigarElement.getLength() << 4 | op; - } - return binaryCigar; - } - - Cigar decode(final ByteBuffer binaryCigar) { - final Cigar ret = new Cigar(); - while (binaryCigar.hasRemaining()) { - final int cigarette = binaryCigar.getInt(); - ret.add(binaryCigarToCigarElement(cigarette)); - } - return ret; - } - - Cigar decode(final int[] binaryCigar) { - final Cigar ret = new Cigar(); - for (final int cigarette : binaryCigar) { - ret.add(binaryCigarToCigarElement(cigarette)); - } - return ret; - } - - private static CigarElement binaryCigarToCigarElement(final int cigarette) { - final int binaryOp = cigarette & 0xf; - final int length = cigarette >> 4; - return new CigarElement(length, CigarOperator.binaryToEnum(binaryOp)); - } -} diff --git a/java/lib/edu/mit/broad/sam/BinaryTagCodec.java b/java/lib/edu/mit/broad/sam/BinaryTagCodec.java deleted file mode 100644 index fbb8711c5..000000000 --- a/java/lib/edu/mit/broad/sam/BinaryTagCodec.java +++ /dev/null @@ -1,211 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -import edu.mit.broad.sam.util.BinaryCodec; -import edu.mit.broad.sam.util.RuntimeEOFException; - -import java.util.Map; -import java.util.Collection; -import java.util.ArrayList; - -/** - * Parse & produce tag section of alignment record in BAM file. - */ -class BinaryTagCodec { - // Size of the fixed part of the binary representation of a tag, - // i.e. the number of bytes occupied by the tag name and tag type fields. - private static final int FIXED_TAG_SIZE = 3; - - private static final long MAX_INT = Integer.MAX_VALUE; - private static final long MAX_UINT = (MAX_INT + 1) * 2; - private static final long MAX_SHORT = Short.MAX_VALUE; - private static final long MAX_USHORT = (MAX_SHORT + 1) * 2; - private static final long MAX_BYTE = Byte.MAX_VALUE; - private static final long MAX_UBYTE = (MAX_BYTE + 1) * 2; - - final BinaryCodec binaryCodec; - - BinaryTagCodec(final BinaryCodec binaryCodec) { - this.binaryCodec = binaryCodec; - } - - private static int getBinaryValueSize(final Object attributeValue) { - switch (getTagValueType(attributeValue)) { - case 'Z': - return ((String)attributeValue).length() + 1; - case 'A': - return 1; - case 'I': - case 'i': - return 4; - case 's': - case 'S': - return 2; - case 'c': - case 'C': - return 1; - case 'f': - return 4; - case 'H': - final byte[] byteArray = (byte[])attributeValue; - return byteArray.length * 2 + 1; - default: - throw new IllegalArgumentException("When writing BAM, unrecognized tag type " + - attributeValue.getClass().getName()); - } - } - - static int getTagSize(final Object value) { - return FIXED_TAG_SIZE + getBinaryValueSize(value); - } - - static char getTagValueType(final Object value) { - if (value.getClass().equals(String.class)) { - return 'Z'; - } else if (value.getClass().equals(Character.class)) { - return 'A'; - } else if (value.getClass().equals(Integer.class)) { - return getIntegerType((Integer)value); - } else if (value.getClass().equals(Long.class)) { - return getIntegerType((Long)value); - } else if (value.getClass().equals(Float.class)) { - return 'f'; - } else if (value.getClass().isArray() && value.getClass().getComponentType().equals(Byte.class)) { - return 'H'; - } else { - throw new IllegalArgumentException("When writing BAM, unrecognized tag type " + - value.getClass().getName()); - } - } - - static private char getIntegerType(final long val) { - if (val > MAX_UINT) { - throw new IllegalArgumentException("Integer attribute value too large to be encoded in BAM"); - } - if (val > MAX_INT) { - return 'I'; - } - if (val > MAX_USHORT) { - return 'i'; - } - if (val > MAX_SHORT) { - return 'S'; - } - if (val > MAX_UBYTE) { - return 's'; - } - if (val > MAX_BYTE) { - return 'C'; - } - if (val >= Byte.MIN_VALUE) { - return 'c'; - } - if (val >= Short.MIN_VALUE) { - return 's'; - } - if (val >= Integer.MIN_VALUE) { - return 'i'; - } - throw new IllegalArgumentException("Integer attribute value too negative to be encoded in BAM"); - } - - void writeTag(final String key, final Object value) { - assert(key.length() == 2); - binaryCodec.writeString(key, false, false); - final char tagValueType = getTagValueType(value); - binaryCodec.writeByte(tagValueType); - - switch (tagValueType) { - case 'Z': - binaryCodec.writeString((String)value, false, true); - break; - case 'A': - binaryCodec.writeByte(((Character)value)); - break; - case 'I': - binaryCodec.writeUInt((Long)value); - break; - case 'i': - binaryCodec.writeInt((Integer)value); - break; - case 's': - binaryCodec.writeShort(((Integer)value).shortValue()); - break; - case 'S': - binaryCodec.writeUShort((Integer)value); - break; - case 'c': - binaryCodec.writeByte((Integer)value); - break; - case 'C': - binaryCodec.writeUByte(((Integer)value).shortValue()); - break; - case 'f': - binaryCodec.writeFloat((Float)value); - break; - case 'H': - final byte[] byteArray = (byte[])value; - binaryCodec.writeString(SAMUtils.bytesToHexString(byteArray), false, true); - break; - default: - throw new IllegalArgumentException("When writing BAM, unrecognized tag type " + - value.getClass().getName()); - } - } - - /** - * Reads tags from the binaryCodec passed in the ctor - * @param tagCollection tags are stored in this Map - */ - void readTags(final Map tagCollection) { - while (true) { - final String key; - try { - // Only way to know at end is when out of input - key = binaryCodec.readString(2); - } catch (RuntimeEOFException e) { - break; - } - final byte tagType = binaryCodec.readByte(); - final Object value = readValue(tagType); - tagCollection.put(key, value); - } - } - - private Object readValue(final byte tagType) { - switch (tagType) { - case 'Z': - return binaryCodec.readNullTerminatedString(); - case 'A': - return (char)binaryCodec.readByte(); - case 'I': - return binaryCodec.readUInt(); - case 'i': - return binaryCodec.readInt(); - case 's': - return (int)binaryCodec.readShort(); - case 'S': - return binaryCodec.readUShort(); - case 'c': - return (int)binaryCodec.readByte(); - case 'C': - return (int)binaryCodec.readUByte(); - case 'f': - return binaryCodec.readFloat(); - case 'H': - final String hexRep = binaryCodec.readNullTerminatedString(); - return SAMUtils.hexStringToBytes(hexRep); - default: - throw new SAMFormatException("Unrecognized tag type: " + (char)tagType); - } - } - -} diff --git a/java/lib/edu/mit/broad/sam/Cigar.java b/java/lib/edu/mit/broad/sam/Cigar.java deleted file mode 100644 index fa9852657..000000000 --- a/java/lib/edu/mit/broad/sam/Cigar.java +++ /dev/null @@ -1,93 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -import java.util.List; -import java.util.ArrayList; -import java.util.Collections; - -/** - * A list of CigarElements, which describes how a read aligns with the reference. - * E.g. the Cigar string 10M1D25M means - * * match or mismatch for 10 bases - * * deletion of 1 base - * * match or mismatch for 25 bases - */ -public class Cigar { - private final List cigarElements = new ArrayList(); - - public Cigar() { - } - - public Cigar(final List cigarElements) { - this.cigarElements.addAll(cigarElements); - } - - public List getCigarElements() { - return Collections.unmodifiableList(cigarElements); - } - - public CigarElement getCigarElement(final int i) { - return cigarElements.get(i); - } - - public void add(final CigarElement cigarElement) { - cigarElements.add(cigarElement); - } - - public int numCigarElements() { - return cigarElements.size(); - } - - public int getReferenceLength() { - int length = 0; - for (CigarElement element : cigarElements) { - switch (element.getOperator()) { - case M: - case D: - case N: - length += element.getLength(); - } - } - return length; - } - - public int getPaddedReferenceLength() { - int length = 0; - for (CigarElement element : cigarElements) { - switch (element.getOperator()) { - case M: - case D: - case N: - case P: - length += element.getLength(); - } - } - return length; - } - - @Override - public boolean equals(final Object o) { - if (this == o) return true; - if (!(o instanceof Cigar)) return false; - - final Cigar cigar = (Cigar) o; - - if (cigarElements != null ? !cigarElements.equals(cigar.cigarElements) : cigar.cigarElements != null) - return false; - - return true; - } - - @Override - public int hashCode() { - return cigarElements != null ? cigarElements.hashCode() : 0; - } -} diff --git a/java/lib/edu/mit/broad/sam/CigarElement.java b/java/lib/edu/mit/broad/sam/CigarElement.java deleted file mode 100644 index eec99106b..000000000 --- a/java/lib/edu/mit/broad/sam/CigarElement.java +++ /dev/null @@ -1,52 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -/** - * One component of a cigar string. The component comprises the operator, and the number of bases to which - * the operator applies. - */ -public class CigarElement { - private final int length; - private final CigarOperator operator; - - public CigarElement(final int length, final CigarOperator operator) { - this.length = length; - this.operator = operator; - } - - public int getLength() { - return length; - } - - public CigarOperator getOperator() { - return operator; - } - - @Override - public boolean equals(final Object o) { - if (this == o) return true; - if (!(o instanceof CigarElement)) return false; - - final CigarElement that = (CigarElement) o; - - if (length != that.length) return false; - if (operator != that.operator) return false; - - return true; - } - - @Override - public int hashCode() { - int result = length; - result = 31 * result + (operator != null ? operator.hashCode() : 0); - return result; - } -} diff --git a/java/lib/edu/mit/broad/sam/CigarOperator.java b/java/lib/edu/mit/broad/sam/CigarOperator.java deleted file mode 100644 index 7445455e2..000000000 --- a/java/lib/edu/mit/broad/sam/CigarOperator.java +++ /dev/null @@ -1,113 +0,0 @@ -package edu.mit.broad.sam;/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -/** - * The operators that can appear in a cigar string. - */ -public enum CigarOperator { - M, - I, - D, - N, - S, - H, - P, - C; // I don't know what C means, but it is in the BAM spec - - // Readable synonyms of the above enums - public static final CigarOperator MATCH_OR_MISMATCH = M; - public static final CigarOperator INSERTION = I; - public static final CigarOperator DELETION = D; - public static final CigarOperator SKIPPED_REGION = N; - public static final CigarOperator SOFT_CLIP = S; - public static final CigarOperator HARD_CLIP = H; - public static final CigarOperator PADDING = P; - - // Representation of CigarOperator in BAM file - private static final byte OP_M = 0; - private static final byte OP_I = 1; - private static final byte OP_D = 2; - private static final byte OP_N = 3; - private static final byte OP_S = 4; - private static final byte OP_H = 5; - private static final byte OP_P = 6; - private static final byte OP_C = 7; - - - - public static CigarOperator characterToEnum(final int b) { - switch (b) { - case 'M': - return M; - case 'I': - return I; - case 'D': - return D; - case 'N': - return N; - case 'S': - return S; - case 'H': - return H; - case 'P': - return P; - case 'C': - return C; - default: - throw new IllegalArgumentException("Unrecognized CigarOperator: " + b); - } - } - - public static CigarOperator binaryToEnum(final int i) { - switch(i) { - case OP_M: - return M; - case OP_I: - return I; - case OP_D: - return D; - case OP_N: - return N; - case OP_S: - return S; - case OP_H: - return H; - case OP_P: - return P; - case OP_C: - return C; - default: - throw new IllegalArgumentException("Unrecognized CigarOperator: " + i); - } - } - - public static int enumToBinary(final CigarOperator e) { - switch(e) { - case M: - return OP_M; - case I: - return OP_I; - case D: - return OP_D; - case N: - return OP_N; - case S: - return OP_S; - case H: - return OP_H; - case P: - return OP_P; - case C: - return OP_C; - default: - throw new IllegalArgumentException("Unrecognized CigarOperator: " + e); - } - } -} diff --git a/java/lib/edu/mit/broad/sam/NotPrimarySkippingIterator.java b/java/lib/edu/mit/broad/sam/NotPrimarySkippingIterator.java deleted file mode 100644 index 7191cc14d..000000000 --- a/java/lib/edu/mit/broad/sam/NotPrimarySkippingIterator.java +++ /dev/null @@ -1,37 +0,0 @@ -package edu.mit.broad.sam; - -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.util.NonDestructiveIterator; - -/** - * Wrapper around SAMRecord iterator that skips over non-primary elements. - */ -public class NotPrimarySkippingIterator { - private final NonDestructiveIterator> it; - - public NotPrimarySkippingIterator(final CloseableIterator underlyingIt) { - it = new NonDestructiveIterator>(underlyingIt); - skipAnyNotprimary(); - } - - public boolean hasCurrent() { - return it.hasCurrent(); - } - - public SAMRecord getCurrent() { - assert(hasCurrent()); - return it.getCurrent(); - } - - public boolean advance() { - it.advance(); - skipAnyNotprimary(); - return hasCurrent(); - } - - private void skipAnyNotprimary() { - while (it.hasCurrent() && it.getCurrent().getNotPrimaryAlignmentFlag()) { - it.advance(); - } - } -} diff --git a/java/lib/edu/mit/broad/sam/SAMFileHeader.java b/java/lib/edu/mit/broad/sam/SAMFileHeader.java deleted file mode 100644 index 95d39f120..000000000 --- a/java/lib/edu/mit/broad/sam/SAMFileHeader.java +++ /dev/null @@ -1,191 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.sam; - - -import java.util.*; - -/** - * Header information from a SAM file. - */ -public class SAMFileHeader -{ - public static final String VERSION_TAG = "VN"; - public static final String CURRENT_VERSION = "1.0"; - - public enum SortOrder { - - unsorted(null), - queryname(SAMRecordQueryNameComparator.class), - coordinate(SAMRecordCoordinateComparator.class); - - private Class comparator; - - SortOrder(final Class comparatorClass) { - this.comparator = comparatorClass; - } - - public Class getComparator() { - return comparator; - } - } - - public enum GroupOrder { - none, query, reference - } - - private final Map mAttributes = - new HashMap(); - private List mSequences = - new ArrayList(); - private List mReadGroups = - new ArrayList(); - private final List mProgramRecords = new ArrayList(); - private final Map mSequenceMap = - new HashMap(); - private final Map mReadGroupMap = - new HashMap(); - private Map mProgramRecordMap = new HashMap(); - - public SAMFileHeader() { - setAttribute(VERSION_TAG, CURRENT_VERSION); - } - - public String getVersion() { - return (String) getAttribute("VN"); - } - - public String getCreator() { - return (String) getAttribute("CR"); - } - - public Object getAttribute(final String key) { - return mAttributes.get(key); - } - - public Set> getAttributes() { - return mAttributes.entrySet(); - } - - public List getSequences() { - return mSequences; - } - - public List getReadGroups() { - return mReadGroups; - } - - public SAMSequenceRecord getSequence(final String name) { - return mSequenceMap.get(name); - } - - public SAMReadGroupRecord getReadGroup(final String name) { - return mReadGroupMap.get(name); - } - - public void setSequences(final List list) { - mSequences = list; - mSequenceMap.clear(); - int index = 0; - for (final SAMSequenceRecord record : list) { - record.setSequenceIndex(index++); - mSequenceMap.put(record.getSequenceName(), record); - } - } - - public SAMSequenceRecord getSequence(final int sequenceIndex) { - if (sequenceIndex < 0 || sequenceIndex >= mSequences.size()) { - return null; - } - return mSequences.get(sequenceIndex); - } - - public int getSequenceIndex(final String sequenceName) { - final SAMSequenceRecord record = mSequenceMap.get(sequenceName); - if (record == null) { - return -1; - } - return record.getSequenceIndex(); - } - - public void setAttribute(final String key, final String value) { - mAttributes.put(key, value); - } - - public void setReadGroups(final List readGroups) { - mReadGroups = readGroups; - mReadGroupMap.clear(); - for (final SAMReadGroupRecord readGroupRecord : readGroups) { - mReadGroupMap.put(readGroupRecord.getReadGroupId(), readGroupRecord); - } - } - - public List getProgramRecords() { - return Collections.unmodifiableList(mProgramRecords); - } - - public void addProgramRecord(final SAMProgramRecord programRecord) { - this.mProgramRecords.add(programRecord); - this.mProgramRecordMap.put(programRecord.getProgramGroupId(), programRecord); - } - - public SAMProgramRecord getProgramRecord(final String name) { - return this.mProgramRecordMap.get(name); - } - - public SortOrder getSortOrder() { - if (getAttribute("SO") == null) { - return SortOrder.unsorted; - } - return SortOrder.valueOf((String)getAttribute("SO")); - } - - public void setSortOrder(final SortOrder so) { - setAttribute("SO", so.name()); - } - - public GroupOrder getGroupOrder() { - if (getAttribute("GO") == null) { - return GroupOrder.none; - } - return GroupOrder.valueOf((String)getAttribute("GO")); - } - - public void setGroupOrder(final GroupOrder go) { - setAttribute("GO", go.name()); - } - - @Override - public boolean equals(final Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - final SAMFileHeader that = (SAMFileHeader) o; - - if (mAttributes != null ? !mAttributes.equals(that.mAttributes) : that.mAttributes != null) return false; - if (mProgramRecords != null ? !mProgramRecords.equals(that.mProgramRecords) : that.mProgramRecords != null) - return false; - if (mReadGroups != null ? !mReadGroups.equals(that.mReadGroups) : that.mReadGroups != null) return false; - if (mSequences != null ? !mSequences.equals(that.mSequences) : that.mSequences != null) return false; - - return true; - } - - @Override - public int hashCode() { - int result = mAttributes != null ? mAttributes.hashCode() : 0; - result = 31 * result + (mSequences != null ? mSequences.hashCode() : 0); - result = 31 * result + (mReadGroups != null ? mReadGroups.hashCode() : 0); - result = 31 * result + (mReadGroupMap != null ? mReadGroupMap.hashCode() : 0); - result = 31 * result + (mProgramRecords != null ? mProgramRecords.hashCode() : 0); - return result; - } -} diff --git a/java/lib/edu/mit/broad/sam/SAMFileReader.java b/java/lib/edu/mit/broad/sam/SAMFileReader.java deleted file mode 100644 index 8c0e44919..000000000 --- a/java/lib/edu/mit/broad/sam/SAMFileReader.java +++ /dev/null @@ -1,213 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.sam; - - -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.util.RuntimeIOException; -import edu.mit.broad.sam.util.BlockCompressedInputStream; - -import java.io.*; - - -/** - * Class for reading and querying SAM/BAM files. - */ -public class SAMFileReader implements Iterable -{ - private boolean mIsBinary = false; - private BAMFileIndex mFileIndex = null; - private ReaderImplementation mReader = null; - - public enum ValidationStringency { - STRICT, // Do the right thing, throw an exception if something looks wrong - LENIENT, // Emit warnings but keep going if possible - SILENT; // Like LENIENT, only don't emit warning messages - - public static ValidationStringency DEFAULT_STRINGENCY = STRICT; - } - - /** - * Internal interface for SAM/BAM file reader implementations. - * Implemented as an abstract class to enforce better access control. - */ - static abstract class ReaderImplementation { - abstract SAMFileHeader getFileHeader(); - abstract CloseableIterator getIterator(); - abstract CloseableIterator query(String sequence, int start, int end, boolean contained); - abstract void close(); - // If true, emit warnings about format errors rather than throwing exceptions; - abstract void setValidationStringency(final ValidationStringency validationStringency); - } - - - public SAMFileReader(final InputStream stream) { - this(stream, false); - } - - public SAMFileReader(final File file) { - this(file, null, false); - } - - public SAMFileReader(final File file, final File indexFile) { - this(file, indexFile, false); - } - - /** - * Read a SAM or BAM file - * @param stream input SAM or BAM - * @param eagerDecode if true, decode SAM record entirely when reading it - */ - public SAMFileReader(final InputStream stream, final boolean eagerDecode) { - init(stream, eagerDecode); - } - - /** - * Read a SAM or BAM file, possibly with an index file if present - * @param file where to read from - * @param eagerDecode if true, decode SAM record entirely when reading it - */ - public SAMFileReader(final File file, final boolean eagerDecode) { - init(file, null, eagerDecode); - } - - /** - * Read a SAM or BAM file, possibly with an index file - * @param file where to read from - * @param indexFile location of index file, or null in order to use the default index file (if present) - * @param eagerDecode eagerDecode if true, decode SAM record entirely when reading it - */ - public SAMFileReader(final File file, final File indexFile, final boolean eagerDecode){ - init(file, indexFile, eagerDecode); - } - - public void close() { - if (mReader != null) { - mReader.close(); - } - if (mFileIndex != null) { - mFileIndex.close(); - } - mReader = null; - mFileIndex = null; - } - - public boolean isBinary() { - return mIsBinary; - } - - public boolean hasIndex() { - return (mFileIndex != null); - } - - public SAMFileHeader getFileHeader() { - return mReader.getFileHeader(); - } - - public void setValidationStringency(final ValidationStringency validationStringency) { - mReader.setValidationStringency(validationStringency); - } - - public CloseableIterator iterator() { - return mReader.getIterator(); - } - - public CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { - return mReader.query(sequence, start, end, contained); - } - - public CloseableIterator queryOverlapping(final String sequence, final int start, final int end) { - return query(sequence, start, end, false); - } - - public CloseableIterator queryContained(final String sequence, final int start, final int end) { - return query(sequence, start, end, true); - } - - private void init(final InputStream stream, final boolean eagerDecode) { - - try { - final BufferedInputStream bufferedStream = toBufferedStream(stream); - if (isBAMFile(bufferedStream)) { - mIsBinary = true; - mReader = new BAMFileReader(bufferedStream, eagerDecode); - } else if (isSAMFile(bufferedStream)) { - mIsBinary = false; - mReader = new SAMTextReader(bufferedStream); - } else { - throw new SAMFormatException("Unrecognized file format"); - } - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - private void init(final File file, File indexFile, final boolean eagerDecode) { - - try { - final BufferedInputStream bufferedStream = - new BufferedInputStream(new FileInputStream(file)); - if (isBAMFile(bufferedStream)) { - bufferedStream.close(); - mIsBinary = true; - final BAMFileReader reader = new BAMFileReader(file, eagerDecode); - mReader = reader; - if (indexFile == null) { - indexFile = findIndexFile(file); - } - if (indexFile != null) { - mFileIndex = new BAMFileIndex(indexFile); - reader.setFileIndex(mFileIndex); - } - } else if (isSAMFile(bufferedStream)) { - if (indexFile != null) { - bufferedStream.close(); - throw new RuntimeException("Cannot use index file with textual SAM file"); - } - mIsBinary = false; - mReader = new SAMTextReader(bufferedStream, file); - } else { - bufferedStream.close(); - throw new SAMFormatException("Unrecognized file format"); - } - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - private File findIndexFile(final File dataFile) { - final File indexFile = - new File(dataFile.getParent(), dataFile.getName() + ".bai"); - if (indexFile.exists()) { - return indexFile; - } else { - return null; - } - } - - private boolean isBAMFile(final InputStream stream) - throws IOException { - return BlockCompressedInputStream.isValidFile(stream); - } - - private boolean isSAMFile(final InputStream stream) { - // For now, assume every non-binary file is a SAM text file. - return true; - } - - private BufferedInputStream toBufferedStream(final InputStream stream) { - if (stream instanceof BufferedInputStream) { - return (BufferedInputStream) stream; - } else { - return new BufferedInputStream(stream); - } - } -} diff --git a/java/lib/edu/mit/broad/sam/SAMFileWriter.java b/java/lib/edu/mit/broad/sam/SAMFileWriter.java deleted file mode 100644 index 2d57854b5..000000000 --- a/java/lib/edu/mit/broad/sam/SAMFileWriter.java +++ /dev/null @@ -1,23 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -/** - * Interface for SAMText and BAM file writers. Clients need not care which they write to, - * once the object is constructed. - */ -public interface SAMFileWriter { - void addAlignment(SAMRecord alignment); - - /** - * Must be called or file will likely be defective. - */ - void close(); -} diff --git a/java/lib/edu/mit/broad/sam/SAMFileWriterFactory.java b/java/lib/edu/mit/broad/sam/SAMFileWriterFactory.java deleted file mode 100644 index 3d7594855..000000000 --- a/java/lib/edu/mit/broad/sam/SAMFileWriterFactory.java +++ /dev/null @@ -1,64 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -import java.io.File; - -/** - * Create a SAMFileWriter for writing SAM or BAM. - */ -public class SAMFileWriterFactory { - - /** - * Create a BAMFileWriter that is ready to receive SAMRecords - * @param header entire header. Sort order is determined by the sortOrder property of this arg - * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder - * @param outputFile where to write the output. - * @return - */ - public SAMFileWriter makeBAMWriter(final SAMFileHeader header, final boolean presorted, final File outputFile) { - final BAMFileWriter ret = new BAMFileWriter(outputFile); - ret.setSortOrder(header.getSortOrder(), presorted); - ret.setHeader(header); - return ret; - } - - /** - * Create a SAMTextWriter that is ready to receive SAMRecords - * @param header entire header. Sort order is determined by the sortOrder property of this arg - * @param presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder - * @param outputFile where to write the output. - * @return - */ - public SAMFileWriter makeSAMWriter(final SAMFileHeader header, final boolean presorted, final File outputFile) { - final SAMTextWriter ret = new SAMTextWriter(outputFile); - ret.setSortOrder(header.getSortOrder(), presorted); - ret.setHeader(header); - return ret; - } - - /** - * Create either a SAM or a BAM writer based on examination of the outputFile - * @param header entire header. Sort order is determined by the sortOrder property of this arg - * @param presorted presorted if true, SAMRecords must be added to the SAMFileWriter in order that agrees with header.sortOrder - * @param outputFile - * @return outputFile where to write the output. Must end with .sam or .bam - */ - public SAMFileWriter makeSAMOrBAMWriter(final SAMFileHeader header, final boolean presorted, final File outputFile) { - final String filename = outputFile.getName(); - if (filename.endsWith(".bam")) { - return makeBAMWriter(header, presorted, outputFile); - } - if (filename.endsWith(".sam")) { - return makeSAMWriter(header, presorted, outputFile); - } - throw new IllegalArgumentException("SAM/BAM file should end with .sam or .bam: " + outputFile); - } -} diff --git a/java/lib/edu/mit/broad/sam/SAMFileWriterImpl.java b/java/lib/edu/mit/broad/sam/SAMFileWriterImpl.java deleted file mode 100644 index 78521af44..000000000 --- a/java/lib/edu/mit/broad/sam/SAMFileWriterImpl.java +++ /dev/null @@ -1,157 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -import edu.mit.broad.sam.util.SortingCollection; - -import java.io.File; -import java.io.StringWriter; - -/** - * Base class for implementing SAM writer with any underlying format. - * Mostly this manages accumulation & sorting of SAMRecords when appropriate, - * and produces the text version of the header, since that seems to be a popular item - * in both text and binary file formats. - */ -abstract class SAMFileWriterImpl implements SAMFileWriter -{ - private static final int MAX_RECORDS_IN_RAM = 500000; - private SAMFileHeader.SortOrder sortOrder; - private SAMFileHeader header; - private SortingCollection alignmentSorter; - - // If true, records passed to addAlignment are already in the order specified by sortOrder - private boolean presorted; - - // These two fields are for validating presorted records. - private SAMRecord prevAlignment; - private SAMRecordComparator presortedComparator; - - /** - * Must be called before calling writeHeader(). SortOrder value in the header passed - * to writeHeader() is ignored. If setSortOrder is not called, default is SortOrder.unsorted - * @param sortOrder - */ - public void setSortOrder(final SAMFileHeader.SortOrder sortOrder, final boolean presorted) { - if (header != null) { - throw new IllegalStateException("Cannot call SAMFileWriterImpl.setSortOrder after setHeader for " + - getFilename()); - } - this.sortOrder = sortOrder; - this.presorted = presorted; - } - - /** - * Must be called before addAlignment. - * @param header - */ - public void setHeader(final SAMFileHeader header) - { - this.header = header; - if (sortOrder == null) { - sortOrder = SAMFileHeader.SortOrder.unsorted; - } - header.setSortOrder(sortOrder); - final StringWriter headerTextBuffer = new StringWriter(); - new SAMTextHeaderCodec().encode(headerTextBuffer, header); - final String headerText = headerTextBuffer.toString(); - - writeHeader(headerText); - - if (presorted) { - if (sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) { - presorted = false; - } else { - presortedComparator = makeComparator(); - } - } else if (!sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) { - alignmentSorter = SortingCollection.newInstance(SAMRecord.class, - new BAMRecordCodec(header), makeComparator(), MAX_RECORDS_IN_RAM); - } - } - - protected SAMFileHeader getHeader() { - return header; - } - - private SAMRecordComparator makeComparator() { - switch (sortOrder) { - case coordinate: - return new SAMRecordCoordinateComparator(header); - case queryname: - return new SAMRecordQueryNameComparator(); - case unsorted: - return null; - } - throw new IllegalStateException("sortOrder should not be null"); - } - - public void addAlignment(final SAMRecord alignment) - { - if (sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) { - if (!header.getGroupOrder().equals(SAMFileHeader.GroupOrder.none)) { - throw new UnsupportedOperationException("GroupOrder " + header.getGroupOrder() + " is not supported"); - } - writeAlignment(alignment); - } else if (presorted) { - assertPresorted(alignment); - writeAlignment(alignment); - } else { - alignmentSorter.add(alignment); - } - } - - private void assertPresorted(final SAMRecord alignment) { - if (prevAlignment != null) { - if (presortedComparator.fileOrderCompare(prevAlignment, alignment) > 0) { - throw new IllegalArgumentException("Alignments added out of order in SAMFileWriterImpl.addAlignment for " + - getFilename() + ". Sort order is " + this.sortOrder + ". Offending records are at [" - + prevAlignment.getReferenceName() + ":" + prevAlignment.getAlignmentStart() + "] and [" - + alignment.getReferenceName() + ":" + alignment.getAlignmentStart() + "]"); - } - } - prevAlignment = alignment; - } - - public final void close() - { - if (alignmentSorter != null) { - for (final SAMRecord alignment : alignmentSorter) { - writeAlignment(alignment); - } - alignmentSorter.cleanup(); - } - finish(); - } - - /** - * Writes the record to disk. Sort order has been taken care of by the time - * this method is called. - * @param alignment - */ - abstract protected void writeAlignment(SAMRecord alignment); - - /** - * Write the header to disk. Header object is available via getHeader(). - * @param textHeader for convenience if the implementation needs it. - */ - abstract protected void writeHeader(String textHeader); - - /** - * Do any required flushing here. - */ - abstract protected void finish(); - - /** - * For producing error messages. - * @return Output filename, or null if there isn't one. - */ - abstract protected String getFilename(); -} diff --git a/java/lib/edu/mit/broad/sam/SAMFormatException.java b/java/lib/edu/mit/broad/sam/SAMFormatException.java deleted file mode 100644 index f055d1075..000000000 --- a/java/lib/edu/mit/broad/sam/SAMFormatException.java +++ /dev/null @@ -1,30 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -/** - * Thrown when a SAM file being read (text or binary) looks bad. - */ -public class SAMFormatException extends RuntimeException { - public SAMFormatException() { - } - - public SAMFormatException(final String s) { - super(s); - } - - public SAMFormatException(final String s, final Throwable throwable) { - super(s, throwable); - } - - public SAMFormatException(final Throwable throwable) { - super(throwable); - } -} diff --git a/java/lib/edu/mit/broad/sam/SAMLocusIterator.java b/java/lib/edu/mit/broad/sam/SAMLocusIterator.java deleted file mode 100644 index e494d389a..000000000 --- a/java/lib/edu/mit/broad/sam/SAMLocusIterator.java +++ /dev/null @@ -1,308 +0,0 @@ -package edu.mit.broad.sam; - -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.arachne.GenomeMask; - -import java.util.*; - -/** - * Iterator that traverses a SAM File, accumulating information on a per-locus basis - */ -public class SAMLocusIterator implements Iterable, CloseableIterator { - public static class LocusInfo { - protected final String chrom; - protected final int position; - protected final List bases = new ArrayList(100); - protected final List qualities = new ArrayList(100); - protected final List negativeStrandFlags = new ArrayList(100); - - LocusInfo(final String chrom, final int position) { - this.chrom = chrom; - this.position = position; - } - - public void add(final Byte readBase, final Byte baseQuality, final boolean strand) { - bases.add(readBase); - qualities.add(baseQuality); - negativeStrandFlags.add(strand); - } - - public String getChrom() { return chrom; } - public int getPosition() { return position; } - public List getBases() { return bases; } - public List getQualities() { return qualities; } - public List getNegativeStrandFlags() { return negativeStrandFlags; } - - public String getBasesAsString() { return bytesToString(bases); } - - private static String bytesToString(final List data) { - if (data == null || data.size() == 0) { - return ""; - } - - final char[] chars = new char[data.size()]; - for (int i = 0; i < data.size(); i++) { - chars[i] = (char) (data.get(i) & 0xFF); - } - return new String(chars); - } - } - - - - - private final CloseableIterator underlyingIterator; - private final NotPrimarySkippingIterator it; - private final LinkedList complete = new LinkedList(); - private final LinkedList accumulator = new LinkedList(); - - private boolean includeNonPfReads = false; - private boolean includeDuplicates = false; - private int qualityScoreCutoff = -Integer.MAX_VALUE; - - private GenomeMask mask; - private int lastContig = 0; - private int lastPosition = 0; - - private boolean finishedAlignedReads = false; - - - // this should probably take a SAM - public SAMLocusIterator(final CloseableIterator samIterator) { - this.underlyingIterator = samIterator; - this.it = new NotPrimarySkippingIterator(samIterator); - } - - public Iterator iterator() { - return this; - } - - public void close() { - this.underlyingIterator.close(); - } - - private boolean samHasMore() { - return !finishedAlignedReads && it.hasCurrent(); - } - public boolean hasNext() { - return ((complete.size() > 0) || (accumulator.size() > 0) || (samHasMore()) || hasRemainingMaskBases()); - } - - private boolean hasRemainingMaskBases() { - if (mask == null) return false; - - // if there are more contigs in the mask, by definition some of them must have - // marked bases otherwise if we're in the last contig, but we're not at the last marked position, - // there is also more in the mask - return (lastContig <= mask.getMaxContig() || - (lastContig == mask.getMaxContig() && lastPosition <= mask.get(lastContig).nextSetBit(lastPosition+1))); - } - - public LocusInfo next() { - - // if we don't have any completed entries to return, try and make some! - while(complete.size() == 0 && samHasMore()) { - final SAMRecord rec = it.getCurrent(); - final String cigar = rec.getCigarString(); - - // as soon as we hit our first non-aligned read, we can stop! - if (cigar.equals("*")) { - this.finishedAlignedReads = true; - continue; - } - - // skip dupe reads, if so requested - if (!isIncludeDuplicates() && rec.getDuplicateReadFlag()) { it.advance(); continue; } - - // skip non-PF reads, if so requested - if (!isIncludeNonPfReads() && rec.getReadFailsVendorQualityCheckFlag()) { it.advance(); continue; } - - // when we switch contigs, emit everything in the accumulator - if (accumulator.size() > 0 && !accumulator.getFirst().chrom.equals(rec.getReferenceName())) { - while (accumulator.size() > 0) { - popLocus(); - } - } - - // pop off things we're not going to accumulate more coverage at the locus in question - while(accumulator.size() > 0 && accumulator.getFirst().position < rec.getAlignmentStart()) { - popLocus(); - } - - // check that it's a non-gapped alignment for now! - // TODO: handle gapped and clipped alignments - if (!cigar.matches("[0-9]+M")) { - System.out.println("Cannot deal with clipped or gapped alignments. CIGAR="+cigar); - System.exit(1); - } - - // at this point, either the list is empty or the head should - // be the same position as the first base of the read - - // interpret the CIGAR string and add the base info - for(int j=0; j < rec.getReadBases().length; j++) { - // if the position is empty, initialize it - if (j > accumulator.size() - 1) { - accumulator.add(new LocusInfo(rec.getReferenceName(), rec.getAlignmentStart() + j)); - } - - // if the quality score cutoff is met, accumulate the base info - if (rec.getBaseQualities()[j] >= getQualityScoreCutoff()) { - accumulator.get(j).add(rec.getReadBases()[j], rec.getBaseQualities()[j], rec.getReadNegativeStrandFlag()); - } - } - - - it.advance(); - } - - // if we have nothing to return to the user, and we're at the end of the SAM iterator, - // push everything into the complete queue - if (complete.size() == 0 && !samHasMore()) { - while(accumulator.size() > 0) { - popLocus(); - } - } - - // if there are completed entries, return those - if (complete.size() > 0) { - return complete.removeFirst(); - } else { - - // In this case... we're past the last read from SAM so see if we can - // fill out any more (zero coverage) entries from the mask - LocusInfo zeroResult = null; - while (zeroResult == null && lastContig <= mask.getMaxContig()) { - final int nextbit = mask.get(lastContig).nextSetBit(lastPosition+1); - - // try the next contig - if (nextbit == -1) { - lastContig++; - lastPosition = 0; - } else { - lastPosition = nextbit; - zeroResult = new LocusInfo(contigToChrom[lastContig], lastPosition); - } - } - - return zeroResult; - } - } - - /** - * Pop the first entry from the LocusInfo accumulator into the complete queue. In addition, - * check the GenomeMask and if there are intervening mask positions between the last popped base and the one - * about to be popped, put those on the complete queue as well. - */ - private void popLocus() { - final LocusInfo li = accumulator.removeFirst(); - - // fill in any gaps based on our genome mask - final int liContig = chromToContig.get(li.getChrom()); - - // if we're not on the same contig, fill in the rest of the bits for the previous contig first... - if (lastContig < liContig) { - while (lastContig < liContig) { - int nextbit = 0; - - if (mask != null && mask.get(lastContig) != null) { - while (nextbit != -1) { - nextbit = mask.get(lastContig).nextSetBit(lastPosition + 1); - if (nextbit > -1) { - complete.addLast(new LocusInfo(contigToChrom[lastContig], nextbit)); - lastPosition = nextbit; - } - } - } - lastPosition=0; - lastContig++; - } - } - - // now that we're on the same contig, fill in any unfilled positions - // if we have some bits in the mask to fill in... - if (mask != null && mask.get(lastContig) != null && lastPosition + 1 < li.getPosition()) { - while (lastPosition + 1 < li.getPosition()) { - - final int nextbit = mask.get(lastContig).nextSetBit(lastPosition + 1); - - // if there are no more mask bits, or the next mask bit is - // at or after the current data, just continue on - if (nextbit == -1 || nextbit >= li.getPosition()) { break; } - - // otherwise, pop on the desired empty locus info - complete.addLast(new LocusInfo(contigToChrom[lastContig], nextbit)); - lastPosition = nextbit; - } - } - - // only add to the complete queue if it's in the mask (or we have no mask!) - if (mask == null || mask.get(chromToContig.get(li.getChrom()), li.getPosition())) { - complete.addLast(li); - } - - lastContig = liContig; - lastPosition = li.getPosition(); - - - } - - public void remove() { - throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); - } - - // -------------------------------------------------------------------------------------------- - // Helper methods below this point... - // -------------------------------------------------------------------------------------------- - - public void setGenomeMask(final GenomeMask mask) { this.mask = mask; } - public GenomeMask getGenomeMask() { return this.mask; } - - public boolean isIncludeNonPfReads() { return includeNonPfReads; } - public void setIncludeNonPfReads(final boolean includeNonPfReads) { this.includeNonPfReads = includeNonPfReads; } - - public boolean isIncludeDuplicates() { return includeDuplicates; } - public void setIncludeDuplicates(final boolean includeDuplicates) { this.includeDuplicates = includeDuplicates; } - - public int getQualityScoreCutoff() { return qualityScoreCutoff; } - public void setQualityScoreCutoff(final int qualityScoreCutoff) { this.qualityScoreCutoff = qualityScoreCutoff; } - - - // TODO: once we have a foundation method for access to reference data, this should all change - // to be based on that, rather than this strange mashup of contig and chrom - private static final Map chromToContig = new HashMap(); - { - for(int i=1; i<=22; i++) { - chromToContig.put("chr"+i, i); - } - chromToContig.put("chrM", 0); - chromToContig.put("chrX", 23); - chromToContig.put("chrY", 24); - chromToContig.put("chr1_random", 25); - chromToContig.put("chr2_random", 26); - chromToContig.put("chr3_random", 27); - chromToContig.put("chr4_random", 28); - chromToContig.put("chr5_random", 29); - chromToContig.put("chr6_random", 30); - chromToContig.put("chr7_random", 31); - chromToContig.put("chr8_random", 32); - chromToContig.put("chr9_random", 33); - chromToContig.put("chr10_random", 34); - chromToContig.put("chr11_random", 35); - chromToContig.put("chr13_random", 36); - chromToContig.put("chr15_random", 37); - chromToContig.put("chr16_random", 38); - chromToContig.put("chr17_random", 39); - chromToContig.put("chr18_random", 40); - chromToContig.put("chr19_random", 41); - chromToContig.put("chr21_random", 42); - chromToContig.put("chr22_random", 43); - chromToContig.put("chrX_random", 44); - } - - private static final String[] contigToChrom = new String[] { "chrM","chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY", - "chr1_random","chr2_random","chr3_random","chr4_random","chr5_random","chr6_random","chr7_random","chr8_random","chr9_random","chr10_random","chr11_random","chr13_random","chr15_random","chr16_random","chr17_random","chr18_random","chr19_random","chr21_random","chr22_random","chrX_random" }; - - - -} diff --git a/java/lib/edu/mit/broad/sam/SAMProgramRecord.java b/java/lib/edu/mit/broad/sam/SAMProgramRecord.java deleted file mode 100644 index d2597adb3..000000000 --- a/java/lib/edu/mit/broad/sam/SAMProgramRecord.java +++ /dev/null @@ -1,85 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -import java.util.HashMap; -import java.util.Map; -import java.util.Set; - -public class SAMProgramRecord { - public static final String PROGRAM_GROUP_ID_TAG = "ID"; - private static final String PROGRAM_VERSION_TAG = "VN"; - private static final String COMMAND_LINE_TAG = "CL"; - private final String mProgramGroupId; - private final Map mAttributes = new HashMap(); - - public SAMProgramRecord(final String programGroupId) { - this.mProgramGroupId = programGroupId; - } - - public String getProgramGroupId() { - return mProgramGroupId; - } - - public String getAttribute(final String key) { - return mAttributes.get(key); - } - - public void setAttribute(final String key, final String value) { - mAttributes.put(key, value); - } - - public Set> getAttributes() { - return mAttributes.entrySet(); - } - - public String getProgramVersion() { - return getAttribute(PROGRAM_VERSION_TAG); - } - - public void setProgramVersion(final String version) { - setAttribute(PROGRAM_VERSION_TAG, version); - } - - public String getCommandLine() { - return getAttribute(COMMAND_LINE_TAG); - } - - public void setCommandLine(final String commandLine) { - setAttribute(COMMAND_LINE_TAG, commandLine); - } - - /** - * @return true if this == that except for the program group ID, which is arbitrary - */ - public boolean equivalent(final SAMProgramRecord that) { - return mAttributes.equals(that.mAttributes); - } - - @Override - public boolean equals(final Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - final SAMProgramRecord that = (SAMProgramRecord) o; - - if (mAttributes != null ? !mAttributes.equals(that.mAttributes) : that.mAttributes != null) return false; - if (mProgramGroupId != null ? !mProgramGroupId.equals(that.mProgramGroupId) : that.mProgramGroupId != null) return false; - - return true; - } - - @Override - public int hashCode() { - int result = mProgramGroupId != null ? mProgramGroupId.hashCode() : 0; - result = 31 * result + (mAttributes != null ? mAttributes.hashCode() : 0); - return result; - } -} diff --git a/java/lib/edu/mit/broad/sam/SAMReadGroupRecord.java b/java/lib/edu/mit/broad/sam/SAMReadGroupRecord.java deleted file mode 100644 index 3bdf1f6bb..000000000 --- a/java/lib/edu/mit/broad/sam/SAMReadGroupRecord.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.sam; - - -import java.util.*; - -/** - * Header information about a read group. - */ -public class SAMReadGroupRecord -{ - private String mReadGroupId = null; - private final Map mAttributes = new HashMap(); - public static final String READ_GROUP_ID_TAG = "ID"; - public static final String READ_GROUP_SAMPLE_TAG = "SM"; - public static final String PREDICTED_MEDIAN_INSERT_SIZE_TAG = "PI"; - public static final String DATE_RUN_PRODUCED_TAG = "DT"; - - public SAMReadGroupRecord(final String id) { - mReadGroupId = id; - } - - public String getReadGroupId() { - return mReadGroupId; - } - - public String getSample() { - return (String) getAttribute("SM"); - } - - public void setSample(final String value) { - setAttribute("SM", value); - } - - public String getLibrary() { - return (String) getAttribute("LB"); - } - - public void setLibrary(final String value) { - setAttribute("LB", value); - } - - public Object getAttribute(final String key) { - return mAttributes.get(key); - } - - public void setAttribute(final String key, final Object value) { - mAttributes.put(key, value); - } - - public Set> getAttributes() { - return mAttributes.entrySet(); - } - - @Override - public boolean equals(final Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - final SAMReadGroupRecord that = (SAMReadGroupRecord) o; - - if (mAttributes != null ? !mAttributes.equals(that.mAttributes) : that.mAttributes != null) return false; - if (mReadGroupId != null ? !mReadGroupId.equals(that.mReadGroupId) : that.mReadGroupId != null) return false; - - return true; - } - - @Override - public int hashCode() { - int result = mReadGroupId != null ? mReadGroupId.hashCode() : 0; - result = 31 * result + (mAttributes != null ? mAttributes.hashCode() : 0); - return result; - } -} - diff --git a/java/lib/edu/mit/broad/sam/SAMRecord.java b/java/lib/edu/mit/broad/sam/SAMRecord.java deleted file mode 100644 index ca603994d..000000000 --- a/java/lib/edu/mit/broad/sam/SAMRecord.java +++ /dev/null @@ -1,732 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.sam; - - -import edu.mit.broad.sam.util.StringUtil; - -import java.util.*; - -/** - * Java binding for a SAM file record. - */ -public class SAMRecord -{ - public static final int UNKNOWN_MAPPING_QUALITY = 255; - public static final int NO_MAPPING_QUALITY = 0; - public static final String NO_ALIGNMENT_REFERENCE_NAME = "*"; - public static final String NO_ALIGNMENT_CIGAR = "*"; - public static final int NO_ALIGNMENT_START = 0; - public static final byte[] NULL_SEQUENCE = "*".getBytes(); - public static final byte[] NULL_QUALS = "*".getBytes(); - private static final int READ_PAIRED_FLAG = 0x1; - private static final int PROPER_PAIR_FLAG = 0x2; - private static final int READ_UNMAPPED_FLAG = 0x4; - private static final int MATE_UNMAPPED_FLAG = 0x8; - private static final int READ_STRAND_FLAG = 0x10; - private static final int MATE_STRAND_FLAG = 0x20; - private static final int FIRST_OF_PAIR_FLAG = 0x40; - private static final int SECOND_OF_PAIR_FLAG = 0x80; - private static final int NOT_PRIMARY_ALIGNMENT_FLAG = 0x100; - private static final int READ_FAILS_VENDOR_QUALITY_CHECK_FLAG = 0x200; - private static final int DUPLICATE_READ_FLAG = 0x400; - - - private String mReadName = null; - private byte[] mReadBases = NULL_SEQUENCE; - private byte[] mBaseQualities = NULL_QUALS; - private String mReferenceName = NO_ALIGNMENT_REFERENCE_NAME; - private int mAlignmentStart = NO_ALIGNMENT_START; - private int mMappingQuality = NO_MAPPING_QUALITY; - private String mCigarString = NO_ALIGNMENT_CIGAR; - private Cigar mCigar = null; - private List mAlignmentBlocks = null; - private int mFlags = 0; - private String mMateReferenceName = NO_ALIGNMENT_REFERENCE_NAME; - private int mMateAlignmentStart = 0; - private int mInferredInsertSize = 0; - private Map mAttributes = null; - private Integer mReferenceIndex = null; - private Integer mMateReferenceIndex = null; - private Integer mIndexingBin = null; - - // Optional, but handy for looking of reference indices - private SAMFileHeader mHeader = null; - - - public SAMRecord() { - } - - public String getReadName() { - return mReadName; - } - - /** - * This method is preferred over getReadName().length(), because for BAMRecord - * it may be faster. - * @return length not including a null terminator - */ - public int getReadNameLength() { - return mReadName.length(); - } - - public void setReadName(final String value) { - mReadName = value; - } - - public String getReadString() { - return StringUtil.bytesToString(getReadBases()); - } - - public void setReadString(final String value) { - mReadBases = StringUtil.stringToBytes(value); - } - - // Read bases, as bytes - public byte[] getReadBases() { - return mReadBases; - } - - public void setReadBases(final byte[] value) { - mReadBases = value; - } - - /** - * This method is preferred over getReadBases().length, because for BAMRecord it may be faster. - * @return number of bases in the read - */ - public int getReadLength() { - return getReadBases().length; - } - - // Base qualities, encoded as a FASTQ string - public String getBaseQualityString() { - return SAMUtils.phredToFastq(getBaseQualities()); - } - - public void setBaseQualityString(final String value) { - setBaseQualities(SAMUtils.fastqToPhred(value)); - } - - public byte[] getBaseQualities() { - return mBaseQualities; - } - - public void setBaseQualities(final byte[] value) { - mBaseQualities = value; - } - - public String getReferenceName() { - return mReferenceName; - } - - public void setReferenceName(final String value) { - mReferenceName = value; - mReferenceIndex = null; - } - - public Integer getReferenceIndex(final SAMFileHeader header) { - if (mReferenceIndex == null) { - if (mReferenceName == null) { - mReferenceIndex = -1; - } else if (NO_ALIGNMENT_REFERENCE_NAME.equals(mReferenceName)) { - mReferenceIndex = -1; - } else { - mReferenceIndex = header.getSequenceIndex(mReferenceName); - } - } - return mReferenceIndex; - } - - public Integer getReferenceIndex() { - return getReferenceIndex(mHeader); - } - - - public void setReferenceIndex(final int referenceIndex, final SAMFileHeader header) { - mReferenceIndex = referenceIndex; - if (mReferenceIndex == -1) { - mReferenceName = NO_ALIGNMENT_REFERENCE_NAME; - } else { - mReferenceName = header.getSequence(referenceIndex).getSequenceName(); - } - } - - public void setReferenceIndex(final int referenceIndex) { - setReferenceIndex(referenceIndex, mHeader); - } - - - public String getMateReferenceName() { - return mMateReferenceName; - } - - public void setMateReferenceName(final String mateReferenceName) { - this.mMateReferenceName = mateReferenceName; - mMateReferenceIndex = null; - } - - public Integer getMateReferenceIndex(final SAMFileHeader header) { - if (mMateReferenceIndex == null) { - if (mMateReferenceName == null) { - mMateReferenceIndex = -1; - } else if (NO_ALIGNMENT_REFERENCE_NAME.equals(mMateReferenceName)){ - mMateReferenceIndex = -1; - } else { - mMateReferenceIndex = header.getSequenceIndex(mMateReferenceName); - } - } - return mMateReferenceIndex; - } - - public Integer getMateReferenceIndex() { - return getMateReferenceIndex(mHeader); - } - - public void setMateReferenceIndex(final int referenceIndex, final SAMFileHeader header) { - mMateReferenceIndex = referenceIndex; - if (mMateReferenceIndex == -1) { - mMateReferenceName = NO_ALIGNMENT_REFERENCE_NAME; - } else { - mMateReferenceName = header.getSequence(referenceIndex).getSequenceName(); - } - } - - public void setMateReferenceIndex(final int referenceIndex) { - setMateReferenceIndex(referenceIndex, mHeader); - } - - - public int getAlignmentStart() { - return mAlignmentStart; - } - - public void setAlignmentStart(final int value) { - mAlignmentStart = value; - } - - public int getAlignmentEnd() { - final byte[] readBases = getReadBases(); - if (mAlignmentStart == NO_ALIGNMENT_START || Arrays.equals(NULL_SEQUENCE, readBases) || readBases == null) { - return -1; - } - return mAlignmentStart + getCigar().getReferenceLength() - 1; - } - - /** - * Returns the alignment start adjusted for clipped bases. For example if the read - * has an alignment start of 100 but the first 4 bases were clipped (hard or soft clipped) - * then this method will return 96. - */ - public int getUnclippedStart() { - int pos = getAlignmentStart(); - - for (final CigarElement cig : getCigar().getCigarElements()) { - final CigarOperator op = cig.getOperator(); - if (op == CigarOperator.SOFT_CLIP || op == CigarOperator.HARD_CLIP) { - pos -= cig.getLength(); - } - else { - break; - } - } - - return pos; - } - - /** - * Returns the alignment end adjusted for clipped bases. For example if the read - * has an alignment end of 100 but the last 7 bases were clipped (hard or soft clipped) - * then this method will return 107. - */ - public int getUnclippedEnd() { - int pos = getAlignmentEnd(); - List cigs = getCigar().getCigarElements(); - for (int i=cigs.size() - 1; i>=0; --i) { - final CigarElement cig = cigs.get(i); - final CigarOperator op = cig.getOperator(); - - if (op == CigarOperator.SOFT_CLIP || op == CigarOperator.HARD_CLIP) { - pos += cig.getLength(); - } - else { - break; - } - } - - return pos; - } - - public void setAlignmentEnd(final int value) { - throw new UnsupportedOperationException("Not supported: setAlignmentEnd"); - } - - public int getMateAlignmentStart() { - return mMateAlignmentStart; - } - - public void setMateAlignmentStart(final int mateAlignmentStart) { - this.mMateAlignmentStart = mateAlignmentStart; - } - - public int getInferredInsertSize() { - return mInferredInsertSize; - } - - public void setInferredInsertSize(final int inferredInsertSize) { - this.mInferredInsertSize = inferredInsertSize; - } - - public int getMappingQuality() { - return mMappingQuality; - } - - public void setMappingQuality(final int value) { - mMappingQuality = value; - } - - public String getCigarString() { - if (mCigarString == null && getCigar() != null) { - mCigarString = TextCigarCodec.getSingleton().encode(getCigar()); - } - return mCigarString; - } - - public void setCigarString(final String value) { - mCigarString = value; - mCigar = null; - } - - public Cigar getCigar() { - if (mCigar == null && mCigarString != null) { - mCigar = TextCigarCodec.getSingleton().decode(mCigarString); - } - return mCigar; - } - - /** - * This method is preferred over getCigar().getNumElements(), because for BAMRecord it may be faster. - * @return number of cigar elements (number + operator) in the cigar string - */ - public int getCigarLength() { - return getCigar().numCigarElements(); - } - - public void setCigar(final Cigar cigar) { - this.mCigar = cigar; - mCigarString = null; - } - - public int getFlags() { - return mFlags; - } - - public void setFlags(final int value) { - mFlags = value; - } - - /** - * the read is paired in sequencing, no matter whether it is mapped in a pair - */ - public boolean getReadPairedFlag() { - return (mFlags & READ_PAIRED_FLAG) != 0; - } - - private void requireReadPaired() { - if (!getReadPairedFlag()) { - throw new IllegalStateException("Inappropriate call if not paired read"); - } - } - - /** - * the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) - */ - public boolean getProperPairFlag() { - requireReadPaired(); - return (mFlags & PROPER_PAIR_FLAG) != 0; - } - - /** - * the query sequence itself is unmapped - */ - public boolean getReadUnmappedFlag() { - return (mFlags & READ_UNMAPPED_FLAG) != 0; - } - - /** - * the mate is unmapped - */ - public boolean getMateUnmappedFlag() { - requireReadPaired(); - return (mFlags & MATE_UNMAPPED_FLAG) != 0; - } - - /** - * strand of the query (false for forward; true for reverse strand) - */ - public boolean getReadNegativeStrandFlag() { - return (mFlags & READ_STRAND_FLAG) != 0; - } - - /** - * strand of the mate (false for forward; true for reverse strand) - */ - public boolean getMateNegativeStrandFlag() { - requireReadPaired(); - return (mFlags & MATE_STRAND_FLAG) != 0; - } - - /** - * the read is the first read in a pair - */ - public boolean getFirstOfPairFlag() { - requireReadPaired(); - return (mFlags & FIRST_OF_PAIR_FLAG) != 0; - } - - /** - * the read is the second read in a pair - */ - public boolean getSecondOfPairFlag() { - requireReadPaired(); - return (mFlags & SECOND_OF_PAIR_FLAG) != 0; - } - - /** - * the alignment is not primary (a read having split hits may have multiple primary alignment records) - */ - public boolean getNotPrimaryAlignmentFlag() { - return (mFlags & NOT_PRIMARY_ALIGNMENT_FLAG) != 0; - } - - /** - * the read fails platform/vendor quality checks - */ - public boolean getReadFailsVendorQualityCheckFlag() { - return (mFlags & READ_FAILS_VENDOR_QUALITY_CHECK_FLAG) != 0; - } - - /** - * the read is either a PCR duplicate or an optical duplicate - */ - public boolean getDuplicateReadFlag() { - return (mFlags & DUPLICATE_READ_FLAG) != 0; - } - - /** - * the read is paired in sequencing, no matter whether it is mapped in a pair - */ - public void setReadPairedFlag(final boolean flag) { - setFlag(flag, READ_PAIRED_FLAG); - } - - /** - * the read is mapped in a proper pair (depends on the protocol, normally inferred during alignment) - */ - public void setProperPairFlag(final boolean flag) { - setFlag(flag, PROPER_PAIR_FLAG); - } - - /** - * the query sequence itself is unmapped - */ - public void setReadUmappedFlag(final boolean flag) { - setFlag(flag, READ_UNMAPPED_FLAG); - } - - /** - * the mate is unmapped - */ - public void setMateUnmappedFlag(final boolean flag) { - setFlag(flag, MATE_UNMAPPED_FLAG); - } - - /** - * strand of the query (false for forward; true for reverse strand) - */ - public void setReadNegativeStrandFlag(final boolean flag) { - setFlag(flag, READ_STRAND_FLAG); - } - - /** - * strand of the mate (false for forward; true for reverse strand) - */ - public void setMateNegativeStrandFlag(final boolean flag) { - setFlag(flag, MATE_STRAND_FLAG); - } - - /** - * the read is the first read in a pair - */ - public void setFirstOfPairFlag(final boolean flag) { - setFlag(flag, FIRST_OF_PAIR_FLAG); - } - - /** - * the read is the second read in a pair - */ - public void setSecondOfPairFlag(final boolean flag) { - setFlag(flag, SECOND_OF_PAIR_FLAG); - } - - /** - * the alignment is not primary (a read having split hits may have multiple primary alignment records) - */ - public void setNotPrimaryAlignmentFlag(final boolean flag) { - setFlag(flag, NOT_PRIMARY_ALIGNMENT_FLAG); - } - - /** - * the read fails platform/vendor quality checks - */ - public void setReadFailsVendorQualityCheckFlag(final boolean flag) { - setFlag(flag, READ_FAILS_VENDOR_QUALITY_CHECK_FLAG); - } - - /** - * the read is either a PCR duplicate or an optical duplicate - */ - public void setDuplicateReadFlag(final boolean flag) { - setFlag(flag, DUPLICATE_READ_FLAG); - } - - private void setFlag(final boolean flag, final int bit) { - if (flag) { - mFlags |= bit; - } else { - mFlags &= ~bit; - } - } - - public Object getAttribute(final String key) { - if (mAttributes == null) { - return null; - } - return mAttributes.get(key); - } - - public void setAttribute(final String key, final Object value) { - if (mAttributes == null) { - mAttributes = new LinkedHashMap(); - } - mAttributes.put(key, value); - } - - public Set> getAttributes() { - if (mAttributes == null) { - return null; - } - return mAttributes.entrySet(); - } - - public Integer getIndexingBin() { - return mIndexingBin; - } - - public void setIndexingBin(final Integer mIndexingBin) { - this.mIndexingBin = mIndexingBin; - } - - public SAMFileHeader getHeader() { - return mHeader; - } - - public void setHeader(final SAMFileHeader mHeader) { - this.mHeader = mHeader; - } - - /** - * If this record has a valid binary representation of the variable-length portion of a binary record stored, - * return that byte array, otherwise return null. This will never be true for SAMRecords. It will be true - * for BAMRecords that have not been eagerDecoded(), and for which none of the data in the variable-length - * portion has been changed. - */ - public byte[] getVariableBinaryRepresentation() { - return null; - } - - /** - * Depending on the concrete implementation, the binary file size of attributes may be known without - * computing them all. - * @return binary file size of attribute, if known, else -1 - */ - public int getAttributesBinarySize() { - return -1; - } - - public String format() { - final StringBuilder buffer = new StringBuilder(); - addField(buffer, getReadName(), null, null); - addField(buffer, getFlags(), null, null); - addField(buffer, getReferenceName(), null, "*"); - addField(buffer, getAlignmentStart(), 0, "*"); - addField(buffer, getMappingQuality(), 0, "0"); - addField(buffer, getCigarString(), null, "*"); - addField(buffer, getMateReferenceName(), null, "*"); - addField(buffer, getMateAlignmentStart(), 0, "*"); - addField(buffer, getInferredInsertSize(), 0, "*"); - addField(buffer, getReadString(), null, "*"); - addField(buffer, getBaseQualityString(), null, "*"); - if (mAttributes != null) { - for (final Map.Entry entry : getAttributes()) { - addField(buffer, formatTagValue(entry.getKey(), entry.getValue())); - } - } - return buffer.toString(); - } - - private void addField(final StringBuilder buffer, final Object value, final Object defaultValue, final String defaultString) { - if (safeEquals(value, defaultValue)) { - addField(buffer, defaultString); - } else if (value == null) { - addField(buffer, ""); - } else { - addField(buffer, value.toString()); - } - } - - private void addField(final StringBuilder buffer, final String field) { - if (buffer.length() > 0) { - buffer.append('\t'); - } - buffer.append(field); - } - - private String formatTagValue(final String key, final Object value) { - if (value == null || value instanceof String) { - return key + ":Z:" + value; - } else if (value instanceof Integer) { - return key + ":i:" + value; - } else if (value instanceof Character) { - return key + ":A:" + value; - } else if (value instanceof Float) { - return key + ":f:" + value; - } else if (value instanceof byte[]) { - return key + ":H:" + SAMUtils.bytesToHexString((byte[]) value); - } else { - throw new RuntimeException("Unexpected value type for key " + key + - ": " + value); - } - } - - private boolean safeEquals(final Object o1, final Object o2) { - if (o1 == o2) { - return true; - } else if (o1 == null || o2 == null) { - return false; - } else { - return o1.equals(o2); - } - } - - /** - * Force all lazily-initialized data members to be initialized. If a subclass overrides this method, - * typically it should also call super method. - */ - protected void eagerDecode() { - getCigar(); - getCigarString(); - } - - /** - * Returns blocks of the read sequence that have been aligned directly to the - * reference sequence. Note that clipped portions of the read and inserted and - * deleted bases (vs. the reference) are not represented in the alignment blocks. - */ - public List getAlignmentBlocks() { - if (this.mAlignmentBlocks != null) return this.mAlignmentBlocks; - - final Cigar cigar = getCigar(); - if (cigar == null) return Collections.emptyList(); - - - this.mAlignmentBlocks = new ArrayList(); - int readBase = 1; - int refBase = getAlignmentStart(); - - for (final CigarElement e : cigar.getCigarElements()) { - switch (e.getOperator()) { - case H : break; // ignore hard clips - case P : break; // ignore pads - case S : readBase += e.getLength(); break; // soft clip read bases - case N : refBase += e.getLength(); break; // reference skip - case D : refBase += e.getLength(); break; - case I : readBase += e.getLength(); break; - case M : - final int length = e.getLength(); - this.mAlignmentBlocks.add(new AlignmentBlock(readBase, refBase, length)); - readBase += length; - refBase += length; - break; - default : throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator()); - } - } - - return this.mAlignmentBlocks; - } - - @Override - public boolean equals(final Object o) { - if (this == o) return true; - if (!(o instanceof SAMRecord)) return false; - - final SAMRecord samRecord = (SAMRecord) o; - eagerDecode(); - samRecord.eagerDecode(); - - if (mAlignmentStart != samRecord.mAlignmentStart) return false; - if (mFlags != samRecord.mFlags) return false; - if (mInferredInsertSize != samRecord.mInferredInsertSize) return false; - if (mMappingQuality != samRecord.mMappingQuality) return false; - if (mMateAlignmentStart != samRecord.mMateAlignmentStart) return false; - if (mAttributes != null ? !mAttributes.equals(samRecord.mAttributes) : samRecord.mAttributes != null) - return false; - if (!Arrays.equals(mBaseQualities, samRecord.mBaseQualities)) return false; - if (mCigar != null ? !mCigar.equals(samRecord.mCigar) : samRecord.mCigar != null) - return false; - if (mIndexingBin != null ? !mIndexingBin.equals(samRecord.mIndexingBin) : samRecord.mIndexingBin != null) - return false; - if (mMateReferenceIndex != null ? !mMateReferenceIndex.equals(samRecord.mMateReferenceIndex) : samRecord.mMateReferenceIndex != null) - return false; - if (mMateReferenceName != null ? !mMateReferenceName.equals(samRecord.mMateReferenceName) : samRecord.mMateReferenceName != null) - return false; - if (!Arrays.equals(mReadBases, samRecord.mReadBases)) return false; - if (mReadName != null ? !mReadName.equals(samRecord.mReadName) : samRecord.mReadName != null) return false; - if (mReferenceIndex != null ? !mReferenceIndex.equals(samRecord.mReferenceIndex) : samRecord.mReferenceIndex != null) - return false; - if (mReferenceName != null ? !mReferenceName.equals(samRecord.mReferenceName) : samRecord.mReferenceName != null) - return false; - - return true; - } - - @Override - public int hashCode() { - eagerDecode(); - int result = mReadName != null ? mReadName.hashCode() : 0; - result = 31 * result + (mReadBases != null ? Arrays.hashCode(mReadBases) : 0); - result = 31 * result + (mBaseQualities != null ? Arrays.hashCode(mBaseQualities) : 0); - result = 31 * result + (mReferenceName != null ? mReferenceName.hashCode() : 0); - result = 31 * result + mAlignmentStart; - result = 31 * result + mMappingQuality; - result = 31 * result + (mCigarString != null ? mCigarString.hashCode() : 0); - result = 31 * result + mFlags; - result = 31 * result + (mMateReferenceName != null ? mMateReferenceName.hashCode() : 0); - result = 31 * result + mMateAlignmentStart; - result = 31 * result + mInferredInsertSize; - result = 31 * result + (mAttributes != null ? mAttributes.hashCode() : 0); - result = 31 * result + (mReferenceIndex != null ? mReferenceIndex.hashCode() : 0); - result = 31 * result + (mMateReferenceIndex != null ? mMateReferenceIndex.hashCode() : 0); - result = 31 * result + (mIndexingBin != null ? mIndexingBin.hashCode() : 0); - return result; - } -} - diff --git a/java/lib/edu/mit/broad/sam/SAMRecordComparator.java b/java/lib/edu/mit/broad/sam/SAMRecordComparator.java deleted file mode 100644 index 0a2afd838..000000000 --- a/java/lib/edu/mit/broad/sam/SAMRecordComparator.java +++ /dev/null @@ -1,23 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -import java.util.Comparator; - -public interface SAMRecordComparator extends Comparator { - - /** - * Less stringent compare method than the regular compare. If the two records - * are equal enough that their ordering in a sorted SAM file would be arbitrary, - * this method returns 0. - * @return - */ - public int fileOrderCompare(SAMRecord samRecord1, SAMRecord samRecord2); -} diff --git a/java/lib/edu/mit/broad/sam/SAMRecordCoordinateComparator.java b/java/lib/edu/mit/broad/sam/SAMRecordCoordinateComparator.java deleted file mode 100644 index e195d9708..000000000 --- a/java/lib/edu/mit/broad/sam/SAMRecordCoordinateComparator.java +++ /dev/null @@ -1,58 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -/** - * Comparator for sorting SAMRecords by coordinate. Note that the header is required because - * the order of sequences in the header defines the major sort order. - */ -public class SAMRecordCoordinateComparator implements SAMRecordComparator { - private final SAMFileHeader header; - public SAMRecordCoordinateComparator(final SAMFileHeader header) { - this.header = header; - } - public int compare(final SAMRecord samRecord1, final SAMRecord samRecord2) { - final int cmp = fileOrderCompare(samRecord1, samRecord2); - if (cmp != 0) { - return cmp; - } - if (samRecord1.getReadNegativeStrandFlag() == samRecord2.getReadNegativeStrandFlag()) { - return samRecord1.getReadName().compareTo(samRecord2.getReadName()); - } - else { - return (samRecord1.getReadNegativeStrandFlag()? 1: -1); - } - - - - } - - /** - * Less stringent compare method than the regular compare. If the two records - * are equal enough that their ordering in a sorted SAM file would be arbitrary, - * this method returns 0. If read is paired and unmapped, use the mate mapping to sort. - * - * @return - */ - public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) { - int refIndex1 = samRecord1.getReferenceIndex(header); - int refIndex2 = samRecord2.getReferenceIndex(header); - if (refIndex1 == -1) { - return (refIndex2 == -1? 0: 1); - } else if (refIndex2 == -1) { - return -1; - } - int cmp = refIndex1 - refIndex2; - if (cmp != 0) { - return cmp; - } - return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart(); - } -} diff --git a/java/lib/edu/mit/broad/sam/SAMRecordQueryNameComparator.java b/java/lib/edu/mit/broad/sam/SAMRecordQueryNameComparator.java deleted file mode 100644 index 3318488b1..000000000 --- a/java/lib/edu/mit/broad/sam/SAMRecordQueryNameComparator.java +++ /dev/null @@ -1,38 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -/** - * For "queryname" ordering of SAMRecords - */ -public class SAMRecordQueryNameComparator implements SAMRecordComparator { - - public int compare(final SAMRecord samRecord1, final SAMRecord samRecord2) { - final int cmp = fileOrderCompare(samRecord1, samRecord2); - if (cmp != 0) { - return cmp; - } - if (samRecord1.getReadNegativeStrandFlag() == samRecord2.getReadNegativeStrandFlag()) { - return 0; - } - return (samRecord1.getReadNegativeStrandFlag()? 1: -1); - } - - /** - * Less stringent compare method than the regular compare. If the two records - * are equal enough that their ordering in a sorted SAM file would be arbitrary, - * this method returns 0. - * - * @return - */ - public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) { - return samRecord1.getReadName().compareTo(samRecord2.getReadName()); - } -} diff --git a/java/lib/edu/mit/broad/sam/SAMRecordSetBuilder.java b/java/lib/edu/mit/broad/sam/SAMRecordSetBuilder.java deleted file mode 100644 index 6e6e2714f..000000000 --- a/java/lib/edu/mit/broad/sam/SAMRecordSetBuilder.java +++ /dev/null @@ -1,274 +0,0 @@ -package edu.mit.broad.sam; - -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.util.CoordMath; -import edu.mit.broad.sam.util.RuntimeIOException; - -import java.io.File; -import java.io.IOException; -import java.util.*; - -/** - * Factory class for creating SAMRecords for testing purposes. Various methods can be called - * to add new SAM records (or pairs of records) to a list which can then be returned at - * any point. The records must reference human chromosomes (excluding randoms etc.). - * - * Although this is a class for testing, it is in the src tree because it is included in the sam jarfile. - * - * @author Tim Fennell - */ -public class SAMRecordSetBuilder implements Iterable { - private static final String[] chroms = { - "chrM", "chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", - "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", - "chr21", "chr22", "chrX", "chrY" - }; - private static final byte[] BASES = {'A','C','G','T'}; - private static final String READ_GROUP_ID = "1"; - private static final String SAMPLE = "FREE_SAMPLE"; - private final Random random = new Random(); - - private SAMFileHeader header; - private Collection records; - - private final int readLength = 36 ; - - private SAMProgramRecord programRecord = null; - - - /** - * Constructs a new SAMRecordSetBuilder with all the data needed to keep the records - * sorted in coordinate order. - */ - public SAMRecordSetBuilder() { - this(true, SAMFileHeader.SortOrder.coordinate); - } - - public SAMRecordSetBuilder(final boolean sortForMe, final SAMFileHeader.SortOrder sortOrder) { - final List sequences = new ArrayList(); - for (final String chrom : chroms) { - sequences.add(new SAMSequenceRecord(chrom)); - } - - this.header = new SAMFileHeader(); - this.header.setSequences(sequences); - this.header.setSortOrder(sortOrder); - if (sortForMe) { - final SAMRecordComparator comparator; - if (sortOrder == SAMFileHeader.SortOrder.queryname) { - comparator = new SAMRecordQueryNameComparator(); - } else { - comparator = new SAMRecordCoordinateComparator(header); - } - this.records = new TreeSet(comparator); - } else { - this.records = new ArrayList(); - } - final SAMReadGroupRecord readGroupRecord = new SAMReadGroupRecord(READ_GROUP_ID); - readGroupRecord.setSample(SAMPLE); - final List readGroups = new ArrayList(); - readGroups.add(readGroupRecord); - this.header.setReadGroups(readGroups); - } - - /** - * Adds the given program record to the header, and assigns the PG tag to any SAMRecords - * created after it has been added. May be called multiple times in order to assign different - * PG IDs to different SAMRecords. programRecord may be null to stop assignment of PG tag. - * It is up to the caller to ensure that program record IDs do not collide. - */ - public void setProgramRecord(SAMProgramRecord programRecord) { - this.programRecord = programRecord; - if (programRecord != null) { - this.header.addProgramRecord(programRecord); - } - } - - /** Returns the accumulated list of sam records. */ - public Collection getRecords() { return this.records; } - - /** Returns a CloseableIterator over the collection of SAMRecords. */ - public CloseableIterator iterator() { - return new CloseableIterator() { - private final Iterator iterator = records.iterator(); - public void close() { /** Do nothing. */ } - public boolean hasNext() { return this.iterator.hasNext(); } - public SAMRecord next() { return this.iterator.next(); } - public void remove() { this.iterator.remove(); } - }; - } - - /** - * Adds a skeletal fragment (non-PE) record to the set using the provided - * contig start and strand information. - */ - public void addFrag(final String name, final int contig, final int start, final boolean negativeStrand) { - final SAMRecord rec = new SAMRecord(); - rec.setReadName(name); - rec.setReferenceIndex(contig, this.header); - rec.setReferenceName(chroms[contig]); - rec.setAlignmentStart(start); - rec.setReadNegativeStrandFlag(negativeStrand); - rec.setCigarString(readLength + "M"); - rec.setMappingQuality(255); - rec.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); - if (programRecord != null) { - rec.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); - } - - fillInBasesAndQualities(rec); - this.records.add(rec); - } - - /** Adds an unmapped fragment read to the builder. */ - public void addUnmappedFragment(final String name) { - final SAMRecord rec = new SAMRecord(); - rec.setReadName(name); - rec.setReadUmappedFlag(true); - rec.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); - if (programRecord != null) { - rec.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); - } - fillInBasesAndQualities(rec); - this.records.add(rec); - } - - /** - * Adds a skeletal fragment (non-PE) record to the set using the provided - * contig start and strand information. The pair is assumed to be a well - * formed pair sitting on a single contig. - */ - public void addPair(final String name, final int contig, final int start1, final int start2) { - final SAMRecord end1 = new SAMRecord(); - final SAMRecord end2 = new SAMRecord(); - final boolean end1IsFirstOfPair = this.random.nextBoolean(); - - end1.setReadName(name); - end1.setReferenceIndex(contig, this.header); - end1.setAlignmentStart(start1); - end1.setReadNegativeStrandFlag(false); - end1.setCigarString(readLength + "M"); - end1.setMappingQuality(255); - end1.setReadPairedFlag(true); - end1.setProperPairFlag(true); - end1.setMateReferenceIndex(contig, this.header); - end1.setMateAlignmentStart(start2); - end1.setMateNegativeStrandFlag(true); - end1.setFirstOfPairFlag(end1IsFirstOfPair); - end1.setSecondOfPairFlag(!end1IsFirstOfPair); - end1.setInferredInsertSize((int) CoordMath.getLength(start1, CoordMath.getEnd(start2, this.readLength))); - end1.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); - if (programRecord != null) { - end1.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); - } - fillInBasesAndQualities(end1); - - end2.setReadName(name); - end2.setReferenceIndex(contig, this.header); - end2.setAlignmentStart(start2); - end2.setReadNegativeStrandFlag(true); - end2.setCigarString(readLength + "M"); - end2.setMappingQuality(255); - end2.setReadPairedFlag(true); - end2.setProperPairFlag(true); - end2.setMateReferenceIndex(contig, this.header); - end2.setMateAlignmentStart(start1); - end2.setMateNegativeStrandFlag(false); - end2.setFirstOfPairFlag(!end1IsFirstOfPair); - end2.setSecondOfPairFlag(end1IsFirstOfPair); - end2.setInferredInsertSize(end1.getInferredInsertSize()); - end2.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); - if (programRecord != null) { - end2.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); - } - fillInBasesAndQualities(end2); - - this.records.add(end1); - this.records.add(end2); - } - - /** Adds a pair with both ends unmapped to the builder. */ - public void addUnmappedPair(final String name) { - final SAMRecord end1 = new SAMRecord(); - final SAMRecord end2 = new SAMRecord(); - final boolean end1IsFirstOfPair = this.random.nextBoolean(); - - end1.setReadName(name); - end1.setReadPairedFlag(false); - end1.setReadUmappedFlag(true); - end1.setProperPairFlag(false); - end1.setFirstOfPairFlag(end1IsFirstOfPair); - end1.setSecondOfPairFlag(!end1IsFirstOfPair); - end1.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); - if (programRecord != null) { - end1.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); - } - fillInBasesAndQualities(end1); - - end2.setReadName(name); - end2.setReadPairedFlag(false); - end2.setReadUmappedFlag(true); - end2.setProperPairFlag(false); - end2.setFirstOfPairFlag(!end1IsFirstOfPair); - end2.setSecondOfPairFlag(end1IsFirstOfPair); - end2.setAttribute(SAMTag.RG.name(), READ_GROUP_ID); - if (programRecord != null) { - end2.setAttribute(SAMTag.PG.name(), programRecord.getProgramGroupId()); - } - fillInBasesAndQualities(end2); - - this.records.add(end1); - this.records.add(end2); - } - - /** - * Fills in bases and qualities with randomly generated data. - * Relies on the alignment start and end having been set to get read length. - */ - private void fillInBasesAndQualities(final SAMRecord rec) { - final int length = this.readLength; - final byte[] bases = new byte[length]; - final byte[] quals = new byte[length]; - - for (int i=0; i mAttributes = null; - public static final String SEQUENCE_NAME_TAG = "SN"; - public static final String SEQUENCE_LENGTH_TAG = "LN"; - public static final String MD5_TAG = "M5"; - public static final String ASSEMBLY_TAG = "AS"; - public static final String URI_TAG = "UR"; - public static final String SPECIES_TAG = "SP"; - - public SAMSequenceRecord(final String name) { - mSequenceName = name; - } - - public String getSequenceName() { - return mSequenceName; - } - - public int getSequenceLength() { - return mSequenceLength; - } - - public void setSequenceLength(final int value) { - mSequenceLength = value; - } - - public String getAssembly() { - return (String) getAttribute("AS"); - } - - public void setAssembly(final String value) { - setAttribute("AS", value); - } - - public String getSpecies() { - return (String) getAttribute("SP"); - } - - public void setSpecies(final String value) { - setAttribute("SP", value); - } - - public Object getAttribute(final String key) { - if (mAttributes == null) { - return null; - } - return mAttributes.get(key); - } - - public void setAttribute(final String key, final Object value) { - if (mAttributes == null) { - mAttributes = new HashMap(); - } - mAttributes.put(key, value); - } - - public Set> getAttributes() { - if (mAttributes == null) { - return null; - } - return mAttributes.entrySet(); - } - - // Private state used only by SAM implementation. - int getSequenceIndex() { - return mSequenceIndex; - } - - // Private state used only by SAM implementation. - void setSequenceIndex(final int value) { - mSequenceIndex = value; - } - - /** - * Looser comparison than equals(). If one SAMSequenceRecord has an attribute that the other does not - * have, that is not considered inequality. However, if they both have an attribute, but have different - * values for that atttribute, then they are considered unequal. This results in an intransitive equality test, - * i.e. a.isSameSequence(b) && b.isSameSequence(c) does not necessarily imply a.isSameSequence(c) - */ - public boolean isSameSequence(final SAMSequenceRecord that) { - if (this == that) return true; - if (that == null) return false; - - if (mSequenceIndex != that.mSequenceIndex) return false; - if (mSequenceLength != that.mSequenceLength) return false; - if (mSequenceName != null ? !mSequenceName.equals(that.mSequenceName) : that.mSequenceName != null) - return false; - // If one record has an optional attribute and the other does not, that is not considered inequality. - - if (mAttributes != null) { - for (final Map.Entry entry: getAttributes()) { - final Object thatAttribute = that.getAttribute(entry.getKey()); - if (thatAttribute != null && !entry.getValue().equals(thatAttribute)) { - return false; - } - } - } - - return true; - } - - @Override - public boolean equals(final Object o) { - if (this == o) return true; - if (!(o instanceof SAMSequenceRecord)) return false; - - final SAMSequenceRecord that = (SAMSequenceRecord) o; - - if (mSequenceIndex != that.mSequenceIndex) return false; - if (mSequenceLength != that.mSequenceLength) return false; - if (mAttributes != null ? !mAttributes.equals(that.mAttributes) : that.mAttributes != null) return false; - if (mSequenceName != null ? !mSequenceName.equals(that.mSequenceName) : that.mSequenceName != null) - return false; - - return true; - } - - @Override - public int hashCode() { - int result = mSequenceName != null ? mSequenceName.hashCode() : 0; - result = 31 * result + mSequenceIndex; - result = 31 * result + mSequenceLength; - result = 31 * result + (mAttributes != null ? mAttributes.hashCode() : 0); - return result; - } -} - diff --git a/java/lib/edu/mit/broad/sam/SAMTag.java b/java/lib/edu/mit/broad/sam/SAMTag.java deleted file mode 100644 index 5189782cc..000000000 --- a/java/lib/edu/mit/broad/sam/SAMTag.java +++ /dev/null @@ -1,16 +0,0 @@ -package edu.mit.broad.sam;/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ - -/** - * The standard tags defined in the SAM spec - */ -public enum SAMTag { - RG, LB, PU, PG, AS, SQ, MQ, NM, H0, H1, H2, UQ, PQ, NH, IH, HI, MD, CS, CQ, CM, R2, Q2, S2, CC, CP, SM, AM, MF -} diff --git a/java/lib/edu/mit/broad/sam/SAMTextHeaderCodec.java b/java/lib/edu/mit/broad/sam/SAMTextHeaderCodec.java deleted file mode 100644 index 202f5f5bf..000000000 --- a/java/lib/edu/mit/broad/sam/SAMTextHeaderCodec.java +++ /dev/null @@ -1,323 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -import edu.mit.broad.sam.util.LineReader; -import edu.mit.broad.sam.util.RuntimeIOException; -import edu.mit.broad.sam.util.StringUtil; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.IOException; -import java.io.Writer; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -/** - * This is actually two classes in one (not sure if that is a good idea) -- a parser - * for a SAM text header, and a generator of SAM text header. - */ -public class SAMTextHeaderCodec { - private static final String HEADER_LINE_START = "@"; - - // These attributes are populated when parsing or generating - private SAMFileHeader mFileHeader; - - // These attributes are populated when parsing text - private String mCurrentLine; - private LineReader mReader; - private File mFile; - private List sequences; - private List readGroups; - - // These attributes are populated when generating text - private BufferedWriter writer; - - private static final String TAG_KEY_VALUE_SEPARATOR = ":"; - private static final String FIELD_SEPARATOR = "\t"; - - public SAMTextHeaderCodec() { - } - - /** - * Reads text and converts to a SAMFileHeader object. Note that one line past - * the header must be read in order to determine the end of the header. This line can be - * obtained after parseTextHeader() has returned by calling getCurrentLine() - * @param reader Where to get header text from. - * @param file Name of the input file, for error messages. May be null. - * @return complete header object. - */ - public SAMFileHeader decode(final LineReader reader, final File file) { - mFileHeader = new SAMFileHeader(); - mReader = reader; - mFile = file; - sequences = new ArrayList(); - readGroups = new ArrayList(); - - while (advanceLine() != null) { - if (!mCurrentLine.startsWith(HEADER_LINE_START)) { - break; - } - final ParsedHeaderLine parsedHeaderLine = new ParsedHeaderLine(mCurrentLine); - switch (parsedHeaderLine.getHeaderRecordType()) { - - case HD: - parseHDLine(parsedHeaderLine); - break; - case PG: - parsePGLine(parsedHeaderLine); - break; - case RG: - parseRGLine(parsedHeaderLine); - break; - case SQ: - parseSQLine(parsedHeaderLine); - break; - default: - throw new IllegalStateException("Unrecognized header record type: " + - parsedHeaderLine.getHeaderRecordType()); - } - } - mFileHeader.setSequences(sequences); - mFileHeader.setReadGroups(readGroups); - return mFileHeader; - } - - private String advanceLine() { - mCurrentLine = mReader.readLine(); - return mCurrentLine; - } - - private void parsePGLine(final ParsedHeaderLine parsedHeaderLine) { - assert(HeaderRecordType.PG.equals(parsedHeaderLine.getHeaderRecordType())); - parsedHeaderLine.requireTag(SAMProgramRecord.PROGRAM_GROUP_ID_TAG); - final SAMProgramRecord programRecord = new SAMProgramRecord(parsedHeaderLine.removeValue(SAMProgramRecord.PROGRAM_GROUP_ID_TAG)); - for (final Map.Entry entry : parsedHeaderLine.mKeyValuePairs.entrySet()) { - programRecord.setAttribute(entry.getKey(), entry.getValue()); - } - mFileHeader.addProgramRecord(programRecord); - } - - private void parseRGLine(final ParsedHeaderLine parsedHeaderLine) { - assert(HeaderRecordType.RG.equals(parsedHeaderLine.getHeaderRecordType())); - parsedHeaderLine.requireTag(SAMReadGroupRecord.READ_GROUP_ID_TAG); - parsedHeaderLine.requireTag(SAMReadGroupRecord.READ_GROUP_SAMPLE_TAG); - final SAMReadGroupRecord samReadGroupRecord = new SAMReadGroupRecord(parsedHeaderLine.removeValue(SAMReadGroupRecord.READ_GROUP_ID_TAG)); - for (final Map.Entry entry : parsedHeaderLine.mKeyValuePairs.entrySet()) { - samReadGroupRecord.setAttribute(entry.getKey(), entry.getValue()); - } - - // Convert non-String attributes to the appropriate types - final String predictedMedianInsertSize = - (String)samReadGroupRecord.getAttribute(SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG); - if (predictedMedianInsertSize != null) { - try { - samReadGroupRecord.setAttribute(SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG, - Integer.parseInt(predictedMedianInsertSize)); - } catch (NumberFormatException e) { - throw new SAMFormatException(SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG + - " is not numeric: " + predictedMedianInsertSize, e); - } - } - -/* -TODO: Need an ISO 6801 date parser - String dateRunProduced = (String)samReadGroupRecord.getAttribute(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG); - if (dateRunProduced != null) { - try { - Date date = dateParser.parse(dateRunProduced); - samReadGroupRecord.setAttribute(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG, date); - } catch (ParseException e) { - throw new SAMFormatException(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG + " cannot be parsed as a date: " + - dateRunProduced, e); - } - } -*/ - - readGroups.add(samReadGroupRecord); - } - - private void parseSQLine(final ParsedHeaderLine parsedHeaderLine) { - assert(HeaderRecordType.SQ.equals(parsedHeaderLine.getHeaderRecordType())); - parsedHeaderLine.requireTag(SAMSequenceRecord.SEQUENCE_NAME_TAG); - parsedHeaderLine.requireTag(SAMSequenceRecord.SEQUENCE_LENGTH_TAG); - final SAMSequenceRecord samSequenceRecord = new SAMSequenceRecord(parsedHeaderLine.removeValue(SAMSequenceRecord.SEQUENCE_NAME_TAG)); - samSequenceRecord.setSequenceLength(Integer.parseInt(parsedHeaderLine.removeValue(SAMSequenceRecord.SEQUENCE_LENGTH_TAG))); - for (final Map.Entry entry : parsedHeaderLine.mKeyValuePairs.entrySet()) { - samSequenceRecord.setAttribute(entry.getKey(), entry.getValue()); - } - sequences.add(samSequenceRecord); - } - - private void parseHDLine(final ParsedHeaderLine parsedHeaderLine) { - assert(HeaderRecordType.HD.equals(parsedHeaderLine.getHeaderRecordType())); - parsedHeaderLine.requireTag(SAMFileHeader.VERSION_TAG); - for (final Map.Entry entry : parsedHeaderLine.mKeyValuePairs.entrySet()) { - mFileHeader.setAttribute(entry.getKey(), entry.getValue()); - } - } - - private RuntimeException reportErrorParsingLine(final String reason) { - String fileMessage = ""; - if (mFile != null) { - fileMessage = "File " + mFile + "; "; - } - return new SAMFormatException("Error parsing text SAM file. " + reason + "; " + fileMessage + - "Line " + mReader.getLineNumber() + "\nLine: " + mCurrentLine); - } - - private enum HeaderRecordType { - HD, SQ, RG, PG - } - - private class ParsedHeaderLine { - private final HeaderRecordType mHeaderRecordType; - private final Map mKeyValuePairs = new HashMap(); - - ParsedHeaderLine(final String line) { - assert(line.startsWith(HEADER_LINE_START)); - final String[] fields = line.split(FIELD_SEPARATOR); - try { - mHeaderRecordType = HeaderRecordType.valueOf(fields[0].substring(1)); - } catch (IllegalArgumentException e) { - throw reportErrorParsingLine("Unrecognized header record type"); - } - for (int i = 1; i < fields.length; ++i) { - final String[] keyAndValue = fields[i].split(TAG_KEY_VALUE_SEPARATOR, 2); - if (keyAndValue.length != 2) { - throw reportErrorParsingLine("Problem parsing " + HEADER_LINE_START + mHeaderRecordType + - " key:value pair"); - } - mKeyValuePairs.put(keyAndValue[0], keyAndValue[1]); - } - } - - void requireTag(final String tag) { - if (!mKeyValuePairs.containsKey(tag)) { - throw reportErrorParsingLine(HEADER_LINE_START + mHeaderRecordType + " line missing " + tag + " tag"); - } - } - - public HeaderRecordType getHeaderRecordType() { - return mHeaderRecordType; - } - - boolean containsKey(final String key) { - return mKeyValuePairs.containsKey(key); - } - - String getValue(final String key) { - return mKeyValuePairs.get(key); - } - - String removeValue(final String key) { - final String ret = mKeyValuePairs.get(key); - mKeyValuePairs.remove(key); - return ret; - } - - } - - /** - * After parsing the text header, this object has gobbled one line too many. Call this to get that line. - * @return the first non-header line, or null if there isn't one. - */ - public String getCurrentLine() { - return mCurrentLine; - } - - /** - * - * @param writer where to write the header text - * @param header object to be converted to text. - */ - public void encode(final Writer writer, final SAMFileHeader header) { - mFileHeader = header; - this.writer = new BufferedWriter(writer); - writeHDLine(); - for (final SAMSequenceRecord sequenceRecord: header.getSequences()) { - writeSQLine(sequenceRecord); - } - - for (final SAMReadGroupRecord readGroup : header.getReadGroups()) { - writeRGLine(readGroup); - } - for (final SAMProgramRecord programRecord : header.getProgramRecords()) { - writePGLine(programRecord); - } - try { - this.writer.flush(); - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - private void println(final String s) { - try { - writer.append(s); - writer.append("\n"); - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - private void writePGLine(SAMProgramRecord programRecord) { - if (programRecord == null) { - return; - } - final String[] fields = new String[2 + programRecord.getAttributes().size()]; - fields[0] = HEADER_LINE_START + HeaderRecordType.PG; - fields[1] = SAMProgramRecord.PROGRAM_GROUP_ID_TAG + TAG_KEY_VALUE_SEPARATOR + programRecord.getProgramGroupId(); - int i = 2; - for (final Map.Entry entry: programRecord.getAttributes()) { - fields[i++] = entry.getKey() + TAG_KEY_VALUE_SEPARATOR + entry.getValue(); - } - println(StringUtil.join(FIELD_SEPARATOR, fields)); - } - - private void writeRGLine(final SAMReadGroupRecord readGroup) { - final String[] fields = new String[2 + readGroup.getAttributes().size()]; - fields[0] = HEADER_LINE_START + HeaderRecordType.RG; - fields[1] = SAMReadGroupRecord.READ_GROUP_ID_TAG + TAG_KEY_VALUE_SEPARATOR + readGroup.getReadGroupId(); - int i = 2; - for (final Map.Entry entry: readGroup.getAttributes()) { - fields[i++] = entry.getKey() + TAG_KEY_VALUE_SEPARATOR + entry.getValue().toString(); - } - println(StringUtil.join(FIELD_SEPARATOR, fields)); - } - - private void writeHDLine() { - final String[] fields = new String[1 + mFileHeader.getAttributes().size()]; - fields[0] = HEADER_LINE_START + HeaderRecordType.HD; - int i = 1; - for (final Map.Entry entry: mFileHeader.getAttributes()) { - fields[i++] = entry.getKey() + TAG_KEY_VALUE_SEPARATOR + entry.getValue().toString(); - } - println(StringUtil.join(FIELD_SEPARATOR, fields)); - } - - private void writeSQLine(final SAMSequenceRecord sequenceRecord) { - final int numAttributes =sequenceRecord.getAttributes() != null ? sequenceRecord.getAttributes().size() : 0; - final String[] fields = new String[3 + numAttributes]; - fields[0] = HEADER_LINE_START + HeaderRecordType.SQ; - fields[1] = SAMSequenceRecord.SEQUENCE_NAME_TAG + TAG_KEY_VALUE_SEPARATOR + sequenceRecord.getSequenceName(); - fields[2] = SAMSequenceRecord.SEQUENCE_LENGTH_TAG + TAG_KEY_VALUE_SEPARATOR + Integer.toString(sequenceRecord.getSequenceLength()); - int i = 3; - if (sequenceRecord.getAttributes() != null) { - for (final Map.Entry entry: sequenceRecord.getAttributes()) { - fields[i++] = entry.getKey() + TAG_KEY_VALUE_SEPARATOR + entry.getValue().toString(); - } - } - println(StringUtil.join(FIELD_SEPARATOR, fields)); - } - -} diff --git a/java/lib/edu/mit/broad/sam/SAMTextReader.java b/java/lib/edu/mit/broad/sam/SAMTextReader.java deleted file mode 100644 index 267f70461..000000000 --- a/java/lib/edu/mit/broad/sam/SAMTextReader.java +++ /dev/null @@ -1,336 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.sam; - - -import edu.mit.broad.sam.util.AsciiLineReader; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.util.StringUtil; - -import java.io.File; -import java.io.InputStream; -import java.util.Map; - -/** - * Internal class for reading SAM text files. - */ -class SAMTextReader - extends SAMFileReader.ReaderImplementation -{ - private static final int QNAME_COL = 0; - private static final int FLAG_COL = 1; - private static final int RNAME_COL = 2; - private static final int POS_COL = 3; - private static final int MAPQ_COL = 4; - private static final int CIGAR_COL = 5; - private static final int MRNM_COL = 6; - private static final int MPOS_COL = 7; - private static final int ISIZE_COL = 8; - private static final int SEQ_COL = 9; - private static final int QUAL_COL = 10; - - private static final int NUM_REQUIRED_FIELDS = 11; - - private AsciiLineReader mReader; - private SAMFileHeader mFileHeader = null; - private String mCurrentLine = null; - private RecordIterator mIterator = null; - private File mFile = null; - private final TextTagCodec tagCodec = new TextTagCodec(); - private SAMFileReader.ValidationStringency validationStringency = SAMFileReader.ValidationStringency.DEFAULT_STRINGENCY; - - SAMTextReader(final InputStream stream) { - mReader = new AsciiLineReader(stream); - readHeader(); - } - - SAMTextReader(final InputStream stream, final File file) { - this(stream); - mFile = file; - } - - void close() { - if (mReader != null) { - try { - mReader.close(); - } finally { - mReader = null; - } - } - } - - SAMFileHeader getFileHeader() { - return mFileHeader; - } - - public SAMFileReader.ValidationStringency getValidationStringency() { - return validationStringency; - } - - public void setValidationStringency(final SAMFileReader.ValidationStringency lenientValidation) { - this.validationStringency = lenientValidation; - } - - CloseableIterator getIterator() { - if (mReader == null) { - throw new IllegalStateException("File reader is closed"); - } - if (mIterator != null) { - throw new IllegalStateException("Iteration in progress"); - } - mIterator = new RecordIterator(); - return mIterator; - } - - CloseableIterator query(final String sequence, final int start, final int end, final boolean contained) { - throw new UnsupportedOperationException("Cannot query SAM text files"); - } - - private void readHeader() { - final SAMTextHeaderCodec headerCodec = new SAMTextHeaderCodec(); - mFileHeader = headerCodec.decode(mReader, mFile); - mCurrentLine = headerCodec.getCurrentLine(); - } - - private String advanceLine() { - mCurrentLine = mReader.readLine(); - return mCurrentLine; - } - - private String makeErrorString(final String reason) { - String fileMessage = ""; - if (mFile != null) { - fileMessage = "File " + mFile + "; "; - } - return "Error parsing text SAM file. " + reason + "; " + fileMessage + - "Line " + mReader.getLineNumber() + "\nLine: " + mCurrentLine; - } - - private RuntimeException reportFatalErrorParsingLine(final String reason) { - return new SAMFormatException(makeErrorString(reason)); - } - - private void reportErrorParsingLine(final String reason) { - final String errorMessage = makeErrorString(reason); - - if (validationStringency == SAMFileReader.ValidationStringency.STRICT) { - throw new SAMFormatException(errorMessage); - } else if (validationStringency == SAMFileReader.ValidationStringency.LENIENT) { - System.err.println("Ignoring SAM validation error due to lenient parsing:"); - System.err.println(errorMessage); - } - } - - private void reportErrorParsingLine(final Exception e) { - final String errorMessage = makeErrorString(e.getMessage()); - if (validationStringency == SAMFileReader.ValidationStringency.STRICT) { - throw new SAMFormatException(errorMessage); - } else if (validationStringency == SAMFileReader.ValidationStringency.LENIENT) { - System.err.println("Ignoring SAM validation error due to lenient parsing:"); - System.err.println(errorMessage); - } - } - - private class RecordIterator implements CloseableIterator { - - /** - * Allocate this once rather than for every line as a performance optimization. - * The size is arbitrary -- merely large enough to handle the maximum number - * of fields we might expect from a reasonable SAM file. - */ - private final String[] mFields = new String[10000]; - - private SAMRecord mCurrentRecord; - - private RecordIterator() { - assert(mReader != null); - if (mCurrentLine != null) { - parseLine(); - } - - } - - public void close() { - mCurrentRecord = null; - SAMTextReader.this.close(); - } - - public boolean hasNext() { - return mCurrentRecord != null; - } - - public SAMRecord next() { - if (!hasNext()) { - throw new IllegalStateException("Cannot call next() on exhausted iterator"); - } - final SAMRecord ret = mCurrentRecord; - mCurrentRecord = null; - advanceLine(); - if (mCurrentLine != null) { - parseLine(); - } - return ret; - } - - public void remove() { - throw new UnsupportedOperationException("Not supported: remove"); - } - - int parseInt(final String s, final String fieldName) { - final int ret; - try { - ret = Integer.parseInt(s); - } catch (NumberFormatException e) { - throw reportFatalErrorParsingLine("Non-numeric value in " + fieldName + " column"); - } - return ret; - } - - void validateReferenceName(final String rname, final String fieldName) { - if (fieldName.equals("MRNM") && rname.equals("=")) { - return; - } - if (getFileHeader().getSequences().size() != 0) { - if (getFileHeader().getSequence(rname) == null) { - reportErrorParsingLine(fieldName + " '" + rname + "' not found in any SQ record"); - } - } - } - - private void parseLine() { - final int numFields = StringUtil.split(mCurrentLine, mFields, '\t'); - if (numFields < NUM_REQUIRED_FIELDS) { - reportErrorParsingLine("Not enough fields"); - } - if (numFields == mFields.length) { - reportErrorParsingLine("Too many fields in SAM text record."); - } - for (int i = 0; i < numFields; ++i) { - if (mFields[i].length() == 0) { - reportErrorParsingLine("Empty field at position " + i + " (zero-based)"); - } - } - mCurrentRecord = new SAMRecord(); - mCurrentRecord.setReadName(mFields[QNAME_COL]); - - final int flags = parseInt(mFields[FLAG_COL], "FLAG"); - mCurrentRecord.setFlags(flags); - - final String rname = mFields[RNAME_COL]; - if (!rname.equals("*")) { - validateReferenceName(rname, "RNAME"); - mCurrentRecord.setReferenceName(rname); - } else if (!mCurrentRecord.getReadUnmappedFlag()) { - reportErrorParsingLine("RNAME is not specified but flags indicate mapped"); - } - - final int pos = parseInt(mFields[POS_COL], "POS"); - final int mapq = parseInt(mFields[MAPQ_COL], "MAPQ"); - final String cigar = mFields[CIGAR_COL]; - if (!SAMRecord.NO_ALIGNMENT_REFERENCE_NAME.equals(mCurrentRecord.getReferenceName())) { - if (pos == 0) { - reportErrorParsingLine("POS must be non-zero if RNAME is specified"); - } - if (!mCurrentRecord.getReadUnmappedFlag() && cigar.equals("*")) { - reportErrorParsingLine("CIGAR must not be '*' if RNAME is specified"); - } - } else { - if (pos != 0) { - reportErrorParsingLine("POS must be zero if RNAME is not specified"); - } - if (mapq != 0) { - reportErrorParsingLine("MAPQ must be zero if RNAME is not specified"); - } - if (!cigar.equals("*")) { - reportErrorParsingLine("CIGAR must be '*' if RNAME is not specified"); - } - } - mCurrentRecord.setAlignmentStart(pos); - mCurrentRecord.setMappingQuality(mapq); - mCurrentRecord.setCigarString(cigar); - - final String mateRName = mFields[MRNM_COL]; - if (mateRName.equals("*")) { - if (mCurrentRecord.getReadPairedFlag() && !mCurrentRecord.getMateUnmappedFlag()) { - reportErrorParsingLine("MRNM not specified but flags indicate mate mapped"); - } - } - else { - if (!mCurrentRecord.getReadPairedFlag()) { - reportErrorParsingLine("MRNM specified but flags indicate unpaired"); - } - if (mCurrentRecord.getMateUnmappedFlag()) { - reportErrorParsingLine("MRNM specified but flags indicate mate unmapped"); - } - - validateReferenceName(mateRName, "MRNM"); - if (mateRName.equals("=")) { - if (mCurrentRecord.getReferenceName() == null) { - reportErrorParsingLine("MRNM is '=', but RNAME is not set"); - } - mCurrentRecord.setMateReferenceName(mCurrentRecord.getReferenceName()); - } else { - mCurrentRecord.setMateReferenceName(mateRName); - } - } - - final int matePos = parseInt(mFields[MPOS_COL], "MPOS"); - final int isize = parseInt(mFields[ISIZE_COL], "ISIZE"); - if (!mCurrentRecord.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) { - if (matePos == 0) { - reportErrorParsingLine("MPOS must be non-zero if MRNM is specified"); - } - if (isize == 0 && mCurrentRecord.getReferenceName().equals(mCurrentRecord.getMateReferenceName())) { - reportErrorParsingLine("ISIZE must be non-zero if RNAME == MRNM"); - } - } else { - if (matePos != 0) { - reportErrorParsingLine("MPOS must be zero if MRNM is not specified"); - } - if (isize != 0) { - reportErrorParsingLine("ISIZE must be zero if MRNM is not specified"); - } - } - mCurrentRecord.setMateAlignmentStart(matePos); - mCurrentRecord.setInferredInsertSize(isize); - if (!mFields[SEQ_COL].equals("*")) { - mCurrentRecord.setReadString(mFields[SEQ_COL]); - } - if (!mFields[QUAL_COL].equals("*")) { - if (mCurrentRecord.getReadString() == null) { - reportErrorParsingLine("QUAL should not be specified if SEQ is not specified"); - } - if (mCurrentRecord.getReadString().length() != mFields[QUAL_COL].length()) { - reportErrorParsingLine("length(QUAL) != length(SEQ)"); - } - mCurrentRecord.setBaseQualityString(mFields[QUAL_COL]); - } - - for (int i = NUM_REQUIRED_FIELDS; i < numFields; ++i) { - parseTag(mFields[i]); - } - - } - - private void parseTag(final String tag) { - Map.Entry entry = null; - try { - entry = tagCodec.decode(tag); - } catch (SAMFormatException e) { - reportErrorParsingLine(e); - } - if (entry != null) { - mCurrentRecord.setAttribute(entry.getKey(), entry.getValue()); - } - } - } -} - diff --git a/java/lib/edu/mit/broad/sam/SAMTextWriter.java b/java/lib/edu/mit/broad/sam/SAMTextWriter.java deleted file mode 100644 index e3e8e6572..000000000 --- a/java/lib/edu/mit/broad/sam/SAMTextWriter.java +++ /dev/null @@ -1,121 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -import edu.mit.broad.sam.util.AsciiWriter; -import edu.mit.broad.sam.util.RuntimeIOException; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.Writer; -import java.util.Map; - -class SAMTextWriter extends SAMFileWriterImpl { - private static final String FIELD_SEPARATOR = "\t"; - - private final Writer out; - private final File file; - private final TextTagCodec tagCodec = new TextTagCodec(); - - SAMTextWriter(final File file) { - try { - this.file = file; - this.out = new AsciiWriter(new FileOutputStream(file)); - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - /** - * Writes the record to disk. Sort order has been taken care of by the time - * this method is called. - * - * @param alignment - */ - protected void writeAlignment(final SAMRecord alignment) { - try { - out.write(alignment.getReadName()); - out.write(FIELD_SEPARATOR); - out.write(Integer.toString(alignment.getFlags())); - out.write(FIELD_SEPARATOR); - out.write(alignment.getReferenceName()); - out.write(FIELD_SEPARATOR); - out.write(Integer.toString(alignment.getAlignmentStart())); - out.write(FIELD_SEPARATOR); - out.write(Integer.toString(alignment.getMappingQuality())); - out.write(FIELD_SEPARATOR); - out.write(alignment.getCigarString()); - out.write(FIELD_SEPARATOR); - - // I think == is OK here. If not, it isn't an error, just less efficient storage - if (alignment.getReferenceName() == alignment.getMateReferenceName() && - SAMRecord.NO_ALIGNMENT_REFERENCE_NAME != alignment.getReferenceName()) { - out.write("="); - } else { - out.write(alignment.getMateReferenceName()); - } - out.write(FIELD_SEPARATOR); - out.write(Integer.toString(alignment.getMateAlignmentStart())); - out.write(FIELD_SEPARATOR); - out.write(Integer.toString(alignment.getInferredInsertSize())); - out.write(FIELD_SEPARATOR); - out.write(alignment.getReadString()); - out.write(FIELD_SEPARATOR); - out.write(alignment.getBaseQualityString()); - if (alignment.getAttributes() != null) { - for (final Map.Entry attribute : alignment.getAttributes()) { - out.write(FIELD_SEPARATOR); - out.write(tagCodec.encode(attribute.getKey(), attribute.getValue())); - } - } - out.write("\n"); - - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - /** - * Write the header to disk. Header object is available via getHeader(). - * - * @param textHeader for convenience if the implementation needs it. - */ - protected void writeHeader(final String textHeader) { - try { - out.write(textHeader); - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - /** - * Do any required flushing here. - */ - protected void finish() { - try { - out.close(); - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - /** - * For producing error messages. - * - * @return Output filename, or null if there isn't one. - */ - protected String getFilename() { - if (file == null) { - return null; - } - return file.getAbsolutePath(); - } -} diff --git a/java/lib/edu/mit/broad/sam/SAMTools.java b/java/lib/edu/mit/broad/sam/SAMTools.java deleted file mode 100644 index 0a320ba84..000000000 --- a/java/lib/edu/mit/broad/sam/SAMTools.java +++ /dev/null @@ -1,106 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.sam; - - -import edu.mit.broad.sam.util.CloseableIterator; -import java.io.*; - - -/** - * Command line utility for manipulating SAM/BAM files. - */ -public class SAMTools -{ - private String mCommand = null; - private File mInputFile = null; - - - public static void main(final String[] args) - throws Exception { - final int status = new SAMTools().run(args); - if (status != 0) { - System.exit(status); - } - } - - private SAMTools() { - } - - private void usage() { - System.out.println(); - System.out.println("SAMTools version 0.1.0"); - System.out.println("Tools for manipulating SAM/BAM files"); - System.out.println(); - System.out.println("Usage: SAMTools "); - System.out.println(); - System.out.println("Commands:"); - System.out.println(" help"); - System.out.println(" view "); - System.out.println(); - } - - private boolean parseArguments(final String[] args) { - if (args.length == 0) { - usage(); - return true; - } - final String command = args[0]; - final int argpos = 1; - final int argcount = args.length - argpos; - if (command.equals("help")) { - usage(); - return true; - } else if (command.equals("view")) { - if (argcount != 1) { - usage(); - return false; - } - mInputFile = new File(args[1]); - if (!mInputFile.exists()) { - System.out.println("Input file not found: " + mInputFile); - return false; - } - } else { - System.out.println("Unrecognized command: " + command); - System.out.println(); - usage(); - return false; - } - mCommand = command; - return true; - } - - private int run(final String[] args) - throws Exception { - if (!parseArguments(args)) { - return 1; - } - if (mCommand == null) { - return 0; - } - if (mCommand.equals("view")) { - return runView(); - } - return 1; - } - - private int runView() { - final SAMFileReader reader = new SAMFileReader(mInputFile); - final CloseableIterator iterator = reader.iterator(); - while (iterator.hasNext()) { - final SAMRecord record = iterator.next(); - System.out.println(record.format()); - } - iterator.close(); - return 0; - } -} diff --git a/java/lib/edu/mit/broad/sam/SAMUtils.java b/java/lib/edu/mit/broad/sam/SAMUtils.java deleted file mode 100644 index c17ca773c..000000000 --- a/java/lib/edu/mit/broad/sam/SAMUtils.java +++ /dev/null @@ -1,269 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.sam; - - -/** - * Utilty methods. - */ -final class SAMUtils -{ - private static final byte COMPRESSED_EQUAL_LOW = 0; - private static final byte COMPRESSED_A_LOW = 1; - private static final byte COMPRESSED_C_LOW = 2; - private static final byte COMPRESSED_G_LOW = 4; - private static final byte COMPRESSED_T_LOW = 8; - private static final byte COMPRESSED_N_LOW = 15; - private static final byte COMPRESSED_EQUAL_HIGH = COMPRESSED_EQUAL_LOW << 4; - private static final byte COMPRESSED_A_HIGH = COMPRESSED_A_LOW << 4; - private static final byte COMPRESSED_C_HIGH = COMPRESSED_C_LOW << 4; - private static final byte COMPRESSED_G_HIGH = COMPRESSED_G_LOW << 4; - private static final byte COMPRESSED_T_HIGH = (byte)(COMPRESSED_T_LOW << 4); - private static final byte COMPRESSED_N_HIGH = (byte)(COMPRESSED_N_LOW << 4); - - private SAMUtils() { - } - - static int unpackInt16(final byte[] buffer, final int offset) { - return ((buffer[offset] & 0xFF) | - ((buffer[offset+1] & 0xFF) << 8)); - } - - static int unpackInt32(final byte[] buffer, final int offset) { - return ((buffer[offset] & 0xFF) | - ((buffer[offset+1] & 0xFF) << 8) | - ((buffer[offset+2] & 0xFF) << 16) | - ((buffer[offset+3] & 0xFF) << 24)); - } - - /** - * Convert from a byte array containing =AaCcGgTtNn, to a byte array half as long, - * with =, A, C, G, T converted to 0, 1, 2, 4, 8, 15 - * @param readBases - * @return - */ - static byte[] bytesToCompressedBases(final byte[] readBases) { - final byte[] compressedBases = new byte[(readBases.length + 1)/2]; - int i; - for (i = 1; i < readBases.length; i+=2) { - compressedBases[i/2] = (byte)(charToCompressedBaseHigh(readBases[i-1]) | - charToCompressedBaseLow(readBases[i])); - } - // Last nybble - if (i == readBases.length) { - compressedBases[i/2] = charToCompressedBaseHigh((char)readBases[i-1]); - } - return compressedBases; - } - - static byte[] compressedBasesToBytes(final int length, final byte[] compressedBases, final int compressedOffset) { - final byte[] ret = new byte[length]; - int i; - for (i = 1; i < length; i+=2) { - ret[i-1] = compressedBaseToByteHigh(compressedBases[i/2 + compressedOffset]); - ret[i] = compressedBaseToByteLow(compressedBases[i/2 + compressedOffset]); - } - // Last nybble - if (i == length) { - ret[i-1] = compressedBaseToByteHigh(compressedBases[i/2 + compressedOffset]); - } - return ret; - } - - /** - * - * @param base One of =AaCcGgTtNn - * @return nybble-encoded equivalent - */ - private static byte charToCompressedBaseLow(final int base) { - switch (base) { - case '=': - return COMPRESSED_EQUAL_LOW; - case 'a': - case 'A': - return COMPRESSED_A_LOW; - case 'c': - case 'C': - return COMPRESSED_C_LOW; - case 'g': - case 'G': - return COMPRESSED_G_LOW; - case 't': - case 'T': - return COMPRESSED_T_LOW; - case 'n': - case 'N': - case '.': - return COMPRESSED_N_LOW; - default: - throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); - } - } - - private static byte charToCompressedBaseHigh(final int base) { - switch (base) { - case '=': - return COMPRESSED_EQUAL_HIGH; - case 'a': - case 'A': - return COMPRESSED_A_HIGH; - case 'c': - case 'C': - return COMPRESSED_C_HIGH; - case 'g': - case 'G': - return COMPRESSED_G_HIGH; - case 't': - case 'T': - return COMPRESSED_T_HIGH; - case 'n': - case 'N': - case '.': - return COMPRESSED_N_HIGH; - default: - throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); - } - } - - /** - * - * @param base One of COMPRESSED_* - * @return one of ACGTN= - */ - private static byte compressedBaseToByteLow(final int base) { - switch (base & 0xf) { - case COMPRESSED_EQUAL_LOW: - return '='; - case COMPRESSED_A_LOW: - return 'A'; - case COMPRESSED_C_LOW: - return 'C'; - case COMPRESSED_G_LOW: - return 'G'; - case COMPRESSED_T_LOW: - return 'T'; - case COMPRESSED_N_LOW: - return 'N'; - default: - throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); - } - } - - private static byte compressedBaseToByteHigh(final int base) { - switch ((byte)(base & 0xf0)) { - case COMPRESSED_EQUAL_HIGH: - return '='; - case COMPRESSED_A_HIGH: - return 'A'; - case COMPRESSED_C_HIGH: - return 'C'; - case COMPRESSED_G_HIGH: - return 'G'; - case COMPRESSED_T_HIGH: - return 'T'; - case COMPRESSED_N_HIGH: - return 'N'; - default: - throw new IllegalArgumentException("Bad byte passed to charToCompressedBase: " + base); - } - } - - static String bytesToHexString(final byte[] data) { - final char[] chars = new char[2 * data.length]; - for (int i = 0; i < data.length; i++) { - final byte b = data[i]; - chars[2*i] = toHexDigit((b >> 4) & 0xF); - chars[2*i+1] = toHexDigit(b & 0xF); - } - return new String(chars); - } - - static byte[] hexStringToBytes(final String s) throws NumberFormatException { - if (s.length() % 2 != 0) { - throw new NumberFormatException("Hex representation of byte string does not have even number of hex chars: " + s); - } - final byte[] ret = new byte[s.length() / 2]; - for (int i = 0; i < ret.length; ++i) { - ret[i] = (byte) (fromHexDigit(s.charAt(i * 2)) << 4 + fromHexDigit(s.charAt(i * 2 + 1))); - } - return ret; - } - - static String phredToFastq(final byte[] data) { - if (data == null) { - return null; - } - return phredToFastq(data, 0, data.length); - } - - static String phredToFastq(final byte[] buffer, final int offset, final int length) { - final char[] chars = new char[length]; - for (int i = 0; i < length; i++) { - chars[i] = phredToFastq(buffer[offset+i] & 0xFF); - } - return new String(chars); - } - - static char phredToFastq(final int phredScore) { - if (phredScore < 0 || phredScore > 63) { - throw new IllegalArgumentException("Cannot encode phred score: " + phredScore); - } - return (char) (33 + phredScore); - } - - static byte[] fastqToPhred(final String fastq) { - if (fastq == null) { - return null; - } - final int length = fastq.length(); - final byte[] scores = new byte[length]; - for (int i = 0; i < length; i++) { - scores[i] = (byte) fastqToPhred(fastq.charAt(i)); - } - return scores; - } - - static int fastqToPhred(final char ch) { - if (ch < 33 || ch > 126) { - throw new IllegalArgumentException("Invalid fastq character: " + ch); - } - return (ch - 33); - } - - private static char toHexDigit(final int value) { - return (char) ((value < 10) ? ('0' + value) : ('A' + value - 10)); - } - - private static int fromHexDigit(final char c) throws NumberFormatException { - final int ret = Character.digit(c, 16); - if (ret == -1) { - throw new NumberFormatException("Not a valid hex digit: " + c); - } - return ret; - } - - /** - * calculate the bin given an alignment in [beg,end) - * Copied from SAM spec. - */ - static int reg2bin(final int beg, int end) - { - - --end; - - if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14); - if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17); - if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20); - if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23); - if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26); - return 0; - } -} diff --git a/java/lib/edu/mit/broad/sam/TextCigarCodec.java b/java/lib/edu/mit/broad/sam/TextCigarCodec.java deleted file mode 100755 index a1abc2620..000000000 --- a/java/lib/edu/mit/broad/sam/TextCigarCodec.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - The Broad Institute - SOFTWARE COPYRIGHT NOTICE AGREEMENT - This software and its documentation are copyright 2009 by the - Broad Institute/Massachusetts Institute of Technology. All rights are - reserved. - - This software is supplied without any warranty or guaranteed support - whatsoever. Neither the Broad Institute nor MIT can be responsible for its - use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -/** - * Convert between string and internal CIGAR representations - */ -public class TextCigarCodec -{ - private static final byte ZERO_BYTE = "0".getBytes()[0]; - private static final byte NINE_BYTE = "9".getBytes()[0]; - - private static final TextCigarCodec singleton = new TextCigarCodec(); - - /** - * It is not necssary to get the singleton but it is preferrable to use the same one - * over and over vs. creating a new object for each BAMRecord. - */ - static TextCigarCodec getSingleton() { - return singleton; - } - - - /** - * Convert from interal CIGAR representation to String - */ - String encode(final Cigar cigar) { - if (cigar.numCigarElements() == 0) { - return SAMRecord.NO_ALIGNMENT_CIGAR; - } - final StringBuilder ret = new StringBuilder(); - for (final CigarElement cigarElement : cigar.getCigarElements()) { - ret.append(cigarElement.getLength()); - ret.append(cigarElement.getOperator()); - } - return ret.toString(); - } - - Cigar decode(final String textCigar) { - if (SAMRecord.NO_ALIGNMENT_CIGAR.equals(textCigar)) { - return new Cigar(); - } - final Cigar ret = new Cigar(); - final byte[] cigarBytes = textCigar.getBytes(); - for (int i = 0; i < cigarBytes.length; ++i) { - if (!isDigit(cigarBytes[i])) { - throw new IllegalArgumentException("Malformed CIGAR string: " + textCigar); - } - int length = (cigarBytes[i] - ZERO_BYTE); - for (++i; isDigit(cigarBytes[i]); ++i) { - length = (length * 10) + cigarBytes[i] - ZERO_BYTE; - } - final CigarOperator operator = CigarOperator.characterToEnum(cigarBytes[i]); - ret.add(new CigarElement(length, operator)); - } - return ret; - } - - private boolean isDigit(final byte c) { - return c >= ZERO_BYTE && c <= NINE_BYTE; - } - - - -} - -/******************************************************************/ -/**************************[END OF TextCigarCodec.java]*************************/ -/******************************************************************/ diff --git a/java/lib/edu/mit/broad/sam/TextTagCodec.java b/java/lib/edu/mit/broad/sam/TextTagCodec.java deleted file mode 100644 index 69fd53b1f..000000000 --- a/java/lib/edu/mit/broad/sam/TextTagCodec.java +++ /dev/null @@ -1,96 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam; - -import edu.mit.broad.sam.util.StringUtil; - -import java.util.Map; - -class TextTagCodec { - private static final int NUM_TAG_FIELDS = 3; - - /** - * This is really a local variable of decode(), but allocated here to reduce allocations. - */ - private final String[] fields = new String[NUM_TAG_FIELDS]; - - String encode(final String key, Object value) { - final StringBuilder sb = new StringBuilder(key); - sb.append(':'); - char tagType = BinaryTagCodec.getTagValueType(value); - switch (tagType) { - case 'c': - case 'C': - case 's': - case 'S': - case 'I': - tagType = 'i'; - } - if (tagType == 'H') { - value = SAMUtils.bytesToHexString((byte[])value); - } - sb.append(tagType); - sb.append(':'); - sb.append(value.toString()); - return sb.toString(); - } - - Map.Entry decode(final String tag) { - final int numFields = StringUtil.split(tag, fields, ':'); - if (numFields != TextTagCodec.NUM_TAG_FIELDS) { - throw new SAMFormatException("Not enough fields in tag '" + tag + "'"); - } - final String key = fields[0]; - final String type = fields[1]; - final String stringVal = fields[2]; - final Object val; - if (type.equals("Z")) { - val = stringVal; - } else if (type.equals("A")) { - if (stringVal.length() != 1) { - throw new SAMFormatException("Tag of type A should have a single-character value"); - } - val = stringVal.charAt(0); - } else if (type.equals("i")) { - try { - val = new Integer(stringVal); - } catch (NumberFormatException e) { - throw new SAMFormatException("Tag of type i should have signed decimal value"); - } - } else if (type.equals("f")) { - try { - val = new Float(stringVal); - } catch (NumberFormatException e) { - throw new SAMFormatException("Tag of type f should have single-precision floating point value"); - } - } else if (type.equals("H")) { - try { - val = SAMUtils.hexStringToBytes(stringVal); - } catch (NumberFormatException e) { - throw new SAMFormatException("Tag of type H should have valid hex string with even number of digits"); - } - } else { - throw new SAMFormatException("Unrecognized tag type: " + type); - } - return new Map.Entry() { - public String getKey() { - return key; - } - - public Object getValue() { - return val; - } - - public Object setValue(final Object o) { - throw new UnsupportedOperationException(); - } - }; - } -} diff --git a/java/lib/edu/mit/broad/sam/apps/AccumulateCoverage.java b/java/lib/edu/mit/broad/sam/apps/AccumulateCoverage.java deleted file mode 100644 index 99a3917ff..000000000 --- a/java/lib/edu/mit/broad/sam/apps/AccumulateCoverage.java +++ /dev/null @@ -1,132 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.apps; - -import edu.mit.broad.sam.SAMFileReader; -import edu.mit.broad.sam.SAMLocusIterator; -import edu.mit.broad.sam.SAMFileHeader; - -import java.io.File; -import java.io.IOException; -import java.io.Writer; -import java.io.FileWriter; -import java.util.List; - -public class AccumulateCoverage { - - public static void main(final String[] argv) throws Exception { - if (argv.length != 1) { - System.err.println("ERROR: Incorrect number of arguments"); - usage(); - System.exit(1); - } - final AccumulateCoverage ac = new AccumulateCoverage(argv[0]); - } - - private static void usage() { - System.err.println("USAGE: AccumulateCoverage "); - } - - - - public AccumulateCoverage(final String samFile) throws IOException { - final long startTime = System.currentTimeMillis(); - final Writer writer = new FileWriter("/Users/kcibul/projects/sam/acccov.out"); - - final SAMFileReader samReader = new SAMFileReader(new File(samFile)); - - // ensure the file is sorted -//TODO: is the SAM reader implementation broken? - if (samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { - System.out.println("SAM Files must be coordinate-sorted, this is " + samReader.getFileHeader().getSortOrder()); - System.exit(1); - } - - final SAMLocusIterator sli = new SAMLocusIterator(samReader.iterator()); - - for (final SAMLocusIterator.LocusInfo li : sli) { - - String chrom = li.getChrom().substring(3); - if (chrom.equals("M")) { chrom = "0"; } - if (chrom.equals("X")) { chrom = "23"; } - if (chrom.equals("Y")) { chrom = "24"; } - - final StringBuilder sb = new StringBuilder(); - sb.append(chrom) - .append(":") - .append(li.getPosition()-1) - .append(" ") - .append(li.getBases().size()) - .append("\n"); - - writer.write(sb.toString()); - //System.out.print(sb); - -// // TODO: zero based or 1 based? -// System.out.print(li.chrom + "\t" + (li.position-1) + "\t" + li.bases.size() + "\t"); -// -// // TODO: print and capitalize by strand (like pileup) -// System.out.print(bytesToString(li.bases)); -// System.out.print("\t"); -// System.out.print(phredToFastq(li.qualities)); -// System.out.print("\n"); - } - - - writer.flush(); - writer.close(); - final long elapsed = System.currentTimeMillis() - startTime; - - System.out.println("Completed in " + elapsed + "ms"); - } - - - static String bytesToString(final List data) { - if (data == null || data.size() == 0) { - return null; - } - - final char[] chars = new char[data.size()]; - for (int i = 0; i < data.size(); i++) { - chars[i] = (char) (data.get(i) & 0xFF); - } - return new String(chars); - } - - - static String phredToFastq(final List data) { - final byte[] arrData = new byte[data.size()]; - for(int i=0; i< data.size(); i++) { arrData[i] = data.get(i); } - return phredToFastq(arrData); - } - - static String phredToFastq(final byte[] data) { - if (data == null) { - return null; - } - return phredToFastq(data, 0, data.length); - } - - static String phredToFastq(final byte[] buffer, final int offset, final int length) { - final char[] chars = new char[length]; - for (int i = 0; i < length; i++) { - chars[i] = phredToFastq(buffer[offset+i] & 0xFF); - } - return new String(chars); - } - - static char phredToFastq(final int phredScore) { - if (phredScore < 0 || phredScore > 63) { - throw new IllegalArgumentException("Cannot encode phred score: " + phredScore); - } - return (char) (33 + phredScore); - } - -} \ No newline at end of file diff --git a/java/lib/edu/mit/broad/sam/apps/CompareSAMs.java b/java/lib/edu/mit/broad/sam/apps/CompareSAMs.java deleted file mode 100644 index 8b0ca1b57..000000000 --- a/java/lib/edu/mit/broad/sam/apps/CompareSAMs.java +++ /dev/null @@ -1,486 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.apps; - -import edu.mit.broad.sam.*; - -import java.io.File; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -public class CompareSAMs { - public static void main(final String[] argv) { - if (argv.length != 2) { - System.err.println("ERROR: Incorrect number of arguments"); - usage(); - System.exit(1); - } - final CompareSAMs compareSAMs = new CompareSAMs(argv); - if (!compareSAMs.areEqual()) { - System.exit(1); - } - } - - private static void usage() { - System.err.println("USAGE: CompareSAMS "); - } - - private final String[] samFiles; - private final SAMFileReader[] samReaders = new SAMFileReader[2]; - private boolean sequenceDictionariesDiffer; - private int mappingsMatch = 0; - private int unmappedBoth = 0; - private int unmappedLeft = 0; - private int unmappedRight = 0; - private int mappingsDiffer = 0; - private int missingLeft = 0; - private int missingRight = 0; - private boolean areEqual; - - public CompareSAMs(final String[] samFiles) { - this.samFiles = samFiles; - for (int i = 0; i < samFiles.length; ++i) { - samReaders[i] = new SAMFileReader(new File(samFiles[i])); - } - areEqual = compareHeaders(); - areEqual = compareAlignments() && areEqual; - printReport(); - if (!areEqual) { - System.out.println("SAM files differ."); - } else { - System.out.println("SAM files match."); - } - } - - private void printReport() { - System.out.println("Match\t" + mappingsMatch); - System.out.println("Differ\t" + mappingsDiffer); - System.out.println("Unmapped_both\t" + unmappedBoth); - System.out.println("Unmapped_left\t" + unmappedLeft); - System.out.println("Unmapped_right\t" + unmappedRight); - System.out.println("Missing_left\t" + missingLeft); - System.out.println("Missing_right\t" + missingRight); - } - - private boolean compareAlignments() { - if (!compareValues(samReaders[0].getFileHeader().getSortOrder(), samReaders[1].getFileHeader().getSortOrder(), - "Sort Order")) { - System.out.println("Cannot compare alignments if sort orders differ."); - return false; - } - switch (samReaders[0].getFileHeader().getSortOrder()) { - case coordinate: - if (sequenceDictionariesDiffer) { - System.out.println("Cannot compare coordinate-sorted SAM files because sequence dictionaries differ."); - return false; - } - return compareCoordinateSortedAlignments(); - case queryname: - return compareQueryNameSortedAlignments(); - case unsorted: - return compareUnsortedAlignments(); - default: - // unreachable - assert(false); - return false; - } - } - - - private boolean compareCoordinateSortedAlignments() { - final NotPrimarySkippingIterator itLeft = - new NotPrimarySkippingIterator(samReaders[0].iterator()); - final NotPrimarySkippingIterator itRight = - new NotPrimarySkippingIterator(samReaders[1].iterator()); - - // Save any reads which haven't been matched during in-order scan. - final Map leftUnmatched = new HashMap(); - final Map rightUnmatched = new HashMap(); - - boolean ret = true; - - while (itLeft.hasCurrent()) { - if (!itRight.hasCurrent()) { - // Exhausted right side. See if any of the remaining left reads match - // any of the saved right reads. - for( ; itLeft.hasCurrent(); itLeft.advance()) { - final SAMRecord left = itLeft.getCurrent(); - final SAMRecord right = rightUnmatched.remove(left.getReadName()); - if (right == null) { - ++missingRight; - } else { - tallyAlignmentRecords(left, right); - } - } - break; - } - // Don't assume stability of order beyond the coordinate. Therefore grab all the - // reads from the left that has the same coordinate. - final SAMRecord left = itLeft.getCurrent(); - final Map leftCurrentCoordinate = new HashMap(); - leftCurrentCoordinate.put(left.getReadName(), left); - while (itLeft.advance()) { - final SAMRecord nextLeft = itLeft.getCurrent(); - if (compareAlignmentCoordinates(left, nextLeft) == 0) { - leftCurrentCoordinate.put(nextLeft.getReadName(), nextLeft); - } else { - break; - } - } - // Advance the right iterator until it is >= the left reads that have just been grabbed - while (itRight.hasCurrent() && compareAlignmentCoordinates(left, itRight.getCurrent()) > 0) { - final SAMRecord right = itRight.getCurrent(); - rightUnmatched.put(right.getReadName(), right); - itRight.advance(); - } - // For each right read that has the same coordinate as the current left reads, - // see if there is a matching left read. If so, process and discard. If not, - // save the right read for later. - for (;itRight.hasCurrent() && compareAlignmentCoordinates(left, itRight.getCurrent()) == 0; itRight.advance()) { - final SAMRecord right = itRight.getCurrent(); - final SAMRecord matchingLeft = leftCurrentCoordinate.remove(right.getReadName()); - if (matchingLeft != null) { - ret = tallyAlignmentRecords(matchingLeft, right) && ret; - } else { - rightUnmatched.put(right.getReadName(), right); - } - } - - // Anything left in leftCurrentCoordinate has not been matched - for (final SAMRecord samRecord : leftCurrentCoordinate.values()) { - leftUnmatched.put(samRecord.getReadName(), samRecord); - } - } - // The left iterator has been exhausted. See if any of the remaining right reads - // match any of the saved left reads. - for( ; itRight.hasCurrent(); itRight.advance()) { - final SAMRecord right = itRight.getCurrent(); - final SAMRecord left = leftUnmatched.remove(right.getReadName()); - if (left != null) { - tallyAlignmentRecords(left, right); - } else { - ++missingLeft; - } - } - - // Look up reads that were unmatched from left, and see if they are in rightUnmatched. - // If found, remove from rightUnmatched and tally. - for (final Map.Entry leftEntry : leftUnmatched.entrySet()) { - final String readName = leftEntry.getKey(); - final SAMRecord left = leftEntry.getValue(); - final SAMRecord right = rightUnmatched.remove(readName); - if (right == null) { - ++missingRight; - continue; - } - tallyAlignmentRecords(left, right); - } - - // Any elements remaining in rightUnmatched are guaranteed not to be in leftUnmatched. - missingLeft += rightUnmatched.size(); - - if (ret) { - if (missingLeft > 0 || missingRight > 0 || mappingsDiffer > 0 || unmappedLeft > 0 || unmappedRight > 0) { - ret = false; - } - } - return ret; - } - - private int compareAlignmentCoordinates(final SAMRecord left, final SAMRecord right) { - final String leftReferenceName = left.getReferenceName(); - final String rightReferenceName = right.getReferenceName(); - if (leftReferenceName == null && rightReferenceName == null) { - return 0; - } else if (leftReferenceName == null) { - return 1; - } else if (rightReferenceName == null) { - return -1; - } - final int leftReferenceIndex = samReaders[0].getFileHeader().getSequenceIndex(leftReferenceName); - final int rightReferenceIndex = samReaders[0].getFileHeader().getSequenceIndex(rightReferenceName); - assert(leftReferenceIndex >= 0); - assert(rightReferenceIndex >= 0); - if (leftReferenceIndex != rightReferenceIndex) { - return leftReferenceIndex - rightReferenceIndex; - } - return left.getAlignmentStart() - right.getAlignmentStart(); - } - - private boolean compareQueryNameSortedAlignments() { - final NotPrimarySkippingIterator it1 = new NotPrimarySkippingIterator(samReaders[0].iterator()); - final NotPrimarySkippingIterator it2 = new NotPrimarySkippingIterator(samReaders[1].iterator()); - - boolean ret = true; - while (it1.hasCurrent()) { - if (!it2.hasCurrent()) { - missingRight += countRemaining(it1); - return false; - } - final int cmp = it1.getCurrent().getReadName().compareTo(it2.getCurrent().getReadName()); - if (cmp < 0) { - ++missingRight; - it1.advance(); - ret = false; - } else if (cmp > 0) { - ++missingLeft; - it2.advance(); - ret = false; - } else { - if (!tallyAlignmentRecords(it1.getCurrent(), it2.getCurrent())) { - ret = false; - } - it1.advance(); - it2.advance(); - } - } - if (it2.hasCurrent()) { - missingLeft += countRemaining(it2); - return false; - } - return ret; - } - - private boolean compareUnsortedAlignments() { - final NotPrimarySkippingIterator it1 = new NotPrimarySkippingIterator(samReaders[0].iterator()); - final NotPrimarySkippingIterator it2 = new NotPrimarySkippingIterator(samReaders[1].iterator()); - boolean ret = true; - for (; it1.hasCurrent(); it1.advance(), it2.advance()) { - if (!it2.hasCurrent()) { - missingRight += countRemaining(it1); - return false; - } - final SAMRecord s1 = it1.getCurrent(); - final SAMRecord s2 = it2.getCurrent(); - if (!compareValues(s1.getReadName(), s2.getReadName(), "Read names")) { - System.out.println("Read names cease agreeing in unsorted SAM files . Comparison aborting."); - } - ret = tallyAlignmentRecords(s1, s2) && ret; - } - - if (it2.hasCurrent()) { - missingLeft += countRemaining(it2); - return false; - } - return ret; - } - - private int countRemaining(final NotPrimarySkippingIterator it) { - int i; - for (i = 0; it.hasCurrent(); ++i) { - it.advance(); - } - return i; - } - - private boolean tallyAlignmentRecords(final SAMRecord s1, final SAMRecord s2) { - assert (s1.getReadName().equals(s2.getReadName())); - if (s1.getReadUnmappedFlag() && s2.getReadUnmappedFlag()) { - ++unmappedBoth; - return true; - } - if (s1.getReadUnmappedFlag()) { - ++unmappedLeft; - return false; - } - if (s2.getReadUnmappedFlag()) { - ++unmappedRight; - return false; - } - final boolean ret = (s1.getReferenceName().equals(s2.getReferenceName()) && - s1.getAlignmentStart() == s2.getAlignmentStart() && - s1.getReadNegativeStrandFlag() == s1.getReadNegativeStrandFlag()); - if (!ret) { - ++mappingsDiffer; - } else { - ++mappingsMatch; - } - return ret; - } - - - private boolean compareHeaders() { - final SAMFileHeader h1 = samReaders[0].getFileHeader(); - final SAMFileHeader h2 = samReaders[1].getFileHeader(); - boolean ret = compareValues(h1.getVersion(), h2.getVersion(), "File format version"); - ret = compareValues(h1.getCreator(), h2.getCreator(), "File creator") && ret; - ret = compareValues(h1.getAttribute("SO"), h2.getAttribute("SO"), "Sort order") && ret; - if (!compareSequenceDictionaries(h1, h2)) { - ret = false; - sequenceDictionariesDiffer = true; - } - ret = compareReadGroups(h1, h2) && ret; - ret = compareProgramRecords(h1, h2) && ret; - return ret; - } - - private boolean compareProgramRecords(final SAMFileHeader h1, final SAMFileHeader h2) { - final List l1 = h1.getProgramRecords(); - final List l2 = h2.getProgramRecords(); - if (!compareValues(l1.size(), l2.size(), "Number of read groups")) { - return false; - } - boolean ret = true; - for (int i = 0; i < l1.size(); ++i) { - ret = compareProgramRecord(l1.get(i), l2.get(i)) && ret; - } - return ret; - } - - private boolean compareProgramRecord(final SAMProgramRecord programRecord1, final SAMProgramRecord programRecord2) { - if (programRecord1 == null && programRecord2 == null) { - return true; - } - if (programRecord1 == null) { - reportDifference("null", programRecord2.getProgramGroupId(), "Program Record"); - return false; - } - if (programRecord2 == null) { - reportDifference(programRecord1.getProgramGroupId(), "null", "Program Record"); - return false; - } - boolean ret = compareValues(programRecord1.getProgramGroupId(), programRecord2.getProgramGroupId(), - "Program Name"); - final String[] attributes = {"VN", "CL"}; - for (final String attribute: attributes) { - ret = compareValues(programRecord1.getAttribute(attribute), programRecord2.getAttribute(attribute), - attribute + " Program Record attribute") && ret; - } - return ret; - } - - private boolean compareReadGroups(final SAMFileHeader h1, final SAMFileHeader h2) { - final List l1 = h1.getReadGroups(); - final List l2 = h2.getReadGroups(); - if (!compareValues(l1.size(), l2.size(), "Number of read groups")) { - return false; - } - boolean ret = true; - for (int i = 0; i < l1.size(); ++i) { - ret = compareReadGroup(l1.get(i), l2.get(i)) && ret; - } - return ret; - } - - private boolean compareReadGroup(final SAMReadGroupRecord samReadGroupRecord1, final SAMReadGroupRecord samReadGroupRecord2) { - boolean ret = compareValues(samReadGroupRecord1.getReadGroupId(), samReadGroupRecord2.getReadGroupId(), - "Read Group ID"); - ret = compareValues(samReadGroupRecord1.getSample(), samReadGroupRecord2.getSample(), - "Sample for read group " + samReadGroupRecord1.getReadGroupId()) && ret; - ret = compareValues(samReadGroupRecord1.getLibrary(), samReadGroupRecord2.getLibrary(), - "Library for read group " + samReadGroupRecord1.getReadGroupId()) && ret; - final String[] attributes = {"DS", "PU", "PI", "CN", "DT", "PL"}; - for (final String attribute : attributes) { - ret = compareValues(samReadGroupRecord1.getAttribute(attribute), samReadGroupRecord2.getAttribute(attribute), - attribute + " for read group " + samReadGroupRecord1.getReadGroupId()) && ret; - } - return ret; - } - - private boolean compareSequenceDictionaries(final SAMFileHeader h1, final SAMFileHeader h2) { - final List s1 = h1.getSequences(); - final List s2 = h2.getSequences(); - if (s1.size() != s2.size()) { - reportDifference(s1.size(), s2.size(), "Length of sequence dictionaries"); - return false; - } - boolean ret = true; - for (int i = 0; i < s1.size(); ++i) { - ret = compareSequenceRecord(s1.get(i), s2.get(i), i+1) && ret; - } - return ret; - } - - private boolean compareSequenceRecord(final SAMSequenceRecord sequenceRecord1, final SAMSequenceRecord sequenceRecord2, final int which) { - if (!sequenceRecord1.getSequenceName().equals(sequenceRecord2.getSequenceName())) { - reportDifference(sequenceRecord1.getSequenceName(), sequenceRecord2.getSequenceName(), - "Name of sequence record " + which); - return false; - } - boolean ret = compareValues(sequenceRecord1.getSequenceLength(), sequenceRecord2.getSequenceLength(), "Length of sequence " + - sequenceRecord1.getSequenceName()); - ret = compareValues(sequenceRecord1.getSpecies(), sequenceRecord2.getSpecies(), "Species of sequence " + - sequenceRecord1.getSequenceName()) && ret; - ret = compareValues(sequenceRecord1.getAssembly(), sequenceRecord2.getAssembly(), "Assembly of sequence " + - sequenceRecord1.getSequenceName()) && ret; - ret = compareValues(sequenceRecord1.getAttribute("M5"), sequenceRecord2.getAttribute("M5"), "MD5 of sequence " + - sequenceRecord1.getSequenceName()) && ret; - ret = compareValues(sequenceRecord1.getAttribute("UR"), sequenceRecord2.getAttribute("UR"), "URI of sequence " + - sequenceRecord1.getSequenceName()) && ret; - return ret; - } - - private boolean compareValues(final T v1, final T v2, final String label) { - if (v1 == null) { - if (v2 == null) { - return true; - } - reportDifference(v1, v2, label); - return false; - } - if (v2 == null) { - reportDifference(v1, v2, label); - return false; - } - if (!v1.equals(v2)) { - reportDifference(v1, v2, label); - return false; - } - return true; - } - - private void reportDifference(final String s1, final String s2, final String label) { - System.out.println(label + " differs."); - System.out.println(samFiles[0] + ": " + s1); - System.out.println(samFiles[1] + ": " + s2); - } - private void reportDifference(Object o1, Object o2, final String label) { - if (o1 == null) { - o1 = "null"; - } - if (o2 == null) { - o2 = "null"; - } - reportDifference(o1.toString(), o2.toString(), label); - } - - public int getMappingsMatch() { - return mappingsMatch; - } - - public int getUnmappedBoth() { - return unmappedBoth; - } - - public int getUnmappedLeft() { - return unmappedLeft; - } - - public int getUnmappedRight() { - return unmappedRight; - } - - public int getMappingsDiffer() { - return mappingsDiffer; - } - - public int getMissingLeft() { - return missingLeft; - } - - public int getMissingRight() { - return missingRight; - } - - public boolean areEqual() { - return areEqual; - } -} diff --git a/java/lib/edu/mit/broad/sam/apps/allelecaller/AbstractAlleleCaller.java b/java/lib/edu/mit/broad/sam/apps/allelecaller/AbstractAlleleCaller.java deleted file mode 100644 index 9265d539c..000000000 --- a/java/lib/edu/mit/broad/sam/apps/allelecaller/AbstractAlleleCaller.java +++ /dev/null @@ -1,166 +0,0 @@ -package edu.mit.broad.sam.apps.allelecaller; - -import edu.mit.broad.sam.SAMLocusIterator; -import edu.mit.broad.arachne.FastbReader; - -import java.io.IOException; -import java.io.BufferedWriter; -import java.io.File; -import java.util.SortedSet; -import java.util.List; - -/** - * Base class for AlleleCallers. Handles efficient access to the reference, output of data to a - * standard file format, and application of priors - */ -public abstract class AbstractAlleleCaller { - // writer for output - private final BufferedWriter writer; - - // for providing access to reference data - // TODO: replace with standard mechanism when defined/implemented - private final FastbReader fastbReader; - private String cachedChromName; - private String cachedChrom; - - public AbstractAlleleCaller(final File fastbReference, final BufferedWriter writer) throws IOException { - this.writer = writer; - this.fastbReader = new FastbReader(fastbReference); - } - - - /** - * emit allele calls to the writer specified in the constructor - * - * @param li Locus to call - */ - public void callAlleles(final SAMLocusIterator.LocusInfo li) throws IOException { - - // TODO: replace with standard mechanism when defined/implemented (making use of SAM Header) - // make sure we have access to reference chrom information - if (!li.getChrom().equals(cachedChromName)) { - final int contig = translateChromToContig(li.getChrom()); - cachedChrom = null; // CRITICAL -- to allow for GC - cachedChrom = fastbReader.readSequence(contig); - cachedChromName = li.getChrom(); - } - - final char ref = cachedChrom.charAt(li.getPosition() - 1); - - - // delegate to the specific implementation - final SortedSet likelihoods = call(ref, li.getBasesAsString(), li.getQualities()); - - - final GenotypeTheory bestTheory = likelihoods.first(); - GenotypeTheory nextBestTheory = null; - GenotypeTheory refTheory = null; - final String refString = new String(new char[]{ref,ref}); - final DiploidGenotype refGenotype = DiploidGenotype.valueOf(refString); - - - final StringBuilder theoryString = new StringBuilder(); - int k=0; - for(final GenotypeTheory t : likelihoods) { - if (k == 1) { nextBestTheory = t; } - if (t.getGenotype() == refGenotype) { refTheory = t; } - - theoryString.append(t.getGenotype()) - .append(":") - .append(String.format("%.2f",t.getLikelihood())) - .append(" "); - k++; - } - - final double btnb = bestTheory.getLikelihood() - nextBestTheory.getLikelihood(); - final double btr = bestTheory.getLikelihood() - refTheory.getLikelihood(); - - final DiploidGenotype gt = likelihoods.first().getGenotype(); - - final String type; - if (!gt.isHet() && gt.getAllele1() == ref) { - type = "homozygous"; - } else if (!gt.isHet() && gt.getAllele1() != ref) { - type = "homozygous-SNP"; - } else { - type = "heterozygous-SNP"; - } - - final String bases = li.getBasesAsString(); - int a = 0,c = 0,g = 0,t = 0; - for(int i=0; i call(char ref, String bases, List quals); - - - /** - * Apply a general population-based prior to the likelihood: - *
      - *
    • ref is .999
    • - *
    • het is 10^-3
    • - *
    • homozygous, non-reference is 10^-5
    • - * - * @param ref reference allele - * @param allele1 first allele of the genotype - * @param allele2 second allele of the genotype - * @return prior, given the reference and genotype alleles - */ - protected double getPrior(final char ref, final DiploidGenotype gt) { - final double prior; - if (gt.isHom() && gt.getAllele1() == ref) { - prior = 0.999; // reference - } else { - if (gt.getAllele1() != ref && gt.getAllele2() != ref) { - prior = 0.00001; // neither base is reference - } else { - prior = 0.001; // het, one base is reference - } - } - return prior; - } - - // -------------------------------------------------------------------------------------------- - // Helper methods below this point... - // -------------------------------------------------------------------------------------------- - - - private final String[] chroms = new String[]{"chrM","chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY","chr1_random","chr2_random","chr3_random","chr4_random","chr5_random","chr6_random","chr7_random","chr8_random","chr9_random","chr10_random","chr11_random","chr13_random","chr15_random","chr16_random","chr17_random","chr18_random","chr19_random","chr21_random","chr22_random","chrX_random"}; - private int translateChromToContig(final String chrom) { - for(int i=0; i "); - } - - - private SAMFileReader getSamReader(final File samFile) { - final SAMFileReader samReader = new SAMFileReader(samFile); - - // ensure the file is sorted - if (samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) { - System.out.println("SAM Files must be coordinate-sorted, this is " + samReader.getFileHeader().getSortOrder()); - System.exit(1); - } - - return samReader; - } - -} \ No newline at end of file diff --git a/java/lib/edu/mit/broad/sam/apps/allelecaller/DiploidGenotype.java b/java/lib/edu/mit/broad/sam/apps/allelecaller/DiploidGenotype.java deleted file mode 100644 index d259a6075..000000000 --- a/java/lib/edu/mit/broad/sam/apps/allelecaller/DiploidGenotype.java +++ /dev/null @@ -1,27 +0,0 @@ -package edu.mit.broad.sam.apps.allelecaller; - -public enum DiploidGenotype { - AA('A','A'), - AC('A','C'), - AG('A','G'), - AT('A','T'), - CC('C','C'), - CG('C','G'), - CT('C','T'), - GG('G','G'), - GT('G','T'), - TT('T','T'); - - private final char allele1; - private final char allele2; - - private DiploidGenotype(final char allele1, final char allele2) { - this.allele1 = allele1; - this.allele2 = allele2; - } - - public char getAllele1() { return allele1; } - public char getAllele2() { return allele2; } - public boolean isHet() { return this.allele1 != this.allele2; } - public boolean isHom() { return this.allele1 == this.allele2; } -} diff --git a/java/lib/edu/mit/broad/sam/apps/allelecaller/FlatQualityAlleleCaller.java b/java/lib/edu/mit/broad/sam/apps/allelecaller/FlatQualityAlleleCaller.java deleted file mode 100644 index 7a77d4524..000000000 --- a/java/lib/edu/mit/broad/sam/apps/allelecaller/FlatQualityAlleleCaller.java +++ /dev/null @@ -1,74 +0,0 @@ -package edu.mit.broad.sam.apps.allelecaller; - -import java.io.IOException; -import java.io.BufferedWriter; -import java.io.File; -import java.util.*; -import static java.lang.Math.*; - - -/** - * Bayesian-based allele caller using flat qualities and a 1e-3 error rate, based on CRD algorithm - */ -public class FlatQualityAlleleCaller extends AbstractAlleleCaller { - - public FlatQualityAlleleCaller(final File fastbReference, final BufferedWriter writer) throws IOException { - super(fastbReference, writer); - } - - - protected SortedSet call(final char ref, final String bases, final List quals) { - final float eps = 1e-3f; - - // count up the base by nucleotide and put them into a map - final int depth = bases.length(); - int a = 0,c = 0,g = 0,t = 0; - for(int i=0; i< bases.length(); i++) { - if (bases.charAt(i) == 'A') { a++; } - else if (bases.charAt(i) == 'C') { c++; } - else if (bases.charAt(i) == 'G') { g++; } - else if (bases.charAt(i) == 'T') { t++; } - else { throw new RuntimeException("Unknown Base " + bases.charAt(i)); } - } - - final Map counts = new HashMap(); - counts.put('A', a); - counts.put('C', c); - counts.put('G', g); - counts.put('T', t); - - - // for each of the 10 theories, calculate the likelihood - final SortedSet results = new TreeSet(); - for(final DiploidGenotype theory : DiploidGenotype.values()) { - final double likelihood; - final char allele1 = theory.getAllele1(); - final char allele2 = theory.getAllele2(); - - if (!theory.isHet()) { - likelihood = log10(1-eps)*counts.get(allele1) + log10(eps)*(depth - counts.get(allele1)); - } else { - final int major_allele_counts; - final int minor_allele_counts; - if (counts.get(allele1) > counts.get(allele2)) { - major_allele_counts = counts.get(allele1); - minor_allele_counts = counts.get(allele2); - } else { - major_allele_counts = counts.get(allele2); - minor_allele_counts = counts.get(allele1); - } - - likelihood = log10(0.5 - (eps/2.0) )*major_allele_counts + - log10(0.5 - (eps/2.0) )*minor_allele_counts + - log10(eps)*(depth - major_allele_counts - minor_allele_counts); - } - - final double prior = getPrior(ref, theory); - results.add(new GenotypeTheory(theory, likelihood + log10(prior))); - } - - - return results; - - } -} diff --git a/java/lib/edu/mit/broad/sam/apps/allelecaller/GenotypeTheory.java b/java/lib/edu/mit/broad/sam/apps/allelecaller/GenotypeTheory.java deleted file mode 100644 index 709e1c439..000000000 --- a/java/lib/edu/mit/broad/sam/apps/allelecaller/GenotypeTheory.java +++ /dev/null @@ -1,46 +0,0 @@ -package edu.mit.broad.sam.apps.allelecaller; - -/** - * Datastructure to hold a single genotype along with a likelihood. - */ -public class GenotypeTheory implements Comparable { - private DiploidGenotype genotype; - private double likelihood; - - public GenotypeTheory(final DiploidGenotype genotype, final double likelihood) { - this.genotype = genotype; - this.likelihood = likelihood; - } - - public DiploidGenotype getGenotype() { - return genotype; - } - - public void setGenotype(final DiploidGenotype genotype) { - this.genotype = genotype; - } - - public double getLikelihood() { - return likelihood; - } - - public void setLikelihood(final double likelihood) { - this.likelihood = likelihood; - } - - /** - * Genotype Theories are sorted first by descending likelihood (ie - * the GenotypeTheory with biggest likelihood comes first). Ties are - * broken by lexical sorting of the genotypes themselves - * - */ - public int compareTo(final GenotypeTheory other) { - if (this.getLikelihood() == other.getLikelihood()) { - return this.getGenotype().compareTo(other.getGenotype()); - } else if (this.getLikelihood() > other.getLikelihood()) { - return -1; - } else { - return 1; - } - } -} diff --git a/java/lib/edu/mit/broad/sam/apps/allelecaller/QualityScoreAlleleCaller.java b/java/lib/edu/mit/broad/sam/apps/allelecaller/QualityScoreAlleleCaller.java deleted file mode 100644 index 23b310bd2..000000000 --- a/java/lib/edu/mit/broad/sam/apps/allelecaller/QualityScoreAlleleCaller.java +++ /dev/null @@ -1,80 +0,0 @@ -package edu.mit.broad.sam.apps.allelecaller; - -import java.util.*; -import static java.lang.Math.log10; -import static java.lang.Math.pow; -import java.io.BufferedWriter; -import java.io.IOException; -import java.io.File; - -/** - * Bayesian-based allele caller using quality scores, based on CRD algorithm - */ -public class QualityScoreAlleleCaller extends AbstractAlleleCaller { - - public QualityScoreAlleleCaller(final File fastbReference, final BufferedWriter writer) throws IOException { - super(fastbReference, writer); - } - - protected SortedSet call(final char ref, final String bases, final List quals) { - - // for each of the 10 theories, calculate the likelihood using quality scores - final SortedSet results = new TreeSet(); - for(final DiploidGenotype theory : DiploidGenotype.values()) { - double likelihood = 0; - - for(int i=0; i 0) - { - ++lineNumber; - return StringUtil.bytesToString(lineBuffer, 0, linePosition); - } else - { - return null; - } - } - } - - - final byte b = buffer[nextChar++]; - if (b == LINEFEED || b == CARRIAGE_RETURN) - { - - if (includeTerminators) - { - lineBuffer[linePosition++] = b; - if (b == CARRIAGE_RETURN && peek() == LINEFEED) - { - lineBuffer[linePosition++] = b; - nextChar++; // <= to account for the '\n' we just ate - } - } - else { - if (b == CARRIAGE_RETURN && peek() == LINEFEED) - { - nextChar++; // <= skip the trailing \n in case of \r\n termination - } - - } - ++lineNumber; - return StringUtil.bytesToString(lineBuffer, 0, linePosition); - } else - { - // Expand line buffer size if neccessary. Reservce at least 2 characters - // for potential line-terminators in return string - - if (linePosition > (lineBuffer.length - 3)) - { - final byte[] temp = new byte[lineBuffer.length + 100]; - System.arraycopy(lineBuffer, 0, temp, 0, lineBuffer.length); - lineBuffer = temp; - } - - lineBuffer[linePosition++] = b; - } - } - } - - public int getLineNumber() { - return lineNumber; - } - - /** - * Peek ahead one character, filling from the underlying stream if neccessary. - * - * @return - * @throws java.io.IOException - */ - private byte peek(){ - // Refill buffer if neccessary - if (nextChar == nChars) - { - fill(); - if (nextChar == nChars) - { - // eof reached. - return 0; - } - } - return buffer[nextChar]; - - } - - private void fill() { - try { - nChars = is.read(buffer); - nextChar = 0; - } catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - public void close() { - try { - is.close(); - } catch (IOException e) { - // Ignore exception - } - } -} - diff --git a/java/lib/edu/mit/broad/sam/util/AsciiWriter.java b/java/lib/edu/mit/broad/sam/util/AsciiWriter.java deleted file mode 100644 index 8395cf84d..000000000 --- a/java/lib/edu/mit/broad/sam/util/AsciiWriter.java +++ /dev/null @@ -1,55 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.util; - -import java.io.IOException; -import java.io.OutputStream; -import java.io.Writer; - -/** - * Fast (I hope) Writer that converts char to byte merely by casting, rather than charset conversion. - */ -public class AsciiWriter extends Writer { - - private final OutputStream os; - // Buffer size has not been tuned. - private final byte[] buffer = new byte[10000]; - private int numBytes; - - public AsciiWriter(final OutputStream os) { - this.os = os; - numBytes = 0; - } - - public void close() throws IOException { - flush(); - os.close(); - } - - public void flush() throws IOException { - os.write(buffer, 0, numBytes); - numBytes = 0; - os.flush(); - } - - public void write(final char[] chars, int offset, int length) throws IOException { - while (length > 0) { - final int charsToConvert = Math.min(length, buffer.length - numBytes); - StringUtil.charsToBytes(chars, offset, charsToConvert, buffer, numBytes); - numBytes += charsToConvert; - offset += charsToConvert; - length -= charsToConvert; - if (numBytes == buffer.length) { - os.write(buffer, 0, numBytes); - numBytes = 0; - } - } - } -} diff --git a/java/lib/edu/mit/broad/sam/util/BinaryCodec.java b/java/lib/edu/mit/broad/sam/util/BinaryCodec.java deleted file mode 100644 index 18191a257..000000000 --- a/java/lib/edu/mit/broad/sam/util/BinaryCodec.java +++ /dev/null @@ -1,478 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.util; - -import java.io.*; -import java.nio.ByteBuffer; -import java.nio.ByteOrder; - -/** - * @author Dave Tefft - */ -public class BinaryCodec { - - //Outstream to write to - private OutputStream outputStream; - //If a file or filename was given it will be stored here - private String outputFileName; - - //Input stream to read from - private InputStream inputStream; - //If a file or filename was give to read from it will be stored here - private String inputFileName; - - /* - Mode that the BinaryCodec is in. It is either writing to a binary file or reading from. - This is set to true if it is writing to a binary file - Right now we don't support reading and writing to the same file with the same BinaryCodec instance - */ - private boolean isWriting; - - private ByteBuffer byteBuffer; - - //Byte order used for the Picard project - private static final ByteOrder LITTLE_ENDIAN = ByteOrder.LITTLE_ENDIAN; - private static final byte NULL_BYTE[] = {0}; - - private static final long MAX_UBYTE = (Byte.MAX_VALUE + 1) * 2; - private static final long MAX_USHORT = (Short.MAX_VALUE + 1) * 2; - private static final long MAX_UINT = ((long)Integer.MAX_VALUE + 1) * 2; - - // We never serialize more than this much at a time. - private static final int MAX_BYTE_BUFFER = 8; - - ////////////////////////////////////////////////// - // Constructors // - ////////////////////////////////////////////////// - - /** - * Constructs BinaryCodec from a file and set it's mode to writing or not - * - * @param file file to be written to or read from - * @param writing whether the file is being written to - */ - public BinaryCodec(final File file, final boolean writing) { - try { - this.isWriting = writing; - if (this.isWriting) { - this.outputStream = new FileOutputStream(file); - this.outputFileName = file.getName(); - } else { - this.inputStream = new FileInputStream(file); - this.inputFileName = file.getName(); - } - } catch (FileNotFoundException e) { - throw new RuntimeIOException("File not found: " + file, e); - } - initByteBuffer(); - } - - /** - * Constructs BinaryCodec from a file name and set it's mode to writing or not - * - * @param fileName name of the file to be written to or read from - * @param writing writing whether the file is being written to - */ - public BinaryCodec(final String fileName, final boolean writing) { - this(new File(fileName), writing); - } - - /** - * Constructs BinaryCodec from an output stream - * - * @param outputStream Stream to write to, since it's an output stream we know that isWriting - * should be set to true - */ - public BinaryCodec(final OutputStream outputStream) { - isWriting = true; - this.outputStream = outputStream; - initByteBuffer(); - } - - /** - * Constructs BinaryCodec from an input stream - * - * @param inputStream Stream to read from, since we are reading isWriting is set to false - */ - public BinaryCodec(final InputStream inputStream) { - isWriting = false; - this.inputStream = inputStream; - initByteBuffer(); - } - - /** - * Shared among ctors - */ - private void initByteBuffer() { - byteBuffer = ByteBuffer.allocate(MAX_BYTE_BUFFER); - byteBuffer.order(LITTLE_ENDIAN); - } - - ////////////////////////////////////////////////// - // Writing methods // - ////////////////////////////////////////////////// - - - /** - * Write whatever has been put into the byte buffer - * @param numBytes -- how much to write. Note that in case of writing an unsigned value, - * more bytes were put into the ByteBuffer than will get written out. - */ - private void writeByteBuffer(final int numBytes) { - assert(numBytes <= byteBuffer.limit()); - writeBytes(byteBuffer.array(), 0, numBytes); - } - - /** - * Writes a byte to the output buffer - * - * @param bite byte array to write - */ - public void writeByte(final byte bite) { - byteBuffer.clear(); - byteBuffer.put(bite); - writeByteBuffer(1); - } - - public void writeByte(final int b) { - writeByte((byte)b); - } - - /** - * Writes a byte array to the output buffer - * - * @param bytes byte array to write - */ - public void writeBytes(final byte[] bytes) { - writeBytes(bytes, 0, bytes.length); - } - - public void writeBytes(final byte[] bytes, final int startOffset, final int numBytes) { - if (!isWriting) { - throw new IllegalStateException("Calling write method on BinaryCodec open for read."); - } - try { - outputStream.write(bytes, startOffset, numBytes); - } catch (IOException e) { - throw new RuntimeIOException(constructErrorMessage("Write error"), e); - } - } - - /** - * Write an int to the output stream - * - * @param value int to write - */ - public void writeInt(final int value) { - byteBuffer.clear(); - byteBuffer.putInt(value); - writeByteBuffer(4); - } - - /** - * Write a double to the output stream - * - * @param value double to write - */ - public void writeDouble(final double value) { - byteBuffer.clear(); - byteBuffer.putDouble(value); - writeByteBuffer(8); - } - - /** - * Write a long to the output stream - * - * @param value long to write - */ - public void writeLong(final long value) { - byteBuffer.clear(); - byteBuffer.putLong(value); - writeByteBuffer(8); - } - - - public void writeShort(final short value) { - byteBuffer.clear(); - byteBuffer.putShort(value); - writeByteBuffer(2); - } - - /** - * Write a float to the output stream - * - * @param value float to write - */ - public void writeFloat(final float value) { - byteBuffer.clear(); - byteBuffer.putFloat(value); - writeByteBuffer(4); - } - - /** - * Writes a string to the buffer - * - * @param value string to write to buffer - * @param writeLength prefix the string with the length as an int - * @param appendNull add a null byte to the end of the string - */ - public void writeString(final String value, final boolean writeLength, final boolean appendNull) { - if (writeLength) { - int lengthToWrite = value.length(); - if (appendNull) lengthToWrite++; - writeInt(lengthToWrite); - } - - //Actually writes the string to a buffer - writeString(value); - - if (appendNull) writeBytes(NULL_BYTE); - - } - - - /** - * Write a string to the buffer - * - * @param value string to write - */ - private void writeString(final String value) { - writeBytes(StringUtil.stringToBytes(value)); - } - - // NOTE: The unsigned methods all have little-endianness built into them. - public void writeUByte(final short val) { - if (val < 0) { - throw new IllegalArgumentException("Negative value (" + val + ") passed to unsigned writing method."); - } - if (val > MAX_UBYTE) { - throw new IllegalArgumentException("Value (" + val + ") to large to be written as ubyte."); - } - byteBuffer.clear(); - byteBuffer.putShort(val); - writeByteBuffer(1); - } - - public void writeUShort(final int val) { - if (val < 0) { - throw new IllegalArgumentException("Negative value (" + val + ") passed to unsigned writing method."); - } - if (val > MAX_USHORT) { - throw new IllegalArgumentException("Value (" + val + ") to large to be written as ushort."); - } - byteBuffer.clear(); - byteBuffer.putInt(val); - writeByteBuffer(2); - } - - public void writeUInt(final long val) { - if (val < 0) { - throw new IllegalArgumentException("Negative value (" + val + ") passed to unsigned writing method."); - } - if (val > MAX_UINT) { - throw new IllegalArgumentException("Value (" + val + ") to large to be written as uint."); - } - byteBuffer.clear(); - byteBuffer.putLong(val); - writeByteBuffer(4); - } - - ////////////////////////////////////////////////// - // Reading methods // - ////////////////////////////////////////////////// - - /** - * Read a byte array off the input stream - * - * @return number of bytes read - */ - public void readBytes(final byte[] buffer) { - readBytes(buffer, 0, buffer.length); - } - - public void readBytes(final byte[] buffer, final int offset, final int length) { - final int numRead = readBytesOrFewer(buffer, offset, length); - if (numRead < length) { - throw new RuntimeEOFException(constructErrorMessage("Premature EOF")); - } - } - - public int readBytesOrFewer(final byte[] buffer, final int offset, final int length) { - if (isWriting) { - throw new IllegalStateException("Calling read method on BinaryCodec open for write."); - } - try { - return inputStream.read(buffer, offset, length); - } catch (IOException e) { - throw new RuntimeIOException(constructErrorMessage("Read error"), e); - } - } - - public byte readByte() { - readByteBuffer(1); - byteBuffer.flip(); - return byteBuffer.get(); - } - - /** - * Read a string off the input stream - * - * @param length length of string to read - * @return String read from stream - */ - public String readString(final int length) { - final byte[] buffer = new byte[length]; - readBytes(buffer); - - return StringUtil.bytesToString(buffer); - } - - public String readNullTerminatedString() { - return StringUtil.readNullTerminatedString(this); - } - - private void readByteBuffer(final int numBytes) { - assert(numBytes <= byteBuffer.capacity()); - readBytes(byteBuffer.array(), 0, numBytes); - byteBuffer.limit(byteBuffer.capacity()); - byteBuffer.position(numBytes); - } - - /** - * Read an int off the input stream - * - * @return int from input stream - */ - public int readInt() { - readByteBuffer(4); - byteBuffer.flip(); - return byteBuffer.getInt(); - } - - /** - * Reads a double off the input stream - * - * @return double - */ - public double readDouble() { - readByteBuffer(8); - byteBuffer.flip(); - return byteBuffer.getDouble(); - } - - /** - * Reads a long off the input stream - * - * @return long - */ - public long readLong() { - readByteBuffer(8); - byteBuffer.flip(); - return byteBuffer.getLong(); - } - - public short readShort() { - readByteBuffer(2); - byteBuffer.flip(); - return byteBuffer.getShort(); - } - - /** - * Reads a float off the input stream - * - * @return float - */ - public float readFloat() { - readByteBuffer(4); - byteBuffer.flip(); - return byteBuffer.getFloat(); - } - - public short readUByte() { - readByteBuffer(1); - byteBuffer.put((byte)0); - byteBuffer.flip(); - return byteBuffer.getShort(); - } - - public int readUShort() { - readByteBuffer(2); - byteBuffer.putShort((short)0); - byteBuffer.flip(); - return byteBuffer.getInt(); - } - - public long readUInt() { - readByteBuffer(4); - byteBuffer.putInt(0); - byteBuffer.flip(); - return byteBuffer.getLong(); - } - - /** - * Close the appropriate stream - */ - public void close() { - try { - if (this.isWriting) this.outputStream.close(); - else this.inputStream.close(); - } catch (IOException e) { - throw new RuntimeIOException(e.getMessage(), e); - } - } - - private String constructErrorMessage(final String msg) { - final StringBuilder sb = new StringBuilder(msg); - sb.append("; BinaryCodec in "); - sb.append(isWriting? "write": "read"); - sb.append("mode; "); - final String filename = isWriting? outputFileName: inputFileName; - if (filename != null) { - sb.append("file: "); - sb.append(filename); - } else { - sb.append("streamed file (filename not available)"); - } - return sb.toString(); - } - - ////////////////////////////////////////////////// - // Some getters // - ////////////////////////////////////////////////// - - - public String getInputFileName() { - return inputFileName; - } - - public String getOutputFileName() { - return outputFileName; - } - - public void setOutputFileName(final String outputFileName) { - this.outputFileName = outputFileName; - } - - public void setInputFileName(final String inputFileName) { - this.inputFileName = inputFileName; - } - - public boolean isWriting() { - return isWriting; - } - - public OutputStream getOutputStream() { - return outputStream; - } - - public InputStream getInputStream() { - return inputStream; - } -} diff --git a/java/lib/edu/mit/broad/sam/util/BlockCompressedInputStream.java b/java/lib/edu/mit/broad/sam/util/BlockCompressedInputStream.java deleted file mode 100755 index 626e5c17c..000000000 --- a/java/lib/edu/mit/broad/sam/util/BlockCompressedInputStream.java +++ /dev/null @@ -1,258 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.sam.util; - - -import java.io.*; -import java.util.zip.GZIPInputStream; - -/* - * Utility class for reading BGZF block compressed files. - */ -public class BlockCompressedInputStream - extends InputStream -{ - - private InputStream mStream = null; - private RandomAccessFile mFile = null; - private byte[] mFileBuffer = null; - private byte[] mCurrentBlock = null; - private int mCurrentOffset = 0; - private long mBlockAddress = 0; - private int mLastBlockLength = 0; - - - public BlockCompressedInputStream(final InputStream stream) { - mStream = toBufferedStream(stream); - mFile = null; - } - - public BlockCompressedInputStream(final File file) - throws IOException { - mFile = new RandomAccessFile(file, "r"); - mStream = null; - } - - public int available() - throws IOException { - if (mCurrentBlock == null || mCurrentOffset == mCurrentBlock.length) { - readBlock(); - } - if (mCurrentBlock == null) { - return 0; - } - return mCurrentBlock.length - mCurrentOffset; - } - - public void close() - throws IOException { - if (mFile != null) { - mFile.close(); - mFile = null; - } else if (mStream != null) { - mStream.close(); - mStream = null; - } - // Encourage garbage collection - mFileBuffer = null; - mCurrentBlock = null; - } - - public int read() - throws IOException { - return (available() > 0) ? mCurrentBlock[mCurrentOffset++] : -1; - } - - public int read(final byte[] buffer) - throws IOException { - return read(buffer, 0, buffer.length); - } - - public int read(final byte[] buffer, int offset, int length) - throws IOException { - int bytesRead = 0; - while (length > 0) { - final int available = available(); - if (available == 0) { - break; - } - final int copyLength = Math.min(length, available); - System.arraycopy(mCurrentBlock, mCurrentOffset, buffer, offset, copyLength); - mCurrentOffset += copyLength; - offset += copyLength; - length -= copyLength; - bytesRead += copyLength; - } - return bytesRead; - } - - public void seek(final long pos) - throws IOException { - // Note: pos is a special virtual file pointer, not an actual byte offset - if (mFile == null) { - throw new IOException("Cannot seek on stream based file"); - } - // Decode virtual file pointer - // Upper 48 bits is the byte offset into the compressed stream of a block. - // Lower 16 bits is the byte offset into the uncompressed stream inside the block. - final long compressedOffset = pos >> 16; - final int uncompressedOffset = (int) (pos & 0xFFFF); - mFile.seek(compressedOffset); - mBlockAddress = compressedOffset; - mLastBlockLength = 0; - readBlock(); - if (uncompressedOffset >= available()) { - throw new IOException("Invalid file pointer: " + pos); - } - mCurrentOffset = uncompressedOffset; - } - - public long getFilePointer() { - return ((mBlockAddress << 16) | mCurrentOffset); - } - - public static boolean isValidFile(final InputStream stream) - throws IOException { - if (!stream.markSupported()) { - throw new RuntimeException("Cannot test non-buffered stream"); - } - stream.mark(BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); - final byte[] buffer = new byte[BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH]; - final int count = readBytes(stream, buffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); - stream.reset(); - if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { - return false; - } - return isValidBlockHeader(buffer); - } - - private static boolean isValidBlockHeader(final byte[] buffer) { - return (buffer[0] == BlockCompressedStreamConstants.GZIP_ID1 && - (buffer[1] & 0xFF) == BlockCompressedStreamConstants.GZIP_ID2 && - (buffer[3] & BlockCompressedStreamConstants.GZIP_FLG) != 0 && - buffer[10] == BlockCompressedStreamConstants.GZIP_XLEN && - buffer[12] == BlockCompressedStreamConstants.BGZF_ID1 && - buffer[13] == BlockCompressedStreamConstants.BGZF_ID2); - } - - private void readBlock() - throws IOException { - - if (mFileBuffer == null) { - mFileBuffer = new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE]; - } - int count = readBytes(mFileBuffer, 0, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH); - if (count == 0) { - return; - } - if (count != BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH) { - throw new IOException("Premature end of file"); - } - final int blockLength = unpackInt16(mFileBuffer, BlockCompressedStreamConstants.BLOCK_LENGTH_OFFSET) + 1; - if (blockLength < BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH || blockLength > mFileBuffer.length) { - throw new IOException("Unexpected compressed block length: " + blockLength); - } - final int remaining = blockLength - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH; - count = readBytes(mFileBuffer, BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH, remaining); - if (count != remaining) { - throw new IOException("Premature end of file"); - } - inflateBlock(mFileBuffer, blockLength); - mCurrentOffset = 0; - mBlockAddress += mLastBlockLength; - mLastBlockLength = blockLength; - } - - private void inflateBlock(final byte[] compressedBlock, final int compressedLength) - throws IOException { - final int uncompressedLength = unpackInt32(compressedBlock, compressedLength-4); - byte[] buffer = mCurrentBlock; - mCurrentBlock = null; - if (buffer == null || buffer.length != uncompressedLength) { - buffer = new byte[uncompressedLength]; - } - final GZIPInputStream gzipStream = - new GZIPInputStream(new ByteArrayInputStream(compressedBlock, 0, compressedLength)); - try { - final int count = readBytes(gzipStream, buffer, 0, buffer.length); - if (count != buffer.length) { - throw new IOException("Block inflate failed"); - } - // Note: available() does not return zero here. - // The only safe way to test is to try to read a byte. - if (gzipStream.read() != -1) { - throw new IOException("Block inflate failed"); - } - } finally { - gzipStream.close(); - } - mCurrentBlock = buffer; - } - - private int readBytes(final byte[] buffer, final int offset, final int length) - throws IOException { - if (mFile != null) { - return readBytes(mFile, buffer, offset, length); - } else if (mStream != null) { - return readBytes(mStream, buffer, offset, length); - } else { - return 0; - } - } - - private static int readBytes(final RandomAccessFile file, final byte[] buffer, final int offset, final int length) - throws IOException { - int bytesRead = 0; - while (bytesRead < length) { - final int count = file.read(buffer, offset + bytesRead, length - bytesRead); - if (count <= 0) { - break; - } - bytesRead += count; - } - return bytesRead; - } - - private static int readBytes(final InputStream stream, final byte[] buffer, final int offset, final int length) - throws IOException { - int bytesRead = 0; - while (bytesRead < length) { - final int count = stream.read(buffer, offset + bytesRead, length - bytesRead); - if (count <= 0) { - break; - } - bytesRead += count; - } - return bytesRead; - } - - private BufferedInputStream toBufferedStream(final InputStream stream) { - if (stream instanceof BufferedInputStream) { - return (BufferedInputStream) stream; - } else { - return new BufferedInputStream(stream); - } - } - - private int unpackInt16(final byte[] buffer, final int offset) { - return ((buffer[offset] & 0xFF) | - ((buffer[offset+1] & 0xFF) << 8)); - } - - private int unpackInt32(final byte[] buffer, final int offset) { - return ((buffer[offset] & 0xFF) | - ((buffer[offset+1] & 0xFF) << 8) | - ((buffer[offset+2] & 0xFF) << 16) | - ((buffer[offset+3] & 0xFF) << 24)); - } -} - - diff --git a/java/lib/edu/mit/broad/sam/util/BlockCompressedOutputStream.java b/java/lib/edu/mit/broad/sam/util/BlockCompressedOutputStream.java deleted file mode 100644 index 11b775b88..000000000 --- a/java/lib/edu/mit/broad/sam/util/BlockCompressedOutputStream.java +++ /dev/null @@ -1,177 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.util; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStream; -import java.util.zip.CRC32; -import java.util.zip.Deflater; - -/** - * Writer for a file that is a series of gzip blocks. The caller just treats it as an - * OutputStream, and under the covers a gzip block is written when the amount of uncompressed as-yet-unwritten - * bytes reaches a threshold. Note that the flush() method should not be called by client - * unless you know what you're doing, because it forces a gzip block to be written even if the - * number of buffered bytes has not reached threshold. close(), on the other hand, must be called - * when done writing in order to force the last gzip block to be written. - */ -public class BlockCompressedOutputStream - extends OutputStream -{ - private final BinaryCodec codec; - private final byte[] uncompressedBuffer = new byte[BlockCompressedStreamConstants.DEFAULT_UNCOMPRESSED_BLOCK_SIZE]; - private int numUncompressedBytes = 0; - private final byte[] compressedBuffer = - new byte[BlockCompressedStreamConstants.MAX_COMPRESSED_BLOCK_SIZE - - BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH]; - private final Deflater deflater = new Deflater(BlockCompressedStreamConstants.GZIP_CM_DEFLATE, true); - private final CRC32 crc32 = new CRC32(); - private final byte[] singleByteArray = new byte[1]; - - private int numberOfThrottleBacks = 0; - - public BlockCompressedOutputStream(final String filename) { - codec = new BinaryCodec(filename, true); - } - - public BlockCompressedOutputStream(final File file) { - codec = new BinaryCodec(file, true); - } - - @Override - public void write(final byte[] bytes) throws IOException { - write(bytes, 0, bytes.length); - } - - @Override - public void write(final byte[] bytes, int startIndex, int numBytes) throws IOException { - assert(numUncompressedBytes < uncompressedBuffer.length); - while (numBytes > 0) { - final int bytesToWrite = Math.min(uncompressedBuffer.length - numUncompressedBytes, numBytes); - System.arraycopy(bytes, startIndex, uncompressedBuffer, numUncompressedBytes, bytesToWrite); - numUncompressedBytes += bytesToWrite; - startIndex += bytesToWrite; - numBytes -= bytesToWrite; - assert(numBytes >= 0); - if (numUncompressedBytes == uncompressedBuffer.length) { - deflateBlock(); - } - } - } - - /** - * WARNING: flush() affects the output format, because it causes the current contents of uncompressedBuffer - * to be compressed and written, even if it isn't full. Unless you know what you're doing, don't call flush(). - * Instead, call close(), which will flush any unwritten data before closing the underlying stream. - * - */ - @Override - public void flush() throws IOException { - while (numUncompressedBytes > 0) { - deflateBlock(); - } - codec.getOutputStream().flush(); - } - - /** - * close() must be called in order to flush any remaining buffered bytes. - * - */ - @Override - public void close() throws IOException { - flush(); - if (numberOfThrottleBacks > 0) { - System.err.println("In BlockCompressedOutputStream, had to throttle back " + numberOfThrottleBacks + - " times for file " + codec.getOutputFileName()); - } - codec.close(); - } - - public void write(final int i) throws IOException { - singleByteArray[0] = (byte)i; - write(singleByteArray); - } - - /** - * Attempt to write the data in uncompressedBuffer to the underlying file in a gzip block. - * If the entire uncompressedBuffer does not fit in the maximum allowed size, reduce the amount - * of data to be compressed, and slide the excess down in uncompressedBuffer so it can be picked - * up in the next deflate event. - * @return size of gzip block that was written. - */ - private int deflateBlock() { - if (numUncompressedBytes == 0) { - return 0; - } - int bytesToCompress = numUncompressedBytes; - while (true) { - // Compress the input - deflater.reset(); - deflater.setInput(uncompressedBuffer, 0, bytesToCompress); - deflater.finish(); - final int compressedSize = deflater.deflate(compressedBuffer, 0, compressedBuffer.length); - - // If it didn't all fit in compressedBuffer.length, reduce the amount to - // be compressed and try again. - if (deflater.getBytesRead() < bytesToCompress) { - bytesToCompress -= BlockCompressedStreamConstants.UNCOMPRESSED_THROTTLE_AMOUNT; - ++numberOfThrottleBacks; - assert(bytesToCompress > 0); - continue; - } - // Data compressed small enough, so write it out. - crc32.reset(); - crc32.update(uncompressedBuffer, 0, bytesToCompress); - - final int totalBlockSize = writeGzipBlock(compressedSize, bytesToCompress, crc32.getValue()); - assert(bytesToCompress <= numUncompressedBytes); - - // Clear out from uncompressedBuffer the data that was written - if (bytesToCompress == numUncompressedBytes) { - numUncompressedBytes = 0; - } else { - System.arraycopy(uncompressedBuffer, bytesToCompress, uncompressedBuffer, 0, - numUncompressedBytes - bytesToCompress); - numUncompressedBytes -= bytesToCompress; - } - return totalBlockSize; - } - // unreachable - } - - /** - * Writes the entire gzip block, assuming the compressed data is stored in compressedBuffer - * @return size of gzip block that was written. - */ - private int writeGzipBlock(final int compressedSize, final int uncompressedSize, final long crc) { - // Init gzip header - codec.writeByte(BlockCompressedStreamConstants.GZIP_ID1); - codec.writeByte(BlockCompressedStreamConstants.GZIP_ID2); - codec.writeByte(BlockCompressedStreamConstants.GZIP_CM_DEFLATE); - codec.writeByte(BlockCompressedStreamConstants.GZIP_FLG); - codec.writeInt(0); // Modification time - codec.writeByte(BlockCompressedStreamConstants.GZIP_XFL); - codec.writeByte(BlockCompressedStreamConstants.GZIP_OS_UNKNOWN); - codec.writeShort(BlockCompressedStreamConstants.GZIP_XLEN); - codec.writeByte(BlockCompressedStreamConstants.BGZF_ID1); - codec.writeByte(BlockCompressedStreamConstants.BGZF_ID2); - codec.writeShort(BlockCompressedStreamConstants.BGZF_LEN); - final int totalBlockSize = compressedSize + BlockCompressedStreamConstants.BLOCK_HEADER_LENGTH + - BlockCompressedStreamConstants.BLOCK_FOOTER_LENGTH; - - // I don't know why we store block size - 1, but that is what the spec says - codec.writeShort((short)(totalBlockSize - 1)); - codec.writeBytes(compressedBuffer, 0, compressedSize); - codec.writeInt((int)crc); - codec.writeInt(uncompressedSize); - return totalBlockSize; - } -} diff --git a/java/lib/edu/mit/broad/sam/util/BlockCompressedStreamConstants.java b/java/lib/edu/mit/broad/sam/util/BlockCompressedStreamConstants.java deleted file mode 100644 index 7a5ffc0ce..000000000 --- a/java/lib/edu/mit/broad/sam/util/BlockCompressedStreamConstants.java +++ /dev/null @@ -1,63 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.util; - -/** - * Constants shared by BlockCompressed{Input,Output}Stream classes - */ -public class BlockCompressedStreamConstants { - // Number of bytes in the gzip block before the deflated data. - // This is not the standard header size, because we include one optional subfield, - // but it is the standard for us. - public static final int BLOCK_HEADER_LENGTH = 18; - - // Location in the gzip block of the total block size (actually total block size - 1) - public static final int BLOCK_LENGTH_OFFSET = 16; - - // Number of bytes that follow the deflated data - public static final int BLOCK_FOOTER_LENGTH = 8; - - // We require that a compressed block (including header and footer, be <= this) - public static final int MAX_COMPRESSED_BLOCK_SIZE = 64 * 1024; - - // Push out a gzip block when this many uncompressed bytes have been accumulated. - public static final int DEFAULT_UNCOMPRESSED_BLOCK_SIZE = 64 * 1024; - - // If after compressing a block, the compressed block is found to be > - // MAX_COMPRESSED_BLOCK_SIZE, including overhead, then throttle back bytes to - // be compressed by this amount and try again. - public static final int UNCOMPRESSED_THROTTLE_AMOUNT = 1024; - - // Magic numbers - public static final byte GZIP_ID1 = 31; - public static final int GZIP_ID2 = 139; - - // FEXTRA flag means there are optional fields - public static final int GZIP_FLG = 4; - - // extra flags - public static final int GZIP_XFL = 0; - - // length of extra subfield - public static final short GZIP_XLEN = 6; - - // The deflate compression, which is customarily used by gzip - public static final byte GZIP_CM_DEFLATE = 8; - - // We don't care about OS because we're not doing line terminator translation - public static final int GZIP_OS_UNKNOWN = 255; - - // The subfield ID - public static final byte BGZF_ID1 = 66; - public static final byte BGZF_ID2 = 67; - - // subfield length in bytes - public static final byte BGZF_LEN = 2; -} diff --git a/java/lib/edu/mit/broad/sam/util/CloseableIterator.java b/java/lib/edu/mit/broad/sam/util/CloseableIterator.java deleted file mode 100755 index 1c4612769..000000000 --- a/java/lib/edu/mit/broad/sam/util/CloseableIterator.java +++ /dev/null @@ -1,32 +0,0 @@ -/* - * The Broad Institute - * SOFTWARE COPYRIGHT NOTICE AGREEMENT - * This software and its documentation are copyright 2008 by the - * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. - * - * This software is supplied without any warranty or guaranteed support whatsoever. - * Neither the Broad Institute nor MIT can be responsible for its use, misuse, - * or functionality. - */ -package edu.mit.broad.sam.util; - -import java.util.Iterator; - -/** - * This interface is used by iterators that use releasable resources during iteration. - * - * The consumer of a CloseableIterator should ensure that the close() method is always called, - * for example by putting such a call in a finally block. Two conventions should be followed - * by all implementors of CloseableIterator: - * 1) The close() method should be idempotent. Calling close() twice should have no effect. - * 2) When hasNext() returns false, the iterator implementation should automatically close itself. - * The latter makes it somewhat safer for consumers to use the for loop syntax for iteration: - * for (Type obj : getCloseableIterator()) { ... } - * - * We do not inherit from java.io.Closeable because IOExceptions are a pain to deal with. - */ -public interface CloseableIterator - extends Iterator { - - public void close(); -} diff --git a/java/lib/edu/mit/broad/sam/util/CoordMath.java b/java/lib/edu/mit/broad/sam/util/CoordMath.java deleted file mode 100644 index 61eed6d01..000000000 --- a/java/lib/edu/mit/broad/sam/util/CoordMath.java +++ /dev/null @@ -1,75 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.util; - -public class CoordMath { - - public static long getLength(final long start, final long end) { - return (end - start) + 1; - } - - public static long getStart(final long end, final long length) { - return end - length + 1; - } - - public static long getEnd(final long start, final long length) { - return start + length - 1; - } - - /** - * Offsets are meant to exclude the 'offset' number of bases - */ - public static long getStartFromOffset(final long offset, final long length) { - return offset + 1; - } - - public static long getEndFromOffset(final long offset, final long length) { - return length - offset; - } - - public static long getLengthFromOffsets(final long startOffset, final long endOffset, final long length) { - return getLength(getStartFromOffset(startOffset, length), - getEndFromOffset(endOffset, length)); - } - - /** - * Gets a sub-sequence from a java.lang.String (which is zero based) using one based - * sequence coordinated. The base at the end coordinate will be included. - * - * @param sequence The String of base pairs - * @param begin The one based start coordinate - * @param end The one based end coordinate - * @return The subsequence specified - */ - public static String getSubsequence(final String sequence, final int begin, final int end) { - return sequence.substring(begin-1, end); - } - - /** - * Checks to see if the two sets of coordinates have any overlap. - */ - public static boolean overlaps(final long start, final long end, final long start2, final long end2) { - return (start2 >= start && start2 <= end) || (end2 >=start && end2 <= end) || - encloses(start2, end2, start, end); - } - - /** Returns true if the "inner" coords and totally enclosed by the "outer" coords. */ - public static boolean encloses(final long outerStart, final long outerEnd, final long innerStart, final long innerEnd) { - return innerStart >= outerStart && innerEnd <= outerEnd; - } - - /** - * Determines the amount of overlap between two coordinate ranges. Assumes that the two ranges - * actually do overlap and therefore may produce strange results when they do not! - */ - public static long getOverlap(final long start, final long end, final long start2, final long end2) { - return getLength(Math.max(start, start2), Math.min(end, end2)); - } -} diff --git a/java/lib/edu/mit/broad/sam/util/LineReader.java b/java/lib/edu/mit/broad/sam/util/LineReader.java deleted file mode 100644 index 237444e78..000000000 --- a/java/lib/edu/mit/broad/sam/util/LineReader.java +++ /dev/null @@ -1,33 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.util; - -/** - * Interface allows for implementations that read lines from a String, an ASCII file, or somewhere else. - */ -public interface LineReader { - - /** - * Read a line and remove the line terminator - */ - String readLine(); - - /** - * Read a line and optionally include the line terminator - * @param includeTerminators - * @return - */ - String readLine(boolean includeTerminators); - - /** - * @return 1-based number of line most recently read - */ - int getLineNumber(); -} diff --git a/java/lib/edu/mit/broad/sam/util/NonDestructiveIterator.java b/java/lib/edu/mit/broad/sam/util/NonDestructiveIterator.java deleted file mode 100644 index 3490b31e3..000000000 --- a/java/lib/edu/mit/broad/sam/util/NonDestructiveIterator.java +++ /dev/null @@ -1,48 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.util; - -import java.util.Iterator; - -/** - * PeekIterator is a better class to use than this. - * @param - * @param - */ -public class NonDestructiveIterator> { - private T current = null; - private final ITERATOR underlyingIterator; - - public NonDestructiveIterator(final ITERATOR underlyingIterator) { - this.underlyingIterator = underlyingIterator; - advance(); - } - - public T getCurrent() { - return current; - } - - public ITERATOR getUnderlyingIterator() { - return underlyingIterator; - } - - public boolean advance() { - if (this.underlyingIterator.hasNext()) { - current = this.underlyingIterator.next(); - } else { - current = null; - } - return hasCurrent(); - } - - public boolean hasCurrent() { - return getCurrent() != null; - } -} diff --git a/java/lib/edu/mit/broad/sam/util/PeekIterator.java b/java/lib/edu/mit/broad/sam/util/PeekIterator.java deleted file mode 100644 index 6346a10a3..000000000 --- a/java/lib/edu/mit/broad/sam/util/PeekIterator.java +++ /dev/null @@ -1,49 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.util; - -import java.util.Iterator; - -public class PeekIterator implements Iterator { - Iterator underlyingIterator; - T peekedElement = null; - - public PeekIterator(final Iterator underlyingIterator) { - this.underlyingIterator = underlyingIterator; - } - - public boolean hasNext() { - return peekedElement != null || underlyingIterator.hasNext(); - } - - public T next() { - if (peekedElement != null) { - final T ret = peekedElement; - peekedElement = null; - return ret; - } - return underlyingIterator.next(); - } - - public T peek() { - if (peekedElement == null) { - peekedElement = underlyingIterator.next(); - } - return peekedElement; - } - - public void remove() { - throw new UnsupportedOperationException(); - } - - public Iterator getUnderlyingIterator() { - return underlyingIterator; - } -} diff --git a/java/lib/edu/mit/broad/sam/util/RuntimeEOFException.java b/java/lib/edu/mit/broad/sam/util/RuntimeEOFException.java deleted file mode 100644 index ff99358f3..000000000 --- a/java/lib/edu/mit/broad/sam/util/RuntimeEOFException.java +++ /dev/null @@ -1,27 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.util; - -public class RuntimeEOFException extends RuntimeException { - public RuntimeEOFException() { - } - - public RuntimeEOFException(final String s) { - super(s); - } - - public RuntimeEOFException(final String s, final Throwable throwable) { - super(s, throwable); - } - - public RuntimeEOFException(final Throwable throwable) { - super(throwable); - } -} diff --git a/java/lib/edu/mit/broad/sam/util/RuntimeIOException.java b/java/lib/edu/mit/broad/sam/util/RuntimeIOException.java deleted file mode 100644 index b6e51bcfb..000000000 --- a/java/lib/edu/mit/broad/sam/util/RuntimeIOException.java +++ /dev/null @@ -1,27 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2008 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.util; - -public class RuntimeIOException extends RuntimeException { - public RuntimeIOException() { - } - - public RuntimeIOException(final String s) { - super(s); - } - - public RuntimeIOException(final String s, final Throwable throwable) { - super(s, throwable); - } - - public RuntimeIOException(final Throwable throwable) { - super(throwable); - } -} diff --git a/java/lib/edu/mit/broad/sam/util/SortingCollection.java b/java/lib/edu/mit/broad/sam/util/SortingCollection.java deleted file mode 100644 index b501a08b3..000000000 --- a/java/lib/edu/mit/broad/sam/util/SortingCollection.java +++ /dev/null @@ -1,369 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.util; - -import java.io.*; -import java.lang.reflect.Array; -import java.util.*; - -/** - * Collection to which many records can be added. After all records are added, the collection can be - * iterated, and the records will be returned in order defined by the comparator. Records may be spilled - * to a temporary directory if there are more records added than will fit in memory. As a result of this, - * the objects returned may not be identical to the objects added to the collection, but they should be - * equal as determined by the codec used to write them to disk and read them back. - */ -public class SortingCollection - implements Iterable { - - /** - * Client must implement this class, which defines the way in which records are written to and - * read from file. - */ - public interface Codec { - /** - * Where to write encoded output - * @param os - */ - void setOutputStream(OutputStream os); - - /** - * Where to read encoded input from - * @param is - */ - void setInputStream(InputStream is); - /** - * Write object to output stream - * @param val what to write - */ - void encode(T val); - - /** - * Read the next record from the input stream and convert into a java object. - * @return null if no more records. Should throw exception if EOF is encountered in the middle of - * a record. - */ - T decode(); - - /** - * Must return a cloned copy of the codec that can be used independently of - * the original instance. - */ - Codec clone(); - } - - /** - * Where files of sorted records go. - */ - private final File tmpDir; - private final SortingCollection.Codec codec; - private final Comparator comparator; - private final int maxRecordsInRam; - private int numRecordsInRam = 0; - private T[] ramRecords; - private boolean iterationStarted = false; - private boolean cleanedUp = false; - - /** - * List of files in tmpDir containing sorted records - */ - private final List files = new ArrayList(); - - /** - * Prepare to accumulate records to be sorted - * @param componentType Class of the record to be sorted. Necessary because of Java generic lameness. - * @param codec For writing records to file and reading them back into RAM - * @param comparator Defines output sort order - * @param maxRecordsInRam - * @param tmpDir Where to write files of records that will not fit in RAM - */ - private SortingCollection(final Class componentType, final SortingCollection.Codec codec, - final Comparator comparator, final int maxRecordsInRam, final File tmpDir) { - if (maxRecordsInRam <= 0) { - throw new IllegalArgumentException("maxRecordsInRam must be > 0"); - } - this.tmpDir = tmpDir; - this.codec = codec; - this.comparator = comparator; - this.maxRecordsInRam = maxRecordsInRam; - this.ramRecords = (T[])Array.newInstance(componentType, maxRecordsInRam); - } - - public void add(final T rec) { - if (iterationStarted) { - throw new IllegalStateException("Cannot add after calling iterator()"); - } - if (numRecordsInRam == maxRecordsInRam) { - spillToDisk(); - } - ramRecords[numRecordsInRam++] = rec; - } - - /** - * Sort the records in memory, write them to a file, and clear the buffer of records in memory. - */ - private void spillToDisk() { - try { - Arrays.sort(this.ramRecords, 0, this.numRecordsInRam, this.comparator); - final File f = File.createTempFile("sortingcollection.", ".tmp", this.tmpDir); - OutputStream os = null; - try { - os = new BufferedOutputStream(new FileOutputStream(f)); - this.codec.setOutputStream(os); - f.deleteOnExit(); - for (int i = 0; i < this.numRecordsInRam; ++i) { - this.codec.encode(ramRecords[i]); - // Facilitate GC - this.ramRecords[i] = null; - } - - os.flush(); - } - finally { - if (os != null) { - os.close(); - } - } - - this.numRecordsInRam = 0; - this.files.add(f); - - } - catch (IOException e) { - throw new RuntimeIOException(e); - } - } - - /** - * Prepare to iterate through the records in order. This method may be called more than once, - * but add() may not be called after this method has been called. - */ - public CloseableIterator iterator() { - if (this.cleanedUp) { - throw new IllegalStateException("Cannot call iterator() after cleanup() was called."); - } - - this.iterationStarted = true; - if (this.files.isEmpty()) { - return new InMemoryIterator(); - } - - if (this.numRecordsInRam > 0) { - spillToDisk(); - } - - // Facilitate GC - this.ramRecords = null; - return new MergingIterator(); - } - - /** - * Delete any temporary files. After this method is called, iterator() may not be called. - */ - public void cleanup() { - this.iterationStarted = true; - this.cleanedUp = true; - - for (final File f : this.files) { - f.delete(); - } - } - - /** - * Syntactic sugar around the ctor, to save some typing of type parameters - * - * @param componentType Class of the record to be sorted. Necessary because of Java generic lameness. - * @param codec For writing records to file and reading them back into RAM - * @param comparator Defines output sort order - * @param maxRecordsInRAM - * @param tmpDir Where to write files of records that will not fit in RAM - */ - public static SortingCollection newInstance(final Class componentType, - final SortingCollection.Codec codec, - final Comparator comparator, - final int maxRecordsInRAM, - final File tmpDir) { - return new SortingCollection(componentType, codec, comparator, maxRecordsInRAM, tmpDir); - - } - - public static SortingCollection newInstance(final Class componentType, - final SortingCollection.Codec codec, - final Comparator comparator, - final int maxRecordsInRAM) { - - final File tmpDir = new File(System.getProperty("java.io.tmpdir")); - return new SortingCollection(componentType, codec, comparator, maxRecordsInRAM, tmpDir); - } - - /** - * For iteration when number of records added is less than the threshold for spilling to disk. - */ - class InMemoryIterator implements CloseableIterator { - private int iterationIndex = 0; - - InMemoryIterator() { - Arrays.sort(SortingCollection.this.ramRecords, - 0, - SortingCollection.this.numRecordsInRam, - SortingCollection.this.comparator); - } - - public void close() { - // nothing to do - } - - public boolean hasNext() { - return this.iterationIndex < SortingCollection.this.numRecordsInRam; - } - - public T next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - return SortingCollection.this.ramRecords[iterationIndex++]; - } - - public void remove() { - throw new UnsupportedOperationException(); - } - } - - /** - * For iteration when spilling to disk has occurred. - * Each file is has records in sort order within the file. - * This iterator automatically closes when it iterates to the end, but if not iterating - * to the end it is a good idea to call close(). - * - * Algorithm: MergingIterator maintains a PriorityQueue of PeekFileRecordIterators. - * Each PeekFileRecordIterator iterates through a file in which the records are sorted. - * The comparator for PeekFileRecordIterator used by the PriorityQueue peeks at the next record from - * the file, so the first element in the PriorityQueue is the file that has the next record to be emitted. - * In order to get the next record, the first PeekFileRecordIterator in the PriorityQueue is popped, - * the record is obtained from that iterator, and then if that iterator is not empty, it is pushed back into - * the PriorityQueue. Because it now has a different record as its next element, it may go into another - * location in the PriorityQueue - */ - class MergingIterator implements CloseableIterator { - private final PriorityQueue priorityQueue; - - MergingIterator() { - this.priorityQueue = new PriorityQueue(SortingCollection.this.files.size(), - new PeekFileRecordIteratorComparator()); - for (final File f : SortingCollection.this.files) { - final FileRecordIterator it = new FileRecordIterator(f); - if (it.hasNext()) { - this.priorityQueue.offer(new PeekFileRecordIterator(it)); - } - else { - it.close(); - } - } - } - - public boolean hasNext() { - return !this.priorityQueue.isEmpty(); - } - - public T next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - - final PeekFileRecordIterator fileIterator = priorityQueue.poll(); - final T ret = fileIterator.next(); - if (fileIterator.hasNext()) { - this.priorityQueue.offer(fileIterator); - } - else { - ((CloseableIterator)fileIterator.getUnderlyingIterator()).close(); - } - - return ret; - } - - public void remove() { - throw new UnsupportedOperationException(); - } - - public void close() { - while (!this.priorityQueue.isEmpty()) { - final PeekFileRecordIterator it = this.priorityQueue.poll(); - ((CloseableIterator)it.getUnderlyingIterator()).close(); - } - } - } - - /** - * Read a file of records in format defined by the codec - */ - class FileRecordIterator implements CloseableIterator { - private final File file; - private final FileInputStream is; - private final Codec codec; - private T currentRecord = null; - - FileRecordIterator(final File file) { - this.file = file; - try { - this.is = new FileInputStream(file); - this.codec = SortingCollection.this.codec.clone(); - this.codec.setInputStream(this.is); - advance(); - } - catch (FileNotFoundException e) { - throw new RuntimeIOException(e); - } - } - - public boolean hasNext() { - return this.currentRecord != null; - } - - public T next() { - if (!hasNext()) { - throw new NoSuchElementException(); - } - final T ret = this.currentRecord; - advance(); - return ret; - } - - public void remove() { - throw new UnsupportedOperationException(); - } - - private void advance() { - this.currentRecord = this.codec.decode(); - } - - public void close() { - try { this.is.close(); } - catch (IOException e) { } - } - } - - - /** - * Just a typedef - */ - class PeekFileRecordIterator extends PeekIterator { - PeekFileRecordIterator(final Iterator underlyingIterator) { - super(underlyingIterator); - } - } - - class PeekFileRecordIteratorComparator implements Comparator { - - public int compare(final PeekFileRecordIterator peekFileRecordIterator, final PeekFileRecordIterator peekFileRecordIterator1) { - return comparator.compare(peekFileRecordIterator.peek(), peekFileRecordIterator1.peek()); - } - } -} diff --git a/java/lib/edu/mit/broad/sam/util/StringLineReader.java b/java/lib/edu/mit/broad/sam/util/StringLineReader.java deleted file mode 100644 index 8bcaf5447..000000000 --- a/java/lib/edu/mit/broad/sam/util/StringLineReader.java +++ /dev/null @@ -1,65 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.util; - -/** - * Implementation of LineReader that gets its input from a String. No charset conversion - * is necessary because the String is in unicode. Handles CR, LF or CRLF line termination, - * but if asked to return the line terminator, it always comes back as LF. - */ -public class StringLineReader implements LineReader { - - private final String theString; - private int curPos = 0; - private int lineNumber = 0; - - public StringLineReader(final String s) { - // Simplify later processing by replacing crlf with just lf, and replacing solo cr with lf - this.theString = s.replaceAll("\r\n", "\n").replaceAll("\r", "\n"); - } - - /** - * Read a line and remove the line terminator - */ - public String readLine() { - return readLine(false); - } - - /** - * Read a line and optionally include the line terminator - * - * @param includeTerminators - * @return - */ - public String readLine(final boolean includeTerminators) { - if (curPos == theString.length()) { - return null; - } - final int nextLfIndex = theString.indexOf('\n', curPos); - if (nextLfIndex == -1) { - final int startPos = curPos; - curPos = theString.length(); - ++lineNumber; - return theString.substring(startPos); - } - final int startPos = curPos; - final int endPos = nextLfIndex + (includeTerminators? 1: 0); - curPos = nextLfIndex + 1; - ++lineNumber; - return theString.substring(startPos, endPos); - } - - /** - * @return 1-based number of line most recently read - */ - public int getLineNumber() { - return lineNumber; - } -} diff --git a/java/lib/edu/mit/broad/sam/util/StringUtil.java b/java/lib/edu/mit/broad/sam/util/StringUtil.java deleted file mode 100644 index b4ab47522..000000000 --- a/java/lib/edu/mit/broad/sam/util/StringUtil.java +++ /dev/null @@ -1,136 +0,0 @@ -/* -* The Broad Institute -* SOFTWARE COPYRIGHT NOTICE AGREEMENT -* This software and its documentation are copyright 2009 by the -* Broad Institute/Massachusetts Institute of Technology. All rights are reserved. -* -* This software is supplied without any warranty or guaranteed support whatsoever. Neither -* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality. -*/ -package edu.mit.broad.sam.util; - -public class StringUtil { - /** - * - * @param separator String to interject between each string in strings arg - * @param strings List of strings to be joined. - * @return String that concatenates each item of strings arg, with separator btw each of them. - */ - public static String join(final String separator, final String[] strings) { - if (strings.length == 0) { - return ""; - } - final StringBuilder ret = new StringBuilder(strings[0]); - for (int i = 1; i < strings.length; ++i) { - ret.append(separator); - ret.append(strings[i]); - } - return ret.toString(); - } - - /** - * Split the string into tokesn separated by the given delimiter. Profiling has - * revealed that the standard string.split() method typically takes > 1/2 - * the total time when used for parsing ascii files. - * - * @param aString the string to split - * @param tokens an array to hold the parsed tokens - * @param delim character that delimits tokens - * @return the number of tokens parsed - */ - public static int split(final String aString, final String[] tokens, final char delim) { - - final int maxTokens = tokens.length; - int nTokens = 0; - int start = 0; - int end = aString.indexOf(delim); - if(end < 0) { - tokens[nTokens++] = aString; - return nTokens; - } - while ((end > 0) && (nTokens < maxTokens)) - { - tokens[nTokens++] = aString.substring(start, end); - start = end + 1; - end = aString.indexOf(delim, start); - - } - // Add the trailing string, if there is room and if it is not empty. - if (nTokens < maxTokens) - { - final String trailingString = aString.substring(start); - if (trailingString.length() > 0) - { - tokens[nTokens++] = trailingString; - } - } - return nTokens; - } - - //////////////////////////////////////////////////////////////////// - // The following methods all convert btw bytes and Strings, without - // using the Java character set mechanism. - //////////////////////////////////////////////////////////////////// - - public static String bytesToString(final byte[] data) { - if (data == null) { - return null; - } - return bytesToString(data, 0, data.length); - } - - @SuppressWarnings("deprecation") - public static String bytesToString(final byte[] buffer, final int offset, final int length) { -/* - The non-deprecated way, that requires allocating char[] - final char[] charBuffer = new char[length]; - for (int i = 0; i < length; ++i) { - charBuffer[i] = (char)buffer[i+offset]; - } - return new String(charBuffer); -*/ - return new String(buffer, 0, offset, length); - } - - @SuppressWarnings("deprecation") - public static byte[] stringToBytes(final String s) { -/* - The non-deprecated way, that requires allocating char[] - final byte[] byteBuffer = new byte[s.length()]; - final char[] charBuffer = s.toCharArray(); - for (int i = 0; i < charBuffer.length; ++i) { - byteBuffer[i] = (byte)(charBuffer[i] & 0xff); - } - return byteBuffer; -*/ - final byte[] byteBuffer = new byte[s.length()]; - s.getBytes(0, byteBuffer.length, byteBuffer, 0); - return byteBuffer; - } - - // This method might more appropriately live in BinaryCodec, but all the byte <=> char conversion - // should be in the same place. - public static String readNullTerminatedString(final BinaryCodec binaryCodec) { - final StringBuilder ret = new StringBuilder(); - for (byte b = binaryCodec.readByte(); b != 0; b = binaryCodec.readByte()) { - ret.append((char)(b & 0xff)); - } - return ret.toString(); - } - - /** - * Convert chars to bytes merely by casting - * @param chars input chars - * @param charOffset where to start converting from chars array - * @param length how many chars to convert - * @param bytes where to put the converted output - * @param byteOffset where to start writing the converted output. - */ - public static void charsToBytes(final char[] chars, final int charOffset, final int length, - final byte[] bytes, final int byteOffset) { - for (int i = 0; i < length; ++i) { - bytes[byteOffset + i] = (byte)chars[charOffset + i]; - } - } - -} diff --git a/java/jars/functionalj.jar b/java/lib/functionalj.jar similarity index 100% rename from java/jars/functionalj.jar rename to java/lib/functionalj.jar diff --git a/java/lib/picard.jar b/java/lib/picard.jar new file mode 100644 index 000000000..9390564f1 Binary files /dev/null and b/java/lib/picard.jar differ diff --git a/java/lib/sam-1.0.jar b/java/lib/sam-1.0.jar new file mode 100644 index 000000000..aec063f44 Binary files /dev/null and b/java/lib/sam-1.0.jar differ diff --git a/java/src/edu/mit/broad/sting/ValidateSAM.java b/java/src/edu/mit/broad/sting/ValidateSAM.java index 237b838a6..841d88ccc 100755 --- a/java/src/edu/mit/broad/sting/ValidateSAM.java +++ b/java/src/edu/mit/broad/sting/ValidateSAM.java @@ -1,8 +1,8 @@ package edu.mit.broad.sting; -import edu.mit.broad.sam.*; -import edu.mit.broad.sam.SAMFileReader.ValidationStringency; -import edu.mit.broad.sam.util.CloseableIterator; +import net.sf.samtools.*; +import net.sf.samtools.SAMFileReader.ValidationStringency; +import net.sf.samtools.util.CloseableIterator; import edu.mit.broad.picard.cmdline.CommandLineProgram; import edu.mit.broad.picard.cmdline.Usage; import edu.mit.broad.picard.cmdline.Option; @@ -102,4 +102,4 @@ public class ValidateSAM extends CommandLineProgram { return samReader; } -} \ No newline at end of file +} diff --git a/java/src/edu/mit/broad/sting/atk/AnalysisTK.java b/java/src/edu/mit/broad/sting/atk/AnalysisTK.java index b4b71c89f..4751385bc 100644 --- a/java/src/edu/mit/broad/sting/atk/AnalysisTK.java +++ b/java/src/edu/mit/broad/sting/atk/AnalysisTK.java @@ -1,6 +1,6 @@ package edu.mit.broad.sting.atk; -import edu.mit.broad.sam.SAMFileReader.ValidationStringency; +import net.sf.samtools.SAMFileReader.ValidationStringency; import edu.mit.broad.picard.cmdline.CommandLineProgram; import edu.mit.broad.picard.cmdline.Usage; import edu.mit.broad.picard.cmdline.Option; diff --git a/java/src/edu/mit/broad/sting/atk/LocusContext.java b/java/src/edu/mit/broad/sting/atk/LocusContext.java index 12c246365..b8b9c23e2 100755 --- a/java/src/edu/mit/broad/sting/atk/LocusContext.java +++ b/java/src/edu/mit/broad/sting/atk/LocusContext.java @@ -1,6 +1,6 @@ package edu.mit.broad.sting.atk; -import edu.mit.broad.sam.SAMRecord; +import net.sf.samtools.SAMRecord; import java.util.List; diff --git a/java/src/edu/mit/broad/sting/atk/LocusIterator.java b/java/src/edu/mit/broad/sting/atk/LocusIterator.java index 2f6fa516e..ff306ab03 100755 --- a/java/src/edu/mit/broad/sting/atk/LocusIterator.java +++ b/java/src/edu/mit/broad/sting/atk/LocusIterator.java @@ -1,7 +1,7 @@ package edu.mit.broad.sting.atk; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.SAMRecord; +import net.sf.samtools.util.CloseableIterator; +import net.sf.samtools.SAMRecord; import edu.mit.broad.sting.utils.PushbackIterator; import edu.mit.broad.sting.utils.Utils; import edu.mit.broad.sting.utils.Predicate; diff --git a/java/src/edu/mit/broad/sting/atk/PrepareROD.java b/java/src/edu/mit/broad/sting/atk/PrepareROD.java index 74c677d04..aa379f12d 100644 --- a/java/src/edu/mit/broad/sting/atk/PrepareROD.java +++ b/java/src/edu/mit/broad/sting/atk/PrepareROD.java @@ -1,7 +1,7 @@ package edu.mit.broad.sting.atk; -import edu.mit.broad.sam.SAMFileReader.ValidationStringency; -import edu.mit.broad.sam.SAMSequenceRecord; +import net.sf.samtools.SAMFileReader.ValidationStringency; +import net.sf.samtools.SAMSequenceRecord; import edu.mit.broad.picard.cmdline.CommandLineProgram; import edu.mit.broad.picard.cmdline.Usage; import edu.mit.broad.picard.cmdline.Option; @@ -89,4 +89,4 @@ public class PrepareROD extends CommandLineProgram { return 0; } -} \ No newline at end of file +} diff --git a/java/src/edu/mit/broad/sting/atk/ReadWalker.java b/java/src/edu/mit/broad/sting/atk/ReadWalker.java index 6f6fa915b..13374a4e8 100755 --- a/java/src/edu/mit/broad/sting/atk/ReadWalker.java +++ b/java/src/edu/mit/broad/sting/atk/ReadWalker.java @@ -1,6 +1,6 @@ package edu.mit.broad.sting.atk; -import edu.mit.broad.sam.SAMRecord; +import net.sf.samtools.SAMRecord; import edu.mit.broad.sting.atk.LocusContext; /** @@ -25,4 +25,4 @@ public interface ReadWalker { ReduceType reduce(MapType value, ReduceType sum); void onTraveralDone(); -} \ No newline at end of file +} diff --git a/java/src/edu/mit/broad/sting/atk/TraversalEngine.java b/java/src/edu/mit/broad/sting/atk/TraversalEngine.java index ae1444763..c5ecc4f4b 100755 --- a/java/src/edu/mit/broad/sting/atk/TraversalEngine.java +++ b/java/src/edu/mit/broad/sting/atk/TraversalEngine.java @@ -1,9 +1,9 @@ package edu.mit.broad.sting.atk; -import edu.mit.broad.sam.*; -import edu.mit.broad.sam.SAMFileReader.ValidationStringency; -import edu.mit.broad.sam.util.CloseableIterator; -import edu.mit.broad.sam.util.RuntimeIOException; +import net.sf.samtools.*; +import net.sf.samtools.SAMFileReader.ValidationStringency; +import net.sf.samtools.util.CloseableIterator; +import net.sf.samtools.util.RuntimeIOException; import edu.mit.broad.picard.filter.SamRecordFilter; import edu.mit.broad.picard.filter.FilteringIterator; import edu.mit.broad.picard.reference.ReferenceSequenceFile; diff --git a/java/src/edu/mit/broad/sting/atk/modules/BaseQualityHistoWalker.java b/java/src/edu/mit/broad/sting/atk/modules/BaseQualityHistoWalker.java index 6c8c8c982..969550c77 100755 --- a/java/src/edu/mit/broad/sting/atk/modules/BaseQualityHistoWalker.java +++ b/java/src/edu/mit/broad/sting/atk/modules/BaseQualityHistoWalker.java @@ -1,6 +1,6 @@ package edu.mit.broad.sting.atk.modules; -import edu.mit.broad.sam.SAMRecord; +import net.sf.samtools.SAMRecord; import edu.mit.broad.sting.atk.ReadWalker; import edu.mit.broad.sting.atk.LocusContext; @@ -56,4 +56,4 @@ public class BaseQualityHistoWalker implements ReadWalker { System.out.printf("%3d : %10d%n", i, this.qualCounts[i]); } } -} \ No newline at end of file +} diff --git a/java/src/edu/mit/broad/sting/atk/modules/BasicLociWalker.java b/java/src/edu/mit/broad/sting/atk/modules/BasicLociWalker.java index daf38d3dc..3dbd143c3 100755 --- a/java/src/edu/mit/broad/sting/atk/modules/BasicLociWalker.java +++ b/java/src/edu/mit/broad/sting/atk/modules/BasicLociWalker.java @@ -3,7 +3,7 @@ package edu.mit.broad.sting.atk.modules; import edu.mit.broad.sting.atk.LocusWalker; import edu.mit.broad.sting.atk.LocusIterator; import edu.mit.broad.sting.utils.ReferenceOrderedDatum; -import edu.mit.broad.sam.SAMRecord; +import net.sf.samtools.SAMRecord; import java.util.List; @@ -34,4 +34,4 @@ public abstract class BasicLociWalker implements LocusWalke public abstract ReduceType reduceInit(); public abstract ReduceType reduce(MapType value, ReduceType sum); -} \ No newline at end of file +} diff --git a/java/src/edu/mit/broad/sting/atk/modules/BasicReadWalker.java b/java/src/edu/mit/broad/sting/atk/modules/BasicReadWalker.java index 6606b190a..d78651edf 100755 --- a/java/src/edu/mit/broad/sting/atk/modules/BasicReadWalker.java +++ b/java/src/edu/mit/broad/sting/atk/modules/BasicReadWalker.java @@ -1,6 +1,6 @@ package edu.mit.broad.sting.atk.modules; -import edu.mit.broad.sam.SAMRecord; +import net.sf.samtools.SAMRecord; import edu.mit.broad.sting.atk.LocusContext; import edu.mit.broad.sting.atk.ReadWalker; @@ -28,4 +28,4 @@ public abstract class BasicReadWalker implements ReadWalker public abstract MapType map(LocusContext context, SAMRecord read); public abstract ReduceType reduceInit(); public abstract ReduceType reduce(MapType value, ReduceType sum); -} \ No newline at end of file +} diff --git a/java/src/edu/mit/broad/sting/atk/modules/CountReadsWalker.java b/java/src/edu/mit/broad/sting/atk/modules/CountReadsWalker.java index 1ac0b38cf..7699529b7 100755 --- a/java/src/edu/mit/broad/sting/atk/modules/CountReadsWalker.java +++ b/java/src/edu/mit/broad/sting/atk/modules/CountReadsWalker.java @@ -1,6 +1,6 @@ package edu.mit.broad.sting.atk.modules; -import edu.mit.broad.sam.SAMRecord; +import net.sf.samtools.SAMRecord; import edu.mit.broad.sting.atk.LocusContext; public class CountReadsWalker extends BasicReadWalker { diff --git a/java/src/edu/mit/broad/sting/atk/modules/PileupWalker.java b/java/src/edu/mit/broad/sting/atk/modules/PileupWalker.java index f67e818c2..6c4b8fb0a 100644 --- a/java/src/edu/mit/broad/sting/atk/modules/PileupWalker.java +++ b/java/src/edu/mit/broad/sting/atk/modules/PileupWalker.java @@ -5,7 +5,7 @@ import edu.mit.broad.sting.atk.LocusIterator; import edu.mit.broad.sting.utils.ReferenceOrderedDatum; import edu.mit.broad.sting.utils.rodDbSNP; import edu.mit.broad.sting.utils.Utils; -import edu.mit.broad.sam.SAMRecord; +import net.sf.samtools.SAMRecord; import java.util.List; @@ -86,4 +86,4 @@ public class PileupWalker implements LocusWalker { public void onTraveralDone() { } -} \ No newline at end of file +} diff --git a/java/src/edu/mit/broad/sting/atk/modules/PrintReadsWalker.java b/java/src/edu/mit/broad/sting/atk/modules/PrintReadsWalker.java index 4cacbd3ab..85eec90f9 100755 --- a/java/src/edu/mit/broad/sting/atk/modules/PrintReadsWalker.java +++ b/java/src/edu/mit/broad/sting/atk/modules/PrintReadsWalker.java @@ -1,6 +1,6 @@ package edu.mit.broad.sting.atk.modules; -import edu.mit.broad.sam.SAMRecord; +import net.sf.samtools.SAMRecord; import edu.mit.broad.sting.atk.LocusContext; public class PrintReadsWalker extends BasicReadWalker { @@ -14,4 +14,4 @@ public class PrintReadsWalker extends BasicReadWalker { public Integer reduce(Integer value, Integer sum) { return value + sum; } -} \ No newline at end of file +} diff --git a/java/src/edu/mit/broad/sting/utils/ReferenceIterator.java b/java/src/edu/mit/broad/sting/utils/ReferenceIterator.java index c5b28c8f9..a9c80f044 100755 --- a/java/src/edu/mit/broad/sting/utils/ReferenceIterator.java +++ b/java/src/edu/mit/broad/sting/utils/ReferenceIterator.java @@ -2,7 +2,7 @@ package edu.mit.broad.sting.utils; import edu.mit.broad.picard.reference.ReferenceSequenceFile; import edu.mit.broad.picard.reference.ReferenceSequence; -import edu.mit.broad.sam.util.StringUtil; +import net.sf.samtools.util.StringUtil; import java.util.Iterator; import java.util.NoSuchElementException; diff --git a/java/src/edu/mit/broad/sting/utils/Utils.java b/java/src/edu/mit/broad/sting/utils/Utils.java index da378e8d2..da2637f59 100755 --- a/java/src/edu/mit/broad/sting/utils/Utils.java +++ b/java/src/edu/mit/broad/sting/utils/Utils.java @@ -1,7 +1,7 @@ package edu.mit.broad.sting.utils; -import edu.mit.broad.sam.SAMRecord; -import edu.mit.broad.sam.SAMSequenceRecord; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMSequenceRecord; import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; import edu.mit.broad.picard.reference.ReferenceSequence; import edu.mit.broad.picard.reference.ReferenceSequenceFile; diff --git a/java/src/edu/mit/broad/sting/utils/rodDbSNP.java b/java/src/edu/mit/broad/sting/utils/rodDbSNP.java index a2c5f8de4..74b03ad8e 100644 --- a/java/src/edu/mit/broad/sting/utils/rodDbSNP.java +++ b/java/src/edu/mit/broad/sting/utils/rodDbSNP.java @@ -1,7 +1,7 @@ package edu.mit.broad.sting.utils; -import edu.mit.broad.sam.SAMRecord; -import edu.mit.broad.sam.util.CloseableIterator; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; import edu.mit.broad.picard.util.TabbedTextFileParser; import edu.mit.broad.picard.util.SequenceUtil; @@ -157,4 +157,4 @@ public class rodDbSNP extends ReferenceOrderedDatum { throw e; } } -} \ No newline at end of file +} diff --git a/java/src/edu/mit/broad/sting/utils/rodGFF.java b/java/src/edu/mit/broad/sting/utils/rodGFF.java index bff564ab2..0eab78300 100644 --- a/java/src/edu/mit/broad/sting/utils/rodGFF.java +++ b/java/src/edu/mit/broad/sting/utils/rodGFF.java @@ -1,7 +1,7 @@ package edu.mit.broad.sting.utils; -import edu.mit.broad.sam.SAMRecord; -import edu.mit.broad.sam.util.CloseableIterator; +import net.sf.samtools.SAMRecord; +import net.sf.samtools.util.CloseableIterator; import edu.mit.broad.picard.util.TabbedTextFileParser; import java.io.File; @@ -116,4 +116,4 @@ public class rodGFF extends ReferenceOrderedDatum { HashMap attributes = null; setValues(contig, source, feature, start, stop, score, strand, frame, attributes); } -} \ No newline at end of file +}