Clean up 3rd party dependencies.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@27 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
hanna 2009-03-10 19:34:00 +00:00
parent e4bde58353
commit ea0a826f8f
221 changed files with 58 additions and 31995 deletions

View File

@ -3,10 +3,10 @@
simple build file
</description>
<!-- set global properties for this build -->
<property name="src" location=""/>
<property name="build" location="out/production/AnalysisTK"/>
<property name="src" location="src"/>
<property name="build" location="build"/>
<property name="dist" location="dist"/>
<property name="jars" location="jars/functionalj.jar"/>
<property name="lib" location="lib"/>
<target name="init">
<!-- Create the time stamp -->
@ -18,16 +18,31 @@
<target name="compile" depends="init"
description="compile the source " >
<!-- Compile the java code from ${src} into ${build} -->
<javac srcdir="${src}" destdir="${build}" classpath="${jars}"/>
<javac srcdir="${src}" destdir="${build}" >
<classpath>
<fileset dir="lib">
<include name="*.jar" />
</fileset>
</classpath>
</javac>
</target>
<target name="dist" depends="compile"
description="generate the distribution" >
<!-- Create the distribution directory -->
<mkdir dir="${dist}/lib"/>
<mkdir dir="${dist}"/>
<!-- Put everything in ${build} into the MyProject-${DSTAMP}.jar file -->
<jar jarfile="${dist}/AnalysisTK-${DSTAMP}.jar" basedir="${build}"/>
<jar jarfile="${dist}/AnalysisTK.jar" basedir="${build}">
<manifest>
<attribute name="Class-Path" value="functionalj.jar picard.jar sam-1.0.jar" />
<attribute name="Main-Class" value="edu.mit.broad.sting.atk.AnalysisTK" />
</manifest>
</jar>
<copy todir="${dist}">
<fileset dir="${lib}" includes="*.jar" />
</copy>
</target>
<target name="clean"

View File

@ -1,242 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.arachne;
/**
* This class represents an arachne LookAlign alignment (or other related data structures).
*/
public class Alignment {
private static final char TAB = '\t';
private int mASequenceId;
private int mASequenceLength;
private int mAStart;
private int mAEnd;
private int mBSequenceId;
private int mBSequenceLength;
private int mBStart;
private int mBEnd;
private char mOrientation;
private int[] mAlignmentBlocks;
public Alignment() {
}
public int getASequenceId() {
return mASequenceId;
}
public void setASequenceId(int value) {
mASequenceId = value;
}
public int getASequenceLength() {
return mASequenceLength;
}
public void setASequenceLength(int value) {
mASequenceLength = value;
}
public int getAStart() {
return mAStart;
}
public void setAStart(int value) {
mAStart = value;
}
public int getAEnd() {
return mAEnd;
}
public void setAEnd(int value) {
mAEnd = value;
}
public int getBSequenceId() {
return mBSequenceId;
}
public void setBSequenceId(int value) {
mBSequenceId = value;
}
public int getBSequenceLength() {
return mBSequenceLength;
}
public void setBSequenceLength(int value) {
mBSequenceLength = value;
}
public int getBStart() {
return mBStart;
}
public void setBStart(int value) {
mBStart = value;
}
public int getBEnd() {
return mBEnd;
}
public void setBEnd(int value) {
mBEnd = value;
}
public char getOrientation() {
return mOrientation;
}
public void setOrientation(char value) {
mOrientation = value;
}
public int[] getAlignmentBlocks() {
return mAlignmentBlocks;
}
public void setAlignmentBlocks(int[] value) {
mAlignmentBlocks = value;
}
public static Alignment parse(String text) {
if (text == null) {
return null;
}
String[] fields = text.trim().split("\t");
if (fields.length == 0) {
return null;
}
if (!fields[0].equals("QUERY")) {
throw new IllegalArgumentException("Invalid alignment: " + text);
}
if (fields.length < 14) {
throw new IllegalArgumentException("Invalid alignment: " + text);
}
int seqAId = parseIntField(fields[1]);
int seqAStart = parseIntField(fields[2]);
int seqAEnd = parseIntField(fields[3]);
int seqALength = parseIntField(fields[4]);
int orientation = parseIntField(fields[5]);
int seqBId = parseIntField(fields[6]);
int seqBStart = parseIntField(fields[7]);
int seqBEnd = parseIntField(fields[8]);
int seqBLength = parseIntField(fields[9]);
int blockCount = parseIntField(fields[10]);
if (seqAStart < 0 || seqAEnd <= 0 || seqALength <= 0 ||
seqAStart >= seqALength || seqAEnd > seqALength || seqAStart >= seqAEnd) {
throw new IllegalArgumentException("Invalid alignment: " + text);
}
if (seqBStart < 0 || seqBEnd <= 0 || seqBLength <= 0 ||
seqBStart >= seqBLength || seqBEnd > seqBLength || seqBStart >= seqBEnd) {
throw new IllegalArgumentException("Invalid alignment: " + text);
}
if (orientation < 0 || orientation > 1) {
throw new IllegalArgumentException("Invalid alignment: " + text);
}
if (fields.length != (11 + 3*blockCount)) {
throw new IllegalArgumentException("Invalid alignment: " + text);
}
int[] alignmentBlocks = new int[3*blockCount];
for (int i = 0; i < 3*blockCount; i++) {
alignmentBlocks[i] = parseIntField(fields[11 + i]);
}
Alignment alignment = new Alignment();
alignment.setASequenceId(seqAId);
alignment.setASequenceLength(seqALength);
alignment.setAStart(seqAStart+1);
alignment.setAEnd(seqAEnd);
alignment.setBSequenceId(seqBId);
alignment.setBSequenceLength(seqBLength);
alignment.setBStart(seqBStart+1);
alignment.setBEnd(seqBEnd);
alignment.setOrientation((orientation == 0) ? '+' : '-');
alignment.setAlignmentBlocks(alignmentBlocks);
return alignment;
}
private static int parseIntField(String text) {
try {
return Integer.parseInt(text);
} catch (NumberFormatException exc) {
throw new IllegalArgumentException("Illegal alignment field: " + text);
}
}
public String arachneFormat() {
StringBuilder builder = new StringBuilder();
builder.append("QUERY");
builder.append(TAB);
builder.append(mASequenceId);
builder.append(TAB);
builder.append(mAStart-1); // zero based
builder.append(TAB);
builder.append(mAEnd);
builder.append(TAB);
builder.append(mASequenceLength);
builder.append(TAB);
builder.append(mOrientation == '+' ? 0 : 1);
builder.append(TAB);
builder.append(mBSequenceId);
builder.append(TAB);
builder.append(mBStart-1); // zero based
builder.append(TAB);
builder.append(mBEnd);
builder.append(TAB);
builder.append(mBSequenceLength);
builder.append(TAB);
builder.append(mAlignmentBlocks.length / 3);
for (int i = 0; i < mAlignmentBlocks.length; i++) {
builder.append(TAB);
builder.append(mAlignmentBlocks[i]);
}
return builder.toString();
}
public String format() {
StringBuilder builder = new StringBuilder();
builder.append("Alignment");
builder.append(' ');
builder.append(mASequenceId);
builder.append(' ');
builder.append(mAStart);
builder.append(' ');
builder.append(mAEnd);
builder.append(' ');
builder.append(mOrientation);
builder.append(' ');
builder.append(mBSequenceId);
builder.append(' ');
builder.append(mBStart);
builder.append(' ');
builder.append(mBEnd);
builder.append(' ');
builder.append(mAlignmentBlocks.length / 3);
for (int i = 0; i < mAlignmentBlocks.length; i++) {
builder.append(' ');
builder.append(mAlignmentBlocks[i]);
}
return builder.toString();
}
}

View File

@ -1,132 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.arachne;
import java.io.*;
/**
* Utility to convert fastb to fasta files.
* More importantly, can be used to extract a subset of the reads.
*/
public class Fastb2Fasta {
private boolean mVerbose = false;
private boolean mDebug = false;
private String mInputPath = null;
private String mIdListFilePath = null;
public static void main(String[] args)
throws Exception {
new Fastb2Fasta().run(args);
}
private void usage() {
System.out.println("Usage: Fastb2Fasta ... <fastb-file>");
System.out.println(" -idlist <file-of-read-ids>");
System.out.println(" -verbose");
System.out.println(" -debug");
}
private boolean parseArguments(String[] args) {
int argpos = 0;
int argsleft = 0;
while (argpos < args.length) {
argsleft = args.length - argpos;
String arg = args[argpos];
if (arg.equals("-idlist") && argsleft > 1) {
argpos++;
mIdListFilePath = args[argpos++];
} else if (arg.equals("-verbose")) {
argpos++;
mVerbose = true;
} else if (arg.equals("-debug")) {
argpos++;
mDebug = true;
} else if (arg.startsWith("-")) {
usage();
return false;
} else {
break;
}
}
argsleft = args.length - argpos;
if (argsleft != 1) {
usage();
return false;
}
mInputPath = args[argpos];
return true;
}
private void run(String[] args)
throws Exception {
if (!parseArguments(args)) {
System.exit(1);
}
FastbReader fastbReader = new FastbReader(new File(mInputPath));
try {
if (mIdListFilePath != null) {
LineNumberReader reader = new LineNumberReader(new FileReader(mIdListFilePath));
while (true) {
String line = reader.readLine();
if (line == null) {
reader.close();
break;
}
Integer id = parseReadId(line);
if (id == null) {
continue;
}
if (id < 0 || id >= fastbReader.getSequenceCount()) {
System.out.println("ERROR: Illegal sequence id: " + id);
System.exit(1);
}
String sequence = fastbReader.readSequence(id);
System.out.println(">" + id);
System.out.println(sequence);
}
} else {
int id = 0;
while (fastbReader.hasNext()) {
String sequence = fastbReader.next();
System.out.println(">" + id);
System.out.println(sequence);
id++;
}
}
} finally {
fastbReader.close();
}
}
private Integer parseReadId(String line) {
String text = line.trim();
if (text.length() == 0 || text.charAt(0) == '#') {
return null;
}
String token = text.split("\\s+")[0];
Integer id = null;
try {
id = new Integer(token);
} catch (NumberFormatException exc) {
System.out.println("ERROR: Invalid sequence id: " + token);
System.exit(1);
}
return id;
}
}

View File

@ -1,220 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.arachne;
import edu.mit.broad.sam.util.CloseableIterator;
import java.io.*;
/**
* Reader for arachne Fastb files.
*/
public class FastbReader
implements CloseableIterator<String> {
// Notes on fastb file format
//
// Fastb files contain the serialized contents of an arachne vecbasevector,
// which is a typedef for mastervec<basevector, unsigned int>.
// The serialization of mastervec objects starts with a 24 byte mv_file_control_block,
// followed by N variable length segments (one for each element of the mastervec vector),
// followed by an offset table containing N 8-byte file offsets to the N variable length
// segments, followed by N fixed length data segments, one for each vector element.
// Thus, reading a single element of the mastervec vector requires reading from three
// separate places in the file (the offset table, the variable length section and the
// fixed length section).
//
// The mastervec file header is 24 bytes arranged as follows:
// n 4-byte signed(?) integer (number of entries)
// c1 1-byte unsigned bit mask (see below)
// reserved 1-byte unused
// sizeX 1-byte unsigned, sizeof first template parameter (16 for fastb files)
// sizeA 1-byte unsigned, sizeof second template parameter (4 for fastb files)
// offsets_start 8-byte signed(?) integer, file offset of offset table
// static_start 8-byte signed(?) integer, file offset of static data (fixed size section)
//
// For fastb files, the fixed size section contains 4 bytes for each object, which is the
// unsigned(?) count of the number of bases in this entry.
// For fastb files, the variable length section contains a bit vector with two bits per base.
// The bases are encoded as follows: A = 0, C = 1, G = 2, T = 3.
//
// For fastb files, in the file header N is the number of entries in the fastb file.
// c1 is unused/unimplemented except that the two low-order bits should be 0x01, indicating
// that we are using the single-file representation. There is also apparently a three-file
// representation that looks the same except that the offset table and static (fixed length)
// table are in separate files named <basename>.offsets and <basename>.static.
// The sizeX should be 16 for fastb files and sizeA should be 4.
//
// Note that in fastb files, the sequences are not identified by name or id, only by index
// (zero based) into the mastervec object. There is no representation for bases other than
// ACGT (i.e. Ns cannot be encoded).
private static final char[] BASES = { 'A', 'C', 'G', 'T' };
private File mFile;
private RandomAccessFile mRandomFile;
private int mEntryCount;
private long mOffsetTableOffset;
private long mLengthTableOffset;
private int mCurrentPosition;
private byte[] mIOBuffer = new byte[8];
public FastbReader(File file)
throws IOException {
mFile = file;
mRandomFile = new RandomAccessFile(mFile, "r");
readHeader();
}
public int getSequenceCount() {
return mEntryCount;
}
public boolean hasNext() {
return (mCurrentPosition < mEntryCount);
}
public String next() {
if (!hasNext()) {
throw new IllegalStateException("Iterator exhausted");
}
try {
return readSequence(mCurrentPosition);
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
}
public void remove() {
throw new UnsupportedOperationException("Not supported: remove");
}
public void close() {
if (mRandomFile != null) {
mEntryCount = 0;
mCurrentPosition = 0;
try {
mRandomFile.close();
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
} finally {
mRandomFile = null;
}
}
}
public String readSequence(int n)
throws IOException {
if (mRandomFile == null) {
throw new IllegalStateException("Reader is closed");
}
if (n < 0 || n >= mEntryCount) {
throw new IndexOutOfBoundsException("Illegal index: " + n);
}
long offset = getEntryOffset(n);
int length = getEntryBaseCount(n);
String result = readBases(offset, length);
mCurrentPosition = n+1;
return result;
}
private void readHeader()
throws IOException {
byte[] fileControlBlock = new byte[24];
mRandomFile.readFully(fileControlBlock, 0, 24);
int word2 = deserializeInt(fileControlBlock, 4);
int nFiles = word2 & 0x3;
int sizeX = (word2 >> 16) & 0xFF;
int sizeA = (word2 >> 24) & 0xFF;
if (nFiles != 1) {
throw new RuntimeException(mFile + ": Invalid file header: nFiles = " + nFiles);
}
if (sizeX != 16) {
throw new RuntimeException(mFile + ": Invalid file header: sizeX = " + sizeX);
}
if (sizeA != 4) {
throw new RuntimeException(mFile + ": Invalid file header: sizeX = " + sizeA);
}
mEntryCount = deserializeInt(fileControlBlock, 0);
mOffsetTableOffset = deserializeLong(fileControlBlock, 8);
mLengthTableOffset = deserializeLong(fileControlBlock, 16);
}
private long getEntryOffset(int n)
throws IOException {
mRandomFile.seek(mOffsetTableOffset + 8 * n);
mRandomFile.readFully(mIOBuffer, 0, 8);
return deserializeLong(mIOBuffer, 0);
}
private int getEntryBaseCount(int n)
throws IOException {
mRandomFile.seek(mLengthTableOffset + 4 * n);
mRandomFile.readFully(mIOBuffer, 0, 4);
return deserializeInt(mIOBuffer, 0);
}
private String readBases(long fileOffset, int baseCount)
throws IOException {
int byteCount = (baseCount + 3) / 4;
byte[] data = new byte[byteCount];
mRandomFile.seek(fileOffset);
mRandomFile.readFully(data, 0, byteCount);
int baseIndex = 0;
int dataIndex = 0;
char[] baseBuffer = new char[baseCount];
while (baseIndex < baseCount) {
int b = data[dataIndex++];
int count = Math.min(4, baseCount - baseIndex);
for (int i = 0; i < count; i++) {
baseBuffer[baseIndex++] = BASES[b & 0x3];
b = b >> 2;
}
}
return new String(baseBuffer);
}
private int deserializeInt(byte[] buffer, int offset) {
int byte1 = buffer[offset] & 0xFF;
int byte2 = buffer[offset+1] & 0xFF;
int byte3 = buffer[offset+2] & 0xFF;
int byte4 = buffer[offset+3] & 0xFF;
return (byte1 | (byte2 << 8) | (byte3 << 16) | (byte4 << 24));
}
private long deserializeLong(byte[] buffer, int offset) {
long int1 = deserializeInt(buffer, offset) & 0xFFFFFFFFL;
long int2 = deserializeInt(buffer, offset+4) & 0xFFFFFFFFL;
return (int1 | (int2 << 32));
}
// Stub for interactive use (see also Fastb2Fasta)
public static void main(String[] args)
throws Exception {
FastbReader reader = new FastbReader(new File(args[0]));
int readId = 0;
while (reader.hasNext()) {
System.out.println(">" + readId);
System.out.println(reader.next());
readId++;
}
reader.close();
}
}

View File

@ -1,83 +0,0 @@
package edu.mit.broad.arachne;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.BitSet;
import java.util.SortedMap;
import java.util.TreeMap;
/**
* Utility class to read in a set of contig-based genomic intervals in zero-based end inclusive
* and store them efficiently in memory as a 1-based bit-mask
*/
public class GenomeMask {
// if memory usage becomes a problem... this could be changed to a SparseBitSet
// http://java.sun.com/developer/onlineTraining/collections/magercises/BitSet/index.html
private SortedMap<Integer, BitSet> data = new TreeMap<Integer, BitSet>();
public GenomeMask(File maskFile) throws IOException {
BufferedReader baitReader = null;
try {
baitReader = new BufferedReader(new FileReader(maskFile));
String line;
while ((line = baitReader.readLine()) != null) {
String[] arr = line.split(" ");
int contig = Integer.parseInt(arr[0]);
// covert the coordinates from 0-based, end inclusive to
// 1-based end inclusive
int startPos = Integer.parseInt(arr[1]) + 1;
int endPos = Integer.parseInt(arr[2]) + 1;
BitSet bits = data.get(contig);
if (bits == null) {
bits = new BitSet(endPos);
data.put(contig,bits);
}
bits.set(startPos, endPos + 1); // set method is end exclusive
}
} finally {
if (baitReader != null) { baitReader.close(); }
}
}
/**
* This ctor is useful if initializing a GenomeMask externally.
*/
public GenomeMask() {
}
public boolean get(int contig, int position) {
BitSet bits = data.get(contig);
return (bits != null) && bits.get(position);
}
public BitSet get(int contig) {
return data.get(contig);
}
/**
* Get an existing BitSet for the given contig, or create one if not already present. This is
* useful when initializing a GenomeMask from an external source.
* @param contig which BitSet
* @param numBits if there was not already a BitSet for this contig, one is created and initialized to this size.
* @return the BitSet for the given contig, creating one if necessary
*/
public BitSet getOrCreate(int contig, int numBits) {
BitSet ret = data.get(contig);
if (ret == null) {
ret = new BitSet(numBits);
data.put(contig, ret);
}
return ret;
}
public int getMaxContig() {
return data.lastKey();
}
}

View File

@ -1,136 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.arachne;
import edu.mit.broad.sam.util.CloseableIterator;
import java.io.*;
/**
* Reader for arachne LookAlign text format alignment files.
* Supports filtering of the input by genomic locus.
*/
public class LookAlignReader
implements CloseableIterator<Alignment> {
private LineNumberReader mReader = null;
private Alignment mNextAlignment = null;
private int mBSequenceId = -1;
private int mBStart = 0;
private int mBEnd = 0;
public LookAlignReader(File file)
throws IOException {
this(new FileReader(file));
}
public LookAlignReader(Reader reader) {
if (reader instanceof LineNumberReader) {
mReader = (LineNumberReader) reader;
} else {
mReader = new LineNumberReader(reader);
}
}
public void setBSequenceId(int value) {
mBSequenceId = value;
}
public void setBStart(int value) {
mBStart = value;
}
public void setBEnd(int value) {
mBEnd = value;
}
public boolean hasNext() {
if (mNextAlignment != null) {
return true;
}
try {
mNextAlignment = nextAlignment();
return (mNextAlignment != null);
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
}
public Alignment next() {
if (!hasNext()) {
throw new IllegalStateException("Iterator exhausted");
}
try {
Alignment result = mNextAlignment;
mNextAlignment = nextAlignment();
return result;
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
}
public void remove() {
throw new UnsupportedOperationException("Not supported: remove");
}
public void close() {
if (mReader != null) {
try {
mReader.close();
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
mReader = null;
}
}
private Alignment nextAlignment()
throws IOException {
if (mReader == null) {
return null;
}
while (true) {
String line = mReader.readLine();
if (line == null) {
close();
break;
}
if (!line.startsWith("QUERY")) {
continue;
}
Alignment alignment = Alignment.parse(line);
if (matchesFilters(alignment)) {
return alignment;
}
}
return null;
}
private boolean matchesFilters(Alignment alignment) {
if (mBSequenceId < 0) {
return true;
}
if (alignment.getBSequenceId() != mBSequenceId) {
return false;
}
if (mBStart > 0 && alignment.getBEnd() < mBStart) {
return false;
}
if (mBEnd > 0 && alignment.getBStart() > mBEnd) {
return false;
}
return true;
}
}

View File

@ -1,437 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.cnv;
import edu.mit.broad.arachne.Alignment;
import edu.mit.broad.arachne.LookAlignReader;
import java.io.*;
import java.util.*;
/**
* Utility class to do data reduction on CNV data.
*/
public class AnalyzeCnvs {
public static void main(String[] args)
throws Exception {
new AnalyzeCnvs().run(args);
}
private void usage() {
System.out.println("Usage: AnalyzeCnvs ...");
System.out.println(" -action <action>");
System.out.println(" -alignments <alignment-file> or -");
System.out.println(" -alignmentList <alignment-fofn>");
System.out.println(" -chromosome <chrN>");
System.out.println(" -start <start-coordinate>");
System.out.println(" -end <end-coordinate>");
System.out.println(" -bestAlignments");
System.out.println(" -mismatchThreshold <n>");
System.out.println(" -binsize <n>");
System.out.println(" -output <coverage|all>");
System.out.println(" -verbose");
System.out.println(" -debug");
}
private boolean parseArguments(String[] args) {
int argpos = 0;
int argsleft = 0;
while (argpos < args.length) {
argsleft = args.length - argpos;
String arg = args[argpos];
if (arg.equals("-action") && argsleft > 1) {
argpos++;
mAction = args[argpos++];
} else if (arg.equals("-alignments") && argsleft > 1) {
argpos++;
mAlignmentFilePath = args[argpos++];
} else if (arg.equals("-alignmentList") && argsleft > 1) {
argpos++;
mAlignmentListFilePath = args[argpos++];
} else if (arg.equals("-chromosome") && argsleft > 1) {
argpos++;
mChromosome = args[argpos++];
} else if (arg.equals("-start") && argsleft > 1) {
argpos++;
mStartPosition = new Integer(args[argpos++]);
} else if (arg.equals("-end") && argsleft > 1) {
argpos++;
mEndPosition = new Integer(args[argpos++]);
} else if (arg.equals("-verbose")) {
argpos++;
mVerbose = true;
} else if (arg.equals("-mismatchThreshold") && argsleft > 1) {
argpos++;
mMismatchThreshold = new Integer(args[argpos++]);
} else if (arg.equals("-bestAlignments")) {
argpos++;
mReturnBestHits = true;
} else if (arg.equals("-binsize") && argsleft > 1) {
argpos++;
mBinSize = Integer.parseInt(args[argpos++]);
} else if (arg.equals("-output") && argsleft > 1) {
argpos++;
mOutputColumns = args[argpos++];
} else if (arg.equals("-debug")) {
argpos++;
mDebug = true;
} else if (arg.startsWith("-")) {
usage();
return false;
} else {
break;
}
}
argsleft = args.length - argpos;
if (argsleft != 0) {
usage();
return false;
}
return true;
}
private void run(String[] args)
throws Exception {
if (!parseArguments(args)) {
System.exit(1);
}
if (mAction == null) {
mAction = "alignmentCoverage";
}
if (mAction.equals("alignmentCoverage")) {
mainAlignmentCoverage();
} else {
System.out.println("Unknown action: " + mAction);
usage();
System.exit(1);
}
}
private void mainAlignmentCoverage()
throws IOException {
if (mStartPosition == null || mEndPosition == null) {
usage();
System.exit(1);
} else if (mStartPosition <= 0 || mEndPosition <= 0 || mStartPosition > mEndPosition) {
System.out.println("Invalid start/end positions: " + mStartPosition + " " + mEndPosition);
usage();
System.exit(1);
}
mSequenceId = chromosomeToSequenceId(mChromosome);
if (mSequenceId < 0) {
System.out.println("Invalid chromosome: " + mChromosome);
usage();
System.exit(1);
}
if (mBinSize <= 0) {
System.out.println("Invalid bin size: " + mBinSize);
usage();
System.exit(1);
}
runAlignmentCoverage();
}
private void runAlignmentCoverage()
throws IOException {
int length = (mEndPosition - mStartPosition + 1);
if (length <= 0) {
throw new RuntimeException("Invalid start/end positions");
}
int binSize = mBinSize;
int binCount = (length + binSize - 1) / binSize;
int[] readStarts = new int[binCount];
int[] readDepths = new int[binCount];
List<String> alignmentFiles = getAlignmentFiles();
for (String path : alignmentFiles) {
processAlignmentFile(path, readStarts, readDepths);
}
printStats(readStarts, readDepths);
}
private List<String> getAlignmentFiles()
throws IOException {
List<String> fileList = new ArrayList<String>();
if (mAlignmentListFilePath != null) {
LineNumberReader reader = new LineNumberReader(new FileReader(mAlignmentListFilePath));
while (true) {
String line = reader.readLine();
if (line == null) {
reader.close();
break;
}
String path = line.trim();
if (path.length() == 0 || path.startsWith("#")) {
continue;
}
fileList.add(path);
}
} else if (mAlignmentFilePath != null) {
fileList.add(mAlignmentFilePath);
}
return fileList;
}
private void processAlignmentFile(String path, int[] readStarts, int[] readDepths)
throws IOException {
LookAlignReader reader = null;
if (path == null || path.equals("-")) {
reader = new LookAlignReader(new InputStreamReader(System.in));
} else {
reader = new LookAlignReader(new File(path));
}
while (true) {
Alignment alignment = getNextAlignment(reader);
if (alignment == null) {
reader.close();
break;
}
processAlignment(alignment, readStarts, readDepths);
}
}
private void processAlignment(Alignment alignment,
int[] readStarts,
int[] readDepths) {
if (readStarts != null) {
int baseOffset = alignment.getBStart() - mStartPosition;
int binIndex = baseOffset / mBinSize;
if (binIndex >= 0 && binIndex < readStarts.length) {
readStarts[binIndex]++;
}
}
if (readDepths != null) {
int baseOffset = alignment.getBStart() - mStartPosition;
int[] alignmentBlocks = alignment.getAlignmentBlocks();
for (int i = 0; i < alignmentBlocks.length; i += 3) {
int gap = alignmentBlocks[i];
int duration = alignmentBlocks[i+1];
if (gap > 0) {
// Gap in B sequence (genome)
// Negative gaps are gaps in A sequence (read)
baseOffset += gap;
}
for (int j = 0; j < duration; j++) {
int binIndex = baseOffset / mBinSize;
if (binIndex >= 0 && binIndex < readDepths.length) {
readDepths[binIndex]++;
}
baseOffset++;
}
}
}
}
private Alignment getNextAlignment(LookAlignReader reader)
throws IOException {
if (!mReturnBestHits) {
while (reader.hasNext()) {
Alignment alignment = reader.next();
if (passesAlignmentFilters(alignment)) {
return alignment;
}
}
return null;
}
while (true) {
Alignment seed = mPendingAlignment;
mPendingAlignment = null;
if (seed == null && reader.hasNext()) {
seed = reader.next();
}
if (seed == null) {
return null;
}
List<Alignment> secondaryHits = null;
while (reader.hasNext()) {
Alignment alignment = reader.next();
if (alignment.getASequenceId() != seed.getASequenceId()) {
if (alignment.getASequenceId() < seed.getASequenceId()) {
throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format());
}
mPendingAlignment = alignment;
break;
}
if (secondaryHits == null) {
secondaryHits = new ArrayList<Alignment>();
}
secondaryHits.add(alignment);
}
if (secondaryHits == null) {
if (!passesAlignmentFilters(seed)) {
continue;
}
return seed;
}
secondaryHits.add(seed);
Alignment result = getUniqueBestAlignment(secondaryHits);
if (result != null && passesAlignmentFilters(result)) {
return result;
}
}
}
private Alignment getUniqueBestAlignment(List<Alignment> alignments) {
int bestMismatches = 0;
List<Alignment> best = new ArrayList<Alignment>();
for (Alignment a : alignments) {
int mismatches = getAlignmentMismatches(a);
if (best.isEmpty()) {
best.add(a);
bestMismatches = mismatches;
}
if (mismatches == bestMismatches) {
best.add(a);
} else if (mismatches < bestMismatches) {
best.clear();
best.add(a);
bestMismatches = mismatches;
}
}
if (best.size() != 1) {
return null;
}
return best.get(0);
}
private boolean passesAlignmentFilters(Alignment alignment) {
if (mMismatchThreshold != null) {
if (getAlignmentMismatches(alignment) > mMismatchThreshold) {
return false;
}
}
if (mSequenceId != null) {
if (alignment.getBSequenceId() != mSequenceId) {
return false;
}
}
if (mStartPosition != null) {
if (alignment.getBEnd() < mStartPosition) {
return false;
}
}
if (mEndPosition != null) {
if (alignment.getBStart() > mEndPosition) {
return false;
}
}
return true;
}
private int getAlignmentMismatches(Alignment alignment) {
int mismatches = 0;
int[] blocks = alignment.getAlignmentBlocks();
for (int i = 0; i < blocks.length; i += 3) {
int gap = blocks[i];
int duration = blocks[i+1];
int mm = blocks[i+2];
if (mm > duration) {
throw new RuntimeException("Invalid alignment? : " + alignment.format());
}
mismatches += Math.abs(gap);
mismatches += mm;
}
return mismatches;
}
private void printStats(int[] readStarts, int[] readDepths) {
if (mOutputColumns != null && mOutputColumns.equals("coverage")) {
// No headers, just coverage
for (int i = 0; i < readDepths.length; i++) {
String line = "";
if (mBinSize == 1) {
line += readDepths[i];
} else {
line += (readDepths[i] / (double) mBinSize);
}
System.out.println(line);
}
} else {
System.out.println("Position" + "\t" + "Starts" + "\t" + "Coverage");
for (int i = 0; i < readDepths.length; i++) {
String line = "";
int position = mStartPosition + i*mBinSize;
line += position + "\t" + readStarts[i] + "\t";
if (mBinSize == 1) {
line += readDepths[i];
} else {
line += (readDepths[i] / (double) mBinSize);
}
System.out.println(line);
}
}
}
private int chromosomeToSequenceId(String text) {
if (text == null || text.length() == 0) {
return -1;
}
if (text.matches("\\d+")) {
return Integer.parseInt(text);
}
if (text.startsWith("chr") && text.length() > 3) {
text = text.substring(3);
}
if (text.matches("\\d+") && !text.startsWith("0")) {
return Integer.parseInt(text);
}
if (text.equals("M")) {
return 0;
} else if (text.equals("X")) {
return 23;
} else if (text.equals("Y")) {
return 24;
} else {
return -1;
}
}
private boolean mDebug = false;
private boolean mVerbose = false;
private String mAction = null;
private String mAlignmentFilePath = null;
private String mAlignmentListFilePath = null;
private String mChromosome = null;
private Integer mStartPosition = null;
private Integer mEndPosition = null;
private Integer mSequenceId = null;
private boolean mReturnBestHits = false;
private Integer mMismatchThreshold = null;
private int mBinSize = 1;
private String mOutputColumns = null;
private Alignment mPendingAlignment = null;
}

View File

@ -1,283 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.cnv;
import edu.mit.broad.arachne.Alignment;
import edu.mit.broad.arachne.LookAlignReader;
import java.io.*;
import java.util.*;
/**
* Utility to count alignments (rather than gathering).
*/
public class CountAlignments {
public static void main(String[] args)
throws Exception {
new CountAlignments().run(args);
}
private void usage() {
System.out.println("Usage: CountAlignments ...");
System.out.println(" -alignments <alignment-file> (- for stdin)");
System.out.println(" -chromosome <chromosome>");
System.out.println(" -start <start>");
System.out.println(" -end <end>");
System.out.println(" -bestAlignments");
System.out.println(" -mismatchThreshold <n>");
System.out.println(" -verbose");
System.out.println(" -debug");
}
private boolean parseArguments(String[] args) {
int argpos = 0;
int argsleft = 0;
while (argpos < args.length) {
argsleft = args.length - argpos;
String arg = args[argpos];
if (arg.equals("-alignments") && argsleft > 1) {
argpos++;
mAlignmentFilePath = args[argpos++];
} else if (arg.equals("-mismatchThreshold") && argsleft > 1) {
argpos++;
mMismatchThreshold = new Integer(args[argpos++]);
} else if (arg.equals("-bestAlignments")) {
argpos++;
mReturnBestHits = true;
} else if (arg.equals("-chromosome") && argsleft > 1) {
argpos++;
String chromosome = args[argpos++];
mSequenceId = chromosomeToSequenceId(chromosome);
if (mSequenceId < 0) {
System.out.println("Invalid chromosome: " + chromosome);
return false;
}
} else if (arg.equals("-start") && argsleft > 1) {
argpos++;
mStartPosition = new Integer(args[argpos++]);
} else if (arg.equals("-end") && argsleft > 1) {
argpos++;
mEndPosition = new Integer(args[argpos++]);
} else if (arg.equals("-verbose")) {
argpos++;
mVerbose = true;
} else if (arg.equals("-debug")) {
argpos++;
mDebug = true;
} else if (arg.startsWith("-")) {
usage();
return false;
} else {
break;
}
}
argsleft = args.length - argpos;
if (argsleft != 0) {
usage();
return false;
}
return true;
}
private void run(String[] args)
throws Exception {
if (!parseArguments(args)) {
System.exit(1);
}
long[] counts = countAlignments(mAlignmentFilePath);
String line = counts[0] + " " + counts[1];
if (mAlignmentFilePath != null) {
line = mAlignmentFilePath + " " + line;
}
System.out.println(line);
}
private long[] countAlignments(String path)
throws IOException {
long alignmentCount = 0;
long baseCount = 0;
LookAlignReader reader = null;
if (path == null || path.equals("-")) {
reader = new LookAlignReader(new InputStreamReader(System.in));
} else {
reader = new LookAlignReader(new File(path));
}
while (true) {
Alignment alignment = getNextAlignment(reader);
if (alignment == null) {
reader.close();
break;
}
if (mMismatchThreshold != null) {
if (getAlignmentMismatches(alignment) > mMismatchThreshold) {
continue;
}
}
if (mSequenceId != null) {
if (alignment.getBSequenceId() != mSequenceId) {
continue;
}
}
if (mStartPosition != null) {
if (alignment.getBEnd() < mStartPosition) {
continue;
}
}
if (mEndPosition != null) {
if (alignment.getBStart() > mEndPosition) {
continue;
}
}
alignmentCount++;
baseCount += getBaseCount(alignment);
}
long[] result = { alignmentCount, baseCount };
return result;
}
private Alignment getNextAlignment(LookAlignReader reader)
throws IOException {
if (!mReturnBestHits) {
if (!reader.hasNext()) {
return null;
}
return reader.next();
}
while (true) {
Alignment seed = mPendingAlignment;
mPendingAlignment = null;
if (seed == null && reader.hasNext()) {
seed = reader.next();
}
if (seed == null) {
return null;
}
List<Alignment> secondaryHits = null;
while (reader.hasNext()) {
Alignment alignment = reader.next();
if (alignment.getASequenceId() != seed.getASequenceId()) {
if (alignment.getASequenceId() < seed.getASequenceId()) {
throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format());
}
mPendingAlignment = alignment;
break;
}
if (secondaryHits == null) {
secondaryHits = new ArrayList<Alignment>();
}
secondaryHits.add(alignment);
}
if (secondaryHits == null) {
return seed;
}
secondaryHits.add(seed);
Alignment result = getUniqueBestAlignment(secondaryHits);
if (result != null) {
return result;
}
}
}
private Alignment getUniqueBestAlignment(List<Alignment> alignments) {
int bestMismatches = 0;
List<Alignment> best = new ArrayList<Alignment>();
for (Alignment a : alignments) {
int mismatches = getAlignmentMismatches(a);
if (best.isEmpty()) {
best.add(a);
bestMismatches = mismatches;
}
if (mismatches == bestMismatches) {
best.add(a);
} else if (mismatches < bestMismatches) {
best.clear();
best.add(a);
bestMismatches = mismatches;
}
}
if (best.size() != 1) {
return null;
}
return best.get(0);
}
private int getAlignmentMismatches(Alignment alignment) {
int mismatches = 0;
int[] blocks = alignment.getAlignmentBlocks();
for (int i = 0; i < blocks.length; i += 3) {
int gap = blocks[i];
int duration = blocks[i+1];
int mm = blocks[i+2];
if (mm > duration) {
throw new RuntimeException("Invalid alignment? : " + alignment.format());
}
mismatches += Math.abs(gap);
mismatches += mm;
}
return mismatches;
}
// Return the number of reference bases covered by this alignment.
private int getBaseCount(Alignment alignment) {
int count = 0;
int[] blocks = alignment.getAlignmentBlocks();
for (int i = 0; i < blocks.length; i += 3) {
// int gap = blocks[i];
int duration = blocks[i+1];
// int mm = blocks[i+2];
count += duration;
}
return count;
}
private int chromosomeToSequenceId(String text) {
if (text == null || text.length() == 0) {
return -1;
}
if (text.matches("\\d+")) {
return Integer.parseInt(text);
}
if (text.startsWith("chr") && text.length() > 3) {
text = text.substring(3);
}
if (text.matches("\\d+") && !text.startsWith("0")) {
return Integer.parseInt(text);
}
if (text.equals("M")) {
return 0;
} else if (text.equals("X")) {
return 23;
} else if (text.equals("Y")) {
return 24;
} else {
return -1;
}
}
private boolean mDebug = false;
private boolean mVerbose = false;
private String mAlignmentFilePath = null;
private boolean mReturnBestHits = false;
private Integer mMismatchThreshold = null;
private Integer mSequenceId = null;
private Integer mStartPosition = null;
private Integer mEndPosition = null;
private Alignment mPendingAlignment = null;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,399 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.cnv;
import edu.mit.broad.arachne.Alignment;
import edu.mit.broad.arachne.LookAlignReader;
import java.io.*;
import java.util.*;
/**
* Utility program to gather CNV alignments from LookAlign files in an I/O efficient manner.
*/
public class GatherAlignments {
public static void main(String[] args)
throws Exception {
new GatherAlignments().run(args);
}
private void usage() {
System.out.println("Usage: GatherAlignments ...");
System.out.println(" -cnpList <cnp-file>");
System.out.println(" -sampleId <sample-id>");
System.out.println(" -inputFileList <fofn>");
System.out.println(" -outputDirectory <dir>");
System.out.println(" -padding <n-bases>");
System.out.println(" -bestAlignments");
System.out.println(" -verbose");
System.out.println(" -debug");
}
private boolean parseArguments(String[] args) {
int argpos = 0;
int argsleft = 0;
while (argpos < args.length) {
argsleft = args.length - argpos;
String arg = args[argpos];
if (arg.equals("-cnpList") && argsleft > 1) {
argpos++;
mCnpListPath = args[argpos++];
} else if (arg.equals("-sampleId") && argsleft > 1) {
argpos++;
mSampleId = args[argpos++];
} else if (arg.equals("-inputFileList") && argsleft > 1) {
argpos++;
mInputFileListPath = args[argpos++];
} else if (arg.equals("-outputDirectory") && argsleft > 1) {
argpos++;
mOutputDirectory = args[argpos++];
} else if (arg.equals("-padding") && argsleft > 1) {
argpos++;
mCnpRegionPadding = Integer.parseInt(args[argpos++]);
} else if (arg.equals("-bestAlignments")) {
argpos++;
mReturnBestHits = true;
} else if (arg.equals("-verbose")) {
argpos++;
mVerbose = true;
} else if (arg.equals("-debug")) {
argpos++;
mDebug = true;
} else if (arg.startsWith("-")) {
usage();
return false;
} else {
break;
}
}
argsleft = args.length - argpos;
if (argsleft != 0) {
usage();
return false;
}
return true;
}
private void run(String[] args)
throws Exception {
if (!parseArguments(args)) {
System.exit(1);
}
List<File> mInputFileList = parseInputFiles(mInputFileListPath);
Map<Integer, List<CnpRegion>> mCnpMap = parseCnpFile(mCnpListPath);
for (File inputFile : mInputFileList) {
scanInputFile(inputFile, mCnpMap);
}
}
private List<File> parseInputFiles(String path)
throws IOException {
List<File> fileList = new ArrayList<File>();
LineNumberReader reader = new LineNumberReader(new FileReader(path));
while (true) {
String line = reader.readLine();
if (line == null) {
reader.close();
break;
}
line = line.trim();
if (line.length() == 0 || line.startsWith("#")) {
continue;
}
String[] fields = line.split("\\s+");
fileList.add(new File(fields[0]));
}
return fileList;
}
private Map<Integer, List<CnpRegion>> parseCnpFile(String path)
throws IOException {
Map<Integer, List<CnpRegion>> cnpMap = new HashMap<Integer, List<CnpRegion>>();
LineNumberReader reader = new LineNumberReader(new FileReader(path));
while (true) {
String line = reader.readLine();
if (line == null) {
reader.close();
break;
}
line = line.trim();
if (line.length() == 0 || line.startsWith("#")) {
continue;
}
String[] fields = line.split("\\s+");
if (fields.length != 4) {
throw new RuntimeException("Invalid CNP line: " + line);
}
if (fields[0].equalsIgnoreCase("CNPID")) {
continue;
}
String cnpId = fields[0];
String chromosome = fields[1];
int start = Integer.parseInt(fields[2].replaceAll(",", ""));
int end = Integer.parseInt(fields[3].replaceAll(",", ""));
int sequenceId = chromosomeToSequenceId(chromosome);
if (sequenceId < 0) {
throw new RuntimeException("Unrecognized chromosome: " + chromosome);
}
if (mCnpRegionPadding > 0) {
start = Math.max(1, start - mCnpRegionPadding);
end = end + mCnpRegionPadding;
}
CnpRegion cnp = new CnpRegion(cnpId, sequenceId, start, end);
List<CnpRegion> cnpList = cnpMap.get(sequenceId);
if (cnpList == null) {
cnpList = new ArrayList<CnpRegion>();
cnpMap.put(sequenceId, cnpList);
}
cnpList.add(cnp);
}
return cnpMap;
}
private int chromosomeToSequenceId(String text) {
if (text == null || text.length() == 0) {
return -1;
}
if (text.matches("\\d+")) {
return Integer.parseInt(text);
}
if (text.startsWith("chr") && text.length() > 3) {
text = text.substring(3);
}
if (text.matches("\\d+") && !text.startsWith("0")) {
return Integer.parseInt(text);
}
if (text.equals("M")) {
return 0;
} else if (text.equals("X")) {
return 23;
} else if (text.equals("Y")) {
return 24;
} else {
return -1;
}
}
private void scanInputFile(File inputFile,
Map<Integer, List<CnpRegion>> cnpMap)
throws IOException {
LookAlignReader reader = new LookAlignReader(inputFile);
while (true) {
Alignment alignment = getNextAlignment(reader);
if (alignment == null) {
reader.close();
break;
}
List<CnpRegion> cnpList = cnpMap.get(alignment.getBSequenceId());
if (cnpList == null) {
continue;
}
for (CnpRegion cnp : cnpList) {
if (overlaps(cnp, alignment)) {
saveCnpAlignment(cnp, alignment, inputFile);
}
}
}
flushCnpAlignments(inputFile);
}
private Alignment getNextAlignment(LookAlignReader reader)
throws IOException {
if (!mReturnBestHits) {
if (reader.hasNext()) {
return reader.next();
} else {
return null;
}
}
while (true) {
Alignment seed = mPendingAlignment;
mPendingAlignment = null;
if (seed == null && reader.hasNext()) {
seed = reader.next();
}
if (seed == null) {
return null;
}
List<Alignment> secondaryHits = null;
while (reader.hasNext()) {
Alignment alignment = reader.next();
if (alignment.getASequenceId() != seed.getASequenceId()) {
if (alignment.getASequenceId() < seed.getASequenceId()) {
throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format());
}
mPendingAlignment = alignment;
break;
}
if (secondaryHits == null) {
secondaryHits = new ArrayList<Alignment>();
}
secondaryHits.add(alignment);
}
if (secondaryHits == null) {
return seed;
}
secondaryHits.add(seed);
Alignment result = getUniqueBestAlignment(secondaryHits);
if (result != null) {
return result;
}
}
}
private Alignment getUniqueBestAlignment(List<Alignment> alignments) {
int bestMismatches = 0;
List<Alignment> best = new ArrayList<Alignment>();
for (Alignment a : alignments) {
int mismatches = getAlignmentMismatches(a);
if (best.isEmpty()) {
best.add(a);
bestMismatches = mismatches;
}
if (mismatches == bestMismatches) {
best.add(a);
} else if (mismatches < bestMismatches) {
best.clear();
best.add(a);
bestMismatches = mismatches;
}
}
if (best.size() != 1) {
return null;
}
return best.get(0);
}
private int getAlignmentMismatches(Alignment alignment) {
int mismatches = 0;
int[] blocks = alignment.getAlignmentBlocks();
for (int i = 0; i < blocks.length; i += 3) {
int gap = blocks[i];
int duration = blocks[i+1];
int mm = blocks[i+2];
if (mm > duration) {
throw new RuntimeException("Invalid alignment? : " + alignment.format());
}
mismatches += Math.abs(gap);
mismatches += mm;
}
return mismatches;
}
private boolean overlaps(CnpRegion cnp, Alignment alignment) {
return (cnp.getSequenceId() == alignment.getBSequenceId() &&
cnp.getStart() <= alignment.getBEnd() &&
cnp.getEnd() >= alignment.getBStart());
}
private void saveCnpAlignment(CnpRegion cnp, Alignment alignment, File inputFile)
throws IOException {
if (mCnpAlignmentCount > mCnpAlignmentLimit) {
flushCnpAlignments(inputFile);
}
String cnpId = cnp.getCnpId();
List<Alignment> alignmentList = mCnpAlignmentMap.get(cnpId);
if (alignmentList == null) {
alignmentList = new ArrayList<Alignment>();
mCnpAlignmentMap.put(cnpId, alignmentList);
}
alignmentList.add(alignment);
mCnpAlignmentCount++;
}
private void flushCnpAlignments(File inputFile)
throws IOException {
while (!mCnpAlignmentMap.isEmpty()) {
String cnpId = mCnpAlignmentMap.keySet().iterator().next();
List<Alignment> alignmentList = mCnpAlignmentMap.get(cnpId);
writeAlignments(cnpId, mSampleId, alignmentList, inputFile);
mCnpAlignmentMap.remove(cnpId);
mCnpAlignmentCount -= alignmentList.size();
}
if (mCnpAlignmentCount != 0) {
throw new RuntimeException("Unsynchronized alignment count");
}
}
private void writeAlignments(String cnpId, String sampleId, List<Alignment> alignmentList, File inputFile)
throws IOException {
File outputDir = new File(".");
if (mOutputDirectory != null) {
outputDir = new File(mOutputDirectory);
}
String cnpSample = cnpId;
if (sampleId != null) {
cnpSample = cnpSample + "_" + sampleId;
}
File cnpSampleDir = new File(outputDir, cnpSample);
if (!cnpSampleDir.exists()) {
if (!cnpSampleDir.mkdir()) {
throw new RuntimeException("Failed to create directory " + cnpSampleDir);
}
}
String fileName = inputFile.getName();
File alignmentFile = new File(cnpSampleDir, fileName);
PrintWriter writer = new PrintWriter(new FileWriter(alignmentFile, true));
for (Alignment alignment : alignmentList) {
writer.println(alignment.arachneFormat());
}
writer.flush();
writer.close();
}
private GatherAlignments() {
}
private static class CnpRegion {
private CnpRegion(String cnpId, int sequenceId, int start, int end) {
mCnpId = cnpId;
mSequenceId = sequenceId;
mStart = start;
mEnd = end;
}
public String getCnpId() { return mCnpId; };
public int getSequenceId() { return mSequenceId; };
public int getStart() { return mStart; };
public int getEnd() { return mEnd; };
private String mCnpId;
private int mSequenceId;
private int mStart;
private int mEnd;
}
private boolean mDebug = false;
private boolean mVerbose = false;
private boolean mReturnBestHits = false;
private String mCnpListPath = null;
private String mSampleId = null;
private String mInputFileListPath = null;
private String mOutputDirectory = null;
private int mCnpRegionPadding = 0;
private Alignment mPendingAlignment = null;
private int mCnpAlignmentCount = 0;
private int mCnpAlignmentLimit = 1000000;
private Map<String, List<Alignment>> mCnpAlignmentMap = new LinkedHashMap<String, List<Alignment>>();
}

File diff suppressed because it is too large Load Diff

View File

@ -1,151 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.cnv.kmer;
import edu.mit.broad.dcp.DistributedAlgorithm;
import edu.mit.broad.cnv.util.SequenceIterator;
import java.io.*;
import java.util.*;
/**
* Distributed algorithm for counting unique kmers.
*/
public class DistributedKMerCounter
extends DistributedAlgorithm
{
private boolean mDebug = false;
private boolean mVerbose = false;
private int mK = 0;
private List<File> mInputFiles = null;
private List<String> mSequenceList = null;
private List<Integer> mSequenceOffsetList = null;
public DistributedKMerCounter() {
}
public boolean getDebug() {
return mDebug;
}
public void setDebug(boolean value) {
mDebug = value;
}
public boolean getVerbose() {
return mVerbose;
}
public void setVerbose(boolean value) {
mVerbose = value;
}
public int getK() {
return mK;
}
public void setK(int value) {
mK = value;
}
public List<File> getInputFiles() {
return mInputFiles;
}
public void setInputFiles(List<File> value) {
mInputFiles = value;
}
public void run()
throws Exception {
super.run();
finish();
}
protected void init()
throws Exception {
if (getWorkerId() == MASTER) {
initMaster();
} else {
initWorker();
}
}
private void initMaster()
throws IOException {
// Tasks to be amortized
report("Scanning sequences ...");
scanSequences();
report("Scan complete.");
}
private void initWorker() {
// Tasks to be amortized
}
protected void start() {
// scan genome, divide into chromosomes and optionally segments, distribute calls
}
private void finish() {
// merge individual files, write out final results
}
private void scanSequences()
throws IOException {
List<String> sequenceList = new ArrayList<String>();
List<Integer> sequenceOffsetList = new ArrayList<Integer>();
SequenceIterator seqIterator = new SequenceIterator(getInputFiles());
while (true) {
String seqName = seqIterator.getNextSequence();
if (seqName == null) {
break;
}
int baseIndex = seqIterator.getBaseIndex() + 1;
sequenceList.add(seqName);
sequenceOffsetList.add(baseIndex);
}
mSequenceList = sequenceList;
mSequenceOffsetList = sequenceOffsetList;
}
// Currently not used
private void loadGenomeOffsets(File file)
throws IOException {
List<String> sequenceList = new ArrayList<String>();
List<Integer> sequenceOffsetList = new ArrayList<Integer>();
int baseIndex = 0;
LineNumberReader reader = new LineNumberReader(new FileReader(file));
while (true) {
String line = reader.readLine();
if (line == null) {
break;
}
String text = line.trim();
if (text.length() == 0 || text.startsWith("#")) {
continue;
}
String[] fields = text.split("\\s+");
if (fields.length != 2) {
throw new RuntimeException("Invalid input line: " + line);
}
int length = Integer.parseInt(fields[1]);
sequenceList.add(fields[0]);
sequenceOffsetList.add(baseIndex);
baseIndex += length;
}
mSequenceList = sequenceList;
mSequenceOffsetList = sequenceOffsetList;
}
}

View File

@ -1,184 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.cnv.util;
import java.io.*;
import java.util.*;
/**
* Utility class for transforming between a linear base index
* and a chromsome + position coordinate system.
*/
public class GenomeBaseIndex {
private List<String> mSequenceNames = null;
private int[] mLengths = null;
private long[] mOffsets = null;
private GenomeBaseIndex() {
}
public static GenomeBaseIndex read(File file)
throws IOException {
Reader reader = new BufferedReader(new FileReader(file));
try {
return read(reader);
} finally {
reader.close();
}
}
// The input is just a list of space-delimited sequence name and length.
public static GenomeBaseIndex read(Reader reader)
throws IOException {
List<String> sequenceNames = new ArrayList<String>();
List<Integer> sequenceLengths = new ArrayList<Integer>();
BufferedReader bufferedReader = new BufferedReader(reader);
while (true) {
String line = bufferedReader.readLine();
if (line == null) {
break;
}
String text = line.trim();
if (text.length() == 0 || text.startsWith("#")) {
continue;
}
String[] fields = text.split("\\s+");
if (fields.length < 2) {
throw new RuntimeException("Invalid input line: " + line);
}
int length = Integer.parseInt(fields[1]);
if (length <= 0) {
throw new RuntimeException("Invalid sequence length: " + length);
}
sequenceNames.add(fields[0]);
sequenceLengths.add(length);
}
int count = sequenceLengths.size();
int[] lengths = new int[count];
long[] offsets = new long[count];
long offset = 0;
for (int i = 0; i < count; i++) {
lengths[i] = sequenceLengths.get(i);
offsets[i] = offset;
offset += lengths[i];
}
GenomeBaseIndex result = new GenomeBaseIndex();
result.mSequenceNames = sequenceNames;
result.mLengths = lengths;
result.mOffsets = offsets;
return result;
}
public List<String> getSequenceNames() {
return mSequenceNames;
}
public boolean contains(String seqName) {
return (getSequenceIndex(seqName) >= 0);
}
public long getFirstIndex(String seqName) {
int index = getSequenceIndex(seqName);
if (index < 0) {
return -1;
}
return mOffsets[index];
}
public long getLastIndex(String seqName) {
int index = getSequenceIndex(seqName);
if (index < 0) {
return -1;
}
return (mOffsets[index] + mLengths[index] - 1);
}
public int getSequenceLength(String seqName) {
int index = getSequenceIndex(seqName);
if (index < 0) {
return 0;
}
return mLengths[index];
}
public long getBaseIndex(String seqName, int position) {
int index = getSequenceIndex(seqName);
if (index < 0) {
return -1;
}
if (position > mLengths[index]) {
return -1;
}
if (position < 1) {
// Zero or negative position means last base index
position = mLengths[index];
}
return (mOffsets[index] + position - 1);
}
public String getSequenceName(long baseIndex) {
int index = getSequenceIndex(baseIndex);
if (index < 0) {
return null;
}
return mSequenceNames.get(index);
}
public int getPosition(long baseIndex) {
if (baseIndex < 0) {
// Catch common sign-extension error when packing indexes as ints.
throw new IllegalArgumentException("Invalid base index: " + baseIndex);
}
int index = getSequenceIndex(baseIndex);
if (index < 0) {
return 0;
}
long offset = mOffsets[index];
long result = baseIndex - offset + 1;
return (int) result;
}
// Same as getSequenceName, but treat the argument as an unsigned int.
// This is useful for manipulating/storing indexes for the human
// genome as 4-byte unsigned ints.
public String getSequenceNameUnsigned(int baseIndex) {
return getSequenceName(baseIndex & 0xFFFFFFFFL);
}
// Same as getPosition, but treat the argument as an unsigned int.
// This is useful for manipulating/storing indexes for the human
// genome as 4-byte unsigned ints.
public int getPositionUnsigned(int baseIndex) {
return getPosition(baseIndex & 0xFFFFFFFFL);
}
private int getSequenceIndex(String seqName) {
return mSequenceNames.indexOf(seqName);
}
private int getSequenceIndex(long baseIndex) {
long offset = 0;
if (baseIndex < 0) {
return -1;
}
for (int i = 0; i < mLengths.length; i++) {
int length = mLengths[i];
if (offset + length > baseIndex) {
return i;
}
offset += length;
}
return -1;
}
}

View File

@ -1,167 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.cnv.util;
import java.io.*;
import java.util.*;
/**
* Utility class for transforming between a chromsome + position
* coordinate system and a binned coordinate system where each
* chromosome (separately) is divided into fixed sized bins,
* ragged on the right/upper end.
*/
public class GenomeBinIndex {
private int mBinSize;
private List<String> mSequenceNames;
private int[] mSequenceLengths;
private int[] mBinOffsets;
public GenomeBinIndex(GenomeBaseIndex gbi, int binSize) {
if (binSize <= 0) {
throw new IllegalArgumentException("Illegal bin size: " + binSize);
}
mBinSize = binSize;
mSequenceNames = new ArrayList<String>(gbi.getSequenceNames());
int count = mSequenceNames.size();
mSequenceLengths = new int[count];
mBinOffsets = new int[count];
long binOffset = 0; // long to detect overflow
for (int i = 0; i < count; i++) {
int length = gbi.getSequenceLength(mSequenceNames.get(i));
int binCount = (length + binSize - 1) / binSize;
mSequenceLengths[i] = length;
mBinOffsets[i] = (int) binOffset;
binOffset += binCount;
}
if (binOffset > Integer.MAX_VALUE) {
// Check for integer overflow.
// This will happen, e.g., with the human genome and a bin size of 1.
throw new RuntimeException("Binsize too small: " + binSize);
}
}
public int getBinSize() {
return mBinSize;
}
public int getBinIndex(String seqName, int position) {
int index = getSequenceIndex(seqName);
if (index < 0) {
return -1;
}
if (position > mSequenceLengths[index]) {
return -1;
}
if (position < 1) {
position = mSequenceLengths[index];
}
int bin = (position - 1) / mBinSize;
return (mBinOffsets[index] + bin);
}
public String getSequenceName(int binIndex) {
int index = getSequenceIndex(binIndex);
if (index < 0) {
return null;
}
return mSequenceNames.get(index);
}
public int getStartPosition(int binIndex) {
int index = getSequenceIndex(binIndex);
if (index < 0) {
return -1;
}
int bin = binIndex - mBinOffsets[index];
return (bin * mBinSize + 1);
}
public int getEndPosition(int binIndex) {
int index = getSequenceIndex(binIndex);
if (index < 0) {
return -1;
}
int bin = binIndex - mBinOffsets[index];
int position = (bin+1) * mBinSize;
position = Math.min(position, mSequenceLengths[index]);
return position;
}
public List<String> getSequenceNames() {
return mSequenceNames;
}
public int getFirstBin(String seqName) {
return getBinIndex(seqName, 1);
}
public int getLastBin(String seqName) {
return getBinIndex(seqName, 0);
}
public int getBinCount() {
if (mBinOffsets.length == 0) {
return 0;
}
int lastIndex = mBinOffsets.length - 1;
int count = mBinOffsets[lastIndex];
count += (mSequenceLengths[lastIndex] + mBinSize - 1) / mBinSize;
return count;
}
public int getBinCount(String seqName) {
int index = getSequenceIndex(seqName);
if (index < 0) {
return -1;
}
return ((mSequenceLengths[index] + mBinSize - 1) / mBinSize);
}
public int getSequenceLength(String seqName) {
int index = getSequenceIndex(seqName);
if (index < 0) {
return 0;
}
return mSequenceLengths[index];
}
private int getSequenceIndex(String seqName) {
for (int i = 0; i < mSequenceNames.size(); i++) {
if (mSequenceNames.get(i).equals(seqName)) {
return i;
}
}
return -1;
}
private int getSequenceIndex(int binIndex) {
if (binIndex < 0) {
return -1;
}
for (int i = 1; i < mBinOffsets.length; i++) {
if (mBinOffsets[i] > binIndex) {
return i-1;
}
}
int lastIndex = mBinOffsets.length-1;
int lastBinIndex = mBinOffsets[lastIndex];
lastBinIndex += (mSequenceLengths[lastIndex] + mBinSize - 1) / mBinSize;
if (binIndex <= lastBinIndex) {
return lastIndex;
}
return -1;
}
}

View File

@ -1,145 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.cnv.util;
import java.io.*;
import java.util.*;
/**
* Utility class for iterating over fasta files.
* Also maintains an unsigned base index over the file set.
*/
public class SequenceIterator
{
private List<File> mInputFiles = null;
private int mInputFileIndex = 0;
private int mBaseIndex = -1;
private LineNumberReader mCurrentReader = null;
private String mNextSequence = null;
private String mLineBuffer = null;
private int mLineBufferIndex = 0;
public SequenceIterator(File inputFile) {
mInputFiles = new ArrayList<File>();
mInputFiles.add(inputFile);
}
public SequenceIterator(List<File> inputFiles) {
mInputFiles = inputFiles;
}
public void close() {
if (mCurrentReader != null) {
try {
mCurrentReader.close();
} catch (IOException exc) {
throw new RuntimeException("Error closing reader: " + exc.getMessage(),
exc);
}
}
mCurrentReader = null;
mInputFiles = null;
mInputFileIndex = 0;
mBaseIndex = -1;
mNextSequence = null;
mLineBuffer = null;
mLineBufferIndex = 0;
}
public String getNextSequence()
throws IOException {
while (mNextSequence == null) {
if (mLineBuffer != null) {
incrementBaseIndex(mLineBuffer.length() - mLineBufferIndex);
mLineBuffer = null;
mLineBufferIndex = 0;
}
if (mCurrentReader == null) {
mCurrentReader = getNextReader();
if (mCurrentReader == null) {
return null;
}
}
String line = mCurrentReader.readLine();
if (line == null) {
mCurrentReader.close();
mCurrentReader = null;
continue;
}
if (line.startsWith(">")) {
String[] tokens = line.substring(1).trim().split("\\s+");
mNextSequence = tokens[0];
} else {
incrementBaseIndex(line.length());
}
}
String result = mNextSequence;
mNextSequence = null;
return result;
}
public char getNextBase()
throws IOException {
if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) {
if (mCurrentReader == null) {
return 0;
}
if (mNextSequence != null) {
return 0;
}
String line = mCurrentReader.readLine();
if (line == null) {
mLineBuffer = null;
mLineBufferIndex = 0;
mCurrentReader.close();
mCurrentReader = null;
return 0;
}
if (line.startsWith(">")) {
String[] tokens = line.substring(1).trim().split("\\s+");
mNextSequence = tokens[0];
mLineBuffer = null;
mLineBufferIndex = 0;
return 0;
}
mLineBuffer = line.toUpperCase();
mLineBufferIndex = 0;
}
char result = mLineBuffer.charAt(mLineBufferIndex++);
incrementBaseIndex(1);
return result;
}
public int getBaseIndex() {
return mBaseIndex;
}
private LineNumberReader getNextReader()
throws IOException {
if (mInputFileIndex >= mInputFiles.size()) {
return null;
}
File file = mInputFiles.get(mInputFileIndex++);
return new LineNumberReader(new FileReader(file));
}
private void incrementBaseIndex(int amount) {
if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) {
throw new RuntimeException("Base index: 32-bit overflow");
}
mBaseIndex += amount;
}
}

View File

@ -1,18 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2007 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.dcp;
public enum CallStatus
{
PENDING,
PROCESSING
}

View File

@ -1,309 +0,0 @@
/**
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2006 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.dcp;
import java.io.*;
/**
* Utility class to run system commands synchronously and return the output.
*
* The interface supports the typical case where you want to return a modest
* amount of information from the command's standard output or standard error
* as a string. The caller can override this behavior, however, and provide
* alternative output destinations if necessary.
*
* If setMergeOutput() is true, then this class will attempt to interleave
* the standard output and standard error streams of the command into one
* stream (standard output). This may not produce exactly the same results
* as having the operating system interleave the output, but works well for
* simple executables that do not heavily intermix stdout and stderr.
*
* A typical invocation is:
* <pre>
* CommandRunner runner = new CommandRunner();
* int status = runner.runCommand("ls");
* if (status == 0) {
* System.out.print(runner.getStandardOutput());
* }
* </pre>
*
* @author Bob Handsaker
*/
public class CommandRunner {
private boolean mMergeOutput = false;
private Writer mStandardOutputDestination = null;
private Writer mStandardErrorDestination = null;
private String mStandardOutputString = null;
private String mStandardErrorString = null;
/**
* Default constructor.
*/
public CommandRunner() {
}
/**
* Get the standard output from the last command as a string.
*
* If no command has been run or an explicit output destination
* was set, then this method returns null.
*/
public String getStandardOutputString() {
return mStandardOutputString;
}
/**
* Get the standard error from the last command as a string.
*
* If no command has been run or an explicit output destination
* was set, then this method returns null.
*/
public String getStandardErrorString() {
return mStandardErrorString;
}
/**
* If true, the command's standard error stream will be interleaved
* with the command's standard output stream. The standard error
* stream destination will not be used.
*/
public boolean getMergeOutput() {
return mMergeOutput;
}
/**
* If true, the command's standard error stream will be interleaved
* with the command's standard output stream.
*/
public void setMergeOutput(boolean value) {
mMergeOutput = value;
}
/**
* The destination for the command's standard output stream.
* If null, the standard output will be captured in a string.
*/
public Writer getStandardOutputDestination() {
return mStandardOutputDestination;
}
/**
* The destination for the command's standard output stream.
* If set to null, the standard output will be captured in a string.
*/
public void setStandardOutputDestination(Writer writer) {
mStandardOutputDestination = writer;
}
/**
* The destination for the command's standard error stream.
* If null, the standard error will be captured in a string.
*/
public Writer getStandardErrorDestination() {
return mStandardErrorDestination;
}
/**
* The destination for the command's standard error stream.
* If set to null, the standard error will be captured in a string.
*/
public void setStandardErrorDestination(Writer writer) {
mStandardErrorDestination = writer;
}
/**
* Run a command string as a system command.
*
* Returns the exit status of the command.
*
* When this method is called, the standard output string
* and standard error string are updated if no alternative output
* destinations have been set.
*
* This method throws a RuntimeException if running the command fails
* (for example, if there are not enough system resources to spawn
* the process).
*
* @param commmand The command string to run.
* @return Command exit status.
* @throws RuntimeException If command execution fails.
*/
public int runCommand(String command)
throws RuntimeException {
return runCommand(command.split(" "), null, null);
}
/**
* Run a command string as a system command.
*
* Returns the exit status of the command.
*
* When this method is called, the standard output string
* and standard error string are updated if no alternative output
* destinations have been set.
*
* This method throws a RuntimeException if running the command fails
* (for example, if there are not enough system resources to spawn
* the process).
*
* @param commmand The command string to run.
* @param environment The command environment (or null to inherit).
* @param workingDirectory The working directory (or null to inherit).
* @return Command exit status.
* @throws RuntimeException If command execution fails.
*/
public int runCommand(String command, String[] environment, File workingDirectory)
throws RuntimeException {
return runCommand(command.split(" "), environment, workingDirectory);
}
/**
* Run a command string as a system command.
*
* Returns the exit status of the command.
*
* When this method is called, the standard output string
* and standard error string are updated if no alternative output
* destinations have been set.
*
* This method throws a RuntimeException if running the command fails
* (for example, if there are not enough system resources to spawn
* the process).
*
* @param commmand The command to run (as a array of arguments).
* @param environment The command environment (or null to inherit).
* @param workingDirectory The working directory (or null to inherit).
* @return Command exit status.
* @throws RuntimeException If command execution fails.
*/
public int runCommand(String[] command, String[] environment, File workingDirectory)
throws RuntimeException {
Writer stdout = mStandardOutputDestination;
Writer stderr = mStandardErrorDestination;
if (stdout == null) {
stdout = new StringWriter();
}
if (mMergeOutput) {
stderr = stdout;
} else if (stderr == null) {
stderr = new StringWriter();
}
mStandardOutputString = null;
mStandardErrorString = null;
int commandStatus = 0;
try {
Process process =
Runtime.getRuntime().exec(command, environment, workingDirectory);
StreamHandler stdoutHandler =
new StreamHandler(process.getInputStream(), stdout);
StreamHandler stderrHandler =
new StreamHandler(process.getErrorStream(), stderr);
commandStatus = process.waitFor();
// Wait for the streams to drain.
stdoutHandler.join();
stderrHandler.join();
} catch (Exception exc) {
throw new RuntimeException("Command execution failed: " +
exc.getMessage(),
exc);
}
if (mStandardOutputDestination == null) {
mStandardOutputString = stdout.toString();
}
if (mStandardErrorDestination == null && !mMergeOutput) {
mStandardErrorString = stderr.toString();
}
return commandStatus;
}
/**
* Internal class to asynchronously read from the standard output
* and standard error streams of the command being executed.
*
* If you do not handle command output asynchronously, then execution
* of a command may block in some environments if the program produces
* too much output. In this case, the call to run the process will
* never complete.
*/
private static class StreamHandler extends Thread {
/**
* Constructor.
* Create an instance of this class, which is an asynchronous
* thread that will consume input from the given input stream
* and send the output to the given output destination.
*
* @param input The input stream to read.
* @param output The output destination.
*/
StreamHandler(InputStream input, Writer output) {
m_input = input;
m_output = output;
start();
}
/**
* Standard thread run method.
* Pipe input from the input source to the output destination
* until there is no more input left.
*
* If an IOException occurs, the thread will make sure all
* available output has been flushed to the destination and
* then terminate. The IOException is not propagated.
*/
public void run() {
char[] buffer = new char[4096];
Reader reader =
new InputStreamReader(new BufferedInputStream(m_input));
Writer writer = m_output;
try {
while (true) {
int count = reader.read(buffer);
if (count <= 0) {
break;
}
if (writer != null) {
synchronized (writer) {
writer.write(buffer, 0, count);
}
}
}
} catch (IOException ignore) {
// Ignore IO exceptions
} finally {
try {
reader.close();
} catch (Exception ignore) {
}
try {
m_output.flush();
} catch (Exception ignore) {
}
}
}
private InputStream m_input;
private Writer m_output;
}
}

View File

@ -1,618 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2007 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.dcp;
import edu.mit.broad.dcp.message.*;
import java.io.*;
import java.util.*;
import java.lang.reflect.Method;
import java.net.InetAddress;
import java.net.ServerSocket;
import java.rmi.registry.*;
/**
* Experimental.
*/
public abstract class DistributedAlgorithm
implements Serializable
{
public static final Integer ANY = 0;
public static final Integer MASTER = 1;
public DistributedAlgorithm() {
}
public String getServerHost() {
return mServerHost;
}
public void setServerHost(String value) {
mServerHost = value;
}
public int getServerPort() {
return mServerPort;
}
public void setServerPort(int value) {
mServerPort = value;
}
public String getAlgorithmName() {
if (mAlgorithmName != null) {
return mAlgorithmName;
} else {
return getClassName();
}
}
public void setAlgorithmName(String value) {
mAlgorithmName = value;
}
public int getMaximumWorkerCount() {
return mMaximumWorkerCount;
}
public void setMaximumWorkerCount(int value) {
mMaximumWorkerCount = value;
}
/**
* Name of LSF queue to use for workers.
*/
public String getLsfQueue() {
return mLsfQueue;
}
public void setLsfQueue(String value) {
mLsfQueue = value;
}
/**
* Directory to hold lsf log files.
*/
public String getLsfLogDirectory() {
return mLsfLogDirectory;
}
public void setLsfLogDirectory(String value) {
mLsfLogDirectory = value;
}
public boolean getEnableGcLogging() {
return mEnableGcLogging;
}
public void setEnableGcLogging(boolean value) {
mEnableGcLogging = value;
}
public Integer getWorkerId() {
return mWorkerId;
}
public Integer getProcessId() {
return mProcessId;
}
protected void init()
throws Exception {
}
protected abstract void start()
throws Exception;
public void run()
throws Exception {
if (mIsRunning) {
throw new IllegalStateException("Algorithm is already running");
}
mIsRunning = true;
mWorkerId = MASTER;
mProcessId = MASTER;
try {
startDistributedServer();
init();
startWorkerThread();
startWorkers();
start();
waitForCompletion();
} finally {
// TBD: More cleanup (shutdown threads, etc.)
stopDistributedServer();
mIsRunning = false;
}
}
void runWorker(int workerId, int processId)
throws Exception {
if (mIsRunning) {
throw new IllegalStateException("Algorithm is already running");
}
mIsRunning = true;
mWorkerId = workerId;
mProcessId = processId;
try {
if (openDistributedServer() == null) {
report("Server " + mServerHost + ":" + mServerPort + " not responding");
return;
}
init();
startWorkerThread();
mWorkerThread.join();
} finally {
closeDistributedServer();
mIsRunning = false;
}
}
private void startWorkers() {
int workerCount = getMaximumWorkerCount();
if (workerCount <= 0) {
// Use single process execution for testing/debugging.
new InProcessWorker().start();
return;
}
if (workerCount > 1000) {
throw new RuntimeException("Excessive worker count: " + workerCount);
}
for (int i = 0; i < workerCount; i++) {
Integer workerId = (MASTER + i + 1);
Integer processId = workerId; // for now
startWorker(workerId, processId);
}
}
private void startDistributedServer() {
try {
// Create a server socket to allocate a unique port.
// There is a window of vulnerability where the port
// can get reused, but in practice this works ok.
String serverHost = getCurrentHost();
ServerSocket socket = new ServerSocket(0);
int serverPort = socket.getLocalPort();
socket.close();
Registry registry = LocateRegistry.createRegistry(serverPort);
DistributedCallServer server = new DistributedCallServer();
server.setAlgorithm(this);
registry.bind("DistributedCallService", server);
mServerHost = serverHost;
mServerPort = serverPort;
mDistributedCallServer = server;
mDistributedCallService = server;
} catch (Exception exc) {
throw wrapException(exc);
}
}
private void stopDistributedServer() {
if (mDistributedCallServer != null) {
try {
Registry registry = LocateRegistry.getRegistry(mServerPort);
registry.unbind("DistributedCallService");
mDistributedCallServer.stop();
} catch (Exception exc) {
throw wrapException(exc);
}
}
mDistributedCallService = null;
mDistributedCallServer = null;
}
private DistributedCallService openDistributedServer() {
mDistributedCallService = null;
try {
String url = "rmi://" + getServerHost() + ":" + getServerPort() + "/DistributedCallService";
DistributedCallService server =
(DistributedCallService) java.rmi.Naming.lookup(url);
mDistributedCallService = server;
} catch (java.rmi.NotBoundException exc) {
// Server has exited
} catch (Exception exc) {
throw wrapException(exc);
}
return mDistributedCallService;
}
private void closeDistributedServer() {
mDistributedCallService = null;
}
private void startWorker(Integer workerId, Integer processId) {
String logFile = "worker_" + processId + "_%J.bsub";
if (mLsfLogDirectory != null) {
logFile = mLsfLogDirectory + "/" + logFile;
}
List<String> command = new ArrayList<String>();
command.add("bsub");
command.add("-o");
command.add(logFile);
if (mLsfQueue != null) {
command.add("-q");
command.add(mLsfQueue);
}
command.add("runDistributedWorker");
command.add("-serverHost");
command.add(getServerHost());
command.add("-serverPort");
command.add(Integer.toString(getServerPort()));
command.add("-workerId");
command.add(Integer.toString(workerId));
command.add("-processId");
command.add(Integer.toString(processId));
// Pass our -Xmx setting along to all workers.
Map<String, String> environment =
new LinkedHashMap<String, String>(System.getenv());
long maxMemory = Runtime.getRuntime().maxMemory();
long maxKbytes = maxMemory / 1024;
String memJavaOpt = "-Xmx" + maxKbytes + "K";
// Enable GC logging if requested
String gcJavaOpt = null;
if (mEnableGcLogging) {
String gcLogFile = "worker_" + processId + ".gc.log";
if (mLsfLogDirectory != null) {
gcLogFile = mLsfLogDirectory + "/" + gcLogFile;
}
gcJavaOpt = "-Xloggc:" + gcLogFile;
}
String javaOpts = environment.get("JAVAOPTS");
if (javaOpts == null) {
javaOpts = memJavaOpt;
if (gcJavaOpt != null) {
javaOpts = javaOpts + " " + gcJavaOpt;
}
environment.put("JAVAOPTS", javaOpts);
}
// Log output ourselves (rather than waiting for bsub).
String workerLogFile = "worker_" + processId + ".log";
if (mLsfLogDirectory != null) {
workerLogFile = mLsfLogDirectory + "/" + workerLogFile;
}
environment.put("DA_LOG_FILE", workerLogFile);
CommandRunner runner = new CommandRunner();
Writer output = new LsfOutputFilter();
runner.setStandardOutputDestination(output);
runner.setStandardErrorDestination(output);
String[] commandArray = command.toArray(new String[command.size()]);
String[] environmentArray = createEnvironmentArray(environment);
int status = runner.runCommand(commandArray, environmentArray, null);
if (status != 0) {
throw new RuntimeException("Error starting worker: " + status);
}
}
private String[] createEnvironmentArray(Map<String, String> map) {
if (map == null) {
return null;
}
int index = 0;
String[] array = new String[map.size()];
for (Map.Entry<String, String> entry : map.entrySet()) {
array[index++] = entry.getKey() + "=" + entry.getValue();
}
return array;
}
private String getCurrentHost() {
try {
return InetAddress.getLocalHost().getCanonicalHostName();
} catch (Exception exc) {
throw wrapException(exc);
}
}
private void waitForCompletion() {
DistributedCallServer server = mDistributedCallServer;
while (true) {
if (server.isQueueEmpty()) {
break;
}
try {
Thread.sleep(1000);
} catch (InterruptedException exc) {
// ignore
}
}
}
protected void callDistributed(String methodName, Object... methodArgs) {
callDistributed(null, methodName, methodArgs);
}
protected void callDistributed(Integer workerId, String methodName, Object... methodArgs) {
if (workerId == null) {
workerId = ANY;
}
try {
DistributedCallMessage message = new DistributedCallMessage();
message.setSenderWorkerId(getWorkerId());
message.setSenderProcessId(getProcessId());
message.setReceiverWorkerId(workerId);
message.setMethodName(methodName);
message.setMethodArgs(methodArgs);
mDistributedCallService.writeMessage(message);
} catch (Throwable exc) {
throw wrapException(exc);
}
}
private void callMethod(String methodName, Object[] methodArgs) {
try {
Object target = this;
Class targetClass = target.getClass();
Method targetMethod = findMethod(targetClass, methodName);
if (targetMethod == null) {
throw new RuntimeException("Cannot find target method: " + methodName);
}
targetMethod.invoke(target, methodArgs);
} catch (Throwable exc) {
throw wrapException(exc);
}
}
private Method findMethod(Class clazz, String methodName) throws Exception {
Method result = null;
Method[] methods = clazz.getDeclaredMethods();
for (int i = 0; i < methods.length; i++) {
if (methods[i].getName().equals(methodName)) {
if (result != null) {
throw new RuntimeException("Duplicate method name: " + methodName);
}
result = methods[i];
}
}
return result;
}
private RuntimeException wrapException(Throwable exception) {
if (exception instanceof RuntimeException) {
return (RuntimeException) exception;
} else {
return new RuntimeException(exception.getMessage(), exception);
}
}
private void startWorkerThread() {
if (mWorkerThread != null) {
throw new IllegalStateException("WorkerThread is running");
}
mWorkerThread = new WorkerThread();
mWorkerThread.start();
}
private void stopWorkerThread() {
if (mWorkerThread == null) {
throw new IllegalStateException("WorkerThread is running");
}
mWorkerThread.stopThread();
}
private class WorkerThread extends Thread {
WorkerThread() {
setDaemon(true);
}
public void run() {
try {
DistributedCallService service = mDistributedCallService;
while (true) {
if (isInterrupted()) {
System.out.println("#DBG: Worker isInterrupted");
throw new InterruptedException();
}
DistributedCallMessage message =
service.acceptMessage(getWorkerId(), getProcessId());
if (message == null) {
Thread.sleep(1000);
} else {
processMessage(message);
}
}
} catch (InterruptedException exc) {
// Interruption terminates this thread.
// System.out.println("#DBG: Worker caught InterruptedException");
} catch (Throwable exc) {
if (isDisconnectException(exc)) {
report("Server disconnected");
} else {
reportError("Exception in WorkerThread: " + exc.getMessage(), exc);
System.exit(1);
}
}
report("WorkerThread terminated");
}
void stopThread() {
// System.out.println("#DBG: About to interrupt worker...");
interrupt();
// System.out.println("#DBG: Joining worker...");
try {
join();
} catch (InterruptedException exc) {
// ignore
}
}
private boolean isDisconnectException(Throwable exc) {
if (exc instanceof java.rmi.ConnectException) {
return true;
} else if (exc instanceof java.rmi.NoSuchObjectException) {
return true;
} else if (exc instanceof java.rmi.UnmarshalException &&
exc.getCause() != null &&
exc.getCause() instanceof EOFException) {
return true;
} else {
return false;
}
}
}
private void processMessage(DistributedCallMessage message) {
try {
Integer workerId = message.getReceiverWorkerId();
if (workerId == null || !workerId.equals(getWorkerId())) {
reportError("Invalid worker ID in message: " + message);
return;
}
callMethod(message.getMethodName(), message.getMethodArgs());
} catch (Throwable exc) {
reportError("Exception running message: " + message, exc);
} finally {
completeMessage(message);
}
}
private void completeMessage(DistributedCallMessage message) {
try {
DistributedCallService service = mDistributedCallService;
service.completeMessage(getWorkerId(), getProcessId(), message.getCallId());
} catch (Throwable exc) {
reportError("Exception completing message: " + message, exc);
}
}
protected void report(String message) {
String identity =
getAlgorithmName() + " " +
getWorkerId() + "/" + getProcessId();
System.out.println("# " + identity + " : " + message);
}
protected void reportError(String message) {
reportError(message, null);
}
protected void reportError(String message, Throwable exception) {
String identity =
getAlgorithmName() + " " +
getWorkerId() + "/" + getProcessId();
System.out.println("Error" +
" [" + identity + "]" +
": " + message);
if (exception != null) {
System.out.println(" with exception: " + exception.getMessage());
exception.printStackTrace(System.out);
}
}
private String getClassName() {
String name = getClass().getName();
return name.substring(name.lastIndexOf('.')+1);
}
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("DistributedAlgorithm");
builder.append("(");
builder.append("" + getAlgorithmName());
builder.append(",");
builder.append("" + getWorkerId());
builder.append(",");
builder.append("" + getProcessId());
builder.append(",");
builder.append("" + getMaximumWorkerCount());
builder.append(",");
builder.append("" + getLsfQueue());
builder.append(",");
builder.append("" + mIsRunning);
builder.append(")");
return builder.toString();
}
// This class is used only during in-process execution/testing/debugging.
private class InProcessWorker extends Thread {
InProcessWorker() {
setDaemon(true);
}
public void run() {
report("InProcessWorker starting");
try {
String serverAddress = getServerHost() + ":" + getServerPort();
String url = "rmi://" + serverAddress + "/DistributedCallService";
DistributedCallService server =
(DistributedCallService) java.rmi.Naming.lookup(url);
DistributedAlgorithm algorithm = server.getAlgorithm();
algorithm.setServerHost(getServerHost());
algorithm.setServerPort(getServerPort());
algorithm.runWorker(2, 1);
} catch (Throwable exc) {
reportError("Exception in InProcessWorker: " + exc.getMessage(), exc);
System.exit(1);
}
report("InProcessWorker terminated");
}
}
private static class LsfOutputFilter
extends FilterWriter {
LsfOutputFilter() {
super(new PrintWriter(System.out, true));
}
public void write(int ch)
throws IOException {
if (mAtLineStart) {
out.write("# ");
mAtLineStart = false;
}
out.write(ch);
mAtLineStart = (ch == '\n');
}
public void write(String s, int off, int len)
throws IOException {
write(s.toCharArray(), off, len);
}
public void write(char[] a, int off, int len)
throws IOException {
for (int i = 0; i < len; i++) {
write(a[off+i]);
}
}
private boolean mAtLineStart = true;
}
private transient int mMaximumWorkerCount = 0;
private transient String mLsfQueue = null;
private transient String mLsfLogDirectory = null;
private transient boolean mEnableGcLogging = false;
private transient boolean mIsRunning = false;
private transient int mWorkerId = 0;
private transient int mProcessId = 0;
private transient WorkerThread mWorkerThread = null;
private transient String mAlgorithmName = null;
private transient String mServerHost = null;
private transient int mServerPort = 0;
private transient DistributedCallService mDistributedCallService = null;
private transient DistributedCallServer mDistributedCallServer = null;
}

View File

@ -1,134 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2007 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.dcp;
import java.util.*;
/**
* Command line driver for distributed worker invocation.
*/
public class DistributedAlgorithmWorker
{
public static void main(String[] args)
throws Exception {
new DistributedAlgorithmWorker().run(args);
}
private void run(String[] args)
throws Exception {
if (!parseArguments(args)) {
System.exit(1);
}
System.out.println("# DistributedAlgorithmWorker");
System.out.println("# Started at " + new Date());
runDistributedWorker();
System.out.println("# Ended at " + new Date());
}
private boolean parseArguments(String[] args) {
int argpos = 0;
int argsleft = 0;
while (argpos < args.length) {
argsleft = args.length - argpos;
String arg = args[argpos];
if (arg.equals("-serverHost") && argsleft > 1) {
argpos++;
mServerHost = args[argpos++];
} else if (arg.equals("-serverPort") && argsleft > 1) {
argpos++;
mServerPort = Integer.parseInt(args[argpos++]);
} else if (arg.equals("-workerId") && argsleft > 1) {
argpos++;
mWorkerId = new Integer(args[argpos++]);
} else if (arg.equals("-processId") && argsleft > 1) {
argpos++;
mProcessId = new Integer(args[argpos++]);
} else if (arg.equals("-debug")) {
argpos++;
mDebug = true;
continue;
} else if (arg.equals("-verbose")) {
argpos++;
mVerbose = true;
continue;
} else if (arg.startsWith("-")) {
usage();
return false;
} else {
break;
}
}
argsleft = args.length - argpos;
if (argsleft != 0) {
usage();
return false;
}
return true;
}
private void usage() {
System.out.println("Usage: DistributedWorkerMain ...");
System.out.println(" -serverHost <hostname>");
System.out.println(" -serverPort <port>");
System.out.println(" -workerId <id>");
System.out.println(" -processId <id>");
System.out.println(" -verbose");
System.out.println(" -debug");
}
private void runDistributedWorker()
throws Exception {
DistributedAlgorithm algorithm = null;
String serverAddress = getServerHost() + ":" + getServerPort();
try {
String url = "rmi://" + serverAddress + "/DistributedCallService";
DistributedCallService server =
(DistributedCallService) java.rmi.Naming.lookup(url);
algorithm = server.getAlgorithm();
} catch (java.rmi.ConnectException exc) {
System.out.println("# Server " + serverAddress + " not responding.");
return;
}
algorithm.setServerHost(getServerHost());
algorithm.setServerPort(getServerPort());
algorithm.runWorker(getWorkerId(), getProcessId());
}
private Integer getWorkerId() {
return mWorkerId;
}
private Integer getProcessId() {
return mProcessId;
}
private String getServerHost() {
return mServerHost;
}
private int getServerPort() {
return mServerPort;
}
private boolean mDebug = false;
private boolean mVerbose = false;
private String mServerHost = null;
private int mServerPort = 0;
private Integer mWorkerId = null;
private Integer mProcessId = null;
}

View File

@ -1,133 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.dcp;
import edu.mit.broad.dcp.message.*;
import java.rmi.server.UnicastRemoteObject;
import java.util.*;
public class DistributedCallServer
extends UnicastRemoteObject
implements DistributedCallService
{
public DistributedCallServer()
throws java.rmi.RemoteException {
}
public void setAlgorithm(DistributedAlgorithm algorithm) {
mAlgorithm = algorithm;
}
public DistributedAlgorithm getAlgorithm() {
return mAlgorithm;
}
public long writeMessage(DistributedCallMessage message) {
message.setCallStatus(CallStatus.PENDING);
message.setCallId(generateCallId());
if (message.getReceiverWorkerId().equals(0)) {
synchronized (mMessageQueue) {
mMessageQueue.addLast(message);
}
} else {
synchronized (mMessageQueue) {
mMessageQueue.addFirst(message);
}
}
return message.getCallId();
}
public DistributedCallMessage acceptMessage(int workerId, int processId) {
if (workerId <= 0) {
throw new IllegalArgumentException("Invalid worker ID: " + workerId);
}
if (processId <= 0) {
throw new IllegalArgumentException("Invalid process ID: " + processId);
}
synchronized (mMessageQueue) {
Iterator<DistributedCallMessage> iterator = mMessageQueue.iterator();
while (iterator.hasNext()) {
DistributedCallMessage message = iterator.next();
if (message.getCallStatus() != CallStatus.PENDING) {
continue;
}
int receiverId = message.getReceiverWorkerId();
if (receiverId == workerId ||
(receiverId == 0 && workerId > 1)) {
message.setCallStatus(CallStatus.PROCESSING);
message.setReceiverWorkerId(workerId);
message.setReceiverProcessId(processId);
return message;
}
}
}
return null;
}
public void completeMessage(int workerId, int processId, long callId) {
if (workerId <= 0) {
throw new IllegalArgumentException("Invalid worker ID: " + workerId);
}
if (processId <= 0) {
throw new IllegalArgumentException("Invalid process ID: " + processId);
}
if (callId <= 0) {
throw new IllegalArgumentException("Invalid call ID: " + callId);
}
synchronized (mMessageQueue) {
Iterator<DistributedCallMessage> iterator = mMessageQueue.iterator();
while (iterator.hasNext()) {
DistributedCallMessage message = iterator.next();
if (message.getCallId().longValue() == callId) {
if (message.getCallStatus() != CallStatus.PROCESSING) {
throw new IllegalStateException("Call #" + callId + " not in state PROCESSING");
}
if (!message.getReceiverWorkerId().equals(workerId)) {
throw new IllegalStateException("Call #" + callId + " assigned to worker " + message.getReceiverWorkerId() + " not worker " + workerId);
}
if (!message.getReceiverProcessId().equals(processId)) {
throw new IllegalStateException("Call #" + callId + " assigned to process " + message.getReceiverProcessId() + " not process " + processId);
}
iterator.remove();
return;
}
}
}
throw new IllegalArgumentException("Unrecognized call ID " + callId);
}
public boolean isQueueEmpty() {
synchronized (mMessageQueue) {
return mMessageQueue.isEmpty();
}
}
public void stop() {
try {
UnicastRemoteObject.unexportObject(this, false);
} catch (java.rmi.NoSuchObjectException exc) {
throw new RuntimeException("Exception unexporting object: " + exc.getMessage(),
exc);
}
}
private synchronized long generateCallId() {
return ++mCallIdGenerator;
}
private long mCallIdGenerator = 0;
private DistributedAlgorithm mAlgorithm = null;
private LinkedList<DistributedCallMessage> mMessageQueue =
new LinkedList<DistributedCallMessage>();
}

View File

@ -1,25 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.dcp;
import edu.mit.broad.dcp.message.*;
public interface DistributedCallService
extends java.rmi.Remote
{
public DistributedAlgorithm getAlgorithm()
throws java.rmi.RemoteException;
public long writeMessage(DistributedCallMessage message)
throws java.rmi.RemoteException;
public DistributedCallMessage acceptMessage(int workerId, int processId)
throws java.rmi.RemoteException;
public void completeMessage(int workerId, int processId, long callId)
throws java.rmi.RemoteException;
}

View File

@ -1,90 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2007 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.dcp.message;
import edu.mit.broad.dcp.CallStatus;
public class DistributedCallMessage
extends DistributedMessage
{
public DistributedCallMessage() {
}
public Long getCallId() {
return mCallId;
}
public void setCallId(Long value) {
mCallId = value;
}
public CallStatus getCallStatus() {
return mCallStatus;
}
public void setCallStatus(CallStatus value) {
mCallStatus = value;
}
public String getMethodName() {
return mMethodName;
}
public void setMethodName(String value) {
mMethodName = value;
}
public Object[] getMethodArgs() {
return mMethodArgs;
}
public void setMethodArgs(Object[] value) {
mMethodArgs = value;
}
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("DistributedCallMessage");
builder.append("(");
builder.append("" + getSenderWorkerId());
builder.append(",");
builder.append("" + getSenderProcessId());
builder.append(",");
builder.append("" + getReceiverWorkerId());
builder.append(",");
builder.append("" + getReceiverProcessId());
builder.append(",");
builder.append("" + mCallId);
builder.append(",");
builder.append("" + mCallStatus);
builder.append(",");
builder.append("" + mMethodName);
builder.append(",");
if (mMethodArgs == null) {
builder.append("" + mMethodArgs);
} else {
builder.append("[");
for (int i = 0; i < mMethodArgs.length; i++) {
if (i > 0) {
builder.append(",");
}
builder.append("" + mMethodArgs[i]);
}
builder.append("]");
}
builder.append(")");
return builder.toString();
}
public Long mCallId;
public CallStatus mCallStatus;
public String mMethodName;
public Object[] mMethodArgs;
}

View File

@ -1,54 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2007 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.dcp.message;
public class DistributedMessage
{
public DistributedMessage() {
}
public Integer getSenderWorkerId() {
return mSenderWorkerId;
}
public void setSenderWorkerId(Integer value) {
mSenderWorkerId = value;
}
public Integer getSenderProcessId() {
return mSenderProcessId;
}
public void setSenderProcessId(Integer value) {
mSenderProcessId = value;
}
public Integer getReceiverWorkerId() {
return mReceiverWorkerId;
}
public void setReceiverWorkerId(Integer value) {
mReceiverWorkerId = value;
}
public Integer getReceiverProcessId() {
return mReceiverProcessId;
}
public void setReceiverProcessId(Integer value) {
mReceiverProcessId = value;
}
public Integer mSenderWorkerId;
public Integer mSenderProcessId;
public Integer mReceiverWorkerId;
public Integer mReceiverProcessId;
}

View File

@ -1,27 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard;
/**
* Basic Picard runtime exception that, for now, does nothing much
*
* @author Kathleen Tibbetts
*/
public class PicardException extends RuntimeException
{
public PicardException(String message) {
super(message);
}
public PicardException(String message, Throwable throwable) {
super(message, throwable);
}
}

View File

@ -1,97 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.aligner;
import edu.mit.broad.picard.io.IoUtil;
import java.io.File;
import java.io.IOException;
import java.util.Map;
import java.util.List;
/**
* Abstract base class for use by <code>Aligner</code> implementations. Provides a constructor and
* accessors for common inputs and outputs.
*
* @author Kathleen Tibbetts
*/
public abstract class AbstractBaseAligner implements Aligner {
private final Stringency stringency; // The stringency of the alignment
private final File readsBamFile; // The BAM file containing the read data
private final String outputPrefix; // The directory and file name prefix for outputs
private final String referenceFileDir; // The directory where the reference file can be found
private final int clipPoints[]; // The clip points to use
private final Integer expectedInsertSize; // Expected insert size; null for non-paired-end lanes
private final Integer readsToAlign; // The number of reads to align (all if null)
private final boolean pairedReads; // Whether this is a paired-end run
private final int readLength;
// Parameters specific to the Aligner implementation being used
private final Map<String, String> customParametersMap;
/**
* Constructor that sets every parameter.
*
* @param stringency the stringency of the alignment
* @param readsBamFile the BAM file containing the reads
* @param outputPrefix the directory and filename prefix for output
* @param referenceFileDir the directory where the reference file is located
* @param clipPoints the clip points
* @param expectedInsertSize the expected insert size (null for non-PE lanes)
* @param readsToAlign the number of reads to align
* @param customParametersMap parameters specific to the Aligner implementation
*/
public AbstractBaseAligner(Stringency stringency, File readsBamFile, String outputPrefix,
String referenceFileDir, int clipPoints[], Integer expectedInsertSize,
Integer readsToAlign, Map<String, String> customParametersMap,
boolean pairedReads, int readLength) {
// First, a little validation
if (clipPoints != null && clipPoints.length != 4) {
throw new IllegalArgumentException("Length of clipPoints array argument must be 4.");
}
IoUtil.assertFileIsReadable(readsBamFile);
this.stringency = stringency;
this.readsBamFile = readsBamFile;
this.outputPrefix = outputPrefix;
this.referenceFileDir = referenceFileDir;
this.clipPoints = clipPoints != null ? clipPoints : new int[4];
this.expectedInsertSize = expectedInsertSize;
this.readsToAlign = readsToAlign;
this.customParametersMap = customParametersMap;
this.pairedReads = pairedReads;
this.readLength = readLength;
}
/**
* Utility method for deleting a list of files, to be used by the
* cleanup method of sub-classes
*
* @param files the list of files to delete
*/
protected final void deleteFiles(List<File> files) {
for (File f : files) {
f.delete();
}
}
// Accessors
protected final Stringency getStringency() { return stringency; }
protected final File getReadsBamFile() { return readsBamFile; }
protected final String getOutputPrefix() { return outputPrefix; }
protected final String getReferenceFileDir() { return referenceFileDir; }
protected final int[] getClipPoints() { return clipPoints; }
protected final Integer getExpectedInsertSize() { return expectedInsertSize; }
protected final Integer getReadsToAlign() { return readsToAlign; }
protected final Map<String, String> getCustomParametersMap() { return customParametersMap; }
protected final boolean isPairedReads() { return pairedReads; }
protected final int getReadLength() { return readLength; }
}

View File

@ -1,45 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.aligner;
/**
* API for aligners. Clients must call these methods in order, as each depends on
* the previous one, but they may call them multiple times and need not call them all.
* This allows steps to be rerun and also lets the caller review intermediate files
* when troubleshooting.
*
* @author Kathleen Tibbetts
*/
public interface Aligner {
public static enum Stringency{ low, high };
/**
* Prepares all the necessary inputs for the alignment process from a BAM file of read data.
*/
public void prepareInputs();
/**
* Does the alignment and produces output in the underlying form of the aligner.
*/
public void align();
/**
* Converts the output of the aligner to BAM format
*/
public void prepareOutput();
/**
* Cleans up intermediate files (the files created in by and for the underlying aligner by the
* prepareInputs() and align() methods. Does not clean up the original source files or the final BAM file.
*/
public void cleanup();
}

View File

@ -1,319 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.aligner.maq;
import edu.mit.broad.sam.SAMFileReader;
import edu.mit.broad.sam.SAMRecord;
import edu.mit.broad.sam.util.BinaryCodec;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.filter.*;
import edu.mit.broad.picard.util.PeekableIterator;
import edu.mit.broad.picard.util.Log;
import edu.mit.broad.picard.sam.ReservedTagConstants;
import java.io.File;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Arrays;
/**
* Class to take unmapped reads in BAM file format and create Maq binary fastq format file(s) --
* one or two of them, depending on whether it's a paired-end read. This relies on the unmapped
* BAM file having all paired reads together in order.
*/
public class BamToBfqWriter {
private final File bamFile;
private final String outputPrefix;
private boolean pairedReads = false;
private int wrote = 0;
private int increment = 1;
private int chunk = 0;
private BinaryCodec codec1;
private BinaryCodec codec2;
private final Log log = Log.getInstance(BamToBfqWriter.class);
/**
* Constructor
*
* @param bamFile the BAM file to read from
* @param outputPrefix the directory and file prefix for the binary fastq files
* @param total the total number of records that should be written, drawn evenly
* from throughout the file (null for all).
* @param chunk the maximum number of records taht should be written to any one file
* @param pairedReads whether these reads are from a paired-end run
*/
public BamToBfqWriter(File bamFile, String outputPrefix, Integer total, Integer chunk, boolean pairedReads) {
this.bamFile = bamFile;
this.outputPrefix = outputPrefix;
this.pairedReads = pairedReads;
if (total != null) {
double writeable = (double)countWritableRecords();
this.increment = (int)Math.floor(writeable/total.doubleValue());
}
if (chunk != null) {
this.chunk = chunk;
}
}
/**
* Constructor
*
* @param bamFile the BAM file to read from
* @param outputPrefix the directory and file prefix for the binary fastq files
* @param pairedReads whether these reads are from a paired-end run
*/
public BamToBfqWriter(File bamFile, String outputPrefix, boolean pairedReads) {
this(bamFile, outputPrefix, null, null, pairedReads);
}
/**
* Writes the binary fastq file(s) to the output directory
*/
public void writeBfqFiles() {
Iterator<SAMRecord> iterator = (new SAMFileReader(IoUtil.openFileForReading(this.bamFile))).iterator();
// Filter out noise reads and reads that fail the quality filter
TagFilter tagFilter = new TagFilter(ReservedTagConstants.XN, 1);
FailsVendorReadQualityFilter qualityFilter = new FailsVendorReadQualityFilter();
if (!pairedReads) {
writeSingleEndBfqs(iterator, Arrays.asList(tagFilter, qualityFilter));
codec1.close();
}
else {
writePairedEndBfqs(iterator, tagFilter, qualityFilter);
codec1.close();
codec2.close();
}
log.info("Wrote " + wrote + " bfq records.");
}
/**
* Path for writing bfqs for paired-end reads
*
* @param iterator the iterator witht he SAM Records to write
* @param tagFilter the filter for noise reads
* @param qualityFilter the filter for PF reads
*/
private void writePairedEndBfqs(Iterator<SAMRecord> iterator, TagFilter tagFilter,
FailsVendorReadQualityFilter qualityFilter) {
// Open the codecs for writing
int fileIndex = 0;
initializeNextBfqFiles(fileIndex++);
int records = 0;
while (iterator.hasNext()) {
SAMRecord first = iterator.next();
if (!iterator.hasNext()) {
throw new PicardException("Mismatched number of records in " + this.bamFile.getAbsolutePath());
}
SAMRecord second = iterator.next();
if (!second.getReadName().equals(first.getReadName()) ||
first.getFirstOfPairFlag() == second.getFirstOfPairFlag()) {
throw new PicardException("Unmatched read pairs in " + this.bamFile.getAbsolutePath() +
": " + first.getReadName() + ", " + second.getReadName() + ".");
}
// If both are noise reads, filter them out
if (tagFilter.filterOut(first) && tagFilter.filterOut(second)) {
// skip it
}
// If either fails to pass filter, then exclude them as well
else if (qualityFilter.filterOut(first) || qualityFilter.filterOut(second)) {
// skip it
}
// Otherwise, write them out
else {
records++;
if (records % increment == 0) {
first.setReadName(first.getReadName() + "#0/1");
writeFastqRecord(first.getFirstOfPairFlag() ? codec1 : codec2, first);
second.setReadName(second.getReadName() + "#0/2");
writeFastqRecord(second.getFirstOfPairFlag() ? codec1 : codec2, second);
wrote++;
if (wrote % 1000000 == 0) {
log.info(wrote + " records written.");
}
if (chunk > 0 && wrote % chunk == 0) {
initializeNextBfqFiles(fileIndex++);
}
}
}
}
}
/**
* Path for writing bfqs for single-end reads
*
* @param iterator the iterator witht he SAM Records to write
* @param filters the list of filters to be applied
*/
private void writeSingleEndBfqs(Iterator<SAMRecord> iterator, List<SamRecordFilter> filters) {
// Open the codecs for writing
int fileIndex = 0;
initializeNextBfqFiles(fileIndex++);
int records = 0;
FilteringIterator it = new FilteringIterator(iterator, new AggregateFilter(filters));
while (it.hasNext()) {
SAMRecord record = it.next();
records++;
if (records % increment == 0) {
writeFastqRecord(codec1, record);
wrote++;
if (wrote % 1000000 == 0) {
log.info(wrote + " records processed.");
}
if (chunk > 0 && wrote % chunk == 0) {
initializeNextBfqFiles(fileIndex++);
}
}
}
}
/**
* Closes any the open bfq file(s), if any, and opens the new one(s)
*
* @param fileIndex the index (counter) of the files to write
*/
private void initializeNextBfqFiles(int fileIndex) {
// Close the codecs if they were writing before
if (codec1 != null) {
codec1.close();
if (pairedReads) {
codec2.close();
}
}
// Open new file, using the fileIndex.
File bfq1 = getOutputFile(this.outputPrefix , 1, fileIndex);
codec1 = new BinaryCodec(IoUtil.openFileForWriting(bfq1));
log.info("Now writing to file " + bfq1.getAbsolutePath());
if (pairedReads) {
File bfq2 = getOutputFile(this.outputPrefix , 2, fileIndex);
codec2 = new BinaryCodec(IoUtil.openFileForWriting(bfq2));
log.info("Now writing to file " + bfq2.getAbsolutePath());
}
}
/**
* Writes out a SAMRecord in Maq fastq format
*
* @param codec the code to write to
* @param rec the SAMRecord to write
*/
private void writeFastqRecord(BinaryCodec codec, SAMRecord rec) {
// Writes the length of the read name and then the name (null-terminated)
codec.writeString(rec.getReadName(), true, true);
char seqs[] = rec.getReadString().toCharArray();
char quals[] = rec.getBaseQualityString().toCharArray();
// Write the length of the sequence
codec.writeInt(seqs.length);
// Calculate and write the sequence and qualities
byte seqsAndQuals[] = new byte[seqs.length];
for (int i = 0; i < seqs.length; i++) {
int quality = Math.min(quals[i]-33, 63);
int base;
switch(seqs[i]) {
case 'A':
case 'a':
base = 0;
break;
case 'C':
case 'c':
base = 1;
break;
case 'G':
case 'g':
base = 2;
break;
case 'T':
case 't':
base = 3;
break;
case 'N':
case 'n':
case '.':
base = 0;
quality = 0;
break;
default:
throw new PicardException("Unknown base when writing bfq file: " + seqs[i]);
}
seqsAndQuals[i] = (byte) (base << 6 | quality);
}
codec.writeBytes(seqsAndQuals);
}
private int countWritableRecords() {
int count = 0;
PeekableIterator<SAMRecord> it = new PeekableIterator<SAMRecord>((new SAMFileReader(IoUtil.openFileForReading(this.bamFile))).iterator());
if (!this.pairedReads) {
// Filter out noise reads and reads that fail the quality filter
List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
filters.add(new TagFilter(ReservedTagConstants.XN, 1));
filters.add(new FailsVendorReadQualityFilter());
FilteringIterator itr = new FilteringIterator(it, new AggregateFilter(filters));
while (itr.hasNext()) {
itr.next();
count++;
}
}
else {
while (it.hasNext()) {
SAMRecord first = it.next();
SAMRecord second = it.next();
// If both are noise reads, filter them out
if (first.getAttribute(ReservedTagConstants.XN) != null &&
second.getAttribute(ReservedTagConstants.XN) != null) {
// skip it
}
// If either fails to pass filter, then exclude them as well
else if (first.getReadFailsVendorQualityCheckFlag() || second.getReadFailsVendorQualityCheckFlag() ) {
// skip it
}
// Otherwise, write them out
else {
count++;
}
}
}
it.close();
return count;
}
/**
* Constructs the name for the output file and returns the file
*
* @param outputPrefix the directory and file prefix for the output bfq file
* @param read whether this is the file for the first or second read
* @return a new File object for the bfq file.
*/
private File getOutputFile(String outputPrefix, int read, int index) {
File result = new File(outputPrefix + "." + index + "." + read + ".bfq");
IoUtil.assertFileIsWritable(result);
return result;
}
}

View File

@ -1,357 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.aligner.maq;
import edu.mit.broad.sam.*;
import edu.mit.broad.sam.util.CloseableIterator;
import edu.mit.broad.sam.util.BinaryCodec;
import edu.mit.broad.sam.util.StringUtil;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.util.SamPairUtil;
import java.io.File;
import java.io.BufferedInputStream;
import java.util.*;
/**
* Reads a Maq map file and returns an an iterator of SAMRecords and a populated header
*
* IMPORTANT! Even though the reads in the map file are in coordinate order, this iterator
* will not necessarily return them in that order. For paired reads, both will be
* returned only after *both* records have been seen.
*
* @author Kathleen Tibbetts
*/
public class MapFileIterator implements CloseableIterator<SAMRecord> {
public static final int MATE_UNMAPPED_FLAG = 64;
public static final int READ_UNMAPPED_FLAG = 192;
private static final int READ_NAME_LENGTH = 36;
private static final int MAP_FORMAT = -1;
private static final int MAX_READ_LENGTH = 128;
private static final byte ACGT[] = {'A', 'C', 'G', 'T'};
public static final String PROGRAM_RECORD = "0";
private long recordCount = 0L;
private int recordsRead = 0;
private BinaryCodec mapCodec;
private final SAMFileHeader header;
private final boolean pairedReads;
private final boolean jumpingLibrary;
private final List<SAMRecord> next = new ArrayList<SAMRecord>();
private final Map<String, SAMRecord> pending = new HashMap<String, SAMRecord>();
private final List<File> mapFiles = new LinkedList<File>();
/**
* Constructor. Opens the map file, reads the record count and header from it,
* creates the SAMFileHeader, and queues up the first read
*
* @param mapFile The Maq map file to read
* @param commandLine The command line used to invoke Maq (for the header)
* @param pairedReads Whether this is a paired-end run
*/
public MapFileIterator(String commandLine, boolean pairedReads, boolean jumpingLibrary, File... mapFile) {
if (mapFile.length == 0) {
throw new IllegalArgumentException("At least one map file must be provided.");
}
mapFiles.addAll(Arrays.asList(mapFile));
this.pairedReads = pairedReads;
this.jumpingLibrary = jumpingLibrary;
header = new SAMFileHeader();
header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
SAMProgramRecord program = new SAMProgramRecord(PROGRAM_RECORD);
program.setProgramVersion(MaqConstants.getProgramVersion());
program.setCommandLine(commandLine);
header.addProgramRecord(program);
queueNextMapFile();
}
/**
* Queues up the next map file
*
* @return true if there's another map file to iterate over
*/
private boolean queueNextMapFile() {
// Close the old file
if (mapCodec != null) {
mapCodec.close();
}
// If there are no more map files, return fales
if (mapFiles.size() == 0) {
return false;
}
// Otherwise, open the next file and reset the recordsRead count
mapCodec = new BinaryCodec(new BufferedInputStream(IoUtil.openFileForReading(mapFiles.remove(0))));
int format = mapCodec.readInt();
if (format != MAP_FORMAT) {
mapCodec.close();
throw new PicardException("Unrecognized Maq map file format: " + format);
}
recordsRead = 0;
// Read the sequences out of the map file and set them on the header
int sequenceCount = mapCodec.readInt();
List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>();
for (int i = 0; i < sequenceCount; i++) {
int length = mapCodec.readInt();
// Write the sequence name, trimming off the null terminator
sequences.add(new SAMSequenceRecord(mapCodec.readString(length).substring(0, length-1)));
}
if (header.getSequences() == null || header.getSequences().size() == 0) {
header.setSequences(sequences);
}
else {
// TODO: Check that the sequences match and throw and exception if they don't
}
recordCount = mapCodec.readLong();
readNext();
return true;
}
/**
* Closes the BinaryCodec reading the map file
*/
public void close() {
mapCodec.close();
}
/**
* @return true if the iteration has more elements
*/
public boolean hasNext() {
return next.size() > 0;
}
/**
* @return the next SAMRecord in the iteration
* @throws NoSuchElementException if this is called when hasNext() returns false
*/
public SAMRecord next() {
if (!hasNext()) {
throw new NoSuchElementException("No more elements in this iteration");
}
SAMRecord result = next.remove(0);
readNext();
return result;
}
/**
* Reads the next element from the map file. If we are done with it, we put it in the <code>next</code>
* list; if we are waiting to see its mate, we put it in the <code>pending</code> map. Calls itself
* repeatedly until there is at least one element in <code>next</code>.
*/
private void readNext() {
// If there's already a record queued up, just return
if (next.size() > 0) {
return;
}
// If we've read all there is, then any remaining records in the pending map should be returned.
// If this is not a PE run, then the pending map will be empty and we're done.
if (recordsRead == recordCount) {
if (pending.size() > 0) {
StringBuffer sb = new StringBuffer();
for (String item : pending.keySet()) {
sb.append(item).append("\n");
}
throw new PicardException("MapFileIterator pending map should have been empty but contained " +
"the following records: " + sb.toString());
}
queueNextMapFile();
return;
}
// Otherwise, we read until there is at least one record in the <code>next</code> list
readMapRecord();
if (next.size() == 0) {
readNext();
}
}
/**
* Reads one record from the map file and throws it onto the pending map or the next list,
* depending on whether we have already seen its mate
*/
private void readMapRecord() {
// Now that we've got all the data from the binary file, write a SAMRecord and add it to
// the new BAM file
SAMRecord record = new SAMRecord();
record.setAttribute(SAMTag.PG.toString(), PROGRAM_RECORD);
record.setReadPairedFlag(this.pairedReads);
// the last base is the single-end mapping quality.
byte seqsAndQuals[] = new byte[MAX_READ_LENGTH-1];
mapCodec.readBytes(seqsAndQuals);
byte singleEndMappingQualityOrIndelLength = mapCodec.readByte();
// the length of the read
int readLength = mapCodec.readUByte();
setSeqsAndQuals(seqsAndQuals, readLength, record);
// the final mapping quality (unless <code>flag</code> below is 130, then it is the
// position of the indel (or 0 if no indel)
int mappingQuality = mapCodec.readUByte();
// mismatches in the 28bp (higher 4 bits) and mismatches (lower 4 bits)
mapCodec.readUByte();
// sum of errors of the best hit
mapCodec.readUByte();
// counts of all 0- and 1-mismatch hits on the reference
mapCodec.readUByte();
mapCodec.readUByte();
// A bitwise flag. See the Maq docs for its full meaning
int flag = mapCodec.readUByte();
// the lower mapQ of the two ends (equals map_qual if unpaired); if flag is 130: mapQ of its mate
int altQual = mapCodec.readUByte();
// Index of the sequence for this read
record.setReferenceIndex((int)mapCodec.readUInt(), getHeader());
// Start position and strand
long pos = mapCodec.readUInt();
int startPos = ((int)((pos>>1)& 0x7FFFFFFF)) + 1;
record.setAlignmentStart(startPos);
record.setReadNegativeStrandFlag((pos&1) == 1);
// offset of the mate (zero if unpaired, or two ends mapped to different chr)
mapCodec.readInt();
// The read name
byte nameBytes[] = new byte[READ_NAME_LENGTH];
mapCodec.readBytes(nameBytes);
String name = StringUtil.bytesToString(nameBytes).trim();
if (this.pairedReads) {
if (name.endsWith("/1")) {
record.setFirstOfPairFlag(true);
record.setSecondOfPairFlag(false);
}
else if (name.endsWith("/2")) {
record.setFirstOfPairFlag(false);
record.setSecondOfPairFlag(true);
}
else {
throw new PicardException("Unrecognized ending for paired read name: " + name);
}
name = name.substring(0, name.length()-2);
}
record.setReadName(name);
if (flag != 130 || singleEndMappingQualityOrIndelLength == 0) { // No indel
record.setCigarString(readLength + "M");
record.setMappingQuality(mappingQuality);
}
else { // Indel
int indelPos = mappingQuality;
String cigar = indelPos + "M" + Math.abs(singleEndMappingQualityOrIndelLength);
int remaining = readLength - indelPos;
if (singleEndMappingQualityOrIndelLength > 0) {
cigar += "I" + (remaining - singleEndMappingQualityOrIndelLength) + "M";
}
else {
cigar += "D" + remaining + "M";
}
record.setCigarString(cigar);
// In the docs, it look like there is a mapping quality for the mate, do we use that?
record.setMappingQuality(altQual);
}
if (!pairedReads) {
record.setProperPairFlag(false);
next.add(record);
}
else {
record.setMateUnmappedFlag(flag == MATE_UNMAPPED_FLAG);
SAMRecord mate = pending.remove(record.getReadName());
if (mate != null) {
boolean proper = SamPairUtil.isProperPair(record, mate, jumpingLibrary);
record.setProperPairFlag(proper);
mate.setProperPairFlag(proper);
SamPairUtil.setMateInfo(record, mate);
int insertSize = SamPairUtil.computeInsertSize(record, mate);
record.setInferredInsertSize(insertSize);
mate.setInferredInsertSize(insertSize);
if (!mate.getMateUnmappedFlag()) {
next.add(record);
}
if (!record.getMateUnmappedFlag()) {
next.add(mate);
}
}
else {
pending.put(record.getReadName(), record);
}
}
// TODO: Figure out what do do about noise reads long-term
// Note that it is possible that we have lost a "Noise read" annotation at this point. Since
// we try to map a pair if only one of the reads is classified as "noise", then for any paired
// reads where one was a noise read and one was not, we will lose the noise annotation on the
// one noisy read. We have discussed either re-doing the noise evaluation here, modifying the
// read name to carry the noise flag through Maq, or changing what reads we give to Maq.
recordsRead++;
}
/**
* Decodes the sequence and the qualities and sets them on the SAMrecords
*
* @param seqsAndQuals the list of seqs and quals
* @param readLength the length of the read
* @param sam the SAMRecord to populate
*/
private void setSeqsAndQuals(byte seqsAndQuals[], int readLength, SAMRecord sam) {
byte sequence[] = new byte[readLength];
byte qualities[] = new byte[readLength];
for (int i = 0; i < readLength; i++) {
byte b = seqsAndQuals[i];
qualities[i] = (byte)(b & 0x3F);
if (b == 0) {
sequence[i] = 'N';
}
else {
sequence[i] = ACGT[(seqsAndQuals[i] >> 6) & 3];
}
}
sam.setReadBases(sequence);
sam.setBaseQualities(qualities);
}
/**
* @throws UnsupportedOperationException -- not implemented
*/
public void remove() {
throw new UnsupportedOperationException("remove() not supported in MapFileIterator");
}
public SAMFileHeader getHeader() { return header; }
}

View File

@ -1,211 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.aligner.maq;
import edu.mit.broad.picard.aligner.Aligner;
import edu.mit.broad.picard.aligner.AbstractBaseAligner;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.util.Log;
import java.io.File;
import java.io.FilenameFilter;
import java.util.*;
/**
* Maq implementation of the Aligner interface
*/
public class MaqAligner extends AbstractBaseAligner implements Aligner {
// Constants related to Maq output files
public static final String MAQ_MAP_SUFFIX = ".out.aln.map";
public static final String MAQ_LOG_SUFFIX = ".out.map.log";
// Internal constant for multi-plexing lane data
private static final int READ_CHUNK_SIZE = 2000000;
public static final String REFERENCE_FILE_SUFFIX = ".bfa";
private final Log log = Log.getInstance(MaqAligner.class);
private String commandLine = null;
/**
* Constructor that sets every parameter. All other constructors delegate to this one.
*
* @param stringency the stringency of the alignment
* @param readsBamFile the BAM file containing the reads
* @param outputPrefix the directory and filename prefix for output
* @param referenceFileDir the directory where the reference file is located
* @param clipPoints the clip points
* @param expectedInsertSize the expected insert size (null for non-PE lanes)
* @param readsToAlign the number of reads to align
* @param customParametersMap parameters specific to the Aligner implementation
*/
public MaqAligner(Stringency stringency, File readsBamFile, String outputPrefix,
String referenceFileDir, int clipPoints[], Integer expectedInsertSize,
Integer readsToAlign, Map<String, String> customParametersMap,
boolean pairedReads, int readLength) {
super(stringency, readsBamFile, outputPrefix, referenceFileDir, clipPoints,
expectedInsertSize, readsToAlign, customParametersMap, pairedReads, readLength);
}
/**
* Prepares all the necessary inputs for the alignment process from a BAM file of read data.
*/
public void prepareInputs() {
log.info("Preparing Maq inputs.");
BamToBfqWriter writer = new BamToBfqWriter(this.getReadsBamFile(), this.getOutputPrefix(),
this.getReadsToAlign(), READ_CHUNK_SIZE, isPairedReads());
writer.writeBfqFiles();
}
/**
* Does the alignment and produces output in the underlying form of the aligner.
*/
public void align() {
log.info("Running Maq alignment.");
// Temporary hack until we get the multi-tasking code from Seva
List<String> mapFileNames = new ArrayList<String>(); // All map files that we will merge together at the end
String maqParams = MaqConstants.SWITCH_RANDOM_SEED + " " + MaqConstants.DEFAULT_RANDOM_SEED;
if (this.getStringency() == Stringency.high) {
maqParams += " " + MaqConstants.SWITCH_MAX_OUTER_DISTANCE + " " + Math.round(
this.getExpectedInsertSize() * MaqConstants.HIGH_STRINGENCY_MAX_OUTER_DISTANCE_MULTIPLIER);
maqParams += " " + MaqConstants.SWITCH_SUM_MISMATCHES + " " +
MaqConstants.HIGH_STRINGENCY_SUM_MISMATCHES;
}
else {
maqParams += " " + MaqConstants.SWITCH_MAX_OUTER_DISTANCE + " " +
MaqConstants.LOW_STRINGENCY_MAX_OUTER_DISTANCE;
// For low stringency, get at least 30 bases and then let half of what's remaining mismatch
int maxMisMatches = (this.getReadLength() - 30)/2;
maqParams += " " + MaqConstants.SWITCH_SUM_MISMATCHES + " " +
(maxMisMatches * MaqConstants.LOW_STRINGENCY_QUALITY_FOR_MISMATCHES);
}
String referenceFile = new File(this.getReferenceFileDir()).listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(REFERENCE_FILE_SUFFIX);
}
})[0].getAbsolutePath();
ProcessBuilder builder;
// Map the bfq files, individually or in pairs
SortedSet<File> bfqs = new TreeSet<File>(this.getBfqFiles());
for (Iterator<File> it = bfqs.iterator(); it.hasNext();) {
String read1bfq = it.next().getAbsolutePath();
String read2bfq = (this.isPairedReads()) ? it.next().getAbsolutePath() : "";
String outputFileBase = read1bfq.substring(0, read1bfq.lastIndexOf('.')-2);
String mapFile = outputFileBase + MAQ_MAP_SUFFIX;
String logFile = outputFileBase + MAQ_LOG_SUFFIX;
String command = MaqConstants.MAQ_HOME + MaqConstants.MAQ_COMMAND + " " + MaqConstants.MAP_COMMAND +
" " + maqParams + " " + mapFile + " " + referenceFile + " " + read1bfq + " " + read2bfq +
" 2> " + logFile;
setCommandLine(getCommandLine() == null ? command : getCommandLine() + ";" + command);
log.info("Executing command: " + command);
try {
builder = new ProcessBuilder(command.split(" "));
Process p = builder.start();
p.waitFor();
}
catch (Exception e) {
throw new PicardException("Error starting Maq process", e);
}
mapFileNames.add(mapFile);
}
// If there's more than one map file, then merge them.
String finalFileName = this.getOutputPrefix() + "." + this.getStringency() + MAQ_MAP_SUFFIX;
if (mapFileNames.size() > 1) {
String command = MaqConstants.MAQ_HOME + MaqConstants.MAQ_COMMAND + " " +
MaqConstants.MERGE_COMMAND + " " + finalFileName;
for (String name : mapFileNames) {
command += " " + name;
}
setCommandLine(getCommandLine() == null ? command : getCommandLine() + ";" + command);
log.info("Executing command: " + command);
try {
builder = new ProcessBuilder(command.split(" "));
Process p = builder.start();
p.waitFor();
}
catch (Exception e) {
throw new PicardException("Error starting Maq process", e);
}
}
else { // Otherwise rename the single map file so we can find it later
File f = new File(mapFileNames.get(0));
if (!f.renameTo(new File(finalFileName))) {
throw new PicardException("Error renaming " + f.getAbsolutePath() + " to " + finalFileName);
}
}
}
/**
* Converts the output of the aligner to BAM format
*/
public void prepareOutput() {
log.info("Preparing output from Maq alignment.");
// TODO: MaqToBam
}
/**
* Cleans up intermediate files (the files created in by and for the underlying aligner by the
* prepareInputs() and align() methods. Does not clean up the original source files or the final BAM file.
*/
public void cleanup() {
log.info("Cleaning up Maq intermediate files.");
this.deleteFiles(getBfqFiles());
// this.deleteFiles(getMaqAlignmentFiles());
}
/**
* Returns a list of zero to two BFQ files, depending on whether they are there
* and whether it was a paired-end run or not
*
* @return a list of BFQ files
*/
private List<File> getBfqFiles() {
File dir = new File(this.getOutputPrefix().substring(0, this.getOutputPrefix().lastIndexOf("/")));
return Arrays.asList(dir.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".bfq");
}
}));
}
/**
* Returns the Maq map files
*
* @return a list of Maq .map files
*/
private List<File> getMaqAlignmentFiles() {
File dir = new File(this.getOutputPrefix().substring(0, this.getOutputPrefix().lastIndexOf("/")));
return Arrays.asList(dir.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
// TODO: Add the text files if we do not read the binary map files
return name.endsWith(MAQ_MAP_SUFFIX) || name.endsWith(MAQ_LOG_SUFFIX);
}
}));
}
public String getCommandLine() { return commandLine; }
public void setCommandLine(String commandLine) { this.commandLine = commandLine; }
}

View File

@ -1,39 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.aligner.maq;
/**
* Utility class to hold Maq-related constants (program name, location, switches, etc)
*/
public class MaqConstants {
// General Maq constants
public static final String PROGRAM_NAME = "Maq";
public static final String PROGRAM_VERSION = "0.7.1";
public static final String MAQ_HOME = "/seq/dirseq/maq-0.7.1/";
// Command-related constants
public static final String MAQ_COMMAND = "maq";
public static final String MAP_COMMAND = "map";
public static final String MERGE_COMMAND = "mapmerge";
// Constants related to Maq map switches
public static final String SWITCH_SUM_MISMATCHES = "-e";
public static final int HIGH_STRINGENCY_SUM_MISMATCHES = 100;
public static final int LOW_STRINGENCY_QUALITY_FOR_MISMATCHES = 30;
public static final String SWITCH_MAX_OUTER_DISTANCE = "-a";
public static final int LOW_STRINGENCY_MAX_OUTER_DISTANCE = 1500;
public static final double HIGH_STRINGENCY_MAX_OUTER_DISTANCE_MULTIPLIER = 1.5d;
public static final String SWITCH_RANDOM_SEED = "-s";
public static final int DEFAULT_RANDOM_SEED = 0;
public static String getProgramVersion() { return PROGRAM_VERSION; }
}

View File

@ -1,125 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.aligner.maq;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.util.StringSortingCollectionFactory;
import edu.mit.broad.picard.util.Log;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.sam.util.SortingCollection;
import edu.mit.broad.sam.util.BinaryCodec;
import edu.mit.broad.sam.util.CloseableIterator;
import edu.mit.broad.sam.*;
import java.io.File;
import java.io.BufferedInputStream;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.nio.ByteBuffer;
/**
* Class to write a BAM file that includes the results from a Maq .map file along with the unaligned
* reads from the original BAM file.
*
* Information on the meaning of the elements of the map file is drawn from the Maq documentation
* on this page: http://maq.sourceforge.net/maqmap_format.shtml
*/
public class MaqMapMerger {
private final File mapFile;
private final File sourceBamFile;
private final File targetBamFile;
private final boolean pairedReads;
private final Log log = Log.getInstance(MaqMapMerger.class);
private String commandLine = null;
private List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>();
/**
* Constructor
*
* @param mapFile The Maq map file to parse
* @param sourceBamFile The BAM file that was used as the input to the Maq aligner, which will
* include info on all the reads that did not map
* @param targetBamFile The file to which to write the merged
*/
public MaqMapMerger(File mapFile, File sourceBamFile, File targetBamFile, boolean pairedReads) {
IoUtil.assertFileIsReadable(mapFile);
IoUtil.assertFileIsReadable(sourceBamFile);
IoUtil.assertFileIsWritable(targetBamFile);
this.mapFile = mapFile;
this.sourceBamFile = sourceBamFile;
this.targetBamFile = targetBamFile;
this.pairedReads = pairedReads;
}
/**
* Merges the alignment from the map file with the remaining records from the source BAM file.
*/
public void mergeAlignment() {
log.info("Processing map file: " + mapFile.getAbsolutePath());
// Write the header
MapFileIterator it = new MapFileIterator(getCommandLine(), this.pairedReads, false, this.mapFile);
SAMFileHeader header = it.getHeader();
SAMFileWriter writer = new SAMFileWriterFactory().makeBAMWriter(header, false, targetBamFile);
// Write the alignments
SortingCollection<String> readNames = writeAlignments(it, writer);
// We're done with the map file, so close it
it.close();
writeUnalignedReads(writer, readNames.iterator());
// Now close the writer
writer.close();
}
private void writeUnalignedReads(SAMFileWriter writer, CloseableIterator<String> nameIterator) {
int skipCount = 0;
SAMFileReader reader = new SAMFileReader(IoUtil.openFileForReading(this.sourceBamFile));
CloseableIterator<SAMRecord> bamRecords = reader.iterator();
String readName = nameIterator.hasNext() ? nameIterator.next() : null;
while(bamRecords.hasNext()) {
SAMRecord rec = bamRecords.next();
if (rec.getReadName().equals(readName)) {
// skip it and pull the next name off the name iterator
readName = nameIterator.hasNext() ? nameIterator.next() : null;
skipCount++;
}
else {
writer.addAlignment(rec);
}
}
System.out.println("Skipped " + skipCount + " already-aligned records.");
bamRecords.close();
nameIterator.close();
}
private SortingCollection<String> writeAlignments(MapFileIterator iterator, SAMFileWriter writer) {
int wrote = 0;
SortingCollection<String> readNames = StringSortingCollectionFactory.newCollection();
while (iterator.hasNext()) {
SAMRecord record = iterator.next();
readNames.add(record.getReadName());
writer.addAlignment(record);
wrote++;
}
System.out.println("Wrote " + wrote + " alignment records.");
return readNames;
}
public void setCommandLine(String commandLine) { this.commandLine = commandLine; }
public String getCommandLine() { return this.commandLine; }
}

View File

@ -1,133 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.aligner.maq;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Usage;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.aligner.Aligner;
import java.io.File;
import java.util.Map;
import java.util.List;
import java.util.HashMap;
import java.util.ArrayList;
/**
* CommandLineProgram to generate to invoke BustardToBamWriter
*
* @author Kathleen Tibbetts
*/
public class RunMaq extends CommandLineProgram {
private static final String PROGRAM_VERSION = "1.0";
// The following attributes define the command-line arguments
@Usage
public String USAGE =
"Usage: " + getClass().getName() + " [options]\n\n" +
"Invoke the Maq aligner.\n" +
"Version: " + PROGRAM_VERSION +"\n";
@Option(shortName="I", doc="The BAM file to parse.", optional=true)
public File INPUT;
@Option(shortName="O", doc="The directory and file prefix for all output.", optional=false)
public String OUTPUT;
@Option(shortName="L", doc="The read length.", optional=false)
public Integer READ_LENGTH;
@Option(shortName="S", doc="Stringency of the alignment.", optional=true)
public Aligner.Stringency STRINGENCY;
@Option(shortName="R", doc="Directory where the reference file is located.", optional=true)
public String REFERENCE;
@Option(shortName="C", doc="Clip points for the alignment.", optional=true, minElements=0, maxElements=4)
public List<Integer> CLIP_POINT = new ArrayList<Integer>();
@Option(shortName="E", doc="Expected insert size.", optional=true)
public Integer EXPECTED_INSERT_SIZE;
@Option(doc="Whether this is a paired-end run.", optional=false)
public Boolean PE;
@Option(shortName="NUM", doc="Number of reads to align (null = all).", optional=true)
public Integer READS_TO_ALIGN;
@Option(shortName="CUSTOM", doc="Custom parameter in the form name=value.", optional=true)
public List<String> CUSTOM_PARAMETER = new ArrayList<String>();
@Option(shortName="PREP", doc="Whether to prepare inputs for the alignement.", optional=true)
public Boolean PREPARE = true;
@Option(doc="Whether to do the alignement.", optional=true)
public Boolean ALIGN = true;
@Option(shortName="BAM", doc="Whether to generate a BAM file from the alignment output.", optional=true)
public Boolean BAM_OUTPUT = true;
@Option(doc="Whether to clean up intermediate input and output.", optional=true)
public Boolean CLEANUP = true;
protected int doWork() {
int clipPoints[] = null;
if (CLIP_POINT != null) {
clipPoints = new int[4];
int index=0;
for (Integer i : CLIP_POINT) {
clipPoints[index++] = i;
}
}
Map<String, String> params = null;
if (CUSTOM_PARAMETER != null) {
params = new HashMap<String, String>();
for (String param : CUSTOM_PARAMETER) {
String nameAndVal[] = param.split("=");
params.put(nameAndVal[0], nameAndVal[1]);
}
}
Aligner aligner = new MaqAligner(STRINGENCY, INPUT, OUTPUT, REFERENCE, clipPoints,
EXPECTED_INSERT_SIZE, READS_TO_ALIGN, params, PE, READ_LENGTH);
if (PREPARE) {
aligner.prepareInputs();
}
if (ALIGN) {
aligner.align();
}
if (BAM_OUTPUT) {
aligner.prepareOutput();
}
if (CLEANUP) {
aligner.cleanup();
}
return 0;
}
/**
* This is kind of a mess. Almost everything is optional, since you don't have to do all of the steps in the
* alignement.
* @return
*/
protected boolean customCommandLineValidation() {
if (PREPARE) {
if( INPUT == null) {
System.err.println("ERROR: INPUT must be specified when preparing inputs for the alignment.");
return false;
}
if (CLIP_POINT.size() != 0 && CLIP_POINT.size() != 4) {
System.err.println("ERROR: You must supply either 0 or 4 values for CLIP_POINT: " + CLIP_POINT.size());
return false;
}
}
if (ALIGN) {
if (STRINGENCY == null) {
System.err.println("ERROR: STRINGENCY must be specified when doing an alignment.");
return false;
}
if (REFERENCE == null) {
System.err.println("ERROR: REFERENCE must be specified when doing an alignment.");
return false;
}
}
return true;
}
public static void main(String[] argv) {
System.exit(new RunMaq().instanceMain(argv));
}
}

View File

@ -1,27 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.cmdline;
public class CommandLineParseException extends RuntimeException{
public CommandLineParseException() {
}
public CommandLineParseException(String s) {
super(s);
}
public CommandLineParseException(String s, Throwable throwable) {
super(s, throwable);
}
public CommandLineParseException(Throwable throwable) {
super(throwable);
}
}

View File

@ -1,638 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.cmdline;
import java.io.*;
import java.lang.reflect.Constructor;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.ParameterizedType;
import java.lang.reflect.Type;
import java.util.*;
import edu.mit.broad.picard.util.StringUtil;
import edu.mit.broad.picard.PicardException;
/**
* Annotation-driven utility for parsing command-line arguments, checking for errors, and producing usage message.
*
* This class supports options of the form KEY=VALUE, plus positional arguments. Positional arguments must not contain
* an equal sign lest they be mistaken for a KEY=VALUE pair.
*
* The caller must supply an object that both defines the command line and has the parsed options set into it.
* For each possible KEY=VALUE option, there must be a public data member annotated with @Option. The KEY name is
* the name of the data member. An abbreviated name may also be specified with the shortName attribute of @Option.
* If the data member is a List<T>, then the option may be specified multiple times. The type of the data member,
* or the type of the List element must either have a ctor T(String), or must be an Enum. List options must
* be initialized by the caller with some kind of list. Any other option that is non-null is assumed to have the given
* value as a default. If an option has no default value, and does not have the optional attribute of @Option set,
* is required. For List options, minimum and maximum number of elements may be specified in the @Option annotation.
*
* A single List data member may be annotated with the @PositionalArguments. This behaves similarly to a Option
* with List data member: the caller must initialize the data member, the type must be constructable from String, and
* min and max number of elements may be specified. If no @PositionalArguments annotation appears in the object,
* then it is an error for the command line to contain positional arguments.
*
* A single String public data member may be annotated with @Usage. This string, if present, is used to
* construct the usage message. Details about the possible options are automatically appended to this string.
* If @Usage does not appear, a boilerplate usage message is used.
*/
public class CommandLineParser {
// For formatting option section of usage message.
private static final int OPTION_COLUMN_WIDTH = 30;
private static final int DESCRIPTION_COLUMN_WIDTH = 50;
private static final Boolean[] TRUE_FALSE_VALUES = {Boolean.TRUE, Boolean.FALSE};
// Use these if no @Usage annotation
private static final String defaultUsagePreamble = "Usage: program [options...]\n";
private static final String defaultUsagePreambleWithPositionalArguments =
"Usage: program [options...] [positional-arguments...]\n";
private static final String OPTIONS_FILE = "OPTIONS_FILE";
/**
* A typical command line program will call this to get the beginning of the usage message,
* and then append a description of the program, like this:
*
* \@Usage(programVersion=PROGRAM_VERSION)
* public String USAGE = CommandLineParser.getStandardUsagePreamble(getClass()) + "Frobnicates the freebozzle."
*/
public static String getStandardUsagePreamble(Class mainClass) {
return "USAGE: " + mainClass.getName() + " [options]\n\n";
}
// This is the object that the caller has provided that contains annotations,
// and into which the values will be assigned.
private final Object callerOptions;
private String usagePreamble;
// null if no @PositionalArguments annotation
private Field positionalArguments;
private int minPositionalArguments;
private int maxPositionalArguments;
// List of all the data members with @Option annotation
private final List<OptionDefinition> optionDefinitions = new ArrayList<OptionDefinition>();
// Maps long name, and short name, if present, to an option definition that is
// also in the optionDefinitions list.
private final Map<String, OptionDefinition> optionMap = new HashMap<String, OptionDefinition>();
// For printing error messages when parsing command line.
private PrintStream messageStream;
// In case implementation wants to get at arg for some reason.
private String[] argv;
/**
* This attribute is here just to facilitate printing usage for OPTIONS_FILE
*/
public File IGNORE_THIS_PROPERTY;
/**
* Prepare for parsing command line arguments, by validating annotations.
* @param callerOptions This object contains annotations that define the acceptable command-line options,
* and ultimately will receive the settings when a command line is parsed.
*/
public CommandLineParser(final Object callerOptions) {
this.callerOptions = callerOptions;
for (final Field field : this.callerOptions.getClass().getFields()) {
if (field.getAnnotation(PositionalArguments.class) != null) {
handlePositionalArgumentAnnotation(field);
}
if (field.getAnnotation(Usage.class) != null) {
handleUsageAnnotation(field);
}
if (field.getAnnotation(Option.class) != null) {
handleOptionAnnotation(field);
}
}
if (usagePreamble == null) {
if (positionalArguments == null) {
usagePreamble = defaultUsagePreamble;
} else {
usagePreamble = defaultUsagePreambleWithPositionalArguments;
}
}
}
/**
* Print a usage message based on the options object passed to the ctor.
* @param stream Where to write the usage message.
*/
public void usage(final PrintStream stream) {
stream.print(usagePreamble);
if (!optionDefinitions.isEmpty()) {
stream.println("\nOptions:\n");
for (final OptionDefinition optionDefinition : optionDefinitions) {
printOptionUsage(stream, optionDefinition);
}
}
final Field fileField;
try {
fileField = getClass().getField("IGNORE_THIS_PROPERTY");
} catch (NoSuchFieldException e) {
throw new PicardException("Should never happen", e);
}
final OptionDefinition optionsFileOptionDefinition =
new OptionDefinition(fileField, OPTIONS_FILE, "",
"File of OPTION_NAME=value pairs. No positional parameters allowed. Unlike command-line options, " +
"unrecognized options are ignored. " + "A single-valued option set in an options file may be overridden " +
"by a subsequent command-line option. " +
"A line starting with '#' is considered a comment.", false, true, 0, Integer.MAX_VALUE, null, new String[0]);
printOptionUsage(stream, optionsFileOptionDefinition);
}
/**
* Parse command-line options, and store values in callerOptions object passed to ctor.
* @param messageStream Where to write error messages.
* @param args Command line tokens.
* @return true if command line is valid.
*/
public boolean parseOptions(final PrintStream messageStream, final String[] args) {
this.argv = args;
this.messageStream = messageStream;
for (final String arg: args) {
if (arg.equals("-h") || arg.equals("--help")) {
usage(messageStream);
return false;
}
final String[] pair = arg.split("=", 2);
if (pair.length == 2) {
if (pair[0].equals(OPTIONS_FILE)) {
if (!parseOptionsFile(pair[1])) {
messageStream.println();
usage(messageStream);
return false;
}
} else {
if (!parseOption(pair[0], pair[1], false)) {
messageStream.println();
usage(messageStream);
return false;
}
}
} else if (!parsePositionalArgument(arg)) {
messageStream.println();
usage(messageStream);
return false;
}
}
if (!checkNumArguments()) {
messageStream.println();
usage(messageStream);
return false;
}
return true;
}
/**
* After command line has been parsed, make sure that all required options have values, and that
* lists with minimum # of elements have sufficient.
* @return true if valid
*/
private boolean checkNumArguments() {
try {
for (final OptionDefinition optionDefinition : optionDefinitions) {
StringBuilder mutextOptionNames = new StringBuilder();
for (String mutexOption : optionDefinition.mutuallyExclusive) {
OptionDefinition mutextOptionDef = optionMap.get(mutexOption);
if (mutextOptionDef != null && mutextOptionDef.hasBeenSet) {
mutextOptionNames.append(" ").append(mutextOptionDef.name);
}
}
if (optionDefinition.hasBeenSet && mutextOptionNames.length() > 0) {
messageStream.println("ERROR: Option '" + optionDefinition.name +
"' cannot be used in conjunction with option(s)" +
mutextOptionNames.toString());
return false;
}
if (optionDefinition.isCollection) {
final Collection c = (Collection)optionDefinition.field.get(callerOptions);
if (c.size() < optionDefinition.minElements) {
messageStream.println("ERROR: Option '" + optionDefinition.name + "' must be specified at least " +
optionDefinition.minElements + " times.");
return false;
}
} else if (!optionDefinition.optional && !optionDefinition.hasBeenSet && mutextOptionNames.length() == 0) {
messageStream.print("ERROR: Option '" + optionDefinition.name + "' is required");
if (optionDefinition.mutuallyExclusive.isEmpty()) {
messageStream.println(".");
} else {
messageStream.println(" unless any of " + optionDefinition.mutuallyExclusive + " are specified.");
}
return false;
}
}
if (positionalArguments != null) {
final Collection c = (Collection)positionalArguments.get(callerOptions);
if (c.size() < minPositionalArguments) {
messageStream.println("ERROR: At least " + minPositionalArguments +
" positional arguments must be specified.");
return false;
}
}
return true;
} catch (IllegalAccessException e) {
// Should never happen because lack of publicness has already been checked.
throw new RuntimeException(e);
}
}
private boolean parsePositionalArgument(final String stringValue) {
if (positionalArguments == null) {
messageStream.println("ERROR: Invalid argument '" + stringValue + "'.");
return false;
}
final Object value;
try {
value = constructFromString(getUnderlyingType(positionalArguments), stringValue);
} catch (CommandLineParseException e) {
messageStream.println("ERROR: " + e.getMessage());
return false;
}
final Collection c;
try {
c = (Collection)positionalArguments.get(callerOptions);
} catch (IllegalAccessException e) {
throw new RuntimeException(e);
}
if (c.size() >= maxPositionalArguments) {
messageStream.println("ERROR: No more than " + maxPositionalArguments +
" positional arguments may be specified on the command line.");
return false;
}
c.add(value);
return true;
}
private boolean parseOption(String key, final String stringValue, final boolean optionsFile) {
key = key.toUpperCase();
final OptionDefinition optionDefinition = optionMap.get(key);
if (optionDefinition == null) {
if (optionsFile) {
// Silently ignore unrecognized option from options file
return true;
}
messageStream.println("ERROR: Unrecognized option: " + key);
return false;
}
if (!optionDefinition.isCollection) {
if (optionDefinition.hasBeenSet && !optionDefinition.hasBeenSetFromOptionsFile) {
messageStream.println("ERROR: Option '" + key + "' cannot be specified more than once.");
return false;
}
}
final Object value;
try {
value = constructFromString(getUnderlyingType(optionDefinition.field), stringValue);
} catch (CommandLineParseException e) {
messageStream.println("ERROR: " + e.getMessage());
return false;
}
try {
if (optionDefinition.isCollection) {
final Collection c = (Collection)optionDefinition.field.get(callerOptions);
if (c.size() >= optionDefinition.maxElements) {
messageStream.println("ERROR: Option '" + key + "' cannot be used more than " +
optionDefinition.maxElements + " times.");
return false;
}
c.add(value);
} else {
optionDefinition.field.set(callerOptions, value);
optionDefinition.hasBeenSet = true;
optionDefinition.hasBeenSetFromOptionsFile = optionsFile;
}
} catch (IllegalAccessException e) {
// Should never happen because we only iterate through public fields.
throw new RuntimeException(e);
}
return true;
}
/**
* Parsing of options from file is looser than normal. Any unrecognized options are
* ignored, and a single-valued option that is set in a file may be overridden by a
* subsequent appearance of that option.
* A line that starts with '#' is ignored.
* @param optionsFile
* @return false if a fatal error occurred
*/
private boolean parseOptionsFile(final String optionsFile) {
try {
final BufferedReader reader = new BufferedReader(new FileReader(optionsFile));
String line;
while ((line = reader.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
final String[] pair = line.split("=", 2);
if (pair.length == 2) {
if (!parseOption(pair[0], pair[1], true)) {
messageStream.println();
usage(messageStream);
return false;
}
} else {
messageStream.println("Strange line in OPTIONS_FILE " + optionsFile + ": " + line);
usage(messageStream);
return false;
}
}
reader.close();
return true;
} catch (IOException e) {
throw new PicardException("I/O error loading OPTIONS_FILE=" + optionsFile, e);
}
}
private void printOptionUsage(final PrintStream stream, final OptionDefinition optionDefinition) {
final String type = getUnderlyingType(optionDefinition.field).getSimpleName();
String optionLabel = optionDefinition.name + "=" + type;
stream.print(optionLabel);
if (optionDefinition.shortName.length() > 0) {
stream.println();
}
if (optionDefinition.shortName.length() > 0) {
optionLabel = optionDefinition.shortName + "=" + type;
stream.print(optionLabel);
}
int numSpaces = OPTION_COLUMN_WIDTH - optionLabel.length();
if (optionLabel.length() > OPTION_COLUMN_WIDTH) {
stream.println();
numSpaces = OPTION_COLUMN_WIDTH;
}
printSpaces(stream, numSpaces);
final StringBuilder sb = new StringBuilder();
if (optionDefinition.doc.length() > 0) {
sb.append(optionDefinition.doc);
sb.append(" ");
}
if (optionDefinition.optional && !optionDefinition.isCollection) {
sb.append("Default value: ");
sb.append(optionDefinition.defaultValue);
sb.append(". ");
} else if (!optionDefinition.isCollection){
sb.append("Required. ");
}
Object[] enumConstants = getUnderlyingType(optionDefinition.field).getEnumConstants();
if (enumConstants == null && getUnderlyingType(optionDefinition.field) == Boolean.class) {
enumConstants = TRUE_FALSE_VALUES;
}
if (enumConstants != null) {
sb.append("Possible values: {");
for (int i = 0; i < enumConstants.length; ++i) {
if (i > 0) {
sb.append(", ");
}
sb.append(enumConstants[i].toString());
}
sb.append("} ");
}
if (optionDefinition.isCollection) {
if (optionDefinition.minElements == 0) {
if (optionDefinition.maxElements == Integer.MAX_VALUE) {
sb.append("This option may be specified 0 or more times.");
} else {
sb.append("This option must be specified no more than " + optionDefinition.maxElements + "times.");
}
} else if (optionDefinition.maxElements == Integer.MAX_VALUE) {
sb.append("This option must be specified at least " + optionDefinition.minElements + " times.");
} else {
sb.append("This option may be specified between " + optionDefinition.minElements +
" and " + optionDefinition.maxElements + " times.");
}
}
if (!optionDefinition.mutuallyExclusive.isEmpty()) {
sb.append(" Cannot be used in conjuction with option(s)");
for (String option : optionDefinition.mutuallyExclusive) {
OptionDefinition mutextOptionDefinition = optionMap.get(option);
sb.append(" ").append(mutextOptionDefinition.name);
if (mutextOptionDefinition.shortName.length() > 0) {
sb.append(" (").append(mutextOptionDefinition.shortName).append(")");
}
}
}
final String wrappedDescription = StringUtil.wordWrap(sb.toString(), DESCRIPTION_COLUMN_WIDTH);
final String[] descriptionLines = wrappedDescription.split("\n");
for (int i = 0; i < descriptionLines.length; ++i) {
if (i > 0) {
printSpaces(stream, OPTION_COLUMN_WIDTH);
}
stream.println(descriptionLines[i]);
}
stream.println();
}
private void printSpaces(final PrintStream stream, final int numSpaces) {
final StringBuilder sb = new StringBuilder();
for (int i = 0; i < numSpaces; ++i) {
sb.append(" ");
}
stream.print(sb);
}
private void handleOptionAnnotation(final Field field) {
try {
final Option optionAnnotation = field.getAnnotation(Option.class);
final boolean isCollection = isCollectionField(field);
if (isCollection) {
if (optionAnnotation.maxElements() == 0) {
throw new CommandLineParserDefinitionException("@Option member " + field.getName() +
"has maxElements = 0");
}
if (optionAnnotation.minElements() > optionAnnotation.maxElements()) {
throw new CommandLineParserDefinitionException("In @Option member " + field.getName() +
", minElements cannot be > maxElements");
}
}
if (!canBeMadeFromString(getUnderlyingType(field))) {
throw new CommandLineParserDefinitionException("@Option member " + field.getName() +
" must have a String ctor or be an enum");
}
final OptionDefinition optionDefinition = new OptionDefinition(field,
field.getName(),
optionAnnotation.shortName(),
optionAnnotation.doc(), optionAnnotation.optional() || (field.get(callerOptions) != null),
isCollection, optionAnnotation.minElements(),
optionAnnotation.maxElements(), field.get(callerOptions),
optionAnnotation.mutex());
for (String option : optionAnnotation.mutex()) {
OptionDefinition mutextOptionDef = optionMap.get(option);
if (mutextOptionDef != null) {
mutextOptionDef.mutuallyExclusive.add(field.getName());
}
}
if (optionMap.containsKey(optionDefinition.name)) {
throw new CommandLineParserDefinitionException(optionDefinition.name + " has already been used");
}
optionMap.put(optionDefinition.name, optionDefinition);
if (optionDefinition.shortName.length() > 0) {
if (optionMap.containsKey(optionDefinition.shortName)) {
throw new CommandLineParserDefinitionException(optionDefinition.shortName + " has already been used");
}
optionMap.put(optionDefinition.shortName, optionDefinition);
}
optionDefinitions.add(optionDefinition);
} catch (IllegalAccessException e) {
throw new CommandLineParserDefinitionException(field.getName() +
" must have public visibility to have @Option annotation");
}
}
private void handleUsageAnnotation(final Field field) {
if (usagePreamble != null) {
throw new CommandLineParserDefinitionException
("@Usage cannot be used more than once in an option class.");
}
try {
usagePreamble = (String)field.get(callerOptions);
final Usage usageAnnotation = field.getAnnotation(Usage.class);
if (usageAnnotation.programVersion().length() > 0) {
usagePreamble += "Version: " + usageAnnotation.programVersion() + "\n";
}
} catch (IllegalAccessException e) {
throw new CommandLineParserDefinitionException("@Usage data member must be public");
} catch (ClassCastException e) {
throw new CommandLineParserDefinitionException
("@Usage can only be applied to a String data member.");
}
}
private void handlePositionalArgumentAnnotation(final Field field) {
if (positionalArguments != null) {
throw new CommandLineParserDefinitionException
("@PositionalArguments cannot be used more than once in an option class.");
}
positionalArguments = field;
if (!isCollectionField(field)) {
throw new CommandLineParserDefinitionException("@PositionalArguments must be applied to a Collection");
}
if (!canBeMadeFromString(getUnderlyingType(field))) {
throw new CommandLineParserDefinitionException("@PositionalParameters member " + field.getName() +
"does not have a String ctor");
}
final PositionalArguments positionalArgumentsAnnotation = field.getAnnotation(PositionalArguments.class);
minPositionalArguments = positionalArgumentsAnnotation.minElements();
maxPositionalArguments = positionalArgumentsAnnotation.maxElements();
if (minPositionalArguments > maxPositionalArguments) {
throw new CommandLineParserDefinitionException("In @PositionalArguments, minElements cannot be > maxElements");
}
}
private boolean isCollectionField(final Field field) {
try {
field.getType().asSubclass(Collection.class);
return true;
} catch (ClassCastException e) {
return false;
}
}
private Class getUnderlyingType(final Field field) {
if (isCollectionField(field)) {
final ParameterizedType clazz = (ParameterizedType)(field.getGenericType());
final Type[] genericTypes = clazz.getActualTypeArguments();
if (genericTypes.length != 1) {
throw new CommandLineParserDefinitionException("Strange collection type for field " + field.getName());
}
return (Class)genericTypes[0];
} else {
return field.getType();
}
}
// True if clazz is an enum, or if it has a ctor that takes a single String argument.
private boolean canBeMadeFromString(final Class clazz) {
if (clazz.isEnum()) {
return true;
}
try {
clazz.getConstructor(String.class);
return true;
} catch (NoSuchMethodException e) {
return false;
}
}
private Object constructFromString(final Class clazz, final String s) {
try {
if (clazz.isEnum()) {
try {
return Enum.valueOf(clazz, s);
} catch (IllegalArgumentException e) {
throw new CommandLineParseException("'" + s + "' is not a valid value for " +
clazz.getSimpleName() + ".", e);
}
}
final Constructor ctor = clazz.getConstructor(String.class);
return ctor.newInstance(s);
} catch (NoSuchMethodException e) {
// Shouldn't happen because we've checked for presence of ctor
throw new CommandLineParseException(e);
} catch (InstantiationException e) {
throw new CommandLineParseException("Abstract class '" + clazz.getSimpleName() +
"'cannot be used for an option value type.", e);
} catch (IllegalAccessException e) {
throw new CommandLineParseException("String constructor for option value type '" + clazz.getSimpleName() +
"' must be public.", e);
} catch (InvocationTargetException e) {
throw new CommandLineParseException("Problem constructing " + clazz.getSimpleName() + " from the string '" + s + "'.",
e.getCause());
}
}
public String[] getArgv() {
return argv;
}
private class OptionDefinition {
final Field field;
final String name;
final String shortName;
final String doc;
final boolean optional;
final boolean isCollection;
final int minElements;
final int maxElements;
final String defaultValue;
boolean hasBeenSet = false;
boolean hasBeenSetFromOptionsFile = false;
Set<String> mutuallyExclusive;
private OptionDefinition(final Field field, final String name, final String shortName, final String doc, final boolean optional, final boolean collection,
final int minElements, final int maxElements, final Object defaultValue, String[] mutuallyExclusive) {
this.field = field;
this.name = name.toUpperCase();
this.shortName = shortName.toUpperCase();
this.doc = doc;
this.optional = optional;
isCollection = collection;
this.minElements = minElements;
this.maxElements = maxElements;
if (defaultValue != null) {
this.defaultValue = defaultValue.toString();
} else {
this.defaultValue = "null";
}
this.mutuallyExclusive = new HashSet<String>(Arrays.asList(mutuallyExclusive));
}
}
}

View File

@ -1,27 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.cmdline;
public class CommandLineParserDefinitionException extends RuntimeException {
public CommandLineParserDefinitionException() {
}
public CommandLineParserDefinitionException(String s) {
super(s);
}
public CommandLineParserDefinitionException(String s, Throwable throwable) {
super(s, throwable);
}
public CommandLineParserDefinitionException(Throwable throwable) {
super(throwable);
}
}

View File

@ -1,141 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.cmdline;
import edu.mit.broad.picard.util.Log;
import edu.mit.broad.picard.util.StringUtil;
import edu.mit.broad.picard.metrics.Header;
import edu.mit.broad.picard.metrics.StringHeader;
import edu.mit.broad.picard.metrics.MetricsFile;
import edu.mit.broad.picard.metrics.MetricBase;
import java.io.File;
import java.util.Date;
import java.util.List;
import java.util.ArrayList;
/**
* Abstract class to facilitate writing command-line programs.
*
* To use:
*
* 1. Extend this class with a concrete class that has data members annotated with @Option, @PositionalArguments
* and/or @Usage annotations.
*
* 2. If there is any custom command-line validation, override customCommandLineValidation(). When this method is
* called, the command line has been parsed and set into the data members of the concrete class.
*
* 3. Implement a method doWork(). This is called after successful comand-line processing. The value it returns is
* the exit status of the program. It is assumed that the concrete class emits any appropriate error message before
* returning non-zero. doWork() may throw unchecked exceptions, which are caught and reported appropriately.
*
* 4. Implement the following static method in the concrete class:
*
* public static void main(String[] argv) {
System.exit(new MyConcreteClass().instanceMain(argv));
}
*/
public abstract class CommandLineProgram {
@Option
public File TMP_DIR = new File(System.getProperty("java.io.tmpdir"), System.getProperty("user.name"));
@Option(doc = "Control verbosity of logging")
public Log.LogLevel VERBOSITY = Log.LogLevel.INFO;
@Option(doc = "Whether to suppress job-summary info on System.out")
public Boolean QUIET = false;
private final String standardUsagePreamble = CommandLineParser.getStandardUsagePreamble(getClass());
/**
* Initialized in parseArgs. Subclasses may want to access this to do
* their own validation, and then print usage using clp.
*/
protected CommandLineParser clp;
private final List<Header> defaultHeaders = new ArrayList<Header>();
/**
* Do the work after command line has been parsed.
* RuntimeException may be thrown by this method, and are reported appropriately.
* @return program exit status.
*/
protected abstract int doWork();
public int instanceMain(final String[] argv) {
// Build the default headers
final Date startDate = new Date();
final String cmdline = getClass().getName() + " " + StringUtil.join(" ", argv);
this.defaultHeaders.add(new StringHeader(cmdline));
this.defaultHeaders.add(new StringHeader("Started on: " + startDate));
if (!parseArgs(argv)) {
return 1;
}
Log.setGlobalLogLevel(VERBOSITY);
if (!TMP_DIR.exists()) {
// Intentially not checking the return value, because it may be that the program does not
// need a tmp_dir. If this fails, the problem will be discovered downstream.
TMP_DIR.mkdir();
}
System.setProperty("java.io.tmpdir", TMP_DIR.getAbsolutePath());
if (!QUIET) {
System.out.println("[" + new Date() + "] " + cmdline);
}
final int ret = doWork();
if (!QUIET) {
System.out.println("[" + new Date() + "] " + getClass().getName() + " done.");
System.out.println("Runtime.totalMemory()=" + Runtime.getRuntime().totalMemory());
}
return ret;
}
/**
* Put any custom command-line validation in an override of this method.
* clp is initialized at this point and can be used to print usage and access argv.
* Any options set by command-line parser can be validated.
* @return true if command line is valid.
*/
protected boolean customCommandLineValidation() {
return true;
}
/**
*
* @return true if command line is valid
*/
protected boolean parseArgs(final String[] argv) {
clp = new CommandLineParser(this);
final boolean ret = clp.parseOptions(System.err, argv);
if (!ret) {
return false;
}
return customCommandLineValidation();
}
/** Gets a MetricsFile with default headers already written into it. */
protected <A extends MetricBase,B extends Comparable> MetricsFile<A,B> getMetricsFile() {
final MetricsFile<A,B> file = new MetricsFile<A,B>();
for (final Header h : this.defaultHeaders) {
file.addHeader(h);
}
return file;
}
public String getStandardUsagePreamble() {
return standardUsagePreamble;
}
}

View File

@ -1,39 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.cmdline;
import java.io.*;
import java.util.regex.Pattern;
public class CommandLineUtils {
/** Regex for splitting on spaces. */
public static final Pattern SPACE_SPLITTER = Pattern.compile(" ");
// Regexes to split things apart on white space
public static final Pattern TAB_SPLITTER = Pattern.compile("\\t");
/** Checks that a file exists and is readable, and then returns a buffered reader for it. */
public static BufferedReader getReader(File file) throws IOException {
return new BufferedReader(new InputStreamReader(getInputStream(file)));
}
/** Checks that a file exists and is readable, and then returns a input stream for it. */
public static InputStream getInputStream(File file) throws IOException {
if (!file.exists()) {
throw new RuntimeException("Specified file does not exist: " + file);
}
if (!file.canRead()) {
throw new RuntimeException("Specified file is not readable: " + file);
}
return new FileInputStream(file);
}
}

View File

@ -1,60 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.cmdline;
import java.lang.annotation.Documented;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
/**
* Used to annotate which fields of a CommandLineProgram are options given at the command line.
* If a command line call looks like "cmd option=foo x=y bar baz" the CommandLineProgram
* would have annotations on fields to handle the values of option and x. All options
* must be in the form name=value on the command line. The java type of the option
* will be inferred from the type of the field or from the generic type of the collection
* if this option is allowed more than once. The type must be an enum or
* have a constructor with a single String parameter.
*
* @author Alec Wysoker
*/
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.FIELD)
@Documented
public @interface Option {
/** The name of the option as it would appear on the command line. */
String shortName() default "";
/** Text that appears for this option in text describing usage of the command line program. */
String doc() default "";
/**
* If set to false, an exception will be thrown if the option is not specified.
* If 2 options are mutually exclusive and both have optional=false it will be
* interpreted as one or the other is required and an exception will only be thrown if
* neither are specified.
*/
boolean optional() default false;
/**
* Array of option names that cannot be used in conjunction with this one.
* If 2 options are mutually exclusive and both have optional=false it will be
* interpreted as one OR the other is required and an exception will only be thrown if
* neither are specified.
*/
String[] mutex() default {};
/** The minimum number of times that this option is required. */
int minElements() default 0;
/** The maximum number of times this option is allowed. */
int maxElements() default Integer.MAX_VALUE;
}

View File

@ -1,38 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.cmdline;
import java.lang.annotation.Documented;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
/**
* Used to annotate which field of a CommandLineProgram should store parameters given at the
* command line which are not options. Fields with this annotation must be a Collection
* (and probably should be a List if order is important).
* If a command line call looks like "cmd option=foo x=y bar baz" the values "bar" and "baz"
* would be added to the collection with this annotation. The java type of the arguments
* will be inferred from the generic type of the collection. The type must be an enum or
* have a constructor with a single String parameter.
*
* @author Alec Wysoker
*/
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.FIELD)
@Documented
public @interface PositionalArguments {
/** The minimum number of arguments required. */
int minElements() default 0;
/** The maximum number of arguments allowed. */
int maxElements() default Integer.MAX_VALUE;
}

View File

@ -1,26 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.cmdline;
import java.lang.annotation.Documented;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
/**
* Annotates the field that contains text to be displayed in a usage message.
*/
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.FIELD)
@Documented
public @interface Usage {
String programVersion() default "";
}

View File

@ -1,62 +0,0 @@
package edu.mit.broad.picard.directed;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.util.BasicTextFileParser;
import edu.mit.broad.picard.util.Interval;
import edu.mit.broad.picard.util.FormatUtil;
import edu.mit.broad.sam.SAMFileReader;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.sam.SAMSequenceRecord;
import java.io.File;
import java.util.List;
/**
* Converts an arachne style map file to the new interval list format.
*
* @author Tim Fennell
*/
public class ArachneMapToIntervalList extends CommandLineProgram {
@Option(shortName="M", doc="The path to an archne style map file") public File MAP;
@Option(shortName="SD", doc="A sequence dictionary in SAM or BAM format") public File SEQUENCE_DICTIONARY;
@Option(shortName="O", doc="The output file to write the interval list to") public File OUTPUT;
@Option(shortName="P", doc="Prefix to use when generating names") public String PREFIX;
/** Stock main method. */
public static void main(String[] argv) {
System.exit(new ArachneMapToIntervalList().instanceMain(argv));
}
protected int doWork() {
IoUtil.assertFileIsReadable(MAP);
IoUtil.assertFileIsReadable(SEQUENCE_DICTIONARY);
IoUtil.assertFileIsWritable(OUTPUT);
SAMFileReader sam = new SAMFileReader(SEQUENCE_DICTIONARY);
SAMFileHeader header = sam.getFileHeader();
List<SAMSequenceRecord> seqs = header.getSequences();
IntervalList list = new IntervalList(header);
BasicTextFileParser parser = new BasicTextFileParser(true, 3, MAP);
FormatUtil format = new FormatUtil();
int i=1;
while (parser.hasNext()) {
String[] fields = parser.next();
int seqIndex = format.parseInt(fields[0]);
int start = format.parseInt(fields[1]) + 1;
int end = format.parseInt(fields[2]) + 1;
String seq = seqs.get(seqIndex).getSequenceName();
Interval interval = new Interval(seq, start, end, false, PREFIX + "_" + i++);
list.add(interval);
}
list.sort();
list.write(OUTPUT);
return 0;
}
}

View File

@ -1,51 +0,0 @@
package edu.mit.broad.picard.directed;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.cmdline.Usage;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.metrics.MetricsFile;
import edu.mit.broad.sam.SAMFileReader;
import java.io.File;
/**
* Calculates a set of HS metrics from a sam or bam file.
*
* @author Tim Fennell
*/
public class CalculateHsMetrics extends CommandLineProgram {
@Usage public final String USAGE =
"Calculates a set of Hybrid Selection specific metrics from an aligned SAM" +
"or BAM file.";
@Option(shortName="BI") public File BAIT_INTERVALS;
@Option(shortName="TI") public File TARGET_INTERVALS;
@Option(shortName="I") public File INPUT;
@Option(shortName="M") public File METRICS_FILE;
/** Stock main method. */
public static void main(String[] argv) {
System.exit(new CalculateHsMetrics().instanceMain(argv));
}
/**
* Asserts that files are readable and writable and then fires off an
* HsMetricsCalculator instance to do the real work.
*/
protected int doWork() {
IoUtil.assertFileIsReadable(BAIT_INTERVALS);
IoUtil.assertFileIsReadable(TARGET_INTERVALS);
IoUtil.assertFileIsReadable(INPUT);
IoUtil.assertFileIsWritable(METRICS_FILE);
HsMetricsCalculator calculator = new HsMetricsCalculator(BAIT_INTERVALS, TARGET_INTERVALS);
SAMFileReader sam = new SAMFileReader(INPUT);
calculator.analyze(sam.iterator());
MetricsFile<HsMetrics, Integer> metrics = getMetricsFile();
metrics.addMetric(calculator.getMetrics());
metrics.write(METRICS_FILE);
return 0;
}
}

View File

@ -1,52 +0,0 @@
package edu.mit.broad.picard.directed;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.BitSet;
import java.util.SortedMap;
import java.util.TreeMap;
/**
* Utility class to store coordinates of interest in per-sequence bitmasks.
*/
public class GenomeMask {
// if memory usage becomes a problem... this could be changed to a SparseBitSet
// http://java.sun.com/developer/onlineTraining/collections/magercises/BitSet/index.html
private SortedMap<Integer, BitSet> data = new TreeMap<Integer, BitSet>();
public GenomeMask() {
}
public boolean get(int contig, int position) {
BitSet bits = data.get(contig);
return (bits != null) && bits.get(position);
}
public BitSet get(int contig) {
return data.get(contig);
}
/**
* Get an existing BitSet for the given contig, or create one if not already present. This is
* useful when initializing a GenomeMask from an external source.
* @param contig which BitSet
* @param numBits if there was not already a BitSet for this contig, one is created and initialized to this size.
* @return the BitSet for the given contig, creating one if necessary
*/
public BitSet getOrCreate(int contig, int numBits) {
BitSet ret = data.get(contig);
if (ret == null) {
ret = new BitSet(numBits);
data.put(contig, ret);
}
return ret;
}
public int getMaxContig() {
return data.lastKey();
}
}

View File

@ -1,47 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.directed;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.picard.util.Interval;
import edu.mit.broad.picard.io.IoUtil;
import java.util.List;
import java.util.BitSet;
import java.io.File;
/**
* Create a GenomeMask from an IntervalList or a file containing an IntervalList
*/
public class GenomeMaskFactory {
public GenomeMask makeGenomeMaskFromIntervalList(IntervalList intervalList) {
if (intervalList.getHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
intervalList.sort();
}
List<Interval> uniqueIntervals = intervalList.getUniqueIntervals();
GenomeMask ret = new GenomeMask();
SAMFileHeader samHeader = intervalList.getHeader();
for (Interval interval : uniqueIntervals) {
// TODO: Maybe figure out more intelligently how big the bitset might be?
BitSet bitSet = ret.getOrCreate(samHeader.getSequenceIndex(interval.getSequence()), interval.getEnd() + 1);
bitSet.set(interval.getStart(), interval.getEnd() + 1);
}
return ret;
}
public GenomeMask makeGenomeMaskFromIntervalList(File intervalListFile) {
IoUtil.assertFileIsReadable(intervalListFile);
IntervalList intervalList = IntervalList.fromFile(intervalListFile);
return makeGenomeMaskFromIntervalList(intervalList);
}
}

View File

@ -1,108 +0,0 @@
package edu.mit.broad.picard.directed;
import edu.mit.broad.picard.metrics.MetricBase;
/**
* The set of metrics captured that are specific to a hybrid selection analysis.
*
* @author Tim Fennell
*/
public class HsMetrics extends MetricBase {
/** The name of the bait set used in the hybrid selection. */
public String BAIT_SET;
/** The number of bases in the reference genome used for alignment. */
public long GENOME_SIZE;
/** The number of bases which have one or more baits on top of them. */
public long BAIT_TERRITORY;
/** The unique number of target bases in the experiment where target is usually exons etc. */
public long TARGET_TERRITORY;
/** Target terrirtoy / bait territory. 1 == perfectly efficient, 0.5 = half of baited bases are not target. */
public double BAIT_DESIGN_EFFICIENCY;
/** The total number of reads in the SAM or BAM file examine. */
public int TOTAL_READS;
/** The number of reads that pass the vendor's filter. */
public int PF_READS;
/** The number of PF reads that are not marked as duplicates. */
public int PF_UNIQUE_READS;
/** PF reads / total reads. The percent of reads passing filter. */
public double PCT_PF_READS;
/** PF Unique Reads / Total Reads. */
public double PCT_PF_UQ_READS;
/** The number of PF reads that are aligned with mapping score > 0 to the reference genome. */
public int PF_READS_ALIGNED;
/** PF Reads Aligned / PF Reads. */
public double PCT_PF_READS_ALIGNED;
/** The number of bases in the PF aligned reads that are mapped to a reference base. Accounts for clipping and gaps. */
public int PF_BASES_ALIGNED;
/** The number of PF aligned bases that mapped to a baited region of the genome. */
public long ON_BAIT_BASES;
/** The number of PF aligned bases that mapped to within a fixed interval of a baited region, but not on a baited region. */
public long NEAR_BAIT_BASES;
/** The number of PF aligned bases that mapped to neither on or near a bait. */
public long OFF_BAIT_BASES;
/** The number of PF aligned bases that mapped to a targetted region of the genome. */
public long ON_TARGET_BASES;
/** On+Near Bait Bases / PF Bases Aligned. */
public double PCT_SELECTED_BASES;
/** The percentage of aligned PF bases that mapped neither on or near a bait. */
public double PCT_OFF_BAIT;
/** The percentage of on+near bait bases that are on as opposed to near. */
public double ON_BAIT_VS_SELECTED;
/** The mean coverage of all baits in the experiment. */
public double MEAN_BAIT_COVERAGE;
/** The mean coverage of targets that recieved at least coverage depth = 2 at one base. */
public double MEAN_TARGET_COVERAGE;
/** The fold by which the baited region has been amplified above genomic background. */
public double FOLD_ENRICHMENT;
/** The number of targets that did not reach coverage=2 over any base. */
public double ZERO_CVG_TARGETS_PCT;
/**
* The fold over-coverage necessary to raise 80% of bases in "non-zero-cvg" targets to
* the mean coverage level in those targets.
*/
public double FOLD_80_BASE_PENALTY;
/**
* Calculates the metrics in this class that can be derived from other metrics in the class.
*/
public void calculateDerivedMetrics() {
BAIT_DESIGN_EFFICIENCY = (double) TARGET_TERRITORY / (double) BAIT_TERRITORY;
PCT_PF_READS = PF_READS / (double) TOTAL_READS;
PCT_PF_UQ_READS = PF_UNIQUE_READS / (double) TOTAL_READS;
PCT_PF_READS_ALIGNED = PF_READS_ALIGNED / (double) PF_UNIQUE_READS;
double denominator = (ON_BAIT_BASES + NEAR_BAIT_BASES + OFF_BAIT_BASES);
PCT_SELECTED_BASES = (ON_BAIT_BASES + NEAR_BAIT_BASES) / denominator;
PCT_OFF_BAIT = OFF_BAIT_BASES / denominator;
ON_BAIT_VS_SELECTED = ON_BAIT_BASES / (double) (ON_BAIT_BASES + NEAR_BAIT_BASES);
MEAN_BAIT_COVERAGE = ON_BAIT_BASES / (double) BAIT_TERRITORY;
FOLD_ENRICHMENT = (ON_BAIT_BASES/ denominator) / ((double) BAIT_TERRITORY / GENOME_SIZE);
}
}

View File

@ -1,207 +0,0 @@
package edu.mit.broad.picard.directed;
import edu.mit.broad.picard.util.*;
import edu.mit.broad.sam.SAMFileReader;
import edu.mit.broad.sam.SAMRecord;
import edu.mit.broad.sam.AlignmentBlock;
import edu.mit.broad.sam.SAMSequenceRecord;
import java.util.*;
import java.io.*;
/**
* Calculates HS metrics for a given SAM or BAM file. Requires the input of a list of
* target intervals and a list of bait intervals. Can be invoked either on an entire
* iterator of SAMRecords or be passed SAMRecords one at a time.
*
* @author Tim Fennell
*/
public class HsMetricsCalculator {
// What is considered "near" to the bait
private static final int NEAR_BAIT_DISTANCE = 250;
private static final Log log = Log.getInstance(HsMetricsCalculator.class);
// Holds file names and other parameter related junk
private SAMFileReader sam;
private File baitFile;
private File targetFile;
private IntervalList baits;
private IntervalList targets;
// Overlap detector for finding overlaps between reads and the experimental targets
private OverlapDetector<Interval> targetDetector = new OverlapDetector<Interval>(0,0);
// Overlap detector for finding overlaps between the reads and the baits (and the near bait space)
private OverlapDetector<Interval> baitDetector = new OverlapDetector<Interval>(-NEAR_BAIT_DISTANCE,0);
// A Map to accumulate per-bait-region (i.e. merge of overlapping baits) coverage. */
private Map<Interval, Coverage> coverageByTarget = new HashMap<Interval, Coverage>();
private HsMetrics metrics = new HsMetrics();
/**
* Constructor that parses the squashed reference to genome reference file and stores the
* information in a map for later use.
*/
public HsMetricsCalculator(File baits, File targets) {
this.baitFile = baits;
this.targetFile = targets;
this.baits = IntervalList.fromFile(baits);
this.targets = IntervalList.fromFile(targets);
this.metrics.BAIT_SET = baits.getName();
int tmp = this.metrics.BAIT_SET.indexOf(".");
if (tmp > 0) {
this.metrics.BAIT_SET = this.metrics.BAIT_SET.substring(0, tmp);
}
List<Interval> uniqueBaits = this.baits.getUniqueIntervals();
this.baitDetector.addAll(uniqueBaits, uniqueBaits);
this.metrics.BAIT_TERRITORY = Interval.countBases(uniqueBaits);
List<Interval> uniqueTargets = this.targets.getUniqueIntervals();
this.targetDetector.addAll(uniqueTargets, uniqueTargets);
this.metrics.TARGET_TERRITORY = Interval.countBases(uniqueTargets);
for (SAMSequenceRecord seq : this.baits.getHeader().getSequences()) {
this.metrics.GENOME_SIZE += seq.getSequenceLength();
}
// Populate the coverage by target map
for (Interval target : this.targets.getIntervals()) {
this.coverageByTarget.put(target, new Coverage(target, 0));
}
}
/** Iterates over all records in the file and collects metrics. */
public void analyze(Iterator<SAMRecord> records) {
int i = 0;
while (records.hasNext()) {
analyze(records.next());
if (++i % 1000000 == 0) {
log.info("Processed " + i + " records so far.");
}
}
}
/** Adds information about an individual SAMRecord to the statistics. */
public void analyze(SAMRecord rec) {
// Just plain avoid records that are marked as not-primary
if (rec.getNotPrimaryAlignmentFlag()) return;
this.metrics.TOTAL_READS += 1;
// Check for PF reads
if (rec.getReadFailsVendorQualityCheckFlag()) {
return;
}
else {
++this.metrics.PF_READS;
}
// Check for reads that are marked as duplicates
if (rec.getDuplicateReadFlag()) {
return;
}
else {
++this.metrics.PF_UNIQUE_READS;
}
// Don't bother with reads that didn't align uniquely
if (rec.getReadUnmappedFlag() || rec.getMappingQuality() == 0) {
return;
}
this.metrics.PF_READS_ALIGNED += 1;
for (AlignmentBlock block : rec.getAlignmentBlocks()) {
this.metrics.PF_BASES_ALIGNED += block.getLength();
}
Interval read = new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd());
// Find the target overlaps
Collection<Interval> targets = this.targetDetector.getOverlaps(read);
if (targets != null && !targets.isEmpty()) {
for (Interval target : targets) {
Coverage coverage = this.coverageByTarget.get(target);
for (AlignmentBlock block : rec.getAlignmentBlocks()) {
int end = CoordMath.getEnd(block.getReferenceStart(), block.getLength());
for (int pos=block.getReferenceStart(); pos<=end; ++ pos) {
if (pos >= target.getStart() && pos <= target.getEnd()) {
++this.metrics.ON_TARGET_BASES;
coverage.addBase(pos - target.getStart());
}
}
}
}
}
// Now do the bait overlaps
int mappedBases = 0;
for (AlignmentBlock block : rec.getAlignmentBlocks()) mappedBases += block.getLength();
Collection<Interval> baits = this.baitDetector.getOverlaps(read);
int onBaitBases = 0;
if (baits != null && !baits.isEmpty()) {
for (Interval bait : baits) {
for (AlignmentBlock block : rec.getAlignmentBlocks()) {
int end = CoordMath.getEnd(block.getReferenceStart(), block.getLength());
for (int pos=block.getReferenceStart(); pos<=end; ++pos) {
if (pos >= bait.getStart() && pos <= bait.getEnd()) ++onBaitBases;
}
}
}
this.metrics.ON_BAIT_BASES += onBaitBases;
this.metrics.NEAR_BAIT_BASES += (mappedBases - onBaitBases);
}
else {
this.metrics.OFF_BAIT_BASES += mappedBases;
}
}
/** Calculates a few last summary metrics and then returns the metrics calculated. */
public HsMetrics getMetrics() {
this.metrics.calculateDerivedMetrics();
calculateTargetCoverageMetrics();
return this.metrics;
}
/** Calculates how much additional sequencing is needed to raise 80% of bases to the mean for the lane. */
private void calculateTargetCoverageMetrics() {
short[] depths = new short[(int) this.metrics.TARGET_TERRITORY]; // may not use entire array
int zeroCoverageTargets = 0;
int depthIndex = 0;
double totalCoverage = 0;
int basesConsidered = 0;
for (Coverage c : this.coverageByTarget.values()) {
if (!c.hasCoverage()) {
++zeroCoverageTargets;
continue;
}
final short[] targetDepths = c.getDepths();
basesConsidered += targetDepths.length;
for (short depth : targetDepths) {
depths[depthIndex++] = depth;
totalCoverage += depth;
}
}
this.metrics.MEAN_TARGET_COVERAGE = totalCoverage / basesConsidered;
// Sort the array (ASCENDING) and then find the base the coverage value that lies at the 80%
// line, which is actually at 20% into the array now
Arrays.sort(depths);
int indexOf80thPercentile = (depths.length - basesConsidered) + (int) (basesConsidered * 0.2);
int coverageAt80thPercentile = depths[indexOf80thPercentile];
this.metrics.FOLD_80_BASE_PENALTY = this.metrics.MEAN_TARGET_COVERAGE / coverageAt80thPercentile;
this.metrics.ZERO_CVG_TARGETS_PCT = zeroCoverageTargets / (double) this.targets.getIntervals().size();
}
}

View File

@ -1,240 +0,0 @@
package edu.mit.broad.picard.directed;
import edu.mit.broad.picard.util.Interval;
import edu.mit.broad.picard.util.FormatUtil;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.sam.SAMTextHeaderCodec;
import edu.mit.broad.sam.util.StringLineReader;
import java.util.*;
import java.io.*;
/**
* Represents a list of intervals against a reference sequence that can be written to
* and read from a file. The file format is relatively simple and reflects the SAM
* alignment format to a degree.
*
* A SAM style header must be present in the file which lists the sequence records
* against which the intervals are described. After the header the file then contains
* records one per line in text format with the following values tab-separated:
* - Sequence name
* - Start position (1-based)
* - End position (1-based, end inclusive)
* - Strand (either + or -)
* - Interval name (an, ideally unique, name for the interval)
*
* @author Tim Fennell
*/
public class IntervalList implements Iterable<Interval> {
private SAMFileHeader header;
private List<Interval> intervals = new ArrayList<Interval>();
/** Constructs a new interval list using the supplied header information. */
public IntervalList(SAMFileHeader header) {
if (header == null) {
throw new IllegalArgumentException("SAMFileHeader must be supplied.");
}
this.header = header;
}
/** Gets the header (if there is one) for the interval list. */
public SAMFileHeader getHeader() { return header; }
/** Returns an iterator over the intervals. */
public Iterator<Interval> iterator() { return this.intervals.iterator(); }
/** Adds an interval to the list of intervals. */
public void add(Interval interval) { this.intervals.add(interval); }
/** Sorts the internal collection of intervals by coordinate. */
public void sort() {
Collections.sort(this.intervals, new IntervalCoordinateComparator(this.header));
this.header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
}
/** Gets the set of intervals as held internally. */
public List<Interval> getIntervals() {
return Collections.unmodifiableList(this.intervals);
}
/**
* Merges the list of intervals and then reduces them down where regions overlap
* or are directly adjacent to one another. During this process the "merged" interval
* will retain the strand and name of the 5' most interval merged.
*
* @return the set of unique intervals condensed from the contained intervals
*/
public List<Interval> getUniqueIntervals() {
List<Interval> unique = new ArrayList<Interval>();
ListIterator<Interval> iterator = this.intervals.listIterator();
Interval previous = iterator.next();
while (iterator.hasNext()) {
Interval next = iterator.next();
if (previous.intersects(next) || previous.abuts(next)) {
previous = new Interval(previous.getSequence(),
previous.getStart(),
Math.max(previous.getEnd(), next.getEnd()),
previous.isNegativeStrand(),
previous.getName());
}
else {
unique.add(previous);
previous = next;
}
}
if (previous != null) unique.add(previous);
return unique;
}
/** Gets the (potentially redundant) sum of the length of the intervals in the list. */
public long getBaseCount() {
return Interval.countBases(this.intervals);
}
/** Gets the count of unique bases represented by the intervals in the list. */
public long getUniqueBaseCount() {
return Interval.countBases(getUniqueIntervals());
}
/**
* Parses an interval list from a file.
* @param file the file containing the intervals
* @return an IntervalList object that contains the headers and intervals from the file
*/
public static IntervalList fromFile(File file) {
BufferedReader in = new BufferedReader(new InputStreamReader(IoUtil.openFileForReading(file)));
try {
// Setup a reader and parse the header
StringBuilder builder = new StringBuilder(4096);
String line = null;
while ((line = in.readLine()) != null) {
if (line.startsWith("@")) {
builder.append(line).append('\n');
}
else {
break;
}
}
if (builder.length() == 0) {
throw new IllegalStateException("Interval list file must contain header: " + file.getAbsolutePath());
}
StringLineReader headerReader = new StringLineReader(builder.toString());
SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
IntervalList list = new IntervalList(codec.decode(headerReader, file));
// Then read in the intervals
FormatUtil format = new FormatUtil();
do {
if (line.trim().length() == 0) continue; // skip over blank lines
// Make sure we have the right number of fields
String fields[] = line.split("\t");
if (fields.length != 5) {
throw new PicardException("Invalid interval record contains " +
fields.length + " fields: " + line);
}
// Then parse them out
String seq = fields[0];
int start = format.parseInt(fields[1]);
int end = format.parseInt(fields[2]);
boolean negative;
if (fields[3].equals("-")) negative = true;
else if (fields[3].equals("+")) negative = false;
else throw new IllegalArgumentException("Invalid strand field: " + fields[3]);
String name = fields[4];
Interval interval = new Interval(seq, start, end, negative, name);
list.intervals.add(interval);
}
while ((line = in.readLine()) != null);
return list;
}
catch (IOException ioe) {
throw new PicardException("Error parsing interval list file: " + file.getAbsolutePath(), ioe);
}
finally {
try { in.close(); } catch (Exception e) { /* do nothing */ }
}
}
/**
* Writes out the list of intervals to the supplied file.
* @param file a file to write to. If exists it will be overwritten.
*/
public void write(File file) {
try {
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(IoUtil.openFileForWriting(file)));
FormatUtil format = new FormatUtil();
// Write out the header
if (this.header != null) {
SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
codec.encode(out, this.header);
}
// Write out the intervals
for (Interval interval : this) {
out.write(interval.getSequence());
out.write('\t');
out.write(format.format(interval.getStart()));
out.write('\t');
out.write(format.format(interval.getEnd()));
out.write('\t');
out.write(interval.isPositiveStrand() ? '+' : '-');
out.write('\t');
out.write(interval.getName());
out.newLine();
}
out.flush();
out.close();
}
catch (IOException ioe) {
throw new PicardException("Error writing out interval list to file: " + file.getAbsolutePath(), ioe);
}
}
}
/**
* Comparator that orders intervals based on their sequence index, by coordinate
* then by strand and finally by name.
*/
class IntervalCoordinateComparator implements Comparator<Interval> {
private SAMFileHeader header;
/** Constructs a comparator using the supplied sequence header. */
IntervalCoordinateComparator(SAMFileHeader header) {
this.header = header;
}
public int compare(Interval lhs, Interval rhs) {
int lhsIndex = this.header.getSequenceIndex(lhs.getSequence());
int rhsIndex = this.header.getSequenceIndex(rhs.getSequence());
int retval = lhsIndex - rhsIndex;
if (retval == 0) retval = lhs.getStart() - rhs.getStart();
if (retval == 0) retval = lhs.getEnd() - rhs.getEnd();
if (retval == 0) {
if (lhs.isPositiveStrand() && rhs.isNegativeStrand()) retval = -1;
else if (lhs.isNegativeStrand() && rhs.isPositiveStrand()) retval = 1;
}
if (retval == 0) {
retval = lhs.getName().compareTo(rhs.getName());
}
return retval;
}
}

View File

@ -1,46 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.filter;
import edu.mit.broad.sam.SAMRecord;
import java.util.List;
/**
* Aggregates multiple filters and provides a method for applying them all to a given record with
* one method call.
*/
public class AggregateFilter implements SamRecordFilter {
private final List<SamRecordFilter> filters;
/**
* Constructor
* @param filters the list of filters that this Aggregator applies
*/
public AggregateFilter(List<SamRecordFilter> filters) {
this.filters = filters;
}
/**
* Determines whether a SAMRecord matches this filter
*
* @param record the SAMRecord to evaluate
* @return true if the SAMRecord matches at least one filter, otherwise false
*/
public boolean filterOut(SAMRecord record) {
for (SamRecordFilter filter : filters) {
if (filter.filterOut(record)) {
return true;
}
}
return false;
}
}

View File

@ -1,28 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.filter;
import edu.mit.broad.sam.SAMRecord;
/**
* Filter for filtering out reads that do not pass the quality filter
*/
public class FailsVendorReadQualityFilter implements SamRecordFilter {
/**
* Determines whether a SAMRecord matches this filter
*
* @param record the SAMRecord to evaluate
* @return true if the SAMRecord matches the filter, otherwise false
*/
public boolean filterOut(SAMRecord record) {
return record.getReadFailsVendorQualityCheckFlag();
}
}

View File

@ -1,94 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.filter;
import edu.mit.broad.sam.SAMRecord;
import edu.mit.broad.sam.util.CloseableIterator;
import edu.mit.broad.picard.util.CloserUtil;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* Filtering Iterator which takes a filter and an iterator and iterates
* through only those records which are not rejected by the filter.
*
* @author Kathleen Tibbetts
*/
public class FilteringIterator implements CloseableIterator<SAMRecord> {
private final Iterator<SAMRecord> iterator;
private final SamRecordFilter filter;
private SAMRecord next = null;
/**
* Constructor
*
* @param iterator the backing iterator
* @param filter the filter (which may be a FilterAggregator)
*/
public FilteringIterator(Iterator<SAMRecord> iterator, SamRecordFilter filter) {
this.iterator = iterator;
this.filter = filter;
next = getNextRecord();
}
/**
* Returns true if the iteration has more elements.
*
* @return true if the iteration has more elements. Otherwise returns false.
*/
public boolean hasNext() {
return next != null;
}
/**
* Returns the next element in the iteration.
*
* @return the next element in the iteration
* @throws java.util.NoSuchElementException
*/
public SAMRecord next() {
if (next == null) {
throw new NoSuchElementException("Iterator has no more elements.");
}
SAMRecord result = next;
next = getNextRecord();
return result;
}
/**
* Required method for Iterator API.
*
* @throws UnsupportedOperationException
*/
public void remove() {
throw new UnsupportedOperationException("Remove() not supported by FilteringIterator");
}
public void close() {
CloserUtil.close(iterator);
}
/**
* Gets the next record from the underlying iterator that passes the filter
*
* @return SAMRecord the next filter-passing record
*/
private SAMRecord getNextRecord() {
while (iterator.hasNext()) {
SAMRecord record = iterator.next();
if (!filter.filterOut(record)) {
return record;
}
}
return null;
}
}

View File

@ -1,26 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.filter;
import edu.mit.broad.sam.SAMRecord;
/**
* API for filtering SAMRecords
*/
public interface SamRecordFilter {
/**
* Determines whether a SAMRecord matches this filter
*
* @param record the SAMRecord to evaluate
* @return true if the SAMRecord matches the filter, otherwise false
*/
public boolean filterOut(SAMRecord record);
}

View File

@ -1,37 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.filter;
import edu.mit.broad.picard.util.SequenceUtil;
import edu.mit.broad.sam.SAMRecord;
/**
* Filter to determine whether a read is "noisy" due to a poly-A run that is a sequencing artifact.
* Currently we filter out only reads that are composed entirely of As.
*/
public class SolexaNoiseFilter implements SamRecordFilter {
/**
* Determines whether a SAMRecord matches this filter
*
* @param record the SAMRecord to evaluate
* @return true if the SAMRecord matches the filter, otherwise false
*/
public boolean filterOut(SAMRecord record) {
byte sequence[] = record.getReadBases();
for (byte base : sequence) {
if (base != 'A' && base != 'a' &&
!SequenceUtil.isNoCall(base)) {
return false;
}
}
return true;
}
}

View File

@ -1,56 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.filter;
import edu.mit.broad.sam.SAMRecord;
import java.util.List;
import java.util.Arrays;
/**
* Filter class for matching tag attributes in SAMRecords
*/
public class TagFilter implements SamRecordFilter {
private final String tag; // The key of the tag to match
private final List<Object> values; // The list of matching values
/**
* Constructor for a single value
*
* @param tag the key of the tag to match
* @param value the value to match
*/
public TagFilter(String tag, Object value) {
this.tag = tag;
this.values = Arrays.asList(value);
}
/**
* Constructor for multiple values
*
* @param tag the key of the tag to match
* @param values the matching values
*/
public TagFilter(String tag, List<Object> values) {
this.tag = tag;
this.values = values;
}
/**
* Determines whether a SAMRecord matches this filter
*
* @param record the SAMRecord to evaluate
* @return true if the SAMRecord matches the filter, otherwise false
*/
public boolean filterOut(SAMRecord record) {
return values.contains(record.getAttribute(tag));
}
}

View File

@ -1,30 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.genotype;
import edu.mit.broad.picard.PicardException;
/**
* Generic exception thrown by GELI format machinery.
*
* @author Doug Voet
*/
public class GeliException extends PicardException {
public GeliException(String message, Throwable throwable) {
super(message, throwable);
}
public GeliException(String message) {
super(message);
}
}

View File

@ -1,20 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.genotype;
/**
* Misc constants for GELI format
*
* @author Doug Voet
*/
public interface GeliFileConstants {
public static final byte[] GELI_MAGIC = "GELI".getBytes();
}

View File

@ -1,103 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.picard.genotype;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.sam.util.BlockCompressedInputStream;
import edu.mit.broad.sam.util.CloseableIterator;
import edu.mit.broad.sam.util.RuntimeIOException;
/**
* Class for reading GELI (GEnotype LIkelihood) files.
*
* @author Doug Voet
*/
public class GeliFileReader implements Iterable<GenotypeLikelihoods>
{
private ReaderImplementation mReader = null;
/**
* Internal interface for SAM/BAM file reader implementations.
* Implemented as an abstract class to enforce better access control.
*/
static abstract class ReaderImplementation {
abstract SAMFileHeader getFileHeader();
abstract CloseableIterator<GenotypeLikelihoods> getIterator();
abstract void close();
}
public GeliFileReader(final InputStream stream) {
try {
final BufferedInputStream bufferedStream = toBufferedStream(stream);
if (isValidGELIFile(bufferedStream)) {
mReader = new GeliFileReaderImplementation(bufferedStream);
} else {
throw new GeliException("Unrecognized file format");
}
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
public GeliFileReader(final File file) {
try {
final BufferedInputStream bufferedStream =
new BufferedInputStream(new FileInputStream(file));
if (isValidGELIFile(bufferedStream)) {
bufferedStream.close();
final GeliFileReaderImplementation reader = new GeliFileReaderImplementation(file);
mReader = reader;
} else {
bufferedStream.close();
throw new GeliException("Unrecognized file format");
}
} catch (IOException e) {
throw new RuntimeIOException(e);
}
}
public void close() {
if (mReader != null) {
mReader.close();
}
mReader = null;
}
public SAMFileHeader getFileHeader() {
return mReader.getFileHeader();
}
public CloseableIterator<GenotypeLikelihoods> iterator() {
return mReader.getIterator();
}
private boolean isValidGELIFile(final InputStream stream)
throws IOException {
return BlockCompressedInputStream.isValidFile(stream);
}
private BufferedInputStream toBufferedStream(final InputStream stream) {
if (stream instanceof BufferedInputStream) {
return (BufferedInputStream) stream;
} else {
return new BufferedInputStream(stream);
}
}
}

View File

@ -1,189 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.picard.genotype;
import java.io.DataInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.LineNumberReader;
import java.io.StringReader;
import java.util.Arrays;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.sam.SAMSequenceRecord;
import edu.mit.broad.sam.SAMTextHeaderCodec;
import edu.mit.broad.sam.util.BinaryCodec;
import edu.mit.broad.sam.util.BlockCompressedInputStream;
import edu.mit.broad.sam.util.CloseableIterator;
import edu.mit.broad.sam.util.StringLineReader;
/**
* Internal class for reading GELI files.
*/
class GeliFileReaderImplementation extends GeliFileReader.ReaderImplementation {
private boolean mIsSeekable = false;
private BinaryCodec mStream = null;
private final BlockCompressedInputStream mCompressedInputStream;
private SAMFileHeader mFileHeader = null;
private long mFirstRecordPointer = 0;
private CloseableIterator<GenotypeLikelihoods> mCurrentIterator = null;
GeliFileReaderImplementation(final InputStream stream)
throws IOException {
mIsSeekable = false;
mCompressedInputStream = new BlockCompressedInputStream(stream);
mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream));
readHeader(null);
}
GeliFileReaderImplementation(final File file)
throws IOException {
mIsSeekable = true;
mCompressedInputStream = new BlockCompressedInputStream(file);
mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream));
readHeader(file);
mFirstRecordPointer = mCompressedInputStream.getFilePointer();
}
void close() {
if (mStream != null) {
mStream.close();
}
mStream = null;
mFileHeader = null;
}
SAMFileHeader getFileHeader() {
return mFileHeader;
}
CloseableIterator<GenotypeLikelihoods> getIterator() {
if (mStream == null) {
throw new IllegalStateException("File reader is closed");
}
if (mCurrentIterator != null) {
throw new IllegalStateException("Iteration in progress");
}
if (mIsSeekable) {
try {
mCompressedInputStream.seek(mFirstRecordPointer);
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
}
mCurrentIterator = new GELIFileIterator();
return mCurrentIterator;
}
private void readHeader(final File file)
throws IOException {
final byte[] buffer = new byte[4];
mStream.readBytes(buffer);
if (!Arrays.equals(buffer, GeliFileConstants.GELI_MAGIC)) {
throw new IOException("Invalid GELI file header");
}
final int headerTextLength = mStream.readInt();
final String textHeader = mStream.readString(headerTextLength);
mFileHeader = new SAMTextHeaderCodec().decode(new StringLineReader(textHeader),
file);
final int sequenceCount = mStream.readInt();
if (sequenceCount != mFileHeader.getSequences().size()) {
throw new GeliException("Number of sequences in text header (" + mFileHeader.getSequences().size() +
") != number of sequences in binary header (" + sequenceCount + ") for file " + file);
}
for (int i = 0; i < sequenceCount; i++) {
readSequenceRecord(file);
// final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
// if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
// throw new GELIException("For sequence " + i + ", text and binary have different names in file " +
// file);
// }
// if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
// throw new GELIException("For sequence " + i + ", text and binary have different lengths in file " +
// file);
// }
}
}
private SAMSequenceRecord readSequenceRecord(final File file) {
final int nameLength = mStream.readInt();
if (nameLength <= 1) {
throw new GeliException("Invalid BAM file header: missing sequence name in file " + file);
}
final String sequenceName = mStream.readString(nameLength - 1);
// Skip the null terminator
mStream.readByte();
final int sequenceLength = mStream.readInt();
final SAMSequenceRecord record = new SAMSequenceRecord(sequenceName);
record.setSequenceLength(sequenceLength);
return record;
}
private class GELIFileIterator
implements CloseableIterator<GenotypeLikelihoods> {
private GenotypeLikelihoods mNextRecord = null;
private final GenotypeLikelihoodsCodec likelihoodsCodec = new GenotypeLikelihoodsCodec();
GELIFileIterator() {
this(true);
}
GELIFileIterator(final boolean advance) {
likelihoodsCodec.setInputStream(mStream.getInputStream());
if (advance) {
advance();
}
}
public void close() {
if (this != mCurrentIterator) {
throw new IllegalStateException("Attempt to close non-current iterator");
}
mCurrentIterator = null;
}
public boolean hasNext() {
return (mNextRecord != null);
}
public GenotypeLikelihoods next() {
final GenotypeLikelihoods result = mNextRecord;
advance();
return result;
}
public void remove() {
throw new UnsupportedOperationException("Not supported: remove");
}
void advance() {
try {
mNextRecord = getNextRecord();
} catch (IOException exc) {
throw new RuntimeException(exc.getMessage(), exc);
}
}
GenotypeLikelihoods getNextRecord()
throws IOException {
return likelihoodsCodec.decode();
}
}
}

View File

@ -1,168 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.genotype;
import java.io.DataOutputStream;
import java.io.File;
import java.io.StringWriter;
import edu.mit.broad.picard.genotype.GenotypeLikelihoods.GenotypeLikelihoodsComparator;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.sam.SAMSequenceRecord;
import edu.mit.broad.sam.SAMTextHeaderCodec;
import edu.mit.broad.sam.SAMFileHeader.SortOrder;
import edu.mit.broad.sam.util.BinaryCodec;
import edu.mit.broad.sam.util.BlockCompressedOutputStream;
import edu.mit.broad.sam.util.SortingCollection;
/**
* Class for writing GELI (GEnotype LIkelihood) files.
*/
public class GeliFileWriter {
private static final int MAX_RECORDS_IN_RAM = 1000000;
private SAMFileHeader.SortOrder sortOrder = SortOrder.coordinate;
private SAMFileHeader header;
private SortingCollection<GenotypeLikelihoods> likelihoodsSorter;
// These two fields are for validating presorted records.
private GenotypeLikelihoods prevLikelihoods;
private GenotypeLikelihoodsComparator presortedComparator;
// If true, records passed to addAlignment are already in the order specified by sortOrder
private boolean presorted;
protected final BinaryCodec outputBinaryCodec;
private GenotypeLikelihoodsCodec genotypeLikelihoodsCodec = null;
public GeliFileWriter(final File path) {
this(path, false);
}
public GeliFileWriter(final File path, boolean presorted) {
outputBinaryCodec = new BinaryCodec(new DataOutputStream(new BlockCompressedOutputStream(path)));
outputBinaryCodec.setOutputFileName(path.toString());
this.presorted = presorted;
}
/**
* Must be called before addAlignment.
* @param header
*/
public void setHeader(final SAMFileHeader header)
{
this.header = header;
header.setSortOrder(sortOrder);
final StringWriter headerTextBuffer = new StringWriter();
new SAMTextHeaderCodec().encode(headerTextBuffer, header);
final String headerText = headerTextBuffer.toString();
writeHeader(headerText);
if (presorted) {
presortedComparator = makeComparator();
} else if (!sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) {
likelihoodsSorter = SortingCollection.newInstance(GenotypeLikelihoods.class,
new GenotypeLikelihoodsCodec(), makeComparator(), MAX_RECORDS_IN_RAM);
}
}
protected SAMFileHeader getHeader() {
return header;
}
private GenotypeLikelihoodsComparator makeComparator() {
return new GenotypeLikelihoodsComparator();
}
public void addGenotypeLikelihoods(GenotypeLikelihoods genotypeLikelihoods)
{
if (presorted) {
assertPresorted(genotypeLikelihoods);
writeGenotypeLikelihoods(genotypeLikelihoods);
} else {
likelihoodsSorter.add(genotypeLikelihoods);
}
}
private void assertPresorted(final GenotypeLikelihoods genotypeLikelihoods) {
if (prevLikelihoods != null) {
if (presortedComparator.compare(prevLikelihoods, genotypeLikelihoods) > 0) {
throw new IllegalArgumentException("GenotypeLikelihoods added out of order in GELIFileWriterImpl.addGenotypeLikelihoods for " +
getFilename() + ". Sort order is " + this.sortOrder + ". Offending records are at ["
+ prevLikelihoods.getReferenceIndex() + ":" + prevLikelihoods.getPosition() + "] and ["
+ genotypeLikelihoods.getReferenceIndex() + ":" + genotypeLikelihoods.getPosition() + "]");
}
}
prevLikelihoods = genotypeLikelihoods;
}
public final void close()
{
if (likelihoodsSorter != null) {
for (final GenotypeLikelihoods genotypeLikelihoods : likelihoodsSorter) {
writeGenotypeLikelihoods(genotypeLikelihoods);
}
likelihoodsSorter.cleanup();
}
finish();
}
private void prepareToWriteAlignments() {
if (genotypeLikelihoodsCodec == null) {
genotypeLikelihoodsCodec = new GenotypeLikelihoodsCodec();
genotypeLikelihoodsCodec.setOutputStream(outputBinaryCodec.getOutputStream());
}
}
/**
* Writes the record to disk. Sort order has been taken care of by the time
* this method is called.
* @param alignment
*/
protected void writeGenotypeLikelihoods(GenotypeLikelihoods genotypeLikelihoods) {
prepareToWriteAlignments();
genotypeLikelihoodsCodec.encode(genotypeLikelihoods);
}
/**
* Write the header to disk. Header object is available via getHeader().
* @param textHeader for convenience if the implementation needs it.
*/
protected void writeHeader(final String textHeader) {
outputBinaryCodec.writeBytes(GeliFileConstants.GELI_MAGIC);
// calculate and write the length of the SAM file header text and the header text
outputBinaryCodec.writeInt(textHeader.length());
outputBinaryCodec.writeBytes(textHeader.getBytes());
// write the sequences binarily. This is redundant with the text header
outputBinaryCodec.writeInt(getHeader().getSequences().size());
for (final SAMSequenceRecord sequenceRecord: getHeader().getSequences()) {
outputBinaryCodec.writeInt(sequenceRecord.getSequenceName().length() + 1);
outputBinaryCodec.writeBytes(sequenceRecord.getSequenceName().getBytes());
outputBinaryCodec.writeByte(0);
outputBinaryCodec.writeInt(sequenceRecord.getSequenceLength());
}
}
/**
* Do any required flushing here.
*/
protected void finish() {
outputBinaryCodec.close();
}
/**
* For producing error messages.
* @return Output filename, or null if there isn't one.
*/
protected String getFilename() {
return outputBinaryCodec.getOutputFileName();
}
}

View File

@ -1,164 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.genotype;
import java.util.Arrays;
import java.util.Comparator;
/**
* Data object for Genotype Likelihoods. One object represents one row in a GELI file.
*
* @author Doug Voet
*/
public class GenotypeLikelihoods {
/** this is a guess at how much memory an instance of this object occupies */
public static final int OBJECT_SIZE_BYTES = 150;
public static final int AA_GENOTYPE = 0;
public static final int AC_GENOTYPE = 1;
public static final int AG_GENOTYPE = 2;
public static final int AT_GENOTYPE = 3;
public static final int CC_GENOTYPE = 4;
public static final int CG_GENOTYPE = 5;
public static final int CT_GENOTYPE = 6;
public static final int GG_GENOTYPE = 7;
public static final int GT_GENOTYPE = 8;
public static final int TT_GENOTYPE = 9;
private static final char[][] GENOTYPES = {
"AA".toCharArray(),
"AC".toCharArray(),
"AG".toCharArray(),
"AT".toCharArray(),
"CC".toCharArray(),
"CG".toCharArray(),
"CT".toCharArray(),
"GG".toCharArray(),
"GT".toCharArray(),
"TT".toCharArray()
};
/** compares first by reference index then by position */
public static class GenotypeLikelihoodsComparator implements Comparator<GenotypeLikelihoods> {
@Override
public int compare(GenotypeLikelihoods thing1, GenotypeLikelihoods thing2) {
long refCompare = thing1.referenceIndex - thing2.referenceIndex;
if (refCompare == 0) {
long posCompare = thing1.position - thing2.position;
return (int) posCompare;
} else {
return (int) refCompare;
}
}
}
private long referenceIndex;
private long position;
private byte referenceBase;
private int numReads;
private short maxMappingQuality;
private float[] likelihoods = new float[10];
private byte bestLikelihoodIndex = -1; // stored as byte to reduce memory footprint
private byte secondBestLikelihoodIndex = -1; // stored as byte to reduce memory footprint
public static int getLikelihoodIndex(char[] genotype) {
char first = Character.isLowerCase(genotype[0]) ? Character.toUpperCase(genotype[0]) : genotype[0];
char second = Character.isLowerCase(genotype[1]) ? Character.toUpperCase(genotype[1]) : genotype[1];
if (first > second) {
char temp = first;
first = second;
second = temp;
}
for (int i=0; i<GENOTYPES.length; i++) {
if (first == GENOTYPES[i][0] && second == GENOTYPES[i][1]) {
return i;
}
}
throw new IllegalArgumentException("Unknown genotype string [" + new String(genotype) +
"], any pair of ACTG case insensitive is acceptable");
}
public float getLikelihood(int genotype) {
return likelihoods[genotype];
}
public void setLikelihood(int genotype, float value) {
likelihoods[genotype] = value;
}
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("referenc ").append(referenceIndex).append(":").append(position);
builder.append(", ref base ").append((char) referenceBase);
builder.append(", #reads ").append(numReads);
builder.append(", quality ").append(maxMappingQuality);
builder.append(" [");
for (int i=0; i<likelihoods.length; i++) {
builder.append(GENOTYPES[i]).append(":").append(likelihoods[i]).append(" ");
}
builder.append("]");
return builder.toString();
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + Arrays.hashCode(likelihoods);
result = prime * result + maxMappingQuality;
result = prime * result + numReads;
result = prime * result + (int) (position ^ (position >>> 32));
result = prime * result + referenceBase;
result = prime * result + (int) (referenceIndex ^ (referenceIndex >>> 32));
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
GenotypeLikelihoods other = (GenotypeLikelihoods) obj;
if (!Arrays.equals(likelihoods, other.likelihoods))
return false;
if (maxMappingQuality != other.maxMappingQuality)
return false;
if (numReads != other.numReads)
return false;
if (position != other.position)
return false;
if (referenceBase != other.referenceBase)
return false;
if (referenceIndex != other.referenceIndex)
return false;
return true;
}
public long getReferenceIndex() { return referenceIndex; }
public void setReferenceIndex(long sequenceIndex) { this.referenceIndex = sequenceIndex; }
public long getPosition() { return position; }
public void setPosition(long position) { this.position = position; }
public byte getReferenceBase() { return referenceBase; }
public void setReferenceBase(byte referenceBase) { this.referenceBase = referenceBase; }
public int getNumReads() { return numReads; }
public void setNumReads(int numReads) { this.numReads = numReads; }
public short getMaxMappingQuality() { return maxMappingQuality; }
public void setMaxMappingQuality(short maxMappingQuality) { this.maxMappingQuality = maxMappingQuality; }
float[] getLikelihoods() { return likelihoods; }
public int getBestLikelihoodIndex() { return bestLikelihoodIndex; }
public void setBestLikelihoodIndex(int bestLikelihoodIndex) { this.bestLikelihoodIndex = (byte) bestLikelihoodIndex; }
public int getSecondBestLikelihoodIndex() { return secondBestLikelihoodIndex; }
public void setSecondBestLikelihoodIndex(int secondBestLikelihoodIndex) { this.secondBestLikelihoodIndex = (byte) secondBestLikelihoodIndex; }
}

View File

@ -1,126 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.genotype;
import java.io.InputStream;
import java.io.OutputStream;
import edu.mit.broad.sam.util.BinaryCodec;
import edu.mit.broad.sam.util.RuntimeEOFException;
import edu.mit.broad.sam.util.SortingCollection;
public class GenotypeLikelihoodsCodec implements SortingCollection.Codec<GenotypeLikelihoods> {
private static final int SIG_FIG_MULTIPLIER = 100;
private static final short BLOCK_SIZE = 12 + 10 * 4;
private OutputStream os;
private InputStream is;
private BinaryCodec binaryCodec;
/** Returns a new genotype likelihood codec. */
public SortingCollection.Codec<GenotypeLikelihoods> clone() {
return new GenotypeLikelihoodsCodec();
}
/**
* Write object to OutputStream.
*
* @param genotypeLikelihoods what to write
*/
public void encode(final GenotypeLikelihoods genotypeLikelihoods) {
this.binaryCodec.writeShort(BLOCK_SIZE);
this.binaryCodec.writeUInt(genotypeLikelihoods.getReferenceIndex());
this.binaryCodec.writeUInt(genotypeLikelihoods.getPosition());
this.binaryCodec.writeByte(genotypeLikelihoods.getReferenceBase());
this.binaryCodec.writeUShort(genotypeLikelihoods.getNumReads());
this.binaryCodec.writeByte(genotypeLikelihoods.getMaxMappingQuality());
for (int i = 0; i < genotypeLikelihoods.getLikelihoods().length; i++) {
writeLikelihood(genotypeLikelihoods.getLikelihoods()[i]);
}
}
/**
* Read the next record from the input stream and convert into a java object.
*
* @return null if no more records. Should throw exception if EOF is encountered in the middle of
* a record.
*/
public GenotypeLikelihoods decode() {
int recordLength = 0;
try {
recordLength = this.binaryCodec.readShort();
} catch (RuntimeEOFException e) {
return null;
}
if (recordLength != BLOCK_SIZE) {
throw new GeliException("Invalid record length: " + recordLength);
}
final GenotypeLikelihoods genotypeLikelihoods = new GenotypeLikelihoods();
genotypeLikelihoods.setReferenceIndex(this.binaryCodec.readUInt());
genotypeLikelihoods.setPosition(this.binaryCodec.readUInt());
genotypeLikelihoods.setReferenceBase(this.binaryCodec.readByte());
genotypeLikelihoods.setNumReads(this.binaryCodec.readUShort());
genotypeLikelihoods.setMaxMappingQuality(this.binaryCodec.readByte());
int bestIndex = -1;
int secondBestIndex = -1;
for (int i = 0; i < genotypeLikelihoods.getLikelihoods().length; i++) {
float likelihood = readLikelihood();
genotypeLikelihoods.getLikelihoods()[i] = likelihood;
if (bestIndex == -1 || genotypeLikelihoods.getLikelihood(bestIndex) < likelihood) {
secondBestIndex = bestIndex;
bestIndex = i;
} else if (secondBestIndex == -1 || genotypeLikelihoods.getLikelihood(secondBestIndex) < likelihood) {
secondBestIndex = i;
}
}
genotypeLikelihoods.setBestLikelihoodIndex(bestIndex);
genotypeLikelihoods.setSecondBestLikelihoodIndex(secondBestIndex);
return genotypeLikelihoods;
}
/**
* Where to write encoded output
*
* @param os
*/
public void setOutputStream(final OutputStream os) {
this.os = os;
this.binaryCodec = new BinaryCodec(os);
}
/**
* Where to read encoded input from
*
* @param is
*/
public void setInputStream(final InputStream is) {
this.is = is;
this.binaryCodec = new BinaryCodec(is);
}
private void writeLikelihood(float likelihood) {
float shiftedLikelihood = likelihood * SIG_FIG_MULTIPLIER;
this.binaryCodec.writeInt((int) Math.round(shiftedLikelihood));
}
/**
* @return
*/
private float readLikelihood() {
float likelihood = (float) this.binaryCodec.readInt() / SIG_FIG_MULTIPLIER;
return likelihood;
}
}

View File

@ -1,192 +0,0 @@
package edu.mit.broad.picard.genotype.caller;
import edu.mit.broad.picard.sam.SamLocusIterator;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
import edu.mit.broad.picard.reference.ReferenceSequence;
import edu.mit.broad.picard.PicardException;
import java.io.IOException;
import java.io.BufferedWriter;
import java.io.File;
import java.util.SortedSet;
import java.util.List;
/**
* Base class for AlleleCallers. Handles efficient access to the reference, output of data to a
* standard file format, and application of priors
*/
public abstract class AbstractAlleleCaller {
// writer for output
private final BufferedWriter writer;
// for providing access to reference data
private final ReferenceSequenceFile referenceSequenceFile;
private final SAMFileHeader samHeader;
private ReferenceSequence referenceSequence;
public AbstractAlleleCaller(final File reference, final SAMFileHeader samHeader, final BufferedWriter writer) {
this.writer = writer;
this.referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(reference);
this.samHeader = samHeader;
}
/**
* emit allele calls to the writer specified in the constructor
*
* @param li Locus to call
*/
public void callAlleles(final SamLocusIterator.LocusInfo li) throws IOException {
cacheReferenceSequence(li.getSequenceIndex());
final char ref = Character.toUpperCase((char)(referenceSequence.getBases()[li.getPosition() - 1] & 0xff));
// delegate to the specific implementation
final SortedSet<GenotypeTheory> likelihoods = call(ref, li.getBasesAsString(), li.getQualities());
final GenotypeTheory bestTheory = likelihoods.first();
GenotypeTheory nextBestTheory = null;
GenotypeTheory refTheory = null;
final String refString = new String(new char[]{ref,ref});
final DiploidGenotype refGenotype = DiploidGenotype.valueOf(refString);
final StringBuilder theoryString = new StringBuilder();
int k=0;
for(final GenotypeTheory t : likelihoods) {
if (k == 1) { nextBestTheory = t; }
if (t.getGenotype() == refGenotype) { refTheory = t; }
theoryString.append(t.getGenotype())
.append(":")
.append(String.format("%.2f",t.getLikelihood()))
.append(" ");
k++;
}
final double btnb = bestTheory.getLikelihood() - nextBestTheory.getLikelihood();
final double btr = bestTheory.getLikelihood() - refTheory.getLikelihood();
final DiploidGenotype gt = likelihoods.first().getGenotype();
final String type;
if (!gt.isHet() && gt.getAllele1() == ref) {
type = "homozygous";
} else if (!gt.isHet() && gt.getAllele1() != ref) {
type = "homozygous-SNP";
} else {
type = "heterozygous-SNP";
}
final String bases = li.getBasesAsString();
int a = 0,c = 0,g = 0,t = 0;
for(int i=0; i<bases.length(); i++) {
if (bases.charAt(i) == 'A') { a++; }
else if (bases.charAt(i) == 'C') { c++; }
else if (bases.charAt(i) == 'G') { g++; }
else if (bases.charAt(i) == 'T') { t++; }
else { throw new RuntimeException("Unknown Base " + bases.charAt(i)); }
}
writer.write(
li.getSequenceIndex() + ":" +
(li.getPosition()-1) + " " + // arachne output is 0-based
ref + " " +
gt + " " +
String.format("%f %f", btnb,btr) + " " +
type + " " +
"A:" + a + " " +
"C:" + c + " " +
"G:" + g + " " +
"T:" + t + " " +
bases.length() + " " +
"0 1 1 " + // used prior, is alignable, bait present
theoryString
);
writer.write("\n");
}
/**
* Ensure that the referenceSequence member points to the sequenceIndex-th sequence. Note that
* this is not random access. It is required that current sequenceIndex is >= the arg in the previous
* call to this method.
*/
private void cacheReferenceSequence(int sequenceIndex) {
if (referenceSequence != null && referenceSequence.getContigIndex() == sequenceIndex) {
return;
}
referenceSequence = null;
for(referenceSequence = referenceSequenceFile.nextSequence();
referenceSequence != null;
referenceSequence = referenceSequenceFile.nextSequence()) {
// Sanity check the sequence names against the sequence dictionary while scanning through.
if (!referenceSequence.getName().equals(samHeader.getSequence(referenceSequence.getContigIndex()).getSequenceName())) {
throw new PicardException("Sequence name mismatch at sequence index " + referenceSequence.getContigIndex() +
": " + referenceSequence.getName() + " != " +
samHeader.getSequence(referenceSequence.getContigIndex()).getSequenceName());
}
if (referenceSequence.getContigIndex() == sequenceIndex) {
break;
}
if (referenceSequence.getContigIndex() > sequenceIndex) {
throw new PicardException("Never found reference sequence with index " + sequenceIndex);
}
}
if (referenceSequence == null) {
throw new PicardException("Reference sequence with index " + sequenceIndex + " was not found");
}
}
/**
* Override this to implement a concrete genotype caller
* @param ref the reference base
* @param bases each element in the String is the base at current locus for a given read
* @param quals same length as bases. the ith element corresponds to the ith element of bases.
* @return
*/
abstract protected SortedSet<GenotypeTheory> call(char ref, String bases, List<Byte> quals);
/**
* Apply a general population-based prior to the likelihood:
* <ul>
* <li>ref is .999</li>
* <li>het is 10^-3</li>
* <li>homozygous, non-reference is 10^-5</li>
*
* @param ref reference allele
* @return prior, given the reference and genotype alleles
*/
protected double getPrior(final char ref, final DiploidGenotype gt) {
final double prior;
if (gt.isHom() && gt.getAllele1() == ref) {
prior = 0.999; // reference
} else {
if (gt.getAllele1() != ref && gt.getAllele2() != ref) {
prior = 0.00001; // neither base is reference
} else {
prior = 0.001; // het, one base is reference
}
}
return prior;
}
// --------------------------------------------------------------------------------------------
// Helper methods below this point...
// --------------------------------------------------------------------------------------------
public boolean isHet(final String alleles) {
return (alleles.charAt(0) != (alleles.charAt(1)));
}
}

View File

@ -1,93 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.genotype.caller;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.cmdline.Usage;
import edu.mit.broad.picard.directed.GenomeMaskFactory;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.sam.SAMFileReader;
import edu.mit.broad.picard.sam.SamLocusIterator;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
/**
* Call genotypes given a SAM file of aligned reads, reference sequences, and optionally a target map.
*/
public class CallGenotypes extends CommandLineProgram {
// Usage and parameters
@Usage(programVersion="1.0") public String USAGE = "Basic Allele Caller\n";
@Option(shortName="I", doc="SAM or BAM file for calling") public File INPUT_FILE;
@Option(shortName="O", doc="Allele Call output GELI file") public File OUTPUT_FILE;
@Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;
@Option(shortName="T", doc="IntervalList-format target map file", optional = true) public File TARGET_FILE;
@Option(shortName="Q", doc="Minimum quality score threshold to use in allele calling", optional = true) public Integer QUAL_SCORE_THRESHOLD;
/** Required main method implementation. */
public static void main(final String[] argv) {
System.exit(new CallGenotypes().instanceMain(argv));
}
protected int doWork() {
try {
final BufferedWriter writer = new BufferedWriter(new FileWriter(OUTPUT_FILE));
final SAMFileReader samReader = getSamReader(INPUT_FILE);
// TODO -- parameterize, or create separate executables...
// AbstractAlleleCaller caller = new FlatQualityAlleleCaller(reference, writer);
final AbstractAlleleCaller caller = new QualityScoreAlleleCaller(REF_FILE, samReader.getFileHeader(), writer);
final long startTime = System.currentTimeMillis();
final SamLocusIterator sli = new SamLocusIterator(samReader.iterator());
if (TARGET_FILE != null) {
sli.setGenomeMask(new GenomeMaskFactory().makeGenomeMaskFromIntervalList(TARGET_FILE));
}
if (QUAL_SCORE_THRESHOLD != null) {
System.out.println("Masking out bases with < Q"+QUAL_SCORE_THRESHOLD);
sli.setQualityScoreCutoff(QUAL_SCORE_THRESHOLD);
}
for (final SamLocusIterator.LocusInfo li : sli) {
if (li != null) caller.callAlleles(li);
}
final long elapsed = System.currentTimeMillis() - startTime;
System.out.println("Completed in " + elapsed + "ms");
writer.flush();
writer.close();
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
return 0;
}
private SAMFileReader getSamReader(final File samFile) {
final SAMFileReader samReader = new SAMFileReader(samFile);
// ensure the file is sorted
if (samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
System.out.println("SAM Files must be coordinate-sorted, this is " + samReader.getFileHeader().getSortOrder());
System.exit(1);
}
return samReader;
}
}

View File

@ -1,27 +0,0 @@
package edu.mit.broad.picard.genotype.caller;
public enum DiploidGenotype {
AA('A','A'),
AC('A','C'),
AG('A','G'),
AT('A','T'),
CC('C','C'),
CG('C','G'),
CT('C','T'),
GG('G','G'),
GT('G','T'),
TT('T','T');
private final char allele1;
private final char allele2;
private DiploidGenotype(final char allele1, final char allele2) {
this.allele1 = allele1;
this.allele2 = allele2;
}
public char getAllele1() { return allele1; }
public char getAllele2() { return allele2; }
public boolean isHet() { return this.allele1 != this.allele2; }
public boolean isHom() { return this.allele1 == this.allele2; }
}

View File

@ -1,76 +0,0 @@
package edu.mit.broad.picard.genotype.caller;
import edu.mit.broad.sam.SAMFileHeader;
import java.io.IOException;
import java.io.BufferedWriter;
import java.io.File;
import java.util.*;
import static java.lang.Math.*;
/**
* Bayesian-based allele caller using flat qualities and a 1e-3 error rate, based on CRD algorithm
*/
public class FlatQualityAlleleCaller extends AbstractAlleleCaller {
public FlatQualityAlleleCaller(final File fastbReference, SAMFileHeader samHeader, final BufferedWriter writer) {
super(fastbReference, samHeader, writer);
}
protected SortedSet<GenotypeTheory> call(final char ref, final String bases, final List<Byte> quals) {
final float eps = 1e-3f;
// count up the base by nucleotide and put them into a map
final int depth = bases.length();
int a = 0,c = 0,g = 0,t = 0;
for(int i=0; i< bases.length(); i++) {
if (bases.charAt(i) == 'A') { a++; }
else if (bases.charAt(i) == 'C') { c++; }
else if (bases.charAt(i) == 'G') { g++; }
else if (bases.charAt(i) == 'T') { t++; }
else { throw new RuntimeException("Unknown Base " + bases.charAt(i)); }
}
final Map<Character, Integer> counts = new HashMap<Character, Integer>();
counts.put('A', a);
counts.put('C', c);
counts.put('G', g);
counts.put('T', t);
// for each of the 10 theories, calculate the likelihood
final SortedSet<GenotypeTheory> results = new TreeSet<GenotypeTheory>();
for(final DiploidGenotype theory : DiploidGenotype.values()) {
final double likelihood;
final char allele1 = theory.getAllele1();
final char allele2 = theory.getAllele2();
if (!theory.isHet()) {
likelihood = log10(1-eps)*counts.get(allele1) + log10(eps)*(depth - counts.get(allele1));
} else {
final int major_allele_counts;
final int minor_allele_counts;
if (counts.get(allele1) > counts.get(allele2)) {
major_allele_counts = counts.get(allele1);
minor_allele_counts = counts.get(allele2);
} else {
major_allele_counts = counts.get(allele2);
minor_allele_counts = counts.get(allele1);
}
likelihood = log10(0.5 - (eps/2.0) )*major_allele_counts +
log10(0.5 - (eps/2.0) )*minor_allele_counts +
log10(eps)*(depth - major_allele_counts - minor_allele_counts);
}
final double prior = getPrior(ref, theory);
results.add(new GenotypeTheory(theory, likelihood + log10(prior)));
}
return results;
}
}

View File

@ -1,46 +0,0 @@
package edu.mit.broad.picard.genotype.caller;
/**
* Datastructure to hold a single genotype along with a likelihood.
*/
public class GenotypeTheory implements Comparable<GenotypeTheory> {
private DiploidGenotype genotype;
private double likelihood;
public GenotypeTheory(final DiploidGenotype genotype, final double likelihood) {
this.genotype = genotype;
this.likelihood = likelihood;
}
public DiploidGenotype getGenotype() {
return genotype;
}
public void setGenotype(final DiploidGenotype genotype) {
this.genotype = genotype;
}
public double getLikelihood() {
return likelihood;
}
public void setLikelihood(final double likelihood) {
this.likelihood = likelihood;
}
/**
* Genotype Theories are sorted first by descending likelihood (ie
* the GenotypeTheory with biggest likelihood comes first). Ties are
* broken by lexical sorting of the genotypes themselves
*
*/
public int compareTo(final GenotypeTheory other) {
if (this.getLikelihood() == other.getLikelihood()) {
return this.getGenotype().compareTo(other.getGenotype());
} else if (this.getLikelihood() > other.getLikelihood()) {
return -1;
} else {
return 1;
}
}
}

View File

@ -1,82 +0,0 @@
package edu.mit.broad.picard.genotype.caller;
import edu.mit.broad.sam.SAMFileHeader;
import java.util.*;
import static java.lang.Math.log10;
import static java.lang.Math.pow;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.File;
/**
* Bayesian-based allele caller using quality scores, based on CRD algorithm
*/
public class QualityScoreAlleleCaller extends AbstractAlleleCaller {
public QualityScoreAlleleCaller(final File fastbReference, SAMFileHeader samHeader, final BufferedWriter writer) {
super(fastbReference, samHeader, writer);
}
protected SortedSet<GenotypeTheory> call(final char ref, final String bases, final List<Byte> quals) {
// for each of the 10 theories, calculate the likelihood using quality scores
final SortedSet<GenotypeTheory> results = new TreeSet<GenotypeTheory>();
for(final DiploidGenotype theory : DiploidGenotype.values()) {
double likelihood = 0;
for(int i=0; i<bases.length(); i++) {
final char base = bases.charAt(i);
final byte qual = quals.get(i);
if (theory.isHom()) {
if (base == theory.getAllele1() || base == theory.getAllele2()) {
likelihood += getOneMinusQual(qual);
} else {
// the real math would be
// likelihood += log10(pow(10,(qual/-10.0)));
// but it simplifies to
likelihood += qual/-10.0;
}
} else {
if (base == theory.getAllele1() || base == theory.getAllele2()) {
likelihood += getOneHalfMinusQual(qual);
} else {
// the real math would be
// likelihood += log10(pow(10,(qual/-10.0)));
// but it simplifies to
likelihood += qual/-10.0;
}
}
}
final double prior = getPrior(ref, theory);
results.add(new GenotypeTheory(theory, likelihood + log10(prior)));
}
return results;
}
private static final double[] oneMinusData = new double[Byte.MAX_VALUE];
{
for(int qual=0; qual < Byte.MAX_VALUE; qual++) {
oneMinusData[qual] = log10(1.0 - pow(10,(qual/-10.0)));
}
}
private double getOneMinusQual(final byte qual) {
return oneMinusData[qual];
}
private static final double[] oneHalfMinusData = new double[Byte.MAX_VALUE];
{
for(int qual=0; qual < Byte.MAX_VALUE; qual++) {
oneHalfMinusData[qual] = log10(0.5-pow(10,(qual/-10.0))/2.0);
}
}
private double getOneHalfMinusQual(final byte qual) {
return oneHalfMinusData[qual];
}
}

View File

@ -1,257 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.illumina;
import edu.mit.broad.picard.util.PasteParser;
import edu.mit.broad.picard.util.FormatUtil;
import edu.mit.broad.picard.util.BasicTextFileParser;
import edu.mit.broad.picard.PicardException;
import java.io.File;
import java.io.FilenameFilter;
import java.io.Closeable;
import java.util.*;
/**
* Class to parse the data in an Illumina Bustard directory and return an iterator over that data, in order
* by tile.
*
* @author Kathleen Tibbetts
*/
public class BustardFileParser implements Iterator<BustardReadData>, Iterable<BustardReadData>, Closeable {
private final File bustardDirectory;
private final int lane;
private final boolean pairedEnd;
private PasteParser parser;
private BustardReadData next = null;
private final FormatUtil formatter = new FormatUtil();
private boolean iterating = false;
/**
* Constructor
*
* @param bustardDirectory directory where the Bustard files can be located
* @param lane the lane to parse
* @param pairedEnd whether this is a paired-end run
*/
public BustardFileParser(File bustardDirectory, int lane, boolean pairedEnd) {
this.bustardDirectory = bustardDirectory;
this.lane = lane;
this.pairedEnd = pairedEnd;
initialize();
}
/**
* Finds the relevant files in the bustardDirectory, sorts them, and puts them into the
* <code>sortedFiles</code> iterator. Does some basic sanity checking to ensure that some files
* are found and that they are the expected multiple for paired-end or not.
*
*/
private void initialize()
{
final String qseq1Regex = "s_" + lane + "_1_\\d{4}_qseq.txt(.gz)?";
final String qseq2Regex = "s_" + lane + "_2_\\d{4}_qseq.txt(.gz)?";
final String intensityRegex = "s_" + lane + "_\\d{4}_sig2.txt(.gz)?";
File read1files[] = bustardDirectory.listFiles( new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.matches(qseq1Regex);
}
});
File read2files[] = bustardDirectory.listFiles( new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.matches(qseq2Regex);
}
});
File intensityFiles[] = bustardDirectory.listFiles( new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.matches(intensityRegex);
}
});
// Some basic sanity checking on file counts
if (read1files.length == 0 && read2files.length == 0 && intensityFiles.length == 0) {
throw new PicardException("No Bustard files found in " +
bustardDirectory.getAbsolutePath() + " for lane " + lane);
}
if (pairedEnd) {
if (read1files.length != read2files.length || read2files.length != intensityFiles.length) {
throw new PicardException("Incorrect number of Bustard files found in " +
bustardDirectory.getAbsolutePath() + " for lane " + lane + ". Found " +
read1files.length + " read 1 qseq files, " + read2files.length + " read 2 " +
"qseq files, and " + intensityFiles.length + " sig2 files. There should be " +
"the same number of each type of file");
}
}
else {
if (read1files.length != intensityFiles.length) {
throw new PicardException("Incorrect number of Bustard files found in " +
bustardDirectory.getAbsolutePath() + " for lane " + lane + ". Found " +
read1files.length + " qseq files and " + intensityFiles.length + " sig2 files, " +
"which should be equal.");
}
if (read2files.length > 0) {
throw new PicardException("Read 2 Bustard files found in " +
bustardDirectory.getAbsolutePath() + " for lane " + lane + ". Lane " +
" was specified as a non-PE run, and so should not have any read 2 data.");
}
}
// Sort each set of reads and create a text parser for it
SortedSet<File> sortedRead1 = new TreeSet<File>(new BustardFilenameComparator());
sortedRead1.addAll(Arrays.asList(read1files));
read1files = sortedRead1.toArray(read1files);
BasicTextFileParser read1Parser = new BasicTextFileParser(true, read1files);
SortedSet<File> sortedIntensity = new TreeSet<File>(new BustardFilenameComparator());
sortedIntensity.addAll(Arrays.asList(intensityFiles));
intensityFiles = sortedIntensity.toArray(intensityFiles);
BasicTextFileParser intensityParser = new BasicTextFileParser(true, intensityFiles);
// And create a paste parser for all of them
if (pairedEnd) {
SortedSet<File> sortedRead2 = new TreeSet<File>(new BustardFilenameComparator());
sortedRead2.addAll(Arrays.asList(read2files));
read2files = sortedRead2.toArray(read2files);
BasicTextFileParser read2Parser = new BasicTextFileParser(true, read2files);
parser = new PasteParser(read1Parser, read2Parser, intensityParser);
}
else {
parser = new PasteParser(read1Parser, intensityParser);
}
}
/**
* Parses the next line from the parser and constructs a BustardReadData object from it
* The first 11 fields are the read1 data, the second 11 are the read2 data, and the remaining
* values are the intensities data. Note that the first four values in the intensity file
* are not intensities but rather lane, tiles, x, and y for the given cluster.
*
* @param validate whether to check that the expected number of intensity values are returned
* @return a fully populated BustardReadData object
*/
private BustardReadData readNext(boolean validate) {
if (!parser.hasNext()) {
return null;
}
String data[][] = parser.next();
String machine = data[0][0];
int run = formatter.parseInt(data[0][1]);
int lane = formatter.parseInt(data[0][2]);
int tile = formatter.parseInt(data[0][3]);
int x = formatter.parseInt(data[0][4]);
int y = formatter.parseInt(data[0][5]);
String firstSeq = data[0][8];
String firstQual = data[0][9];
boolean pf = formatter.parseInt(data[0][10]) == 1;
String secondSeq = null;
String secondQual = null;
int intensityIndex = 1;
if (pairedEnd) {
secondSeq = data[1][8];
secondQual = data[1][9];
intensityIndex = 2;
}
int numIntensities = firstSeq.length() * (pairedEnd ? 2 : 1);
// Sanity check since some of those files look a little weird
if (validate) {
int remaining = data[intensityIndex].length - 4;
if ((remaining % 4 != 0) || (remaining/4) != numIntensities) {
throw new PicardException("Unexpected number of intensity fields for " + machine + "/" + run +
"/" + lane + "/" + tile + ": " + remaining);
}
}
double intensities[][] = new double[numIntensities][4];
int intensityArrayIndex = 4;
for (int i = 0; i < numIntensities; i++) {
for (int j = 0; j < 4; j++) {
intensities[i][j] = formatter.parseDouble(data[intensityIndex][intensityArrayIndex++]);
}
}
return new BustardReadData(
machine, run, lane, tile, firstSeq, firstQual, secondSeq, secondQual, pf, intensities, x, y);
}
/**
* Returns an iterator over a set of elements of type BustardReadData.
*
* @return an iterator over a set of elements of type BustardReadData
*/
public Iterator<BustardReadData> iterator() {
if (iterating) {
throw new IllegalStateException("iterator() method can only be called once, before the" +
"first call to hasNext()");
}
next = readNext(true);
iterating = true;
return this;
}
/**
* Returns true if the iteration has more elements.
*
* @return true if the iteration has more elements. Otherwise returns false.
*/
public boolean hasNext() {
if (!iterating) {
next = readNext(true);
iterating = true;
}
return next != null;
}
/**
* Returns the next element in the iteration.
*
* @return the next element in the iteration
* @throws java.util.NoSuchElementException
*/
public BustardReadData next() {
if (!hasNext()) {
throw new NoSuchElementException("Iteration has no more elements.");
}
BustardReadData result = next;
next = readNext(false);
return result;
}
/**
* Required method for Iterator API.
*
* @throws UnsupportedOperationException
*/
public void remove() {
throw new UnsupportedOperationException("Remove() not supported.");
}
/**
* Closes the underlying PasteParser
*/
public void close() {
if (parser != null) {
parser.close();
}
}
public int getLane() { return this.lane; }
public boolean isPairedEnd() { return this.pairedEnd; }
}

View File

@ -1,78 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.illumina;
import java.io.File;
import java.util.Comparator;
/**
* Comparator for getting Bustard files in "sorted" order for use by the BustardFileParser. Expected order is
* by lane in ascending order, then by tile in ascending order, then:
* the read 1 qseq file
* the read 2 qseq file
* the sig2 file
*
* IMPORTANT: Currently this class expects to receive ONLY qseq and sig2 files.
*
* @author Kathleen Tibbetts
*/
public class BustardFilenameComparator implements Comparator<File> {
/**
* Compares its two arguments for order. Returns a negative integer, zero, or a positive integer as
* the first argument is less than, equal to, or greater than the second.
*
* @param file1
* @param file2
* @return a negative integer, zero, or a positive integer as
* the first argument is less than, equal to, or greater than the second.
*/
public int compare(File file1, File file2)
{
Integer parts1[] = parseFileNameParts(file1.getName());
Integer parts2[] = parseFileNameParts(file2.getName());
for (int i = 1; i < parts1.length; i++)
{
if (!parts1[i].equals(parts2[i])) {
return parts1[i].compareTo(parts2[i]);
}
}
return 0;
}
/**
* Utility method that returns an array of integers that represent, in order,
* the lane, tile, type (0 for qseq files, 1 for sig2 files), and read (if any)
* represented by the given file name
*
* @param name
* @return an array of integers that represent, in order,
* the lane, tile, type (0 for qseq files, 1 for sig2 files), and read (if any)
* represented by the given file name
*/
private Integer[] parseFileNameParts(String name)
{
Integer parts[] = new Integer[4]; // Lane, tile, type, read
String src[] = name.split("_");
parts[0] = new Integer(src[1]); // Lane is always the second part
if (src[2].length() == 4) { // Tile is 3rd or fourth
parts[1] = new Integer(src[2]);
}
else {
parts[1] = new Integer(src[3]);
}
parts[2] = (src[src.length-1].equals("qseq.txt")) ? 0 : 1; // qseq tests are lower
if (src[2].length() == 1) { // read is last
parts[3] = new Integer(src[2]);
}
return parts;
}
}

View File

@ -1,128 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.illumina;
/**
* Holds all the Bustard-level data we need (so far) about an individual read.
*
* @author Kathleen Tibbetts
*/
public class BustardReadData {
private static final String PADDING ="00000";
final private String machineName;
final private int runNumber;
final private int laneNumber;
final private int tileNumber;
final private String firstReadSequence;
final private String firstReadQualities;
final private String secondReadSequence;
final private String secondReadQualities;
final private boolean pf;
final private double intensities[][];
final private int xCoordinate;
final private int yCoordinate;
private final SolexaQualityConverter converter = new SolexaQualityConverter();
/**
* Constructor that takes everything to populate this object
*
* @param machineName
* @param runNumber
* @param laneNumber
* @param tileNumber
* @param firstReadSequence
* @param firstReadQualities
* @param secondReadSequence
* @param secondReadQualities
* @param pf
* @param intensities
* @param xCoordinate
* @param yCoordinate
*/
public BustardReadData(String machineName, int runNumber, int laneNumber, int tileNumber,
String firstReadSequence, String firstReadQualities,
String secondReadSequence, String secondReadQualities,
boolean pf, double[][] intensities, int xCoordinate, int yCoordinate ) {
this.machineName = machineName;
this.runNumber = runNumber;
this.laneNumber = laneNumber;
this.tileNumber = tileNumber;
this.firstReadSequence = firstReadSequence;
this.firstReadQualities = firstReadQualities;
this.secondReadSequence = secondReadSequence;
this.secondReadQualities = secondReadQualities;
this.pf = pf;
this.intensities = intensities;
this.xCoordinate = xCoordinate;
this.yCoordinate = yCoordinate;
}
// TODO: Finalize read name -- ask Tim
/**
* Composes a name for this read from its values
*
* @return the read name
*/
public String getReadName() {
return this.machineName + ":" + this.laneNumber + ":" + this.tileNumber +
":" + this.xCoordinate + ":" + this.yCoordinate;
}
/**
* Gets Phred-style qualitites for the first read
*
* @return the String of qualities
*/
public String getFirstReadPhredQualities() {
return decodeSolexaQualitiesToPhred(getFirstReadQualities());
}
/**
* Gets Phred-style qualitites for the second read
*
* @return the String of qualities
*/
public String getSecondReadPhredQualities() {
return decodeSolexaQualitiesToPhred(getSecondReadQualities());
}
/**
* Converts a string of Solexa qualities to a Phred-style quality String
*
* @param qualities the Solexa qualities to decode
* @return the String of Phred qualities
*/
private String decodeSolexaQualitiesToPhred(String qualities) {
StringBuilder sb = new StringBuilder();
for (char c : qualities.toCharArray()) {
// Quality char is phred score + 33
sb.append((char)(converter.solexaToPhred((byte)c)+33));
}
return sb.toString();
}
public String getMachineName() { return machineName; }
public int getRunNumber() { return runNumber; }
public int getLaneNumber() { return laneNumber; }
public int getTileNumber() { return tileNumber; }
public String getFirstReadSequence() { return firstReadSequence; }
public String getFirstReadQualities() { return firstReadQualities; }
public String getSecondReadSequence() { return secondReadSequence; }
public String getSecondReadQualities() { return secondReadQualities; }
public double[][] getIntensities() { return intensities; }
public boolean isPf() { return pf; }
public int getXCoordinate() { return xCoordinate; }
public int getYCoordinate() { return yCoordinate; }
}

View File

@ -1,58 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.illumina;
import java.io.File;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.cmdline.Usage;
/**
* CommandLineProgram to generate to invoke BustardToBamWriter
*
* @author Kathleen Tibbetts
*/
public class BustardToSam extends CommandLineProgram {
// The following attributes define the command-line arguments
@Usage(programVersion="1.0")
public String USAGE =
"Usage: " + getClass().getName() + " [options]\n\n" +
"Generate a BAM binary file from data in an illumina Bustard directory.\n";
@Option(shortName = "B", doc = "Bustard directory to parse. ")
public File BUSTARD_DIRECTORY;
@Option(shortName = "F", doc = "The flowcell. ")
public String FLOWCELL;
@Option(shortName = "L", doc = "The lane for which to parse data. ")
public Integer LANE;
@Option(shortName = "P", doc = "Whether the lane was a paired-end run. ")
public Boolean PE;
@Option(shortName = "O", doc = "The directory for the binary output file. ")
public File OUTPUT;
@Override
protected int doWork() {
BustardToSamWriter writer = new BustardToSamWriter(
new BustardFileParser(BUSTARD_DIRECTORY, LANE, PE), OUTPUT, FLOWCELL);
writer.writeBamFile();
return 0;
}
public static void main(String[] argv) {
System.exit(new BustardToSam().instanceMain(argv));
}
}

View File

@ -1,138 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.illumina;
import edu.mit.broad.sam.*;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.util.Log;
import edu.mit.broad.picard.filter.AggregateFilter;
import edu.mit.broad.picard.filter.SamRecordFilter;
import edu.mit.broad.picard.filter.SolexaNoiseFilter;
import edu.mit.broad.picard.sam.ReservedTagConstants;
import java.io.File;
import java.util.*;
/**
* Writes the data from a BustardFileParser to a BAM file
*/
public class BustardToSamWriter {
private final BustardFileParser parser;
private SAMFileWriter writer;
private final File outputFile;
private AggregateFilter filters;
private int recordsWritten = 0;
private Log log = Log.getInstance(BustardToSamWriter.class);
/**
* Constructor
*
* @param parser The parser for the Bustard data
* @param outputDirectory The directory in which to write the BAM file
* @param flowcell The flowcell from which the data is drawn
*/
public BustardToSamWriter(BustardFileParser parser, File outputDirectory, String flowcell) {
this.parser = parser;
this.outputFile = getOutputFile(outputDirectory, flowcell);
initializeFilters();
}
/**
* Alternate constructor for testing
*
* @param parser The parser for the Bustard data
* @param outputFile The directory in which to write the BAM file
*/
BustardToSamWriter(BustardFileParser parser, File outputFile) {
this.parser = parser;
this.outputFile = outputFile;
initializeFilters();
}
private void initializeFilters() {
filters = new AggregateFilter(Arrays.asList(
(SamRecordFilter)new SolexaNoiseFilter()
));
}
/**
* Writes all data from the BustardFileParser to a BAM file
*/
public void writeBamFile() {
SAMFileHeader header = new SAMFileHeader();
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
writer = new SAMFileWriterFactory().makeBAMWriter(header, false, outputFile);
while (parser.hasNext()) {
BustardReadData brd = parser.next();
SAMRecord sam = createSamRecord(brd, true);
writer.addAlignment(sam);
this.recordsWritten++;
if (parser.isPairedEnd()) {
SAMRecord sam2 = createSamRecord(brd, false);
writer.addAlignment(sam2);
this.recordsWritten++;
}
}
writer.close();
log.info("Wrote " + this.recordsWritten + " read records to BAM file " +
this.outputFile.getAbsolutePath());
}
/**
* Creates a SAMRecord from Bustard data
*
* @param brd The BustardReadData to use in populating the SAMRecord
* @param isFirstRead whether this is the first read of a pair
* @return SAMRecord fully populated SAMRecord
*/
private SAMRecord createSamRecord(BustardReadData brd, boolean isFirstRead) {
SAMRecord sam = new SAMRecord();
sam.setReadName(brd.getReadName());
sam.setReadString(isFirstRead ? brd.getFirstReadSequence() : brd.getSecondReadSequence());
sam.setBaseQualityString(isFirstRead ? brd.getFirstReadPhredQualities() : brd.getSecondReadPhredQualities());
// Flag values
sam.setReadPairedFlag(parser.isPairedEnd());
sam.setReadUmappedFlag(true);
sam.setReadFailsVendorQualityCheckFlag(!brd.isPf());
sam.setMateUnmappedFlag(true);
if (parser.isPairedEnd()) {
sam.setFirstOfPairFlag(isFirstRead);
sam.setSecondOfPairFlag(!isFirstRead);
}
if (filters.filterOut(sam)) {
sam.setAttribute(ReservedTagConstants.XN, 1);
}
return sam;
}
/**
* Constructs the name for the output file, determines whether it is writeable,
* and returns the file
*
* @param outputDirectory the directory in which to write the BAM file
* @param flowcell the flowcell from which the data is drawn
* @return a new File object for the BAM file.
*/
private File getOutputFile(File outputDirectory, String flowcell) {
File result = new File(outputDirectory.getAbsolutePath() + "/" +
flowcell + "." + parser.getLane() + ".unmapped.bam");
IoUtil.assertFileIsWritable(result);
return result;
}
}

View File

@ -1,235 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.illumina;
import edu.mit.broad.picard.util.PasteParser;
import edu.mit.broad.picard.util.TabbedTextFileParser;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.sam.util.CloseableIterator;
import java.io.File;
import java.util.Iterator;
import java.util.Arrays;
import java.util.regex.Pattern;
import java.text.ParsePosition;
import java.text.NumberFormat;
/**
* Parse the pair of files (eland_extended.txt and export.txt) that correspond to an end of a Gerald run for a lane.
*/
public class GeraldParser implements Iterable<GeraldParser.GeraldAlignment>, CloseableIterator<GeraldParser.GeraldAlignment> {
private static final int EXPECTED_ELAND_FIELDS = 4;
// Regex used to split apart multiple alignments in the eland output
private static final Pattern ALIGN_SPLITTER = Pattern.compile("\\,+");
// export.txt constants
private static final int PASSING_FILTER_COLUMN = 21;
private static final int QUALITIES_COLUMN = 9;
private static final int REQUIRED_EXPORT_COLUMNS = PASSING_FILTER_COLUMN + 1;
private final NumberFormat integerFormat = NumberFormat.getIntegerInstance();
private final SquashedCoordinateMap geraldToArachne;
private final PasteParser pasteParser;
private final File elandExtended;
private final File export;
private boolean iteratorCalled = false;
private final byte[] solexaToPhredQualityConverter = new SolexaQualityConverter().getSolexaToPhredConversionTable();
/**
* @param geraldToArachne for converting btw Gerald coordinate and genomic coordinate
*/
public GeraldParser(final SquashedCoordinateMap geraldToArachne, final File elandExtended, final File export) {
this.geraldToArachne = geraldToArachne;
this.elandExtended = elandExtended;
this.export = export;
final TabbedTextFileParser[] parsers = {
new TabbedTextFileParser(false, elandExtended),
new TabbedTextFileParser(false, export)
};
pasteParser = new PasteParser(parsers);
}
public Iterator<GeraldAlignment> iterator() {
if (iteratorCalled) {
throw new IllegalStateException("iterator() cannot be called more than once on a GeraldParser instance.");
}
iteratorCalled = true;
return this;
}
public void close() {
pasteParser.close();
}
public boolean hasNext() {
return pasteParser.hasNext();
}
public GeraldAlignment next() {
final GeraldAlignment ret = new GeraldAlignment();
final String[][] fields = pasteParser.next();
// Parse eland_extended.txt fields
final String[] elandExtendedFields = fields[0];
if (elandExtendedFields.length < EXPECTED_ELAND_FIELDS) {
throw new PicardException("Not enough fields in file: " + elandExtended);
}
ret.readName = elandExtendedFields[0].substring(1);
ret.readBases = elandExtendedFields[1];
ret.readLength = ret.readBases.length();
final String[] alignCounts = elandExtendedFields[2].split(":");
if (alignCounts.length == 3) {
ret.zeroMismatchPlacements = Short.parseShort(alignCounts[0]);
ret.oneMismatchPlacements = Short.parseShort(alignCounts[1]);
ret.twoMismatchPlacements = Short.parseShort(alignCounts[2]);
}
final String[] alignments = ALIGN_SPLITTER.split(elandExtendedFields[3]);
if (alignments.length == 1 && !"-".equals(alignments[0])) {
final int lastDot = alignments[0].lastIndexOf(".");
final int colon = alignments[0].indexOf(':');
final String tmp = alignments[0].substring(colon + 1);
final ParsePosition pos = new ParsePosition(0);
final long start = integerFormat.parse(tmp, pos).longValue();
if (pos.getIndex() == 0) {
throw new RuntimeException("Problem parsing eland extended alignment record: " + Arrays.toString(elandExtendedFields));
}
final SimpleMapping m = new SimpleMapping(alignments[0].substring(lastDot+1, colon).trim(),
start, start + ret.readLength - 1, null);
geraldToArachne.convertToArachneCoords(m);
ret.primaryChrom = m.getSequenceName();
ret.primaryStart = m.getStartPos();
ret.primaryStop = m.getEndPos();
ret.orientation = tmp.substring(pos.getIndex(), pos.getIndex() + 1);
ret.mismatchString = tmp.substring(pos.getIndex() + 1);
// Count the mismatches in the alignment
for (int i=pos.getIndex(); i<tmp.length(); ++i) {
final char ch = tmp.charAt(i);
if (ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T') {
ret.primaryMismatches += 1;
}
}
}
final String[] exportFields = fields[1];
// Parse export.txt fields
if (exportFields.length < REQUIRED_EXPORT_COLUMNS) {
throw new RuntimeException("Not enough columns in _export.txt file " + export);
}
if (exportFields[PASSING_FILTER_COLUMN].equals("Y")) {
ret.passingFilter = true;
} else if (exportFields[PASSING_FILTER_COLUMN].equals("N")) {
ret.passingFilter = false;
} else {
throw new RuntimeException("Strange value for PF column in _export.txt file " + export + ": '" +
exportFields[PASSING_FILTER_COLUMN] + "'.");
}
ret.phredQualities = exportFields[QUALITIES_COLUMN].getBytes();
decodeSolexaQualitiesToPhred(ret.phredQualities);
return ret;
}
public void remove() {
throw new UnsupportedOperationException();
}
/** Decodes an array of solexa quality chars into SOLEXA numeric space.
* Decode in place in order to avoid extra object allocation */
private void decodeSolexaQualitiesToPhred(final byte[] solexaQuals) {
for (int i=0; i<solexaQuals.length; ++i) {
solexaQuals[i] = solexaToPhredQualityConverter[solexaQuals[i]];
}
}
public class GeraldAlignment {
// From eland_extended.txt
private String readName = null;
private String readBases = null;
private int readLength = 0;
private short zeroMismatchPlacements = 0;
private short oneMismatchPlacements = 0;
private short twoMismatchPlacements = 0;
private String primaryChrom = null;
private long primaryStart = 0;
private long primaryStop = 0;
private String orientation = null;
private short primaryMismatches = 0;
private String mismatchString = null;
// from export.txt
private boolean passingFilter;
private byte[] phredQualities;
public String getMismatchString() {
return mismatchString;
}
public short getOneMismatchPlacements() {
return oneMismatchPlacements;
}
public String getOrientation() {
return orientation;
}
public boolean isPassingFilter() {
return passingFilter;
}
public byte[] getPhredQualities() {
return phredQualities;
}
public String getPrimaryChrom() {
return primaryChrom;
}
public short getPrimaryMismatches() {
return primaryMismatches;
}
public long getPrimaryStart() {
return primaryStart;
}
public long getPrimaryStop() {
return primaryStop;
}
public String getReadBases() {
return readBases;
}
public int getReadLength() {
return readLength;
}
public String getReadName() {
return readName;
}
public short getTwoMismatchPlacements() {
return twoMismatchPlacements;
}
public short getZeroMismatchPlacements() {
return zeroMismatchPlacements;
}
}
}

View File

@ -1,58 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.illumina;
import edu.mit.broad.picard.io.IoUtil;
import java.io.File;
/**
* Given a Gerald directory, create a GeraldParser for one end or both ends as appropriate.
*/
public class GeraldParserFactory {
// A Map of squashed reference chunk to reference genome sequence/chromosome. The chunk is represented as
// a mapping (sequence=chunk file, startPos=offset into chunk file).
private final SquashedCoordinateMap geraldToArachne;
private final File geraldDir;
private final int lane;
public GeraldParserFactory(final File geraldDir, final int lane, final File squashedMapFile) {
this.geraldDir = geraldDir;
this.lane = lane;
geraldToArachne = new SquashedCoordinateMap(squashedMapFile);
}
/** Attempts to determine if an analysis on a lane is PE or single. */
public boolean isPairedRun() {
if (new File(geraldDir, "s_" + lane + "_1_eland_query.txt").exists()) return true;
else if (new File(geraldDir, "s_" + lane + "_eland_query.txt").exists()) return false;
throw new IllegalStateException("Could not determine if gerald run is PE or fragment.");
}
private String makeLanePrefix(final Integer readNumber) {
return "s_" + lane + "_" + (readNumber == null ? "" : readNumber + "_");
}
/**
* @param readNumber 1 == first end of pair; 2 == second end of pair; null == unpaired
* @return a GeraldParser for the given end
*/
public GeraldParser makeParser(final Integer readNumber) {
final File elandExtendedFile = new File(geraldDir, makeLanePrefix(readNumber) + "eland_extended.txt");
final File exportFile = new File(geraldDir, makeLanePrefix(readNumber) + "export.txt");
IoUtil.assertFileIsReadable(elandExtendedFile);
IoUtil.assertFileIsReadable(exportFile);
return new GeraldParser(geraldToArachne, elandExtendedFile, exportFile);
}
}

View File

@ -1,348 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.illumina;
import java.io.File;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Iterator;
import edu.mit.broad.picard.util.*;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Usage;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.cmdline.CommandLineParser;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.sam.SAMFileReader;
import edu.mit.broad.sam.SAMFileWriter;
import edu.mit.broad.sam.SAMFileWriterFactory;
import edu.mit.broad.sam.SAMProgramRecord;
import edu.mit.broad.sam.SAMReadGroupRecord;
import edu.mit.broad.sam.SAMRecord;
/**
* Read alignments for a lane (paired or unpaired) from Gerald directory and write to SAM file.
*/
public class GeraldToSam extends CommandLineProgram {
// These are all written to the SAM header
private static final String DEFAULT_CN = "broad";
private static final String DEFAULT_PL = "illumina";
private static final String PROGRAM_VERSION = "1.0";
private static final String READ_GROUP_ID = "0";
private static final String PROGRAM_RECORD_ID = "0";
private static final String UNKNOWN_SAMPLE = "N/A";
private static final Log log = Log.getInstance(GeraldToSam.class);
// The following attributes define the command-line arguments
@Usage(programVersion=PROGRAM_VERSION)
public String USAGE =
getStandardUsagePreamble() +
"Read Gerald alignments for the given lane, and write in SAM format, coordinate sorted.\n";
@Option(shortName = "G", doc = "Location of Gerald files.")
public File GERALD_DIR;
@Option(shortName = "L")
public Integer LANE;
@Option(shortName = "M", doc = "Translates from Gerald alignment coordinates to genomic coordinates.")
public File SQUASHED_MAP;
@Option(shortName = "D", doc = "Input SAM or BAM file defining the names, sizes and order of the reference contig, " +
"and other reference metadata.")
public File SEQUENCE_DICT;
@Option(shortName = "O", doc = "SAM or BAM file to be written (file extension determines format).")
public File OUTPUT;
@Option(doc = "Populates SM field of read group. Use pool name when a pool is being sequenced. " +
"If any other read group fields are specified, then this is required.")
public String SAMPLE = UNKNOWN_SAMPLE;
@Option(doc = "Populates LB field of read group.")
public String LIBRARY;
@Option(doc = "Populates DS field of read group.", optional = true)
public String DESCRIPTION;
@Option(doc = "Flowcell.lane. Populates PU field of read group.")
public String RUN;
@Option(doc = "Predicted median insert size (may be different from the actual median insert size. " +
"Populates the PI field of read group.", optional = true)
public Integer PI;
@Option(doc = "Sequencing center that produced the reads. Populates CN field of read group.")
public String CN = DEFAULT_CN;
@Option(doc = "Date the run was produced. Populates the DT field of read group.")
public Date RUN_DATE;
@Option(doc = "Platform/technology used to produce the reads. Populates the PL field of read group")
public String PL = DEFAULT_PL;
@Option(shortName = "JUMPING", doc = "True if this is a jumping library")
public Boolean JUMPING_LIBRARY = Boolean.FALSE;
@Option(doc = "String to put in the PG:CL header field. If not present, the GeraldToSam command line is put there",
optional = true)
public String ALIGNMENT_COMMAND;
@Option(doc = "Write no more than this number of alignment records. Default: Write all the alignment records",
optional = true)
public Integer MAX_ALIGNMENTS;
private SAMFileWriter writer;
SAMFileHeader header;
private boolean paired;
public static void main(final String[] argv) {
System.exit(new GeraldToSam().instanceMain(argv));
}
@Override
public int doWork() {
makeHeader(clp.getArgv());
writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, OUTPUT);
writeAlignments();
writer.close();
return 0;
}
/**
* If any of the read group options are specified on the command line, then SAMPLE must be specified.
* This is currently not doing anything because SAMPLE has a non-null default value.
* @return false if there is a problem with the command line
*/
@Override
protected boolean customCommandLineValidation() {
if (SAMPLE == null &&
(LIBRARY != null || DESCRIPTION != null || RUN != null || PI != null || !CN.equals(DEFAULT_CN)
|| RUN_DATE != null || !PL.equals(DEFAULT_PL)
)) {
System.err.println("SAMPLE must be specified if any read group options are used.");
clp.usage(System.err);
return false;
}
return true;
}
/**
* Create the SAMFileHeader given the cmd-line args
* @param argv
*/
private void makeHeader(final String[] argv) {
header = new SAMFileHeader();
header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_ID);
programRecord.setProgramVersion(PROGRAM_VERSION);
String commandLine = ALIGNMENT_COMMAND;
if (commandLine == null) {
commandLine = StringUtil.join(" ", argv);
}
programRecord.setCommandLine(commandLine);
header.addProgramRecord(programRecord);
final SAMFileReader sequenceDictionary = new SAMFileReader(SEQUENCE_DICT);
final SAMFileHeader sequenceDictionaryHeader = sequenceDictionary.getFileHeader();
header.setSequences(sequenceDictionaryHeader.getSequences());
if (SAMPLE != null) {
final SAMReadGroupRecord readGroup = new SAMReadGroupRecord(READ_GROUP_ID);
final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>();
readGroups.add(readGroup);
readGroup.setSample(SAMPLE);
if (LIBRARY != null) {
readGroup.setLibrary(LIBRARY);
}
setRGAttributeIfNotNull(readGroup, DESCRIPTION, "DS");
setRGAttributeIfNotNull(readGroup, RUN, "PU");
setRGAttributeIfNotNull(readGroup, PI, SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG);
setRGAttributeIfNotNull(readGroup, CN, "CN");
setRGAttributeIfNotNull(readGroup, RUN_DATE, SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG);
setRGAttributeIfNotNull(readGroup, PL, "PL");
header.setReadGroups(readGroups);
}
}
private void setRGAttributeIfNotNull(final SAMReadGroupRecord readGroup, final Object value, final String key) {
if (value == null) {
return;
}
readGroup.setAttribute(key, value);
}
/**
* Iterate through the Gerald output and write alignments. eland_extended.txt and export.txt are
* iterated together using PasteParser. If paired end lane, then two PasteParsers are iterated in tandem,
* so that mate info is available when a SAMRecord is created.
*/
private void writeAlignments() {
final GeraldParserFactory geraldParserFactory = new GeraldParserFactory(GERALD_DIR, LANE, SQUASHED_MAP);
paired = geraldParserFactory.isPairedRun();
final GeraldParser firstEndIterator = geraldParserFactory.makeParser(paired ? 1: null);
GeraldParser secondEndIterator = null;
if (paired) {
secondEndIterator = geraldParserFactory.makeParser(2);
}
int numAlignmentsOrPairsWritten = 0;
while (firstEndIterator.hasNext()) {
final GeraldParser.GeraldAlignment firstEnd = firstEndIterator.next();
GeraldParser.GeraldAlignment secondEnd = null;
if (paired) {
hasNextAssert(secondEndIterator);
secondEnd = secondEndIterator.next();
}
final SAMRecord firstEndAlignment = createSAMRecordFromGerald(firstEnd);
SAMRecord secondEndAlignment = null;
if (paired) {
secondEndAlignment = createSAMRecordFromGerald(secondEnd);
setMateInfo(secondEndAlignment, firstEnd);
setMateInfo(firstEndAlignment, secondEnd);
secondEndAlignment.setSecondOfPairFlag(true);
firstEndAlignment.setFirstOfPairFlag(true);
final boolean properPair = SamPairUtil.isProperPair(firstEndAlignment, secondEndAlignment, JUMPING_LIBRARY);
firstEndAlignment.setProperPairFlag(properPair);
secondEndAlignment.setProperPairFlag(properPair);
int insertSize = SamPairUtil.computeInsertSize(firstEndAlignment, secondEndAlignment);
firstEndAlignment.setInferredInsertSize(insertSize);
secondEndAlignment.setInferredInsertSize(-insertSize);
}
writer.addAlignment(firstEndAlignment);
if (secondEndAlignment != null) {
writer.addAlignment(secondEndAlignment);
}
++numAlignmentsOrPairsWritten;
if (MAX_ALIGNMENTS != null && numAlignmentsOrPairsWritten >= MAX_ALIGNMENTS) {
break;
}
if (numAlignmentsOrPairsWritten % 500000 == 0) {
log.info("Loaded " + numAlignmentsOrPairsWritten + " reads");
}
}
if (MAX_ALIGNMENTS == null) {
noMoreAssert(firstEndIterator);
if (paired) {
noMoreAssert(secondEndIterator);
}
}
log.info("Done loading " + numAlignmentsOrPairsWritten + " reads");
}
/**
* Write into the samRecord the mate info from the mate gerald alignment
*/
private void setMateInfo(final SAMRecord samRecord, final GeraldParser.GeraldAlignment mateGeraldAlignment) {
final boolean isMapped = mateGeraldAlignment.getPrimaryChrom() != null;
if (isMapped) {
samRecord.setMateReferenceName(mateGeraldAlignment.getPrimaryChrom());
samRecord.setMateAlignmentStart((int)mateGeraldAlignment.getPrimaryStart());
samRecord.setMateNegativeStrandFlag(isNegativeStrand(mateGeraldAlignment));
} else {
samRecord.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
samRecord.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
samRecord.setMateUnmappedFlag(true);
}
}
private boolean isNegativeStrand(final GeraldParser.GeraldAlignment alignment) {
final String orientation = alignment.getOrientation();
if (orientation.equals("F")) {
return false;
} else if (orientation.equals("R")) {
return true;
} else {
throw new RuntimeException("Strange orientation in eland_extended file");
}
}
private SAMRecord createSAMRecordFromGerald(final GeraldParser.GeraldAlignment alignment) {
final SAMRecord samRecord = new SAMRecord();
// Consider an alignment with a negative start (i.e. that hangs off the beginning of the contig)
// to be unmapped.
final boolean isMapped = alignment.getPrimaryChrom() != null && alignment.getPrimaryStart() >= 0;
String readName = alignment.getReadName();
if (readName.endsWith("/1") || readName.endsWith("/2")) {
readName = readName.substring(0, readName.length() - 2);
}
samRecord.setReadName(readName);
// Set all the flags
samRecord.setReadPairedFlag(paired);
samRecord.setReadUmappedFlag(!isMapped);
if (isMapped) {
samRecord.setReadNegativeStrandFlag(isNegativeStrand(alignment));
}
// For now we are only taking the primary alignment
samRecord.setNotPrimaryAlignmentFlag(false);
String readBases = alignment.getReadBases();
if (samRecord.getReadNegativeStrandFlag()) {
readBases = SequenceUtil.reverseComplement(readBases);
}
samRecord.setReadString(readBases);
final byte[] phredQualities = alignment.getPhredQualities();
if (isMapped && samRecord.getReadNegativeStrandFlag()) {
ArrayUtil.reverseArray(phredQualities);
}
samRecord.setBaseQualities(phredQualities);
if (isMapped) {
/*
if ("23".equals(geraldReferenceName)) {
geraldReferenceName = "X";
} else if ("24".equals(geraldReferenceName)) {
geraldReferenceName = "Y";
}
return REFERENCE_PREFIX + geraldReferenceName;
*/
samRecord.setReferenceName(alignment.getPrimaryChrom());
samRecord.setAlignmentStart((int)alignment.getPrimaryStart());
samRecord.setMappingQuality(SAMRecord.UNKNOWN_MAPPING_QUALITY);
// CIGAR is trivial because there are no indels or clipping in Gerald
final String cigar = Integer.toString(alignment.getReadLength()) + "M";
samRecord.setCigarString(cigar);
// We've decided not to bother with this, and just load the reference
// if we want to determine mismatches.
// samRecord.setAttribute("MD", alignment.getMismatchString());
} else {
samRecord.setReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
samRecord.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
samRecord.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY);
samRecord.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR);
}
if (SAMPLE != null) {
// There is a read group (id = READ_GROUP_ID)
samRecord.setAttribute("RG", READ_GROUP_ID);
}
samRecord.setAttribute("PG", PROGRAM_RECORD_ID);
return samRecord;
}
private void hasNextAssert(final Iterator iterator) {
if (!iterator.hasNext()) {
throw new RuntimeException("gerald output file ends unexpectedly.");
}
}
private void noMoreAssert(final Iterator iterator) {
if (iterator.hasNext()) {
throw new RuntimeException("gerald output file has more lines than expected.");
}
}
}

View File

@ -1,117 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.illumina;
import edu.mit.broad.sam.util.CoordMath;
class SimpleMapping implements Comparable<SimpleMapping> {
String arachneIndex;
long startPos;
long endPos;
String sequenceName;
public SimpleMapping(final String arachneIndex, final long startPos, final long endPos, final String sequenceName) {
this.arachneIndex = arachneIndex;
this.startPos = startPos;
this.endPos = endPos;
this.sequenceName = sequenceName;
if (this.endPos < this.startPos) throw new IllegalArgumentException("startPos must be less than endPos!");
}
public String getArachneIndex() {
return arachneIndex;
}
public void setArachneIndex(final String arachneIndex) {
this.arachneIndex = arachneIndex;
}
public long getStartPos() {
return startPos;
}
public void setStartPos(final long startPos) {
this.startPos = startPos;
}
public long getEndPos() {
return endPos;
}
public void setEndPos(final long endPos) {
this.endPos = endPos;
}
public String getSequenceName() {
return sequenceName;
}
public void setSequenceName(final String sequenceName) {
this.sequenceName = sequenceName;
}
public SimpleMapping intersection(final SimpleMapping other) {
if (this.intersects(other)) {
return new SimpleMapping(this.getArachneIndex(),
(this.getStartPos() >= other.getStartPos())?this.getStartPos():other.getStartPos(),
(this.getEndPos() <= other.getEndPos())?this.getEndPos():other.getEndPos(), this.getSequenceName());
}
return null;
}
public boolean intersects(final SimpleMapping other) {
return (this.getArachneIndex().equals(other.getArachneIndex()) &&
CoordMath.overlaps(this.getStartPos(), this.getEndPos(), other.getStartPos(), other.getEndPos()));
}
public long length() {
return CoordMath.getLength(startPos, endPos);
}
/**
* Sort based on sequence.compareTo, then start pos, then end pos
* with null objects coming lexically last
*/
public int compareTo(final SimpleMapping that) {
if (that == null) return -1; // nulls last
int result = this.getArachneIndex().compareTo(that.getArachneIndex());
if (result == 0) {
if (this.getStartPos() == that.getStartPos()) {
result = ((int) (this.getEndPos() - that.getEndPos()));
} else {
result = ((int) (this.getStartPos() - that.getStartPos()));
}
}
// normalize to -1, 0, 1
if (result > 1) result = 1;
else if (result < -1) result = -1;
return result;
}
public boolean equals(final SimpleMapping that) {
return (this.compareTo(that) == 0);
}
public int hashCode() {
int result;
result = arachneIndex.hashCode();
result = 31 * result + (int) (startPos ^ (startPos >>> 32));
result = 31 * result + (int) (endPos ^ (endPos >>> 32));
return result;
}
public String toString() {
return getArachneIndex() + ":" + getStartPos() + "-" + getEndPos();
}
}

View File

@ -1,58 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.illumina;
/**
* Optimized method for converting Solexa ASCII qualities into Phred scores.
* Pre-computes all values in order to eliminate repeated computation.
*/
public class SolexaQualityConverter {
/**
* This value is added to a Solexa quality score to make it printable ASCII
*/
private static int SOLEXA_ADDEND = 64;
/**
* Mapping from ASCII value in Gerald export file to phred score
*/
private final byte[] phredScore = new byte[256];
public SolexaQualityConverter() {
for (int i = 0; i < SOLEXA_ADDEND; ++i) {
phredScore[i] = 0;
}
for (int i = SOLEXA_ADDEND; i < phredScore.length; ++i) {
phredScore[i] = decodeSolexaQualityToPhred(i);
}
}
/** Converts a solexa character quality into a phred numeric quality. */
private byte decodeSolexaQualityToPhred(final int solexaQuality) {
return (byte) Math.round(10d * Math.log10(1d+Math.pow(10d, (solexaQuality - SOLEXA_ADDEND)/10d)));
}
/**
* Convert a solexa quality ASCII character into a phred score.
*/
public byte solexaToPhred(final byte solexaQuality) {
return phredScore[solexaQuality];
}
/**
* @return a byte array that can be indexed by Solexa ASCII quality, with value
* of corresponding Phred score. Elements 0-63 are invalid because Solexa qualities
* should all be >= 64. Do not modify this array!
*/
public byte[] getSolexaToPhredConversionTable() {
return phredScore;
}
}

View File

@ -1,75 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.illumina;
import edu.mit.broad.sam.util.CoordMath;
import edu.mit.broad.picard.cmdline.CommandLineUtils;
import java.util.Map;
import java.util.HashMap;
import java.io.File;
import java.io.BufferedReader;
import java.io.IOException;
public class SquashedCoordinateMap {
private final Map<SimpleMapping, String> geraldToArachne = new HashMap<SimpleMapping, String>();
private long genomeSize;
public SquashedCoordinateMap(final File squashedMapFile) {
try {
final BufferedReader in = CommandLineUtils.getReader(squashedMapFile);
String line;
genomeSize = 0;
while ((line = in.readLine()) != null) {
final String[] fields = CommandLineUtils.SPACE_SPLITTER.split(line);
final String arachneIndex = fields[0].trim().intern();
final String squashedRefIndex = fields[1].trim().intern();
final long squashedStart = Long.parseLong(fields[2]);
final long length = Long.parseLong(fields[3]);
final String sequenceName = fields[4];
final SimpleMapping mapping = new SimpleMapping(squashedRefIndex, squashedStart,
CoordMath.getEnd(squashedStart, length), sequenceName);
geraldToArachne.put(mapping, arachneIndex);
genomeSize += length;
}
in.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/* Converts a read's mapping from Gerald's vretarded space to arachne index + coords. */
public void convertToArachneCoords(final SimpleMapping read) {
if (this.geraldToArachne == null || this.geraldToArachne.isEmpty()) {
throw new IllegalStateException("Cannot invoke convertToArachneCoords before parseSquashedMapFile");
}
for (final Map.Entry<SimpleMapping,String> entry : this.geraldToArachne.entrySet()) {
final SimpleMapping chunk = entry.getKey();
if (chunk.intersects(read)) {
read.setArachneIndex(entry.getValue());
read.setStartPos( read.getStartPos() - chunk.getStartPos() );
read.setEndPos( read.getEndPos() - chunk.getStartPos() );
read.setSequenceName(chunk.getSequenceName());
return;
}
}
throw new RuntimeException("Could not convert read: " + read);
}
long getGenomeSize() {
return genomeSize;
}
}

View File

@ -1,82 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.importer.genotype;
import java.io.Closeable;
import java.io.File;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.sam.util.BinaryCodec;
/**
*
*
* @author Doug Voet
*/
public class BedFileReader implements Closeable {
private static final int LOWEST_2_BIT_MASK = 3; // binary 11
private static final short BED_MAGIC_NUMBER = 7020;
// private static final short BED_MAGIC_NUMBER = Short.parseShort("0110110000011011", 2);
public static final byte MODE_INDIVIDUAL_MAJOR = 0;
public static final byte MODE_SNP_MAJOR = 1;
public static final byte GENOTYPE_AA = 0; // binary 00
public static final byte GENOTYPE_NO_CALL = 1; // binary 01
public static final byte GENOTYPE_AB = 2; // binary 10
public static final byte GENOTYPE_BB = 3; // binary 11
private final byte mode;
private final BinaryCodec codec;
private byte currentBlock;
private int genotypeCount = 0;
public BedFileReader(File bedFile) {
this.codec = new BinaryCodec(bedFile, false);
short fileMagicNumber = this.codec.readShort();
if (fileMagicNumber != BED_MAGIC_NUMBER) {
this.codec.close();
throw new PicardException("Given file [" + bedFile.getAbsolutePath() +
"] is not in bed file format... magic number does not match");
}
this.mode = codec.readByte();
}
public byte getMode() {
return mode;
}
@Override
public void close() {
this.codec.close();
}
public byte nextGenotype() {
// there are 4 genotypes per byte so get a new byte every 4 genotypes read
if (this.genotypeCount++ % 4 == 0) {
this.currentBlock = this.codec.readByte();
}
// the 2 lowest order bits of currentBlock are the next genotype, pop them off
byte genotype = (byte) (LOWEST_2_BIT_MASK & this.currentBlock);
this.currentBlock >>>= 2;
return genotype;
}
/**
* Call this method when moving on to the next individual (in indiv-major mode) or next
* snp (in snp-major mode).
*/
public void dropRemainingBlock() {
this.genotypeCount = 0;
}
}

View File

@ -1,371 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.importer.genotype;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.cmdline.Usage;
import edu.mit.broad.picard.genotype.GeliFileWriter;
import edu.mit.broad.picard.genotype.GenotypeLikelihoods;
import edu.mit.broad.picard.genotype.GenotypeLikelihoodsCodec;
import edu.mit.broad.picard.genotype.GenotypeLikelihoods.GenotypeLikelihoodsComparator;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.util.BasicTextFileParser;
import edu.mit.broad.picard.util.Log;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.sam.SAMSequenceRecord;
import edu.mit.broad.sam.SAMTextHeaderCodec;
import edu.mit.broad.sam.util.AsciiLineReader;
import edu.mit.broad.sam.util.SortingCollection;
/**
* Converts a BED/BIM/FAM file trio to a number of GELI files (1 per individual).
* BED files come in 2 formats, individual-major and snp-major. The former lists all SNPs for the
* first individual then all SNPs for the second individual, etc. The latter list all individuals
* for first SNP then all individuals for second SNP, etc. The order for snps is dictated by
* the bim file and the order for individuals is dictated by the fam file.
* <p>
* See <a href="http://pngu.mgh.harvard.edu/~purcell/plink/binary.shtml">this page</a> for details
* of the format.
*
* @author Doug Voet
*/
public class BedToGeli extends CommandLineProgram {
static final float LIKELIHOOD = 500;
private static final Log log = Log.getInstance(BedToGeli.class);
@Usage(programVersion="1.0")
public final String USAGE = "";
@Option(doc="The bed file name.", mutex="BFILE")
public File BED;
@Option(doc="The bim file name.", mutex="BFILE")
public File BIM;
@Option(doc="The fam file name.", mutex="BFILE")
public File FAM;
@Option(doc="The root file name of the bed, bim & fam files.", mutex={"BED", "BIM", "FAM"})
public String BFILE;
@Option(doc="The directory to write the output GELI files", shortName="D")
public File OUTPUT_DIR;
@Option(doc="Set to 'true' if the family name should be included in the output file names, default false",
shortName="F",
optional=true)
public Boolean USE_FAMILY = Boolean.FALSE;
@Option(doc="Name of file containing sequence dictionary to embed in new GELI files",
shortName="DICT")
public File SEQUENCE_DICTIONARY;
private List<SNP> snpCache;
private List<String> geliFileNames;
private List<SAMSequenceRecord> sequenceDictionary;
private Map<String, Byte> referenceIndexes;
public static void main(String[] argv) {
System.exit(new BedToGeli().instanceMain(argv));
}
@Override
protected int doWork() {
populateFileNames();
IoUtil.assertFileIsReadable(this.BED);
IoUtil.assertFileIsReadable(this.BIM);
IoUtil.assertFileIsReadable(this.FAM);
IoUtil.assertFileIsReadable(this.SEQUENCE_DICTIONARY);
IoUtil.assertDirectoryIsWritable(this.OUTPUT_DIR);
populateSequenceDictionary();
BedFileReader bedReader = new BedFileReader(this.BED);
if (bedReader.getMode() == BedFileReader.MODE_INDIVIDUAL_MAJOR) {
log.debug("Detected BED file in individual-major mode");
parseIndividualMajor(bedReader);
} else {
log.debug("Detected BED file in snp-major mode");
parseSnpMajor(bedReader);
}
return 0;
}
/**
* loads the SEQUENCE_DICTIONARY file
*/
private void populateSequenceDictionary() {
try {
final SAMFileHeader header = new SAMTextHeaderCodec().decode(new AsciiLineReader(new FileInputStream(this.SEQUENCE_DICTIONARY)), null);
this.sequenceDictionary = header.getSequences();
this.referenceIndexes = new HashMap<String, Byte>();
for (byte i = 0; i < sequenceDictionary.size(); i++) {
this.referenceIndexes.put(sequenceDictionary.get(i).getSequenceName().intern(), i);
}
} catch (FileNotFoundException e) {
throw new PicardException("Unexpected exception", e);
}
}
private void parseIndividualMajor(BedFileReader bedReader) {
cacheSnps();
BasicTextFileParser famReader = new BasicTextFileParser(true, this.FAM);
for (String[] famFields : famReader) {
GeliFileWriter geliWriter = getGeliFileWriter(getGeliFileName(famFields[0], famFields[1]), false);
for (SNP snp : this.snpCache) {
GenotypeLikelihoods genotypeLikelihoods = constructGenotypeLikelihoods(
bedReader, snp);
if (genotypeLikelihoods != null) {
geliWriter.addGenotypeLikelihoods(genotypeLikelihoods);
}
}
bedReader.dropRemainingBlock();
geliWriter.close();
}
famReader.close();
}
/**
* @return null if for a no-call or the snp has no position on the genome
*/
private char[] getNextGenotype(BedFileReader bedReader, SNP snp) {
char[] genotype = new char[2];
byte genotypeCode = bedReader.nextGenotype();
if (snp == null) {
// unplaced marker... we need to read the genotype off the reader so we don't lose
// our place, but we cannot put the marker in the geli file.
return null;
}
switch (genotypeCode) {
case BedFileReader.GENOTYPE_AA:
genotype[0] = (char) snp.getAllele1();
genotype[1] = (char) snp.getAllele1();
break;
case BedFileReader.GENOTYPE_AB:
genotype[0] = (char) snp.getAllele1();
genotype[1] = (char) snp.getAllele2();
break;
case BedFileReader.GENOTYPE_BB:
genotype[0] = (char) snp.getAllele2();
genotype[1] = (char) snp.getAllele2();
break;
case BedFileReader.GENOTYPE_NO_CALL:
// don't record a genotype likelihood for a no call
return null;
default:
throw new PicardException("Unknown genotype code: " + Integer.toBinaryString(genotypeCode));
}
return genotype;
}
private void cacheSnps() {
BasicTextFileParser bimReader = null;
try {
bimReader = new BasicTextFileParser(true, this.BIM);
this.snpCache = new LinkedList<SNP>();
for (String[] bimFields : bimReader) {
SNP snp = constructSnp(bimFields);
snpCache.add(snp);
}
} finally {
try {
bimReader.close();
} catch (Exception e) {
}
}
}
private SNP constructSnp(String[] bimFields) {
byte referenceIndex = getReferenceIndex(bimFields[0]);
if (referenceIndex == -1) {
return null;
}
SNP snp = new SNP(
referenceIndex,
Integer.parseInt(bimFields[3]),
bimFields[4].toUpperCase().getBytes()[0],
bimFields[5].toUpperCase().getBytes()[0]);
return snp;
}
/**
* determines the index in the sequence dictionary for the given chromosome
*/
private byte getReferenceIndex(String chromosome) {
final String referenceName;
int chromosomeNumber;
try {
chromosomeNumber = Integer.parseInt(chromosome);
} catch (NumberFormatException e) {
chromosomeNumber = -1;
}
if (chromosomeNumber >= 1 && chromosomeNumber <= 22) {
referenceName = ("chr" + chromosome).intern();
} else if (chromosomeNumber == 26 || chromosome.equalsIgnoreCase("MT")) {
referenceName = "chrM";
} else if (chromosomeNumber == 23 || chromosomeNumber == 25 ||
chromosome.equalsIgnoreCase("XY") || chromosome.equalsIgnoreCase("X")) {
referenceName = "chrX";
} else if (chromosomeNumber == 24 || chromosome.equalsIgnoreCase("Y")) {
referenceName = "chrY";
} else {
// unplaced marker
return -1;
}
Byte referenceIndex = this.referenceIndexes.get(referenceName);
if (referenceIndex == null) {
throw new PicardException("Reference sequence [" + referenceName + "] not found in sequence dictionary");
}
return referenceIndex;
}
private void cacheGELIFileNames() {
BasicTextFileParser famReader = null;
try {
famReader = new BasicTextFileParser(true, this.FAM);
this.geliFileNames = new LinkedList<String>();
for (String[] fields : famReader) {
this.geliFileNames.add(getGeliFileName(fields[0], fields[1]));
}
} finally {
try {
famReader.close();
} catch (Exception e) {
}
}
}
private void parseSnpMajor(BedFileReader bedReader) {
cacheGELIFileNames();
BasicTextFileParser bimReader = new BasicTextFileParser(true, this.BIM);
Map<String, SortingCollection<GenotypeLikelihoods>> likelihoodsByFile =
new HashMap<String, SortingCollection<GenotypeLikelihoods>>(
(int) Math.ceil(this.geliFileNames.size() * 1.34));
int maxRecordsInRam = calculateMaxRecordsInRam();
for (String geliFileName : this.geliFileNames) {
likelihoodsByFile.put(geliFileName, SortingCollection.newInstance(
GenotypeLikelihoods.class,
new GenotypeLikelihoodsCodec(),
new GenotypeLikelihoodsComparator(),
maxRecordsInRam));
}
for (String[] bimFields : bimReader) {
for (String fileName : this.geliFileNames) {
SNP snp = constructSnp(bimFields);
GenotypeLikelihoods genotypeLikelihoods = constructGenotypeLikelihoods(
bedReader, snp);
if (genotypeLikelihoods != null) {
likelihoodsByFile.get(fileName).add(genotypeLikelihoods);
}
}
bedReader.dropRemainingBlock();
}
bimReader.close();
writeGeliFiles(likelihoodsByFile);
}
/**
* @return
*/
private int calculateMaxRecordsInRam() {
Runtime.getRuntime().gc();
double memoryToUse = Runtime.getRuntime().maxMemory() * .8; // use up to 80%
int objectCountLimit = (int) (memoryToUse / GenotypeLikelihoods.OBJECT_SIZE_BYTES);
return objectCountLimit / this.geliFileNames.size();
}
/**
* @param likelihoodsByFile
*/
private void writeGeliFiles(
Map<String, SortingCollection<GenotypeLikelihoods>> likelihoodsByFile) {
for (Map.Entry<String, SortingCollection<GenotypeLikelihoods>> entry : likelihoodsByFile.entrySet()) {
GeliFileWriter fileWriter = getGeliFileWriter(entry.getKey(), true);
for (GenotypeLikelihoods likelihoods : entry.getValue()) {
fileWriter.addGenotypeLikelihoods(likelihoods);
}
fileWriter.close();
}
}
private GeliFileWriter getGeliFileWriter(
String fileName, boolean presorted) {
File geliFile = new File(this.OUTPUT_DIR, fileName);
GeliFileWriter fileWriter = new GeliFileWriter(geliFile, presorted);
SAMFileHeader header = new SAMFileHeader();
header.setAttribute(SAMFileHeader.VERSION_TAG, "1.0");
header.setSequences(this.sequenceDictionary);
fileWriter.setHeader(header);
return fileWriter;
}
/**
* @param bedReader
* @param snp
* @return
*/
private GenotypeLikelihoods constructGenotypeLikelihoods(
BedFileReader bedReader, SNP snp) {
char[] genotype = getNextGenotype(bedReader, snp);
if (genotype == null) {
// no call or unplaced marker
return null;
}
GenotypeLikelihoods genotypeLikelihoods = new GenotypeLikelihoods();
genotypeLikelihoods.setLikelihood(
GenotypeLikelihoods.getLikelihoodIndex(genotype),
LIKELIHOOD);
genotypeLikelihoods.setReferenceIndex(snp.getReferenceIndex());
genotypeLikelihoods.setPosition(snp.getPosition());
return genotypeLikelihoods;
}
/**
* populates bed/bim/fam if bfile option is used
*/
private void populateFileNames() {
if (this.BFILE != null) {
this.BED = new File(this.BFILE + ".bed");
this.BIM = new File(this.BFILE + ".bim");
this.FAM = new File(this.BFILE + ".fam");
}
}
/**
* @return the appropriate name taking into account this.USE_FAMILY
*/
private String getGeliFileName(String family, String individual) {
StringBuilder fileName = new StringBuilder(individual).append(".geli");
if (this.USE_FAMILY) {
fileName.insert(0, "_").insert(0, family);
}
return fileName.toString();
}
}

View File

@ -1,35 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.importer.genotype;
/**
* data class for storing snp info
*
* @author Doug Voet
*/
public class SNP {
private final byte referenceIndex;
private final int position;
private final byte allele1;
private final byte allele2;
public SNP(byte chromosome, int position, byte allele1, byte allele2) {
this.referenceIndex = chromosome;
this.position = position;
this.allele1 = allele1;
this.allele2 = allele2;
}
public byte getReferenceIndex() { return referenceIndex; }
public int getPosition() { return position; }
public byte getAllele1() { return allele1; }
public byte getAllele2() { return allele2; }
}

View File

@ -1,183 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.io;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import edu.mit.broad.picard.PicardException;
/**
* A class for utility methods that wrap or aggregate functionality in Java IO.
*
* @author Tim Fennell
*/
public class IoUtil {
/**
* Checks that a file is non-null, exists, is not a directory and is readable. If any
* condition is false then a runtime exception is thrown.
*
* @param file the file to check for readability
*/
public static void assertFileIsReadable(File file) {
if (file == null) {
throw new IllegalArgumentException("Cannot check readability of null file.");
} else if (!file.exists()) {
throw new PicardException("Cannot read non-existent file: " + file.getAbsolutePath());
}
else if (file.isDirectory()) {
throw new PicardException("Cannot read file because it is a directory: " + file.getAbsolutePath());
}
else if (!file.canRead()) {
throw new PicardException("File exists but is not readable: " + file.getAbsolutePath());
}
}
/**
* Checks that a file is non-null, and is either extent and writable, or non-existent but
* that the parent directory exists and is writable. If any
* condition is false then a runtime exception is thrown.
*
* @param file the file to check for writability
*/
public static void assertFileIsWritable(File file) {
if (file == null) {
throw new IllegalArgumentException("Cannot check readability of null file.");
} else if (!file.exists()) {
// If the file doesn't exist, check that it's parent directory does and is writable
File parent = file.getAbsoluteFile().getParentFile();
if (!parent.exists()) {
throw new PicardException("Cannot write file: " + file.getAbsolutePath() + ". " +
"Neither file nor parent directory exist.");
}
else if (!parent.isDirectory()) {
throw new PicardException("Cannot write file: " + file.getAbsolutePath() + ". " +
"File does not exist and parent is not a directory.");
}
else if (!parent.canWrite()) {
throw new PicardException("Cannot write file: " + file.getAbsolutePath() + ". " +
"File does not exist and parent directory is not writable..");
}
}
else if (file.isDirectory()) {
throw new PicardException("Cannot write file because it is a directory: " + file.getAbsolutePath());
}
else if (!file.canWrite()) {
throw new PicardException("File exists but is not writable: " + file.getAbsolutePath());
}
}
/**
* Checks that a directory is non-null, extent, writable and a directory
* otherwise a runtime exception is thrown.
*
* @param dir the dir to check for writability
*/
public static void assertDirectoryIsWritable(File dir) {
if (dir == null) {
throw new IllegalArgumentException("Cannot check readability of null file.");
}
else if (!dir.exists()) {
throw new PicardException("Directory does not exist: " + dir.getAbsolutePath());
}
else if (!dir.isDirectory()) {
throw new PicardException("Cannot write to directory because it is not a directory: " + dir.getAbsolutePath());
}
else if (!dir.canWrite()) {
throw new PicardException("Directory exists but is not writable: " + dir.getAbsolutePath());
}
}
/**
* Opens a file for reading, decompressing it if necessary
*
* @param file The file to open
* @return the input stream to read from
*/
public static InputStream openFileForReading(File file) {
try {
if (file.getName().endsWith(".gz") ||
file.getName().endsWith(".bfq") ||
file.getName().endsWith(".map")) {
return new GZIPInputStream(new FileInputStream(file));
}
//TODO: Other compression formats
else {
return new FileInputStream(file);
}
}
catch (IOException ioe) {
throw new PicardException("File not found: " + file.getName(), ioe);
}
}
/**
* Opens a file for writing, overwriting the file if it already exists
*
* @param file the file to write to
* @return the output stream to write to
*/
public static OutputStream openFileForWriting(File file) {
return openFileForWriting(file, false);
}
/**
* Opens a file for writing
*
* @param file the file to write to
* @param append whether to append to the file if it already exists (we overwrite it if false)
* @return the output stream to write to
*/
public static OutputStream openFileForWriting(File file, boolean append) {
try {
if (file.getName().endsWith(".gz") ||
file.getName().endsWith(".bfq") ||
file.getName().endsWith(".map")) {
return new GZIPOutputStream(new FileOutputStream(file, append));
}
//TODO: Other compression formats
else {
return new FileOutputStream(file, append);
}
}
catch (IOException ioe) {
throw new PicardException("Error opening file for writing: " + file.getName(), ioe);
}
}
/**
* Utility method to copy the contents of input to output. The caller is responsible for
* opening and closing both streams.
*
* @param input contents to be copied
* @param output destination
*/
public static void copyStream(InputStream input, OutputStream output) {
try {
byte[] buffer = new byte[1024];
int bytesRead = 0;
while((bytesRead = input.read(buffer)) > 0) {
output.write(buffer, 0, bytesRead);
}
} catch (IOException e) {
throw new PicardException("Exception copying stream", e);
}
}
}

View File

@ -1,50 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.metrics;
import edu.mit.broad.sam.SAMRecord;
public class AggregateMetricCollector<T extends MetricBase> implements MetricCollector<T> {
private final MetricCollector<T>[] collectors;
public AggregateMetricCollector(MetricCollector<T>... collectors) {
if (collectors.length == 0) {
throw new IllegalArgumentException("Must supply at least one collector.");
}
this.collectors = collectors;
}
@Override
public void addRecord(SAMRecord record) {
for (MetricCollector<T> collector : this.collectors) {
collector.addRecord(record);
}
}
@Override
public void onComplete() {
for (MetricCollector<T> collector : this.collectors) {
collector.onComplete();
}
}
@Override
public void setMetrics(T metrics) {
for (MetricCollector<T> collector : this.collectors) {
collector.setMetrics(metrics);
}
}
@Override
public T getMetrics() {
return this.collectors[0].getMetrics();
}
}

View File

@ -1,17 +0,0 @@
package edu.mit.broad.picard.metrics;
/**
* A header for a metrics file. A header simply consists of a type and some arbitrary
* data, but must be able to turn itself into a String and parse it's data back out
* of that String at a later date.
*
* @author Tim Fennell
*/
public interface Header {
/** Converts the header to a String for persisting to a file. */
public String toString();
/** Parses the data contained in the String version of the header. */
public void parse(String in);
}

View File

@ -1,77 +0,0 @@
package edu.mit.broad.picard.metrics;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.util.FormatUtil;
import java.lang.reflect.Field;
/**
* A base class from which all Metric classes should inherit.
*
* @author Tim Fennell
*/
public class MetricBase {
/**
* An equals method that checks equality by asserting that the classes are of the exact
* same type and that all public fields are equal.
*
* @param o an instance to compare to
* @return true if they are equal, false otherwise
*/
public boolean equals(Object o) {
if (o == null) return false;
if (o.getClass() != getClass()) return false;
// Loop through all the fields and check that they are either
// null in both objects or equal in both objects
for (Field f : getClass().getFields()) {
try {
Object lhs = f.get(this);
Object rhs = f.get(o);
if (lhs == null) {
if (rhs == null) {
// keep going
}
else if (rhs != null) {
return false;
}
}
else {
if (lhs.equals(rhs)) {
// keep going
}
else {
return false;
}
}
}
catch (IllegalAccessException iae) {
throw new PicardException("Could not read field " + f.getName() + " from a " + getClass().getSimpleName());
}
}
// If we got this far all the fields are equal
return true;
}
/** Converts the metric class to a human readable string. */
public String toString() {
StringBuilder buffer = new StringBuilder();
FormatUtil formatter = new FormatUtil();
for (Field f : getClass().getFields()) {
try {
buffer.append(f.getName());
buffer.append("\t");
buffer.append(formatter.format(f.get(this)));
buffer.append("\n");
}
catch (IllegalAccessException iae) {
throw new PicardException("Could not read field " + f.getName() + " from a " + getClass().getSimpleName());
}
}
return buffer.toString();
}
}

View File

@ -1,24 +0,0 @@
package edu.mit.broad.picard.metrics;
import edu.mit.broad.sam.SAMRecord;
/**
* Interface for objects that collect metrics about SAMRecords.
*/
public interface MetricCollector<T extends MetricBase> {
T getMetrics();
/** Called after collector is constructed to populate the metrics object. */
void setMetrics(T metrics);
/**
* Called when collection is complete. Implementations can do any calculations
* that must wait until all records are visited at this time.
*/
void onComplete();
/**
* Visitor method called to have the record considered by the collector.
*/
void addRecord(SAMRecord record);
}

View File

@ -1,370 +0,0 @@
package edu.mit.broad.picard.metrics;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.io.Writer;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.util.FormatUtil;
import edu.mit.broad.picard.util.Histogram;
import edu.mit.broad.picard.util.StringUtil;
/**
* Contains a set of metrics that can be written to a file and parsed back
* again. The set of metrics is composed of zero or more instances of a class,
* BEAN, that extends {@link MetricBase} (all instances must be of the same type)
* and may optionally include a histogram of data.
*
* @author Tim Fennell
*/
public class MetricsFile<BEAN extends MetricBase, HKEY extends Comparable> {
public static final String MAJOR_HEADER_PREFIX = "## ";
public static final String MINOR_HEADER_PREFIX = "# ";
public static final String SEPARATOR = "\t";
public static final String HISTO_HEADER = "## HISTOGRAM\t";
public static final String METRIC_HEADER = "## METRICS CLASS\t";
private List<Header> headers = new ArrayList<Header>();
private List<BEAN> metrics = new ArrayList<BEAN>();
private Histogram<HKEY> histogram;
/** Adds a header to the collection of metrics. */
public void addHeader(Header h) { this.headers.add(h); }
/** Returns the list of headers. */
public List<Header> getHeaders() { return Collections.unmodifiableList(this.headers); }
/** Adds a bean to the collection of metrics. */
public void addMetric(BEAN bean) { this.metrics.add(bean); }
/** Returns the list of headers. */
public List<BEAN> getMetrics() { return Collections.unmodifiableList(this.metrics); }
/** Returns the histogram contained in the metrics file if any. */
public Histogram<HKEY> getHistogram() { return histogram; }
/** Sets the histogram contained in the metrics file. */
public void setHistogram(Histogram<HKEY> histogram) { this.histogram = histogram; }
/** Returns the list of headers with the specified type. */
public List<Header> getHeaders(Class<? extends Header> type) {
List<Header> tmp = new ArrayList<Header>();
for (Header h : this.headers) {
if (h.getClass().equals(type)) {
tmp.add(h);
}
}
return tmp;
}
/**
* Writes out the metrics file to the supplied file. The file is written out
* headers first, metrics second and histogram third.
*
* @param f a File into which to write the metrics
*/
public void write(File f) {
FileWriter w = null;
try {
w = new FileWriter(f);
write(w);
}
catch (IOException ioe) {
throw new PicardException("Could not write metrics to file: " + f.getAbsolutePath(), ioe);
}
finally {
if (w != null) {
try {
w.close();
} catch (IOException e) {
}
}
}
}
/**
* Writes out the metrics file to the supplied writer. The file is written out
* headers first, metrics second and histogram third.
*
* @param w a Writer into which to write the metrics
*/
public void write(Writer w) {
try {
FormatUtil formatter = new FormatUtil();
BufferedWriter out = new BufferedWriter(w);
printHeaders(out);
out.newLine();
printBeanMetrics(out, formatter);
out.newLine();
printHistogram(out, formatter);
out.newLine();
out.flush();
}
catch (IOException ioe) {
throw new PicardException("Could not write metrics file.", ioe);
}
}
/** Prints the headers into the provided PrintWriter. */
private void printHeaders(BufferedWriter out) throws IOException {
for (Header h : this.headers) {
out.append(MAJOR_HEADER_PREFIX);
out.append(h.getClass().getName());
out.newLine();
out.append(MINOR_HEADER_PREFIX);
out.append(h.toString());
out.newLine();
}
}
/** Prints each of the metrics entries into the provided PrintWriter. */
private void printBeanMetrics(BufferedWriter out, FormatUtil formatter) throws IOException {
if (this.metrics.isEmpty()) {
return;
}
// Write out a header row with the type of the metric class
out.append(METRIC_HEADER + getBeanType().getName());
out.newLine();
// Write out the column headers
Field[] fields = getBeanType().getFields();
final int fieldCount = fields.length;
for (int i=0; i<fieldCount; ++i) {
out.append(fields[i].getName());
if (i < fieldCount - 1) {
out.append(MetricsFile.SEPARATOR);
}
else {
out.newLine();
}
}
// Write out each of the data rows
for (BEAN bean : this.metrics) {
for (int i=0; i<fieldCount; ++i) {
try {
Object value = fields[i].get(bean);
out.append(StringUtil.assertCharactersNotInString(formatter.format(value), '\t', '\n'));
if (i < fieldCount - 1) {
out.append(MetricsFile.SEPARATOR);
}
else {
out.newLine();
}
}
catch (IllegalAccessException iae) {
throw new PicardException("Could not read property " + fields[i].getName()
+ " from class of type " + bean.getClass());
}
}
}
out.flush();
}
/** Prints the histogram if one is present. */
private void printHistogram(BufferedWriter out, FormatUtil formatter) throws IOException {
if (this.histogram == null || this.histogram.isEmpty()) {
return;
}
// Add a header for the histogram key type
out.append(HISTO_HEADER + this.histogram.keySet().iterator().next().getClass().getName());
out.newLine();
if (this.histogram != null) {
out.append(StringUtil.assertCharactersNotInString(this.histogram.getBinLabel(), '\t', '\n'));
out.append(SEPARATOR);
out.append(StringUtil.assertCharactersNotInString(this.histogram.getValueLabel(), '\t', '\n'));
out.newLine();
for (Histogram<HKEY>.Bin bin : this.histogram.values()) {
out.append(StringUtil.assertCharactersNotInString(formatter.format(bin.getId()), '\t', '\n'));
out.append(MetricsFile.SEPARATOR);
out.append(formatter.format(bin.getValue()));
out.newLine();
}
}
}
/** Gets the type of the metrics bean being used. */
private Class<?> getBeanType() {
if (this.metrics == null || this.metrics.isEmpty()) {
return null;
} else {
return this.metrics.get(0).getClass();
}
}
/** Reads the Metrics in from the given reader. */
public void read(Reader r) {
BufferedReader in = new BufferedReader(r);
FormatUtil formatter = new FormatUtil();
String line = null;
try {
// First read the headers
Header header = null;
boolean inHeader = true;
while ((line = in.readLine()) != null && inHeader) {
line = line.trim();
// A blank line signals the end of the headers, otherwise parse out
// the header types and values and build the headers.
if ("".equals(line)) {
inHeader = false;
}
else if (line.startsWith(MAJOR_HEADER_PREFIX)) {
if (header != null) {
throw new IllegalStateException("Consecutive header class lines encountered.");
}
String className = line.substring(MAJOR_HEADER_PREFIX.length()).trim();
try {
header = (Header) Class.forName(className).newInstance();
}
catch (Exception e) {
throw new PicardException("Error load and/or instantiating an instance of " + className, e);
}
}
else if (line.startsWith(MINOR_HEADER_PREFIX)) {
if (header == null) {
throw new IllegalStateException("Header class must precede header value:" + line);
}
header.parse(line.substring(MINOR_HEADER_PREFIX.length()));
this.headers.add(header);
header = null;
}
else {
throw new PicardException("Illegal state. Found following string in metrics file header: " + line);
}
}
// Then read the metrics if there are any
while (!line.startsWith(MAJOR_HEADER_PREFIX)) {
line = in.readLine().trim();
}
if (line.startsWith(METRIC_HEADER)) {
// Get the metric class from the header
String className = line.split(SEPARATOR)[1];
Class<?> type = null;
try {
type = Class.forName(className);
}
catch (ClassNotFoundException cnfe) {
throw new PicardException("Could not locate class with name " + className, cnfe);
}
// Read the next line with the column headers
String[] fieldNames = in.readLine().split(SEPARATOR);
Field[] fields = new Field[fieldNames.length];
for (int i=0; i<fieldNames.length; ++i) {
try {
fields[i] = type.getField(fieldNames[i]);
}
catch (Exception e) {
throw new PicardException("Could not get field with name " + fieldNames[i] +
" from class " + type.getName());
}
}
// Now read the values
while ((line = in.readLine()) != null) {
line = line.trim();
if ("".equals(line)) {
break;
}
else {
String[] values = line.split(SEPARATOR);
BEAN bean = null;
try { bean = (BEAN) type.newInstance(); }
catch (Exception e) { throw new PicardException("Error instantiating a " + type.getName(), e); }
for (int i=0; i<fields.length; ++i) {
Object value = null;
if (values[i] != null && values[i].length() > 0) {
value = formatter.parseObject(values[i], fields[i].getType());
}
try { fields[i].set(bean, value); }
catch (Exception e) {
throw new PicardException("Error setting field " + fields[i].getName() +
" on class of type " + type.getName(), e);
}
}
this.metrics.add(bean);
}
}
}
// Then read the histogram if it is present
while (line != null && !line.startsWith(MAJOR_HEADER_PREFIX)) {
line = in.readLine();
}
if (line != null && line.startsWith(HISTO_HEADER)) {
// Get the key type of the histogram
String keyClassName = line.split(SEPARATOR)[1].trim();
Class<?> keyClass = null;
try { keyClass = Class.forName(keyClassName); }
catch (ClassNotFoundException cnfe) { throw new PicardException("Could not load class with name " + keyClassName); }
// Read the next line with the bin and value labels
String[] labels = in.readLine().split(SEPARATOR);
this.histogram = new Histogram(labels[0], labels[1]);
// Read the entries in the histogram
while ((line = in.readLine()) != null && !"".equals(line)) {
String[] fields = line.trim().split(SEPARATOR);
HKEY key = (HKEY) formatter.parseObject(fields[0], keyClass);
double value = formatter.parseDouble(fields[1]);
this.histogram.increment(key, value);
}
}
}
catch (IOException ioe) {
throw new PicardException("Could not read metrics from reader.", ioe);
}
}
/** Checks that the headers, metrics and histogram are all equal. */
@Override
public boolean equals(Object o) {
if (getClass() != o.getClass()) {
return false;
}
MetricsFile that = (MetricsFile) o;
if (!this.headers.equals(that.headers)) {
return false;
}
if (!this.metrics.equals(that.metrics)) {
return false;
}
if (this.histogram == null && that.histogram == null) {
return true;
} else if (this.histogram != null) {
return this.histogram.equals(that.histogram);
} else if (that.histogram != null) {
return that.histogram.equals(this.histogram);
}
return true;
}
}

View File

@ -1,43 +0,0 @@
package edu.mit.broad.picard.metrics;
import edu.mit.broad.picard.util.StringUtil;
/**
* A simple header who's data type is a single String. Should not be used for anything other
* than comments or descriptive text.
*
* @author Tim Fennell
*/
public class StringHeader implements Header {
private String value;
/** Default constructor. */
public StringHeader() {}
/** Constructor that uses the supplied value as the value of the header. */
public StringHeader(String value) {
setValue(value);
}
public void parse(String in) { value = in.trim(); }
public String toString() { return value; }
public String getValue() { return value; }
public void setValue(String value) { this.value = StringUtil.assertCharactersNotInString(value, '\n'); }
/** Checks equality on the value of the header. */
public boolean equals(Object o) {
if (o != null && o instanceof StringHeader) {
StringHeader that = (StringHeader) o;
if (this.value == null) {
return that.value == null;
}
else {
return this.value.equals(that.value);
}
}
else {
return false;
}
}
}

View File

@ -1,50 +0,0 @@
package edu.mit.broad.picard.metrics;
import edu.mit.broad.picard.util.StringUtil;
/**
* Header that stores information about the version of some piece of software or
* data used to create the metrics file. Payload consists of a name or description
* of the versioned item and a version string.
*
* @author Tim Fennell
*/
public class VersionHeader implements Header {
private String versionedItem;
private String versionString;
public void parse(String in) {
String[] fields = in.split("\t");
this.versionedItem = fields[0];
this.versionString = fields[1];
}
public String toString() {
return this.versionedItem + "\t" + this.versionString;
}
public String getVersionedItem() { return versionedItem; }
public void setVersionedItem(String versionedItem) {
this.versionedItem = StringUtil.assertCharactersNotInString(versionedItem, '\t', '\n');
}
public String getVersionString() { return versionString; }
public void setVersionString(String versionString) {
this.versionString = StringUtil.assertCharactersNotInString(versionString, '\t', '\n');
}
/** Equals method that checks that both the item and version string are equal. */
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
VersionHeader that = (VersionHeader) o;
if (versionString != null ? !versionString.equals(that.versionString) : that.versionString != null)
return false;
if (versionedItem != null ? !versionedItem.equals(that.versionedItem) : that.versionedItem != null)
return false;
return true;
}
}

View File

@ -1,148 +0,0 @@
package edu.mit.broad.picard.quality;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
import edu.mit.broad.picard.variation.DbSnpFileReader;
import edu.mit.broad.picard.util.Log;
import edu.mit.broad.sam.SAMFileReader;
import edu.mit.broad.sam.SAMFileWriter;
import edu.mit.broad.sam.SAMFileWriterFactory;
import edu.mit.broad.sam.SAMRecord;
import java.io.File;
import java.io.PrintStream;
/**
* Command line program to calibrate quality scores using alignment and dbsnp data. Calibrates
* qualities cycle by cycle and separately for reads one and two in a pair. Bases that fall
* within dbSNP loci are ignored otherwise the empircal mismatch rate is calculated for
* each quality at each cycle and used to calculate the calibrated quality value.
*
* @author Tim Fennell
*/
public class CalibrateQualityScores extends CommandLineProgram {
@Option(shortName="A", doc="A file of aligned reads in SAM or BAM format")
public File ALIGNED_SAM;
@Option(shortName="I", doc="A SAM or BAM file to rewrite with calibrated qualities. If omitted ALIGNED_SAM is used.", optional=true)
public File INPUT;
@Option(shortName="O", doc="The SAM or BAM file to write with updated qualities.")
public File OUTPUT;
@Option(shortName="R", doc="Reference sequence file")
public File REFERENCE;
@Option(shortName="SNP", doc="Binary file of dbSNP information", optional=true)
public File DBSNP_FILE;
@Option(shortName="TABLE", doc="A file to output the calibration table(s) to.")
public File CALIBRATION_TABLE_OUT;
@Option(doc="Optional limit to the number of aligned reads that should be procesed", optional=true)
public Integer READ_LIMIT = -1;
/** Stock main method for a command line program. */
public static void main(String[] argv) {
System.exit(new CalibrateQualityScores().instanceMain(argv));
}
/**
* Main method for the program. Checks that all input files are present and
* readable and that the output file can be written to. Then loads up all the
* data and calibrates the quality scores and proceeds to write an output file
* with calibrated quality scores instead of the input quality scores.
*/
protected int doWork() {
final Log log = Log.getInstance(getClass());
// Some quick parameter checking
if (INPUT == null) INPUT = ALIGNED_SAM;
IoUtil.assertFileIsReadable(ALIGNED_SAM);
IoUtil.assertFileIsReadable(REFERENCE);
IoUtil.assertFileIsReadable(INPUT);
IoUtil.assertFileIsWritable(OUTPUT);
IoUtil.assertFileIsWritable(CALIBRATION_TABLE_OUT);
log.info("Reading input files and calculating calibration matrices.");
// Load things up and calculate the quality score calibrations
SAMFileReader sam = new SAMFileReader(ALIGNED_SAM);
ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE);
DbSnpFileReader dbsnp = null;
if (DBSNP_FILE != null) {
IoUtil.assertFileIsReadable(DBSNP_FILE);
dbsnp = new DbSnpFileReader(DBSNP_FILE);
}
QualityScoreCalibrator calibrator = new QualityScoreCalibrator(sam, ref, dbsnp);
calibrator.calibrate(READ_LIMIT);
// Dump the calibration tables
log.info("Writing out calibration table.");
PrintStream stream = new PrintStream(IoUtil.openFileForWriting(CALIBRATION_TABLE_OUT));
stream.println("Read 1 Calibration Table:");
print(stream, calibrator.getRead1Matrix().getCalibratedQualities());
if (!calibrator.getRead2Matrix().isEmpty()) {
stream.println();
stream.println("Read 2 Calibration Table:");
print(stream, calibrator.getRead2Matrix().getCalibratedQualities());
}
// And then load up the input and rewrite with calibrated qualities
log.info("Writing file with calibrated qualities.");
SAMFileReader in = new SAMFileReader(INPUT);
SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT);
for (SAMRecord rec : in) {
byte[] quals = rec.getBaseQualities();
byte[] calibrated = new byte[quals.length];
QualityScoreMatrix matrix = rec.getFirstOfPairFlag() ? calibrator.getRead1Matrix() : calibrator.getRead2Matrix();
for (int i=0; i<quals.length; ++i) {
calibrated[i] = (byte) matrix.getCalibratedQuality(i+1, quals[i]);
}
rec.setBaseQualities(calibrated);
out.addAlignment(rec);
}
out.close();
return 0;
}
/** Static helper method to dump a calibration matrix to the screen for debugging. */
private void print(PrintStream out, int[][] matrix) {
int maxY = 0;
for (int x=0; x<matrix.length; ++x) {
if (matrix[x] != null) {
maxY = Math.max(maxY, matrix[x].length - 1);
}
}
// Print out the header row
for (int i=0;i<=maxY; ++i) {
out.print(i + "\t");
}
out.println();
// Now print out the data cycle by cycle
for (int cycle=1; cycle<matrix.length; ++cycle) {
out.print(cycle + "\t");
int[] quals = matrix[cycle];
for (int qual=1; qual<quals.length; ++qual) {
out.print(quals[qual] + "\t");
}
out.println();
}
}
}

View File

@ -1,155 +0,0 @@
package edu.mit.broad.picard.quality;
import edu.mit.broad.sam.SAMFileReader;
import edu.mit.broad.sam.SAMRecord;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.sam.AlignmentBlock;
import edu.mit.broad.sam.util.CloseableIterator;
import edu.mit.broad.picard.variation.DbSnpFileReader;
import edu.mit.broad.picard.variation.KnownVariant;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.picard.reference.ReferenceSequence;
import edu.mit.broad.picard.util.CoordMath;
import edu.mit.broad.picard.util.Histogram;
import edu.mit.broad.picard.util.SequenceUtil;
import java.util.Map;
import java.util.BitSet;
import java.util.TreeMap;
/**
* Takes a set of aligned reads with qualities and determines the empirical quality
* score for each of the bins.
*
* @author Tim Fennell
*/
public class QualityScoreCalibrator {
private final SAMFileReader sam;
private final ReferenceSequenceFile ref;
private final DbSnpFileReader dbsnp;
private QualityScoreMatrix read1Matrix;
private QualityScoreMatrix read2Matrix;
/**
* Constructs a calibrator that will read records from the specified SAMFileReader
* and compare them the supplied reference. Optionally takes a set of known variants
* who's positions will be excluded during calibration.
*
* @param sam the set of SAM records to use to calibrate qualities
* @param ref the reference sequence against which the records were aligned
* @param dbsnp the (optional) set of dbsnp positions to mask during calibration
*/
public QualityScoreCalibrator(SAMFileReader sam, ReferenceSequenceFile ref, DbSnpFileReader dbsnp) {
this.sam = sam;
this.dbsnp = dbsnp;
this.ref = ref;
}
/**
* Calculates calibrated quality scores using at most the specified number of aligned
* reads. If the end of the file is hit first then fewer reads will be used.
*
* @param readLimit the number of aligned reads to use if the file contains more
*/
public void calibrate(final int readLimit) {
ReferenceSequence reference = null;
SAMFileHeader header = this.sam.getFileHeader();
CloseableIterator<SAMRecord> samIterator = this.sam.iterator();
SAMRecord read = samIterator.next();
int readsProcessed = 0;
// Quality score matrixes for reads 1 and 2 separately
this.read1Matrix = new QualityScoreMatrix();
this.read2Matrix = new QualityScoreMatrix();
refloop: while ((reference = this.ref.nextSequence()) != null) {
final byte[] refBases = reference.getBases();
final BitSet snps = getDbSnpMask(reference);
while (read != null && read.getReferenceIndex(header) == reference.getContigIndex()) {
if (!read.getReadUnmappedFlag() && !read.getNotPrimaryAlignmentFlag()) {
final QualityScoreMatrix matrix = read.getFirstOfPairFlag() ? this.read1Matrix : this.read2Matrix;
final byte[] readBases = read.getReadBases();
final byte[] qualities = read.getBaseQualities();
for (AlignmentBlock block : read.getAlignmentBlocks()) {
final int readIndex = block.getReadStart() - 1;
final int refIndex = block.getReferenceStart() - 1;
final int length = block.getLength();
for (int i=0; i<length; ++i) {
// Skip dbSNP loci
if (snps.get(refIndex+i+1)) continue;
final int readBaseIndex = readIndex+i;
boolean match = SequenceUtil.basesEqual(readBases[readBaseIndex], refBases[refIndex+i]);
int cycle = CoordMath.getCycle(
read.getReadNegativeStrandFlag(), readBases.length, readBaseIndex);
matrix.addObservation(cycle, qualities[readBaseIndex], !match);
}
}
if (readLimit > 0 && ++readsProcessed >= readLimit) {
break refloop;
}
}
// Advance the sam iterator
if (samIterator.hasNext()) {
read = samIterator.next();
}
else {
read = null;
}
}
}
this.read1Matrix.computeCalibratedQualities();
if (!this.read2Matrix.isEmpty()) this.read2Matrix.computeCalibratedQualities();
}
/** Gets the calibration matrix for the first read. */
public QualityScoreMatrix getRead1Matrix() { return read1Matrix; }
/** Gets the calibration matrix for the second read. May be empty if there was no second read data. */
public QualityScoreMatrix getRead2Matrix() { return read2Matrix; }
/**
* Returns a BitSet that denotes whether a dbSNP entry is present at each
* base in the reference sequence. The set is reference.length() + 1 so that
* it can be indexed by 1-based reference base. True means dbSNP present,
* false means no dbSNP present.
*/
private BitSet getDbSnpMask(ReferenceSequence reference) {
int index = reference.getContigIndex();
BitSet bits = new BitSet(reference.length() + 1);
/* Just return an all false bit set if we don't have dbsnp data. */
if (this.dbsnp == null) {
return bits;
}
/* Read off the next contig's worth of data. */
while (this.dbsnp.hasNext()) {
KnownVariant variant = this.dbsnp.peek();
if (variant.getSequenceIndex() < index) {
this.dbsnp.next();
}
else if (variant.getSequenceIndex() == index) {
variant = this.dbsnp.next();
for (int i=variant.getStartPos(); i<=variant.getEndPos(); ++i) {
bits.set(i, true);
}
}
else {
break;
}
}
return bits;
}
}

View File

@ -1,133 +0,0 @@
package edu.mit.broad.picard.quality;
import edu.mit.broad.picard.util.Histogram;
import java.util.TreeMap;
import java.util.Map;
import java.util.SortedMap;
/**
* <p>Holds all the information necessary to perform quality score calibration for a single
* end/read for a lane or run of sequencing. General usage is to construct an instance
* an call {@link #addObservation(int, int, boolean)} repeatedly and when all input data
* is consumed call {@link #computeCalibratedQualities()}.</p>
*
* <p>Once this is done then {@link #getCalibratedQualities()} can be called to get a matrix
* of quality score calibrations by cycle and input quality. However it is preferred to call
* {@link #getCalibratedQuality(int, int)} which will attempt to infer the correct value in the
* case that the input quality was not observed in the training data.</p>
*
* @author Tim Fennell
*/
public class QualityScoreMatrix {
// Maps by cycle, histograms by quality
private SortedMap<Integer, Histogram<Integer>> observations = new TreeMap<Integer, Histogram<Integer>>();
private SortedMap<Integer, Histogram<Integer>> errors = new TreeMap<Integer, Histogram<Integer>>();
private int[][] calibratedQualities = null;
/**
* Adds an observation to the matrix.
* @param cycle the cycle in the read (1-based)
* @param quality the uncalibrated quality
* @param error true if the base did not match the reference, false otherwise
*/
public void addObservation(int cycle, int quality, boolean error) {
Histogram<Integer> obs = this.observations.get(cycle);
if (obs == null) {
obs = new Histogram<Integer>();
this.observations.put(cycle, obs);
}
obs.increment(quality);
if (error) {
Histogram<Integer> errs = this.errors.get(cycle);
if (errs == null) {
errs = new Histogram<Integer>();
this.errors.put(cycle, errs);
}
errs.increment(quality);
}
}
/**
* Takes the input observations so far and builds a matrix of input cycle and
* uncalibrated quality to calibrated quality value.
*/
public void computeCalibratedQualities() {
this.calibratedQualities = new int[this.observations.lastKey() + 1][];
for (int cycle=1; cycle<this.calibratedQualities.length; ++cycle) {
Histogram<Integer> obs = this.observations.get(cycle);
Histogram<Integer> err = this.errors.get(cycle);
this.calibratedQualities[cycle] = new int[obs.lastKey() + 1];
for (Integer qual : obs.keySet()) {
double o = obs.get(qual).getValue();
Histogram<Integer>.Bin errBin = err.get(qual);
double e = (errBin == null) ? 1 : errBin.getValue();
this.calibratedQualities[cycle][qual] = computePhredScore(e, o);
}
}
}
/**
* Returns the set of calibrated quality scores from the training data. The array is
* indexed first by the cycle (1-based, index 0 is empty) and then by input quality
* (again, the actualy quality, not shifted).
*
* @return an array of calibrated qualities for the read
*/
public int[][] getCalibratedQualities() {
return calibratedQualities;
}
/**
* Accesses the calibrated quality for the given input cycle and quality. If the quality
* is outside the range given in the training data then the upper or lower bound of
* the calibrated qualities is used instead.
*
* @param cycle the input cycle (1-based)
* @param quality the uncalibrated quality
* @return the calibrated quality for the cycle and uncalibrated quality
*/
public final int getCalibratedQuality(int cycle, int quality) {
final int[] quals = this.calibratedQualities[cycle];
// TODO: proper iterpolation where we don't have the right quality
try {
int retval = quals[quality];
// If we didn't calibrate this quality value, search up and down for non-zero
for (int i=quality; i>0 && retval == 0; --i) {
if (quals[i] != 0) retval = quals[i];
}
for (int i=quality; i<quals.length && retval == 0; ++i) {
if (quals[i] != 0) retval = quals[i];
}
return retval;
}
catch (IndexOutOfBoundsException ioobe) {
// If we try to fetch a quality out of the calibrted range use either
// 1 or max quality based on which side we were out of range on
if (quality < 1) return 1;
else return quals[quals.length - 1];
}
}
/** Returns true if no observations were made, otherwise false. */
public boolean isEmpty() {
return this.observations.isEmpty();
}
/** Just does the simple phred scaling given the errors and observations. */
private int computePhredScore(double errors, double observations) {
return (int) Math.round(-10d * Math.log10(errors / observations));
}
}

View File

@ -1,137 +0,0 @@
package edu.mit.broad.picard.reference;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.sam.SAMSequenceRecord;
import edu.mit.broad.sam.SAMTextHeaderCodec;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.sam.util.LineReader;
import edu.mit.broad.sam.util.AsciiLineReader;
import java.io.*;
import java.nio.charset.Charset;
import java.util.List;
/**
* Implementation of ReferenceSequenceFile for reading from FASTA files.
*
* @author Tim Fennell
*/
class FastaSequenceFile implements ReferenceSequenceFile {
private static final Charset ASCII = Charset.forName("US-ASCII");
private File file;
private BufferedReader in;
private List<SAMSequenceRecord> sequenceDictionary;
private String cachedLine = null;
private int index = -1;
/** Constructs a FastaSequenceFile that reads from the specified file. */
FastaSequenceFile(File file) {
this.file = file;
this.in = new BufferedReader(new InputStreamReader(IoUtil.openFileForReading(file)));
// Try and locate the dictionary
String dictionaryName = file.getAbsolutePath();
dictionaryName = dictionaryName.substring(0, dictionaryName.lastIndexOf(".fasta"));
dictionaryName += ".dict";
File dictionary = new File(dictionaryName);
if (dictionary.exists()) {
IoUtil.assertFileIsReadable(dictionary);
try {
SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)), dictionary);
if (header.getSequences() != null && header.getSequences().size() > 0) {
this.sequenceDictionary = header.getSequences();
}
}
catch (Exception e) {
throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e);
}
}
}
/**
* Returns the list of sequence records associated with the reference sequence if found
* otherwise null.
*/
public List<SAMSequenceRecord> getSequenceDictionary() {
return this.sequenceDictionary;
}
public ReferenceSequence nextSequence() {
String line = null;
String name = null;
// Scan forward to a header line
while ((line = readNextLine()) != null) {
if (line.startsWith(">")) {
name = line.substring(1).trim();
this.index += 1;
break;
}
}
// No more!
if (name == null) return null;
// Read the sequence
int basesRead = 0;
byte[] bases = new byte[250000000]; // big enough to hold human chr1!
while ((line = readNextLine()) != null) {
if (line.startsWith(">")) {
pushBackLine(line);
break;
}
else {
final byte[] nextBases = line.getBytes(ASCII);
final int lineLength = nextBases.length;
// If the array isn't big enough to hold the next chunk, resize it
if (basesRead + lineLength > bases.length) {
byte[] tmp = new byte[bases.length * 2];
System.arraycopy(bases, 0, tmp, 0, basesRead);
bases = tmp;
}
// Now shunt the most recent bases onto the end of the array
System.arraycopy(nextBases, 0, bases, basesRead, lineLength);
basesRead += lineLength;
}
}
// And lastly resize the array down to the right size
if (basesRead != bases.length) {
byte[] tmp = new byte[basesRead];
System.arraycopy(bases, 0, tmp, 0, basesRead);
bases = tmp;
}
return new ReferenceSequence(name, this.index, bases);
}
/**
* Reads the next line from the file, or if we've saved a line earlier, returns that
* instead.
*/
private String readNextLine() {
// If we have a cached line use it
if (this.cachedLine != null) {
String tmp = this.cachedLine;
this.cachedLine = null;
return tmp;
}
else {
try { return this.in.readLine(); }
catch (IOException ioe) {
throw new PicardException("Error reading line from file: " + this.file.getAbsolutePath(), ioe);
}
}
}
/** Pushed a line back so that the next call to readNextLine() will return it. */
private void pushBackLine(String line) {
this.cachedLine = line;
}
}

View File

@ -1,48 +0,0 @@
package edu.mit.broad.picard.reference;
/**
* Wrapper around a reference sequence that has been read from a reference file.
*
* @author Tim Fennell
*/
public class ReferenceSequence {
private String name;
private byte[] bases;
private int contigIndex;
private int length;
/**
* Package level constructor that creates a fully formed ReferenceSequence
*
* @param name the name of the sequence from the source file
* @param index the zero based index of this contig in the source file
* @param bases the bases themselves stored as one-byte characters
*/
ReferenceSequence(String name, int index, byte[] bases) {
this.name = name;
this.contigIndex = index;
this.bases = bases;
this.length = bases.length;
}
/** Gets the set of names given to this sequence in the source file. */
public String getName() { return name; }
/**
* Gets the array of bases that define this sequence. The bases can include any
* letter and possibly include masking information in the form of lower case
* letters. This array is mutable (obviously!) and it NOT a clone of the array
* held interally. Do not modify it!!!
*/
public byte[] getBases() { return bases; }
/** Gets the 0-based index of this contig in the source file from which it came. */
public int getContigIndex() { return contigIndex; }
/** Gets the length of this reference sequence in bases. */
public int length() { return length; }
public String toString() {
return "ReferenceSequence " + getName();
}
}

View File

@ -1,29 +0,0 @@
package edu.mit.broad.picard.reference;
import edu.mit.broad.sam.SAMSequenceRecord;
import java.util.List;
/**
* An interface for working with files of reference sequences regardless of the file format
* being used.
*
* @author Tim Fennell
*/
public interface ReferenceSequenceFile {
/**
* Must return a sequence dictionary with at least the following fields completed
* for each sequence: name, length.
*
* @return a list of sequence records representing the sequences in this reference file
*/
public List<SAMSequenceRecord> getSequenceDictionary();
/**
* Retrieves the next whole sequences from the file.
* @return a ReferenceSequence or null if at the end of the file
*/
public ReferenceSequence nextSequence();
}

View File

@ -1,28 +0,0 @@
package edu.mit.broad.picard.reference;
import java.io.File;
/**
* Factory class for creating ReferenceSequenceFile instances for reading reference
* sequences store in various formats.
*
* @author Tim Fennell
*/
public class ReferenceSequenceFileFactory {
/**
* Attempts to determine the type of the reference file and return an instance
* of ReferenceSequenceFile that is appropriate to read it.
*
* @param file the reference sequence file on disk
*/
public static ReferenceSequenceFile getReferenceSequenceFile(File file) {
String name = file.getName();
if (name.endsWith(".fasta") || name.endsWith("fasta.gz") || name.endsWith(".txt") || name.endsWith(".txt.gz")) {
return new FastaSequenceFile(file);
}
else {
throw new IllegalArgumentException("File is not a supported reference file type: " + file.getAbsolutePath());
}
}
}

View File

@ -1,352 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.sam;
import java.io.File;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.cmdline.Usage;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.metrics.AggregateMetricCollector;
import edu.mit.broad.picard.metrics.MetricBase;
import edu.mit.broad.picard.metrics.MetricCollector;
import edu.mit.broad.picard.metrics.MetricsFile;
import edu.mit.broad.picard.metrics.StringHeader;
import edu.mit.broad.picard.reference.ReferenceSequence;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
import edu.mit.broad.picard.sam.CollectAlignmentSummaryMetrics.AlignmentSummaryMetrics.Type;
import edu.mit.broad.picard.util.CoordMath;
import edu.mit.broad.picard.util.Histogram;
import edu.mit.broad.picard.util.SequenceUtil;
import edu.mit.broad.sam.AlignmentBlock;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.sam.SAMFileReader;
import edu.mit.broad.sam.SAMRecord;
import edu.mit.broad.sam.util.CloseableIterator;
/**
* A command line tool to read a BAM file and produce standard alignment metrics that would be applicable to any alignment.
* Metrics to include, but not limited to:
* <ul>
* <li>Total number of reads (total, period, no exclusions)</li>
* <li>Total number of PF reads (PF == does not fail vendor check flag)</li>
* <li>Number of PF noise reads (does not fail vendor check and has noise attr set)</li>
* <li>Total aligned PF reads (any PF read that has a sequence and position)</li>
* <li>High quality aligned PF reads (high quality == mapping quality >= 20)</li>
* <li>High quality aligned PF bases (actual aligned bases, calculate off alignment blocks)</li>
* <li>High quality aligned PF Q20 bases (subset of above where base quality >= 20)</li>
* <li>Median mismatches in HQ aligned PF reads (how many aligned bases != ref on average)</li>
* <li>Reads aligned in pairs (vs. reads aligned with mate unaligned/not present)</li>
* <li>Read length (how to handle mixed lengths?)</li>
* <li>Bad Cycles - how many machine cycles yielded combined no-call and mismatch rates of >= 80%</li>
* <li>Strand balance - reads mapped to positive strand / total mapped reads</li>
* </ul>
* Metrics are written for the first read of a pair, the second read, and combined for the pair.
*
* @author Doug Voet
*/
public class CollectAlignmentSummaryMetrics extends CommandLineProgram {
private static final int MAPPING_QUALITY_THRESHOLD = 20;
private static final int BASE_QUALITY_THRESHOLD = 20;
// Usage and parameters
@Usage(programVersion="1.0")
public String USAGE = "Reads a SAM or BAM file and writes a file containing summary metrics.\n";
@Option(shortName="I", doc="SAM or BAM file") public File INPUT;
@Option(shortName="O", doc="File to write insert size metrics to") public File OUTPUT;
@Option(shortName="R", doc="Reference sequence file") public File REFERENCE;
@Option(doc="If true (default), \"unsorted\" SAM/BAM files will be considerd coordinate sorted")
public Boolean ASSUME_COODINATE_SORTED = Boolean.TRUE;
private ReferenceSequenceFile ref;
private ReferenceSequence refSequence;
private SAMFileHeader samFileHeader;
/** Required main method implementation. */
public static void main(String[] argv) {
System.exit(new CollectAlignmentSummaryMetrics().instanceMain(argv));
}
@Override
protected int doWork() {
IoUtil.assertFileIsReadable(INPUT);
IoUtil.assertFileIsReadable(REFERENCE);
IoUtil.assertFileIsWritable(OUTPUT);
SAMFileReader in = new SAMFileReader(INPUT);
assertCoordinateSortOrder(in);
this.ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE);
this.samFileHeader = in.getFileHeader();
MetricsFile<AlignmentSummaryMetrics, Comparable<?>> file = collectMetrics(in.iterator());
in.close();
file.write(OUTPUT);
return 0;
}
private void assertCoordinateSortOrder(SAMFileReader in) {
switch (in.getFileHeader().getSortOrder()) {
case coordinate:
break;
case unsorted:
if (this.ASSUME_COODINATE_SORTED) {
break;
}
default:
throw new PicardException("Cannot collect summary statistics in file " + INPUT.getAbsoluteFile() +
" because it is not sorted in coordinate order.");
}
}
private ReferenceSequence getReference(SAMRecord record) {
while (refSequence == null ||
record.getReferenceIndex(samFileHeader) > refSequence.getContigIndex()) {
refSequence = ref.nextSequence();
}
if (refSequence == null || record.getReferenceIndex() != refSequence.getContigIndex()) {
throw new PicardException("Cannot find reference sequence [" +
record.getReferenceIndex() + "] in reference file");
}
return refSequence;
}
/**
* Does all the work of iterating through the sam file and collecting summary alignment metrics.
*/
private MetricsFile<AlignmentSummaryMetrics, Comparable<?>> collectMetrics(
CloseableIterator<SAMRecord> samIterator) {
final MetricCollector<AlignmentSummaryMetrics> unpairedCollector =
constructCollector(Type.UNPAIRED);
final MetricCollector<AlignmentSummaryMetrics> firstOfPairCollector =
constructCollector(Type.FIRST_OF_PAIR);
final MetricCollector<AlignmentSummaryMetrics> secondOfPairCollector =
constructCollector(Type.SECOND_OF_PAIR);
final MetricCollector<AlignmentSummaryMetrics> pairCollector =
constructCollector(Type.PAIR);
while (samIterator.hasNext()) {
SAMRecord record = samIterator.next();
if (record.getReadPairedFlag()) {
if (record.getFirstOfPairFlag()) {
firstOfPairCollector.addRecord(record);
} else {
secondOfPairCollector.addRecord(record);
}
pairCollector.addRecord(record);
} else {
unpairedCollector.addRecord(record);
}
}
firstOfPairCollector.onComplete();
secondOfPairCollector.onComplete();
pairCollector.onComplete();
unpairedCollector.onComplete();
MetricsFile<AlignmentSummaryMetrics, Comparable<?>> file = getMetricsFile();
file.addHeader(new StringHeader("Input file: " + INPUT.getAbsolutePath()));
file.addHeader(new StringHeader("Output file: " + OUTPUT.getAbsolutePath()));
file.addHeader(new StringHeader("Reference file: " + REFERENCE.getAbsolutePath()));
if (firstOfPairCollector.getMetrics().TOTAL_READS > 0) {
file.addMetric(firstOfPairCollector.getMetrics());
// override how bad cycle is determined for paired reads, it should be
// the sum of first and second reads
pairCollector.getMetrics().BAD_CYCLES =
firstOfPairCollector.getMetrics().BAD_CYCLES +
secondOfPairCollector.getMetrics().BAD_CYCLES;
file.addMetric(secondOfPairCollector.getMetrics());
file.addMetric(pairCollector.getMetrics());
}
if (unpairedCollector.getMetrics().TOTAL_READS > 0) {
file.addMetric(unpairedCollector.getMetrics());
}
return file;
}
private MetricCollector<AlignmentSummaryMetrics> constructCollector(Type type) {
MetricCollector<AlignmentSummaryMetrics> collector =
new AggregateMetricCollector<AlignmentSummaryMetrics>(new ReadCounter(), new QualityMappingCounter());
collector.setMetrics(new AlignmentSummaryMetrics());
collector.getMetrics().TYPE = type;
return collector;
}
public static class AlignmentSummaryMetrics extends MetricBase {
public enum Type { UNPAIRED, FIRST_OF_PAIR, SECOND_OF_PAIR, PAIR }
public Type TYPE;
public long TOTAL_READS;
public long PF_READS;
public long PF_NOISE_READS;
public long PF_READS_ALIGNED;
public long PF_HQ_ALIGNED_READS;
public long PF_HQ_ALIGNED_BASES;
public long PF_HQ_ALIGNED_Q20_BASES;
public double PF_HQ_MEDIAN_MISMATCHES;
public double MEAN_READ_LENGTH;
public long READS_ALIGNED_IN_PAIRS;
public long BAD_CYCLES;
public double STRAND_BALANCE;
}
/** counts reads that match various conditions */
private class ReadCounter implements MetricCollector<AlignmentSummaryMetrics> {
private long numPositiveStrand = 0;
private Histogram<Integer> readLengthHistogram = new Histogram<Integer>();
private AlignmentSummaryMetrics metrics;
@Override
public void addRecord(SAMRecord record) {
if (record.getNotPrimaryAlignmentFlag()) {
// only want 1 count per read so skip non primary alignments
return;
}
metrics.TOTAL_READS++;
readLengthHistogram.increment(record.getReadBases().length);
if (!record.getReadFailsVendorQualityCheckFlag()) {
metrics.PF_READS++;
if (isNoiseRead(record)) {
metrics.PF_NOISE_READS++;
}
if (!record.getReadUnmappedFlag()) {
metrics.PF_READS_ALIGNED++;
}
}
if (!record.getReadUnmappedFlag() &&
record.getReadPairedFlag() &&
!record.getMateUnmappedFlag()) {
metrics.READS_ALIGNED_IN_PAIRS++;
}
if (!record.getReadNegativeStrandFlag()) {
numPositiveStrand++;
}
}
@Override
public void onComplete() {
metrics.MEAN_READ_LENGTH = readLengthHistogram.getMean();
metrics.STRAND_BALANCE = numPositiveStrand / (double) metrics.TOTAL_READS;
}
private boolean isNoiseRead(SAMRecord record) {
final Object noiseAttribute = record.getAttribute(ReservedTagConstants.XN);
return (noiseAttribute != null && noiseAttribute.equals(1));
}
@Override
public void setMetrics(AlignmentSummaryMetrics metrics) {
this.metrics = metrics;
}
@Override
public AlignmentSummaryMetrics getMetrics() {
return this.metrics;
}
}
/** counts quality mappings & base calls that match various conditions */
private class QualityMappingCounter implements MetricCollector<AlignmentSummaryMetrics> {
private Histogram<Long> mismatchHistogram = new Histogram<Long>();
private Histogram<Integer> badCycleHistogram = new Histogram<Integer>();
private AlignmentSummaryMetrics metrics;
@Override
public void addRecord(SAMRecord record) {
if (record.getNotPrimaryAlignmentFlag()) {
return;
}
if (record.getReadUnmappedFlag()) {
final byte[] readBases = record.getReadBases();
for (int i = 0; i < readBases.length; i++) {
if (SequenceUtil.isNoCall(readBases[i])) {
badCycleHistogram.increment(CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i));
}
}
} else {
boolean highQualityMapping = isHighQualityMapping(record);
if (highQualityMapping) metrics.PF_HQ_ALIGNED_READS++;
final byte[] readBases = record.getReadBases();
final byte[] refBases = getReference(record).getBases();
final byte[] qualities = record.getBaseQualities();
long mismatchCount = 0;
for (AlignmentBlock alignmentBlock : record.getAlignmentBlocks()) {
final int readIndex = alignmentBlock.getReadStart() - 1;
final int refIndex = alignmentBlock.getReferenceStart() - 1;
final int length = alignmentBlock.getLength();
if (highQualityMapping) metrics.PF_HQ_ALIGNED_BASES += alignmentBlock.getLength();
for (int i=0; i<length; ++i) {
final int readBaseIndex = readIndex + i;
boolean mismatch = !SequenceUtil.basesEqual(readBases[readBaseIndex], refBases[refIndex+i]);
if (highQualityMapping) {
if (qualities[readBaseIndex] >= BASE_QUALITY_THRESHOLD) {
metrics.PF_HQ_ALIGNED_Q20_BASES++;
}
if (mismatch) {
mismatchCount++;
}
}
if (mismatch || SequenceUtil.isNoCall(readBases[readBaseIndex])) {
badCycleHistogram.increment(CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i));
}
}
}
mismatchHistogram.increment(mismatchCount);
}
}
private boolean isHighQualityMapping(SAMRecord record) {
return !record.getReadFailsVendorQualityCheckFlag() &&
record.getMappingQuality() >= MAPPING_QUALITY_THRESHOLD;
}
@Override
public void onComplete() {
metrics.PF_HQ_MEDIAN_MISMATCHES = mismatchHistogram.getMedian();
metrics.BAD_CYCLES = 0;
for (Histogram<Integer>.Bin cycleBin : badCycleHistogram.values()) {
double badCyclePercentage = cycleBin.getValue() / metrics.TOTAL_READS;
if (badCyclePercentage >= .8) {
metrics.BAD_CYCLES++;
}
}
}
@Override
public void setMetrics(AlignmentSummaryMetrics metrics) {
this.metrics = metrics;
}
@Override
public AlignmentSummaryMetrics getMetrics() {
return this.metrics;
}
}
}

View File

@ -1,154 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.sam;
import java.io.File;
import edu.mit.broad.picard.PicardException;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.cmdline.Usage;
import edu.mit.broad.picard.io.IoUtil;
import edu.mit.broad.picard.metrics.MetricsFile;
import edu.mit.broad.picard.util.Histogram;
import edu.mit.broad.picard.util.Log;
import edu.mit.broad.picard.util.RExecutor;
import edu.mit.broad.sam.SAMFileReader;
import edu.mit.broad.sam.SAMRecord;
import edu.mit.broad.sam.util.CloseableIterator;
/**
* Command line program to read non-duplicate insert sizes, create a histogram
* and report distribution statistics.
*
* @author Doug Voet
*/
public class CollectInsertSizeMetrics extends CommandLineProgram {
private static Log log = Log.getInstance(CollectInsertSizeMetrics.class);
private static final String HISTOGRAM_R_SCRIPT = "edu/mit/broad/picard/sam/insertSizeHistogram.R";
// Usage and parameters
@Usage(programVersion="1.0")
public String USAGE = "Reads a SAM or BAM file and writes a file containing metrics about " +
"the statistical distribution of insert size (excluding duplicates) " +
"and generates a histogram plot.\n";
@Option(shortName="I", doc="SAM or BAM file") public File INPUT;
@Option(shortName="O", doc="File to write insert size metrics to") public File OUTPUT;
@Option(shortName="H", doc="File to write insert size histogram chart to") public File HISTOGRAM_FILE;
/** Required main method implementation. */
public static void main(String[] argv) {
System.exit(new CollectInsertSizeMetrics().instanceMain(argv));
}
@Override
protected int doWork() {
IoUtil.assertFileIsReadable(INPUT);
IoUtil.assertFileIsWritable(OUTPUT);
IoUtil.assertFileIsWritable(HISTOGRAM_FILE);
SAMFileReader in = new SAMFileReader(INPUT);
MetricsFile<InsertSizeMetrics, Integer> file = collectMetrics(in.iterator());
in.close();
file.write(OUTPUT);
if (file.getMetrics().get(0).READ_PAIRS == 0) {
log.warn("Input file did not contain any records with insert size information.");
} else {
int rResult = RExecutor.executeFromClasspath(
HISTOGRAM_R_SCRIPT,
OUTPUT.getAbsolutePath(),
HISTOGRAM_FILE.getAbsolutePath(),
INPUT.getName());
if (rResult != 0) {
throw new PicardException("R script " + HISTOGRAM_R_SCRIPT + " failed with return code " + rResult);
}
}
return 0;
}
/**
* Does all the work of iterating through the sam file and collecting insert size metrics.
*/
MetricsFile<InsertSizeMetrics, Integer> collectMetrics(CloseableIterator<SAMRecord> samIterator) {
Histogram<Integer> insertSizeHistogram = new Histogram<Integer>("insert_size", "count");
while (samIterator.hasNext()) {
SAMRecord record = samIterator.next();
if (skipRecord(record)) {
continue;
}
int insertSize = Math.abs(record.getInferredInsertSize());
insertSizeHistogram.increment(insertSize);
}
MetricsFile<InsertSizeMetrics, Integer> file = new MetricsFile<InsertSizeMetrics, Integer>();
file.setHistogram(insertSizeHistogram);
InsertSizeMetrics metrics = new InsertSizeMetrics();
metrics.READ_PAIRS = (long) insertSizeHistogram.getCount();
metrics.MAX_INSERT_SIZE = (int) insertSizeHistogram.getMax();
metrics.MIN_INSERT_SIZE = (int) insertSizeHistogram.getMin();
metrics.MEAN_INSERT_SIZE = insertSizeHistogram.getMean();
metrics.STANDARD_DEVIATION = insertSizeHistogram.getStandardDeviation();
metrics.MEDIAN_INSERT_SIZE = insertSizeHistogram.getMedian();
final double total = insertSizeHistogram.getCount();
final double median = insertSizeHistogram.getMedian();
double covered = 0;
double low = median;
double high = median;
while (low >= insertSizeHistogram.getMin() || high <= insertSizeHistogram.getMax()) {
Histogram<Integer>.Bin lowBin = insertSizeHistogram.get((int) low);
if (lowBin != null) covered += lowBin.getValue();
if (low != high) {
Histogram<Integer>.Bin highBin = insertSizeHistogram.get((int) high);
if (highBin != null) covered += highBin.getValue();
}
double percentCovered = covered / total;
int distance = (int) (high - low) + 1;
if (percentCovered >= 0.1 && metrics.WIDTH_OF_10_PERCENT == 0) metrics.WIDTH_OF_10_PERCENT = distance;
if (percentCovered >= 0.2 && metrics.WIDTH_OF_20_PERCENT == 0) metrics.WIDTH_OF_20_PERCENT = distance;
if (percentCovered >= 0.3 && metrics.WIDTH_OF_30_PERCENT == 0) metrics.WIDTH_OF_30_PERCENT = distance;
if (percentCovered >= 0.4 && metrics.WIDTH_OF_40_PERCENT == 0) metrics.WIDTH_OF_40_PERCENT = distance;
if (percentCovered >= 0.5 && metrics.WIDTH_OF_50_PERCENT == 0) metrics.WIDTH_OF_50_PERCENT = distance;
if (percentCovered >= 0.6 && metrics.WIDTH_OF_60_PERCENT == 0) metrics.WIDTH_OF_60_PERCENT = distance;
if (percentCovered >= 0.7 && metrics.WIDTH_OF_70_PERCENT == 0) metrics.WIDTH_OF_70_PERCENT = distance;
if (percentCovered >= 0.8 && metrics.WIDTH_OF_80_PERCENT == 0) metrics.WIDTH_OF_80_PERCENT = distance;
if (percentCovered >= 0.9 && metrics.WIDTH_OF_90_PERCENT == 0) metrics.WIDTH_OF_90_PERCENT = distance;
if (percentCovered >= 0.99 && metrics.WIDTH_OF_99_PERCENT == 0) metrics.WIDTH_OF_99_PERCENT = distance;
--low;
++high;
}
file.addMetric(metrics);
return file;
}
/**
* Figures out whether or not the record should be included in the counting of insert sizes
*/
private boolean skipRecord(SAMRecord record) {
return !record.getReadPairedFlag() ||
record.getMateUnmappedFlag() ||
record.getFirstOfPairFlag() ||
record.getNotPrimaryAlignmentFlag() ||
record.getDuplicateReadFlag() ||
record.getInferredInsertSize() == 0;
}
}

View File

@ -1,64 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright Jan 22, 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.sam;
import edu.mit.broad.picard.util.PeekableIterator;
import edu.mit.broad.sam.SAMFileReader;
import edu.mit.broad.sam.SAMRecord;
import java.util.Comparator;
/**
* Iterator for SAM records that implements comparable to enable sorting of iterators.
* The comparison is performed by comparing the next record in the iterator to the next
* record in another iterator and returning the ordering between those SAM records.
*/
class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements Comparable<ComparableSamRecordIterator> {
private Comparator<SAMRecord> comparator;
private SAMFileReader reader;
/**
* Constructs an iterator for iteration over the supplied SAM file that will be
* able to compare itself to other ComparableSAMRecordIterator instances using
* the supplied comparator for ordering SAMRecords.
*
* @param sam the SAM file to read records from
* @param comparator the Comparator to use to provide ordering fo SAMRecords
*/
public ComparableSamRecordIterator(SAMFileReader sam, Comparator<SAMRecord> comparator) {
super(sam.iterator());
this.reader = sam;
this.comparator = comparator;
}
/** Returns the reader from which this iterator was constructed. */
public SAMFileReader getReader() {
return reader;
}
/**
* Compares this iterator to another comparable iterator based on the next record
* available in each iterator. If the two comparable iterators have different
* comparator types internally an exception is thrown.
*
* @param that another iterator to compare to
* @return a negative, 0 or positive number as described in the Comparator interface
*/
public int compareTo(ComparableSamRecordIterator that) {
if (this.comparator.getClass() != that.comparator.getClass()) {
throw new IllegalStateException("Attempt to compare two ComparableSAMRecordIterators that " +
"have different orderings internally");
}
SAMRecord record = this.peek();
SAMRecord record2 = that.peek();
return comparator.compare(record, record2);
}
}

View File

@ -1,145 +0,0 @@
/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*/
package edu.mit.broad.picard.sam;
import edu.mit.broad.sam.SAMSequenceRecord;
import edu.mit.broad.sam.SAMFileWriter;
import edu.mit.broad.sam.SAMFileWriterFactory;
import edu.mit.broad.sam.SAMFileHeader;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
import edu.mit.broad.picard.reference.ReferenceSequence;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.cmdline.Usage;
import edu.mit.broad.picard.PicardException;
import java.util.List;
import java.util.ArrayList;
import java.io.File;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.math.BigInteger;
/**
* Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no
* SAMRecords, and the header contains only sequence records.
*/
public class CreateSequenceDictionary extends CommandLineProgram {
private static final String PROGRAM_VERSION = "1.0";
// The following attributes define the command-line arguments
@Usage(programVersion=PROGRAM_VERSION)
public String USAGE =
"Usage: " + getClass().getName() + " [options]\n\n" +
"Read fasta or fasta.gz containing reference sequences, and write as a SAM or BAM file with only sequence dictionary.\n";
@Option(doc = "Input reference fasta or fasta.gz")
public File REFERENCE;
@Option(doc = "Output SAM or BAM file containing only the sequence dictionary")
public File OUTPUT;
@Option(doc = "Put into AS field of sequence dictionary entry if supplied", optional = true)
public String GENOME_ASSEMBLY;
@Option(doc = "Put into UIR field of sequence dictionary entry. If not supplied, input reference file is used",
optional = true)
public String URI;
@Option(doc = "Put into SP field of sequence dictionary entry", optional = true)
public String SPECIES;
private final MessageDigest md5;
public CreateSequenceDictionary() {
try {
md5 = MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
throw new PicardException("MD5 algorithm not found", e);
}
}
public static void main(final String[] argv) {
System.exit(new CreateSequenceDictionary().instanceMain(argv));
}
/**
* Use reference filename to create URI to go into header if URI was not passed on cmd line.
*/
protected boolean customCommandLineValidation() {
if (URI == null) {
URI = "file:" + REFERENCE.getAbsolutePath();
}
return true;
}
/**
* Do the work after command line has been parsed.
* RuntimeException may be thrown by this method, and are reported appropriately.
*
* @return program exit status.
*/
protected int doWork() {
final List<SAMSequenceRecord> sequences = makeSequenceDictionary(REFERENCE);
final SAMFileHeader samHeader = new SAMFileHeader();
samHeader.setSequences(sequences);
final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(samHeader, false, OUTPUT);
samWriter.close();
return 0;
}
/**
* Read all the sequences from the given reference file, and convert into SAMSequenceRecords
* @param referenceFile fasta or fasta.gz
* @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments.
*/
List<SAMSequenceRecord> makeSequenceDictionary(final File referenceFile) {
final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile);
ReferenceSequence refSeq;
final List<SAMSequenceRecord> ret = new ArrayList<SAMSequenceRecord>();
while ((refSeq = refSeqFile.nextSequence()) != null) {
ret.add(makeSequenceRecord(refSeq));
}
return ret;
}
/**
* Create one SAMSequenceRecord from a single fasta sequence
*/
private SAMSequenceRecord makeSequenceRecord(final ReferenceSequence refSeq) {
final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName());
ret.setSequenceLength(refSeq.length());
// Compute MD5 of upcased bases
final byte[] bases = refSeq.getBases();
for (int i = 0; i < bases.length; ++i) {
bases[i] = (byte) (Character.toUpperCase(bases[i]) & 0xff);
}
ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases));
if (GENOME_ASSEMBLY != null) {
ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY);
}
ret.setAttribute(SAMSequenceRecord.URI_TAG, URI);
if (SPECIES != null) {
ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES);
}
return ret;
}
private String md5Hash(final byte[] bytes) {
md5.reset();
md5.update(bytes);
return new BigInteger(1, md5.digest()).toString(16);
}
}

Some files were not shown because too many files have changed in this diff Show More