Clean up 3rd party dependencies.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@27 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
e4bde58353
commit
ea0a826f8f
|
|
@ -3,10 +3,10 @@
|
|||
simple build file
|
||||
</description>
|
||||
<!-- set global properties for this build -->
|
||||
<property name="src" location=""/>
|
||||
<property name="build" location="out/production/AnalysisTK"/>
|
||||
<property name="src" location="src"/>
|
||||
<property name="build" location="build"/>
|
||||
<property name="dist" location="dist"/>
|
||||
<property name="jars" location="jars/functionalj.jar"/>
|
||||
<property name="lib" location="lib"/>
|
||||
|
||||
<target name="init">
|
||||
<!-- Create the time stamp -->
|
||||
|
|
@ -18,16 +18,31 @@
|
|||
<target name="compile" depends="init"
|
||||
description="compile the source " >
|
||||
<!-- Compile the java code from ${src} into ${build} -->
|
||||
<javac srcdir="${src}" destdir="${build}" classpath="${jars}"/>
|
||||
<javac srcdir="${src}" destdir="${build}" >
|
||||
<classpath>
|
||||
<fileset dir="lib">
|
||||
<include name="*.jar" />
|
||||
</fileset>
|
||||
</classpath>
|
||||
</javac>
|
||||
</target>
|
||||
|
||||
<target name="dist" depends="compile"
|
||||
description="generate the distribution" >
|
||||
<!-- Create the distribution directory -->
|
||||
<mkdir dir="${dist}/lib"/>
|
||||
<mkdir dir="${dist}"/>
|
||||
|
||||
<!-- Put everything in ${build} into the MyProject-${DSTAMP}.jar file -->
|
||||
<jar jarfile="${dist}/AnalysisTK-${DSTAMP}.jar" basedir="${build}"/>
|
||||
<jar jarfile="${dist}/AnalysisTK.jar" basedir="${build}">
|
||||
<manifest>
|
||||
<attribute name="Class-Path" value="functionalj.jar picard.jar sam-1.0.jar" />
|
||||
<attribute name="Main-Class" value="edu.mit.broad.sting.atk.AnalysisTK" />
|
||||
</manifest>
|
||||
</jar>
|
||||
|
||||
<copy todir="${dist}">
|
||||
<fileset dir="${lib}" includes="*.jar" />
|
||||
</copy>
|
||||
</target>
|
||||
|
||||
<target name="clean"
|
||||
|
|
|
|||
|
|
@ -1,242 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||||
* or functionality.
|
||||
*/
|
||||
package edu.mit.broad.arachne;
|
||||
|
||||
|
||||
/**
|
||||
* This class represents an arachne LookAlign alignment (or other related data structures).
|
||||
*/
|
||||
public class Alignment {
|
||||
|
||||
private static final char TAB = '\t';
|
||||
|
||||
private int mASequenceId;
|
||||
private int mASequenceLength;
|
||||
private int mAStart;
|
||||
private int mAEnd;
|
||||
private int mBSequenceId;
|
||||
private int mBSequenceLength;
|
||||
private int mBStart;
|
||||
private int mBEnd;
|
||||
private char mOrientation;
|
||||
private int[] mAlignmentBlocks;
|
||||
|
||||
|
||||
public Alignment() {
|
||||
}
|
||||
|
||||
public int getASequenceId() {
|
||||
return mASequenceId;
|
||||
}
|
||||
|
||||
public void setASequenceId(int value) {
|
||||
mASequenceId = value;
|
||||
}
|
||||
|
||||
public int getASequenceLength() {
|
||||
return mASequenceLength;
|
||||
}
|
||||
|
||||
public void setASequenceLength(int value) {
|
||||
mASequenceLength = value;
|
||||
}
|
||||
|
||||
public int getAStart() {
|
||||
return mAStart;
|
||||
}
|
||||
|
||||
public void setAStart(int value) {
|
||||
mAStart = value;
|
||||
}
|
||||
|
||||
public int getAEnd() {
|
||||
return mAEnd;
|
||||
}
|
||||
|
||||
public void setAEnd(int value) {
|
||||
mAEnd = value;
|
||||
}
|
||||
|
||||
public int getBSequenceId() {
|
||||
return mBSequenceId;
|
||||
}
|
||||
|
||||
public void setBSequenceId(int value) {
|
||||
mBSequenceId = value;
|
||||
}
|
||||
|
||||
public int getBSequenceLength() {
|
||||
return mBSequenceLength;
|
||||
}
|
||||
|
||||
public void setBSequenceLength(int value) {
|
||||
mBSequenceLength = value;
|
||||
}
|
||||
|
||||
public int getBStart() {
|
||||
return mBStart;
|
||||
}
|
||||
|
||||
public void setBStart(int value) {
|
||||
mBStart = value;
|
||||
}
|
||||
|
||||
public int getBEnd() {
|
||||
return mBEnd;
|
||||
}
|
||||
|
||||
public void setBEnd(int value) {
|
||||
mBEnd = value;
|
||||
}
|
||||
|
||||
public char getOrientation() {
|
||||
return mOrientation;
|
||||
}
|
||||
|
||||
public void setOrientation(char value) {
|
||||
mOrientation = value;
|
||||
}
|
||||
|
||||
public int[] getAlignmentBlocks() {
|
||||
return mAlignmentBlocks;
|
||||
}
|
||||
|
||||
public void setAlignmentBlocks(int[] value) {
|
||||
mAlignmentBlocks = value;
|
||||
}
|
||||
|
||||
public static Alignment parse(String text) {
|
||||
|
||||
if (text == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String[] fields = text.trim().split("\t");
|
||||
if (fields.length == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!fields[0].equals("QUERY")) {
|
||||
throw new IllegalArgumentException("Invalid alignment: " + text);
|
||||
}
|
||||
if (fields.length < 14) {
|
||||
throw new IllegalArgumentException("Invalid alignment: " + text);
|
||||
}
|
||||
|
||||
int seqAId = parseIntField(fields[1]);
|
||||
int seqAStart = parseIntField(fields[2]);
|
||||
int seqAEnd = parseIntField(fields[3]);
|
||||
int seqALength = parseIntField(fields[4]);
|
||||
int orientation = parseIntField(fields[5]);
|
||||
int seqBId = parseIntField(fields[6]);
|
||||
int seqBStart = parseIntField(fields[7]);
|
||||
int seqBEnd = parseIntField(fields[8]);
|
||||
int seqBLength = parseIntField(fields[9]);
|
||||
int blockCount = parseIntField(fields[10]);
|
||||
|
||||
if (seqAStart < 0 || seqAEnd <= 0 || seqALength <= 0 ||
|
||||
seqAStart >= seqALength || seqAEnd > seqALength || seqAStart >= seqAEnd) {
|
||||
throw new IllegalArgumentException("Invalid alignment: " + text);
|
||||
}
|
||||
if (seqBStart < 0 || seqBEnd <= 0 || seqBLength <= 0 ||
|
||||
seqBStart >= seqBLength || seqBEnd > seqBLength || seqBStart >= seqBEnd) {
|
||||
throw new IllegalArgumentException("Invalid alignment: " + text);
|
||||
}
|
||||
if (orientation < 0 || orientation > 1) {
|
||||
throw new IllegalArgumentException("Invalid alignment: " + text);
|
||||
}
|
||||
if (fields.length != (11 + 3*blockCount)) {
|
||||
throw new IllegalArgumentException("Invalid alignment: " + text);
|
||||
}
|
||||
|
||||
int[] alignmentBlocks = new int[3*blockCount];
|
||||
for (int i = 0; i < 3*blockCount; i++) {
|
||||
alignmentBlocks[i] = parseIntField(fields[11 + i]);
|
||||
}
|
||||
|
||||
Alignment alignment = new Alignment();
|
||||
alignment.setASequenceId(seqAId);
|
||||
alignment.setASequenceLength(seqALength);
|
||||
alignment.setAStart(seqAStart+1);
|
||||
alignment.setAEnd(seqAEnd);
|
||||
alignment.setBSequenceId(seqBId);
|
||||
alignment.setBSequenceLength(seqBLength);
|
||||
alignment.setBStart(seqBStart+1);
|
||||
alignment.setBEnd(seqBEnd);
|
||||
alignment.setOrientation((orientation == 0) ? '+' : '-');
|
||||
alignment.setAlignmentBlocks(alignmentBlocks);
|
||||
return alignment;
|
||||
}
|
||||
|
||||
private static int parseIntField(String text) {
|
||||
try {
|
||||
return Integer.parseInt(text);
|
||||
} catch (NumberFormatException exc) {
|
||||
throw new IllegalArgumentException("Illegal alignment field: " + text);
|
||||
}
|
||||
}
|
||||
|
||||
public String arachneFormat() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("QUERY");
|
||||
builder.append(TAB);
|
||||
builder.append(mASequenceId);
|
||||
builder.append(TAB);
|
||||
builder.append(mAStart-1); // zero based
|
||||
builder.append(TAB);
|
||||
builder.append(mAEnd);
|
||||
builder.append(TAB);
|
||||
builder.append(mASequenceLength);
|
||||
builder.append(TAB);
|
||||
builder.append(mOrientation == '+' ? 0 : 1);
|
||||
builder.append(TAB);
|
||||
builder.append(mBSequenceId);
|
||||
builder.append(TAB);
|
||||
builder.append(mBStart-1); // zero based
|
||||
builder.append(TAB);
|
||||
builder.append(mBEnd);
|
||||
builder.append(TAB);
|
||||
builder.append(mBSequenceLength);
|
||||
builder.append(TAB);
|
||||
builder.append(mAlignmentBlocks.length / 3);
|
||||
for (int i = 0; i < mAlignmentBlocks.length; i++) {
|
||||
builder.append(TAB);
|
||||
builder.append(mAlignmentBlocks[i]);
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
public String format() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("Alignment");
|
||||
builder.append(' ');
|
||||
builder.append(mASequenceId);
|
||||
builder.append(' ');
|
||||
builder.append(mAStart);
|
||||
builder.append(' ');
|
||||
builder.append(mAEnd);
|
||||
builder.append(' ');
|
||||
builder.append(mOrientation);
|
||||
builder.append(' ');
|
||||
builder.append(mBSequenceId);
|
||||
builder.append(' ');
|
||||
builder.append(mBStart);
|
||||
builder.append(' ');
|
||||
builder.append(mBEnd);
|
||||
builder.append(' ');
|
||||
builder.append(mAlignmentBlocks.length / 3);
|
||||
for (int i = 0; i < mAlignmentBlocks.length; i++) {
|
||||
builder.append(' ');
|
||||
builder.append(mAlignmentBlocks[i]);
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,132 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||||
* or functionality.
|
||||
*/
|
||||
package edu.mit.broad.arachne;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* Utility to convert fastb to fasta files.
|
||||
* More importantly, can be used to extract a subset of the reads.
|
||||
*/
|
||||
public class Fastb2Fasta {
|
||||
|
||||
private boolean mVerbose = false;
|
||||
private boolean mDebug = false;
|
||||
private String mInputPath = null;
|
||||
private String mIdListFilePath = null;
|
||||
|
||||
|
||||
public static void main(String[] args)
|
||||
throws Exception {
|
||||
new Fastb2Fasta().run(args);
|
||||
}
|
||||
|
||||
private void usage() {
|
||||
System.out.println("Usage: Fastb2Fasta ... <fastb-file>");
|
||||
System.out.println(" -idlist <file-of-read-ids>");
|
||||
System.out.println(" -verbose");
|
||||
System.out.println(" -debug");
|
||||
}
|
||||
|
||||
private boolean parseArguments(String[] args) {
|
||||
|
||||
int argpos = 0;
|
||||
int argsleft = 0;
|
||||
|
||||
while (argpos < args.length) {
|
||||
argsleft = args.length - argpos;
|
||||
String arg = args[argpos];
|
||||
if (arg.equals("-idlist") && argsleft > 1) {
|
||||
argpos++;
|
||||
mIdListFilePath = args[argpos++];
|
||||
} else if (arg.equals("-verbose")) {
|
||||
argpos++;
|
||||
mVerbose = true;
|
||||
} else if (arg.equals("-debug")) {
|
||||
argpos++;
|
||||
mDebug = true;
|
||||
} else if (arg.startsWith("-")) {
|
||||
usage();
|
||||
return false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
argsleft = args.length - argpos;
|
||||
if (argsleft != 1) {
|
||||
usage();
|
||||
return false;
|
||||
}
|
||||
|
||||
mInputPath = args[argpos];
|
||||
return true;
|
||||
}
|
||||
|
||||
private void run(String[] args)
|
||||
throws Exception {
|
||||
|
||||
if (!parseArguments(args)) {
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
FastbReader fastbReader = new FastbReader(new File(mInputPath));
|
||||
try {
|
||||
if (mIdListFilePath != null) {
|
||||
LineNumberReader reader = new LineNumberReader(new FileReader(mIdListFilePath));
|
||||
while (true) {
|
||||
String line = reader.readLine();
|
||||
if (line == null) {
|
||||
reader.close();
|
||||
break;
|
||||
}
|
||||
Integer id = parseReadId(line);
|
||||
if (id == null) {
|
||||
continue;
|
||||
}
|
||||
if (id < 0 || id >= fastbReader.getSequenceCount()) {
|
||||
System.out.println("ERROR: Illegal sequence id: " + id);
|
||||
System.exit(1);
|
||||
}
|
||||
String sequence = fastbReader.readSequence(id);
|
||||
System.out.println(">" + id);
|
||||
System.out.println(sequence);
|
||||
}
|
||||
} else {
|
||||
int id = 0;
|
||||
while (fastbReader.hasNext()) {
|
||||
String sequence = fastbReader.next();
|
||||
System.out.println(">" + id);
|
||||
System.out.println(sequence);
|
||||
id++;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
fastbReader.close();
|
||||
}
|
||||
}
|
||||
|
||||
private Integer parseReadId(String line) {
|
||||
String text = line.trim();
|
||||
if (text.length() == 0 || text.charAt(0) == '#') {
|
||||
return null;
|
||||
}
|
||||
String token = text.split("\\s+")[0];
|
||||
Integer id = null;
|
||||
try {
|
||||
id = new Integer(token);
|
||||
} catch (NumberFormatException exc) {
|
||||
System.out.println("ERROR: Invalid sequence id: " + token);
|
||||
System.exit(1);
|
||||
}
|
||||
return id;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,220 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||||
* or functionality.
|
||||
*/
|
||||
package edu.mit.broad.arachne;
|
||||
|
||||
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
|
||||
/**
|
||||
* Reader for arachne Fastb files.
|
||||
*/
|
||||
public class FastbReader
|
||||
implements CloseableIterator<String> {
|
||||
|
||||
// Notes on fastb file format
|
||||
//
|
||||
// Fastb files contain the serialized contents of an arachne vecbasevector,
|
||||
// which is a typedef for mastervec<basevector, unsigned int>.
|
||||
// The serialization of mastervec objects starts with a 24 byte mv_file_control_block,
|
||||
// followed by N variable length segments (one for each element of the mastervec vector),
|
||||
// followed by an offset table containing N 8-byte file offsets to the N variable length
|
||||
// segments, followed by N fixed length data segments, one for each vector element.
|
||||
// Thus, reading a single element of the mastervec vector requires reading from three
|
||||
// separate places in the file (the offset table, the variable length section and the
|
||||
// fixed length section).
|
||||
//
|
||||
// The mastervec file header is 24 bytes arranged as follows:
|
||||
// n 4-byte signed(?) integer (number of entries)
|
||||
// c1 1-byte unsigned bit mask (see below)
|
||||
// reserved 1-byte unused
|
||||
// sizeX 1-byte unsigned, sizeof first template parameter (16 for fastb files)
|
||||
// sizeA 1-byte unsigned, sizeof second template parameter (4 for fastb files)
|
||||
// offsets_start 8-byte signed(?) integer, file offset of offset table
|
||||
// static_start 8-byte signed(?) integer, file offset of static data (fixed size section)
|
||||
//
|
||||
// For fastb files, the fixed size section contains 4 bytes for each object, which is the
|
||||
// unsigned(?) count of the number of bases in this entry.
|
||||
// For fastb files, the variable length section contains a bit vector with two bits per base.
|
||||
// The bases are encoded as follows: A = 0, C = 1, G = 2, T = 3.
|
||||
//
|
||||
// For fastb files, in the file header N is the number of entries in the fastb file.
|
||||
// c1 is unused/unimplemented except that the two low-order bits should be 0x01, indicating
|
||||
// that we are using the single-file representation. There is also apparently a three-file
|
||||
// representation that looks the same except that the offset table and static (fixed length)
|
||||
// table are in separate files named <basename>.offsets and <basename>.static.
|
||||
// The sizeX should be 16 for fastb files and sizeA should be 4.
|
||||
//
|
||||
// Note that in fastb files, the sequences are not identified by name or id, only by index
|
||||
// (zero based) into the mastervec object. There is no representation for bases other than
|
||||
// ACGT (i.e. Ns cannot be encoded).
|
||||
|
||||
private static final char[] BASES = { 'A', 'C', 'G', 'T' };
|
||||
|
||||
private File mFile;
|
||||
private RandomAccessFile mRandomFile;
|
||||
private int mEntryCount;
|
||||
private long mOffsetTableOffset;
|
||||
private long mLengthTableOffset;
|
||||
private int mCurrentPosition;
|
||||
private byte[] mIOBuffer = new byte[8];
|
||||
|
||||
|
||||
public FastbReader(File file)
|
||||
throws IOException {
|
||||
mFile = file;
|
||||
mRandomFile = new RandomAccessFile(mFile, "r");
|
||||
readHeader();
|
||||
}
|
||||
|
||||
public int getSequenceCount() {
|
||||
return mEntryCount;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return (mCurrentPosition < mEntryCount);
|
||||
}
|
||||
|
||||
public String next() {
|
||||
if (!hasNext()) {
|
||||
throw new IllegalStateException("Iterator exhausted");
|
||||
}
|
||||
try {
|
||||
return readSequence(mCurrentPosition);
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Not supported: remove");
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (mRandomFile != null) {
|
||||
mEntryCount = 0;
|
||||
mCurrentPosition = 0;
|
||||
try {
|
||||
mRandomFile.close();
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
} finally {
|
||||
mRandomFile = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String readSequence(int n)
|
||||
throws IOException {
|
||||
if (mRandomFile == null) {
|
||||
throw new IllegalStateException("Reader is closed");
|
||||
}
|
||||
if (n < 0 || n >= mEntryCount) {
|
||||
throw new IndexOutOfBoundsException("Illegal index: " + n);
|
||||
}
|
||||
long offset = getEntryOffset(n);
|
||||
int length = getEntryBaseCount(n);
|
||||
String result = readBases(offset, length);
|
||||
mCurrentPosition = n+1;
|
||||
return result;
|
||||
}
|
||||
|
||||
private void readHeader()
|
||||
throws IOException {
|
||||
|
||||
byte[] fileControlBlock = new byte[24];
|
||||
mRandomFile.readFully(fileControlBlock, 0, 24);
|
||||
|
||||
int word2 = deserializeInt(fileControlBlock, 4);
|
||||
int nFiles = word2 & 0x3;
|
||||
int sizeX = (word2 >> 16) & 0xFF;
|
||||
int sizeA = (word2 >> 24) & 0xFF;
|
||||
if (nFiles != 1) {
|
||||
throw new RuntimeException(mFile + ": Invalid file header: nFiles = " + nFiles);
|
||||
}
|
||||
if (sizeX != 16) {
|
||||
throw new RuntimeException(mFile + ": Invalid file header: sizeX = " + sizeX);
|
||||
}
|
||||
if (sizeA != 4) {
|
||||
throw new RuntimeException(mFile + ": Invalid file header: sizeX = " + sizeA);
|
||||
}
|
||||
mEntryCount = deserializeInt(fileControlBlock, 0);
|
||||
mOffsetTableOffset = deserializeLong(fileControlBlock, 8);
|
||||
mLengthTableOffset = deserializeLong(fileControlBlock, 16);
|
||||
}
|
||||
|
||||
private long getEntryOffset(int n)
|
||||
throws IOException {
|
||||
mRandomFile.seek(mOffsetTableOffset + 8 * n);
|
||||
mRandomFile.readFully(mIOBuffer, 0, 8);
|
||||
return deserializeLong(mIOBuffer, 0);
|
||||
}
|
||||
|
||||
private int getEntryBaseCount(int n)
|
||||
throws IOException {
|
||||
mRandomFile.seek(mLengthTableOffset + 4 * n);
|
||||
mRandomFile.readFully(mIOBuffer, 0, 4);
|
||||
return deserializeInt(mIOBuffer, 0);
|
||||
}
|
||||
|
||||
private String readBases(long fileOffset, int baseCount)
|
||||
throws IOException {
|
||||
|
||||
|
||||
int byteCount = (baseCount + 3) / 4;
|
||||
byte[] data = new byte[byteCount];
|
||||
mRandomFile.seek(fileOffset);
|
||||
mRandomFile.readFully(data, 0, byteCount);
|
||||
|
||||
int baseIndex = 0;
|
||||
int dataIndex = 0;
|
||||
char[] baseBuffer = new char[baseCount];
|
||||
while (baseIndex < baseCount) {
|
||||
int b = data[dataIndex++];
|
||||
int count = Math.min(4, baseCount - baseIndex);
|
||||
for (int i = 0; i < count; i++) {
|
||||
baseBuffer[baseIndex++] = BASES[b & 0x3];
|
||||
b = b >> 2;
|
||||
}
|
||||
}
|
||||
return new String(baseBuffer);
|
||||
}
|
||||
|
||||
private int deserializeInt(byte[] buffer, int offset) {
|
||||
int byte1 = buffer[offset] & 0xFF;
|
||||
int byte2 = buffer[offset+1] & 0xFF;
|
||||
int byte3 = buffer[offset+2] & 0xFF;
|
||||
int byte4 = buffer[offset+3] & 0xFF;
|
||||
return (byte1 | (byte2 << 8) | (byte3 << 16) | (byte4 << 24));
|
||||
}
|
||||
|
||||
private long deserializeLong(byte[] buffer, int offset) {
|
||||
long int1 = deserializeInt(buffer, offset) & 0xFFFFFFFFL;
|
||||
long int2 = deserializeInt(buffer, offset+4) & 0xFFFFFFFFL;
|
||||
return (int1 | (int2 << 32));
|
||||
}
|
||||
|
||||
// Stub for interactive use (see also Fastb2Fasta)
|
||||
public static void main(String[] args)
|
||||
throws Exception {
|
||||
FastbReader reader = new FastbReader(new File(args[0]));
|
||||
int readId = 0;
|
||||
while (reader.hasNext()) {
|
||||
System.out.println(">" + readId);
|
||||
System.out.println(reader.next());
|
||||
readId++;
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,83 +0,0 @@
|
|||
package edu.mit.broad.arachne;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.BitSet;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* Utility class to read in a set of contig-based genomic intervals in zero-based end inclusive
|
||||
* and store them efficiently in memory as a 1-based bit-mask
|
||||
*/
|
||||
public class GenomeMask {
|
||||
|
||||
// if memory usage becomes a problem... this could be changed to a SparseBitSet
|
||||
// http://java.sun.com/developer/onlineTraining/collections/magercises/BitSet/index.html
|
||||
private SortedMap<Integer, BitSet> data = new TreeMap<Integer, BitSet>();
|
||||
|
||||
|
||||
public GenomeMask(File maskFile) throws IOException {
|
||||
BufferedReader baitReader = null;
|
||||
try {
|
||||
baitReader = new BufferedReader(new FileReader(maskFile));
|
||||
String line;
|
||||
while ((line = baitReader.readLine()) != null) {
|
||||
String[] arr = line.split(" ");
|
||||
int contig = Integer.parseInt(arr[0]);
|
||||
|
||||
// covert the coordinates from 0-based, end inclusive to
|
||||
// 1-based end inclusive
|
||||
int startPos = Integer.parseInt(arr[1]) + 1;
|
||||
int endPos = Integer.parseInt(arr[2]) + 1;
|
||||
|
||||
BitSet bits = data.get(contig);
|
||||
if (bits == null) {
|
||||
bits = new BitSet(endPos);
|
||||
data.put(contig,bits);
|
||||
}
|
||||
|
||||
bits.set(startPos, endPos + 1); // set method is end exclusive
|
||||
}
|
||||
} finally {
|
||||
if (baitReader != null) { baitReader.close(); }
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This ctor is useful if initializing a GenomeMask externally.
|
||||
*/
|
||||
public GenomeMask() {
|
||||
}
|
||||
|
||||
public boolean get(int contig, int position) {
|
||||
BitSet bits = data.get(contig);
|
||||
return (bits != null) && bits.get(position);
|
||||
}
|
||||
|
||||
public BitSet get(int contig) {
|
||||
return data.get(contig);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an existing BitSet for the given contig, or create one if not already present. This is
|
||||
* useful when initializing a GenomeMask from an external source.
|
||||
* @param contig which BitSet
|
||||
* @param numBits if there was not already a BitSet for this contig, one is created and initialized to this size.
|
||||
* @return the BitSet for the given contig, creating one if necessary
|
||||
*/
|
||||
public BitSet getOrCreate(int contig, int numBits) {
|
||||
BitSet ret = data.get(contig);
|
||||
if (ret == null) {
|
||||
ret = new BitSet(numBits);
|
||||
data.put(contig, ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public int getMaxContig() {
|
||||
return data.lastKey();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,136 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||||
* or functionality.
|
||||
*/
|
||||
package edu.mit.broad.arachne;
|
||||
|
||||
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
|
||||
/**
|
||||
* Reader for arachne LookAlign text format alignment files.
|
||||
* Supports filtering of the input by genomic locus.
|
||||
*/
|
||||
public class LookAlignReader
|
||||
implements CloseableIterator<Alignment> {
|
||||
|
||||
private LineNumberReader mReader = null;
|
||||
private Alignment mNextAlignment = null;
|
||||
private int mBSequenceId = -1;
|
||||
private int mBStart = 0;
|
||||
private int mBEnd = 0;
|
||||
|
||||
|
||||
public LookAlignReader(File file)
|
||||
throws IOException {
|
||||
this(new FileReader(file));
|
||||
}
|
||||
|
||||
public LookAlignReader(Reader reader) {
|
||||
if (reader instanceof LineNumberReader) {
|
||||
mReader = (LineNumberReader) reader;
|
||||
} else {
|
||||
mReader = new LineNumberReader(reader);
|
||||
}
|
||||
}
|
||||
|
||||
public void setBSequenceId(int value) {
|
||||
mBSequenceId = value;
|
||||
}
|
||||
|
||||
public void setBStart(int value) {
|
||||
mBStart = value;
|
||||
}
|
||||
|
||||
public void setBEnd(int value) {
|
||||
mBEnd = value;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
if (mNextAlignment != null) {
|
||||
return true;
|
||||
}
|
||||
try {
|
||||
mNextAlignment = nextAlignment();
|
||||
return (mNextAlignment != null);
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
|
||||
public Alignment next() {
|
||||
if (!hasNext()) {
|
||||
throw new IllegalStateException("Iterator exhausted");
|
||||
}
|
||||
try {
|
||||
Alignment result = mNextAlignment;
|
||||
mNextAlignment = nextAlignment();
|
||||
return result;
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Not supported: remove");
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (mReader != null) {
|
||||
try {
|
||||
mReader.close();
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
mReader = null;
|
||||
}
|
||||
}
|
||||
|
||||
private Alignment nextAlignment()
|
||||
throws IOException {
|
||||
if (mReader == null) {
|
||||
return null;
|
||||
}
|
||||
while (true) {
|
||||
String line = mReader.readLine();
|
||||
if (line == null) {
|
||||
close();
|
||||
break;
|
||||
}
|
||||
if (!line.startsWith("QUERY")) {
|
||||
continue;
|
||||
}
|
||||
Alignment alignment = Alignment.parse(line);
|
||||
if (matchesFilters(alignment)) {
|
||||
return alignment;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private boolean matchesFilters(Alignment alignment) {
|
||||
if (mBSequenceId < 0) {
|
||||
return true;
|
||||
}
|
||||
if (alignment.getBSequenceId() != mBSequenceId) {
|
||||
return false;
|
||||
}
|
||||
if (mBStart > 0 && alignment.getBEnd() < mBStart) {
|
||||
return false;
|
||||
}
|
||||
if (mBEnd > 0 && alignment.getBStart() > mBEnd) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,437 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||||
* or functionality.
|
||||
*/
|
||||
package edu.mit.broad.cnv;
|
||||
|
||||
import edu.mit.broad.arachne.Alignment;
|
||||
import edu.mit.broad.arachne.LookAlignReader;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Utility class to do data reduction on CNV data.
|
||||
*/
|
||||
public class AnalyzeCnvs {
|
||||
|
||||
public static void main(String[] args)
|
||||
throws Exception {
|
||||
new AnalyzeCnvs().run(args);
|
||||
}
|
||||
|
||||
private void usage() {
|
||||
System.out.println("Usage: AnalyzeCnvs ...");
|
||||
System.out.println(" -action <action>");
|
||||
System.out.println(" -alignments <alignment-file> or -");
|
||||
System.out.println(" -alignmentList <alignment-fofn>");
|
||||
System.out.println(" -chromosome <chrN>");
|
||||
System.out.println(" -start <start-coordinate>");
|
||||
System.out.println(" -end <end-coordinate>");
|
||||
System.out.println(" -bestAlignments");
|
||||
System.out.println(" -mismatchThreshold <n>");
|
||||
System.out.println(" -binsize <n>");
|
||||
System.out.println(" -output <coverage|all>");
|
||||
System.out.println(" -verbose");
|
||||
System.out.println(" -debug");
|
||||
}
|
||||
|
||||
private boolean parseArguments(String[] args) {
|
||||
|
||||
int argpos = 0;
|
||||
int argsleft = 0;
|
||||
|
||||
while (argpos < args.length) {
|
||||
argsleft = args.length - argpos;
|
||||
String arg = args[argpos];
|
||||
if (arg.equals("-action") && argsleft > 1) {
|
||||
argpos++;
|
||||
mAction = args[argpos++];
|
||||
} else if (arg.equals("-alignments") && argsleft > 1) {
|
||||
argpos++;
|
||||
mAlignmentFilePath = args[argpos++];
|
||||
} else if (arg.equals("-alignmentList") && argsleft > 1) {
|
||||
argpos++;
|
||||
mAlignmentListFilePath = args[argpos++];
|
||||
} else if (arg.equals("-chromosome") && argsleft > 1) {
|
||||
argpos++;
|
||||
mChromosome = args[argpos++];
|
||||
} else if (arg.equals("-start") && argsleft > 1) {
|
||||
argpos++;
|
||||
mStartPosition = new Integer(args[argpos++]);
|
||||
} else if (arg.equals("-end") && argsleft > 1) {
|
||||
argpos++;
|
||||
mEndPosition = new Integer(args[argpos++]);
|
||||
} else if (arg.equals("-verbose")) {
|
||||
argpos++;
|
||||
mVerbose = true;
|
||||
} else if (arg.equals("-mismatchThreshold") && argsleft > 1) {
|
||||
argpos++;
|
||||
mMismatchThreshold = new Integer(args[argpos++]);
|
||||
} else if (arg.equals("-bestAlignments")) {
|
||||
argpos++;
|
||||
mReturnBestHits = true;
|
||||
} else if (arg.equals("-binsize") && argsleft > 1) {
|
||||
argpos++;
|
||||
mBinSize = Integer.parseInt(args[argpos++]);
|
||||
} else if (arg.equals("-output") && argsleft > 1) {
|
||||
argpos++;
|
||||
mOutputColumns = args[argpos++];
|
||||
} else if (arg.equals("-debug")) {
|
||||
argpos++;
|
||||
mDebug = true;
|
||||
} else if (arg.startsWith("-")) {
|
||||
usage();
|
||||
return false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
argsleft = args.length - argpos;
|
||||
if (argsleft != 0) {
|
||||
usage();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private void run(String[] args)
|
||||
throws Exception {
|
||||
|
||||
if (!parseArguments(args)) {
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
if (mAction == null) {
|
||||
mAction = "alignmentCoverage";
|
||||
}
|
||||
|
||||
if (mAction.equals("alignmentCoverage")) {
|
||||
mainAlignmentCoverage();
|
||||
} else {
|
||||
System.out.println("Unknown action: " + mAction);
|
||||
usage();
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
private void mainAlignmentCoverage()
|
||||
throws IOException {
|
||||
|
||||
if (mStartPosition == null || mEndPosition == null) {
|
||||
usage();
|
||||
System.exit(1);
|
||||
} else if (mStartPosition <= 0 || mEndPosition <= 0 || mStartPosition > mEndPosition) {
|
||||
System.out.println("Invalid start/end positions: " + mStartPosition + " " + mEndPosition);
|
||||
usage();
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
mSequenceId = chromosomeToSequenceId(mChromosome);
|
||||
if (mSequenceId < 0) {
|
||||
System.out.println("Invalid chromosome: " + mChromosome);
|
||||
usage();
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
if (mBinSize <= 0) {
|
||||
System.out.println("Invalid bin size: " + mBinSize);
|
||||
usage();
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
runAlignmentCoverage();
|
||||
}
|
||||
|
||||
private void runAlignmentCoverage()
|
||||
throws IOException {
|
||||
|
||||
int length = (mEndPosition - mStartPosition + 1);
|
||||
if (length <= 0) {
|
||||
throw new RuntimeException("Invalid start/end positions");
|
||||
}
|
||||
|
||||
int binSize = mBinSize;
|
||||
int binCount = (length + binSize - 1) / binSize;
|
||||
int[] readStarts = new int[binCount];
|
||||
int[] readDepths = new int[binCount];
|
||||
List<String> alignmentFiles = getAlignmentFiles();
|
||||
for (String path : alignmentFiles) {
|
||||
processAlignmentFile(path, readStarts, readDepths);
|
||||
}
|
||||
printStats(readStarts, readDepths);
|
||||
}
|
||||
|
||||
private List<String> getAlignmentFiles()
|
||||
throws IOException {
|
||||
List<String> fileList = new ArrayList<String>();
|
||||
if (mAlignmentListFilePath != null) {
|
||||
LineNumberReader reader = new LineNumberReader(new FileReader(mAlignmentListFilePath));
|
||||
while (true) {
|
||||
String line = reader.readLine();
|
||||
if (line == null) {
|
||||
reader.close();
|
||||
break;
|
||||
}
|
||||
String path = line.trim();
|
||||
if (path.length() == 0 || path.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
fileList.add(path);
|
||||
}
|
||||
} else if (mAlignmentFilePath != null) {
|
||||
fileList.add(mAlignmentFilePath);
|
||||
}
|
||||
return fileList;
|
||||
}
|
||||
|
||||
private void processAlignmentFile(String path, int[] readStarts, int[] readDepths)
|
||||
throws IOException {
|
||||
|
||||
LookAlignReader reader = null;
|
||||
if (path == null || path.equals("-")) {
|
||||
reader = new LookAlignReader(new InputStreamReader(System.in));
|
||||
} else {
|
||||
reader = new LookAlignReader(new File(path));
|
||||
}
|
||||
|
||||
while (true) {
|
||||
Alignment alignment = getNextAlignment(reader);
|
||||
if (alignment == null) {
|
||||
reader.close();
|
||||
break;
|
||||
}
|
||||
processAlignment(alignment, readStarts, readDepths);
|
||||
}
|
||||
}
|
||||
|
||||
private void processAlignment(Alignment alignment,
|
||||
int[] readStarts,
|
||||
int[] readDepths) {
|
||||
|
||||
if (readStarts != null) {
|
||||
int baseOffset = alignment.getBStart() - mStartPosition;
|
||||
int binIndex = baseOffset / mBinSize;
|
||||
if (binIndex >= 0 && binIndex < readStarts.length) {
|
||||
readStarts[binIndex]++;
|
||||
}
|
||||
}
|
||||
|
||||
if (readDepths != null) {
|
||||
int baseOffset = alignment.getBStart() - mStartPosition;
|
||||
int[] alignmentBlocks = alignment.getAlignmentBlocks();
|
||||
for (int i = 0; i < alignmentBlocks.length; i += 3) {
|
||||
int gap = alignmentBlocks[i];
|
||||
int duration = alignmentBlocks[i+1];
|
||||
if (gap > 0) {
|
||||
// Gap in B sequence (genome)
|
||||
// Negative gaps are gaps in A sequence (read)
|
||||
baseOffset += gap;
|
||||
}
|
||||
for (int j = 0; j < duration; j++) {
|
||||
int binIndex = baseOffset / mBinSize;
|
||||
if (binIndex >= 0 && binIndex < readDepths.length) {
|
||||
readDepths[binIndex]++;
|
||||
}
|
||||
baseOffset++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Alignment getNextAlignment(LookAlignReader reader)
|
||||
throws IOException {
|
||||
|
||||
if (!mReturnBestHits) {
|
||||
while (reader.hasNext()) {
|
||||
Alignment alignment = reader.next();
|
||||
if (passesAlignmentFilters(alignment)) {
|
||||
return alignment;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
while (true) {
|
||||
Alignment seed = mPendingAlignment;
|
||||
mPendingAlignment = null;
|
||||
if (seed == null && reader.hasNext()) {
|
||||
seed = reader.next();
|
||||
}
|
||||
if (seed == null) {
|
||||
return null;
|
||||
}
|
||||
List<Alignment> secondaryHits = null;
|
||||
while (reader.hasNext()) {
|
||||
Alignment alignment = reader.next();
|
||||
if (alignment.getASequenceId() != seed.getASequenceId()) {
|
||||
if (alignment.getASequenceId() < seed.getASequenceId()) {
|
||||
throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format());
|
||||
}
|
||||
mPendingAlignment = alignment;
|
||||
break;
|
||||
}
|
||||
if (secondaryHits == null) {
|
||||
secondaryHits = new ArrayList<Alignment>();
|
||||
}
|
||||
secondaryHits.add(alignment);
|
||||
}
|
||||
if (secondaryHits == null) {
|
||||
if (!passesAlignmentFilters(seed)) {
|
||||
continue;
|
||||
}
|
||||
return seed;
|
||||
}
|
||||
secondaryHits.add(seed);
|
||||
Alignment result = getUniqueBestAlignment(secondaryHits);
|
||||
if (result != null && passesAlignmentFilters(result)) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Alignment getUniqueBestAlignment(List<Alignment> alignments) {
|
||||
int bestMismatches = 0;
|
||||
List<Alignment> best = new ArrayList<Alignment>();
|
||||
for (Alignment a : alignments) {
|
||||
int mismatches = getAlignmentMismatches(a);
|
||||
if (best.isEmpty()) {
|
||||
best.add(a);
|
||||
bestMismatches = mismatches;
|
||||
}
|
||||
if (mismatches == bestMismatches) {
|
||||
best.add(a);
|
||||
} else if (mismatches < bestMismatches) {
|
||||
best.clear();
|
||||
best.add(a);
|
||||
bestMismatches = mismatches;
|
||||
}
|
||||
}
|
||||
if (best.size() != 1) {
|
||||
return null;
|
||||
}
|
||||
return best.get(0);
|
||||
}
|
||||
|
||||
private boolean passesAlignmentFilters(Alignment alignment) {
|
||||
|
||||
if (mMismatchThreshold != null) {
|
||||
if (getAlignmentMismatches(alignment) > mMismatchThreshold) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (mSequenceId != null) {
|
||||
if (alignment.getBSequenceId() != mSequenceId) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (mStartPosition != null) {
|
||||
if (alignment.getBEnd() < mStartPosition) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (mEndPosition != null) {
|
||||
if (alignment.getBStart() > mEndPosition) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private int getAlignmentMismatches(Alignment alignment) {
|
||||
int mismatches = 0;
|
||||
int[] blocks = alignment.getAlignmentBlocks();
|
||||
for (int i = 0; i < blocks.length; i += 3) {
|
||||
int gap = blocks[i];
|
||||
int duration = blocks[i+1];
|
||||
int mm = blocks[i+2];
|
||||
if (mm > duration) {
|
||||
throw new RuntimeException("Invalid alignment? : " + alignment.format());
|
||||
}
|
||||
mismatches += Math.abs(gap);
|
||||
mismatches += mm;
|
||||
}
|
||||
return mismatches;
|
||||
}
|
||||
|
||||
private void printStats(int[] readStarts, int[] readDepths) {
|
||||
if (mOutputColumns != null && mOutputColumns.equals("coverage")) {
|
||||
// No headers, just coverage
|
||||
for (int i = 0; i < readDepths.length; i++) {
|
||||
String line = "";
|
||||
if (mBinSize == 1) {
|
||||
line += readDepths[i];
|
||||
} else {
|
||||
line += (readDepths[i] / (double) mBinSize);
|
||||
}
|
||||
System.out.println(line);
|
||||
}
|
||||
} else {
|
||||
System.out.println("Position" + "\t" + "Starts" + "\t" + "Coverage");
|
||||
for (int i = 0; i < readDepths.length; i++) {
|
||||
String line = "";
|
||||
int position = mStartPosition + i*mBinSize;
|
||||
line += position + "\t" + readStarts[i] + "\t";
|
||||
if (mBinSize == 1) {
|
||||
line += readDepths[i];
|
||||
} else {
|
||||
line += (readDepths[i] / (double) mBinSize);
|
||||
}
|
||||
System.out.println(line);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int chromosomeToSequenceId(String text) {
|
||||
if (text == null || text.length() == 0) {
|
||||
return -1;
|
||||
}
|
||||
if (text.matches("\\d+")) {
|
||||
return Integer.parseInt(text);
|
||||
}
|
||||
if (text.startsWith("chr") && text.length() > 3) {
|
||||
text = text.substring(3);
|
||||
}
|
||||
if (text.matches("\\d+") && !text.startsWith("0")) {
|
||||
return Integer.parseInt(text);
|
||||
}
|
||||
if (text.equals("M")) {
|
||||
return 0;
|
||||
} else if (text.equals("X")) {
|
||||
return 23;
|
||||
} else if (text.equals("Y")) {
|
||||
return 24;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean mDebug = false;
|
||||
private boolean mVerbose = false;
|
||||
|
||||
private String mAction = null;
|
||||
private String mAlignmentFilePath = null;
|
||||
private String mAlignmentListFilePath = null;
|
||||
private String mChromosome = null;
|
||||
private Integer mStartPosition = null;
|
||||
private Integer mEndPosition = null;
|
||||
private Integer mSequenceId = null;
|
||||
private boolean mReturnBestHits = false;
|
||||
private Integer mMismatchThreshold = null;
|
||||
private int mBinSize = 1;
|
||||
private String mOutputColumns = null;
|
||||
private Alignment mPendingAlignment = null;
|
||||
}
|
||||
|
|
@ -1,283 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||||
* or functionality.
|
||||
*/
|
||||
package edu.mit.broad.cnv;
|
||||
|
||||
import edu.mit.broad.arachne.Alignment;
|
||||
import edu.mit.broad.arachne.LookAlignReader;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Utility to count alignments (rather than gathering).
|
||||
*/
|
||||
public class CountAlignments {
|
||||
|
||||
public static void main(String[] args)
|
||||
throws Exception {
|
||||
new CountAlignments().run(args);
|
||||
}
|
||||
|
||||
private void usage() {
|
||||
System.out.println("Usage: CountAlignments ...");
|
||||
System.out.println(" -alignments <alignment-file> (- for stdin)");
|
||||
System.out.println(" -chromosome <chromosome>");
|
||||
System.out.println(" -start <start>");
|
||||
System.out.println(" -end <end>");
|
||||
System.out.println(" -bestAlignments");
|
||||
System.out.println(" -mismatchThreshold <n>");
|
||||
System.out.println(" -verbose");
|
||||
System.out.println(" -debug");
|
||||
}
|
||||
|
||||
private boolean parseArguments(String[] args) {
|
||||
|
||||
int argpos = 0;
|
||||
int argsleft = 0;
|
||||
|
||||
while (argpos < args.length) {
|
||||
argsleft = args.length - argpos;
|
||||
String arg = args[argpos];
|
||||
if (arg.equals("-alignments") && argsleft > 1) {
|
||||
argpos++;
|
||||
mAlignmentFilePath = args[argpos++];
|
||||
} else if (arg.equals("-mismatchThreshold") && argsleft > 1) {
|
||||
argpos++;
|
||||
mMismatchThreshold = new Integer(args[argpos++]);
|
||||
} else if (arg.equals("-bestAlignments")) {
|
||||
argpos++;
|
||||
mReturnBestHits = true;
|
||||
} else if (arg.equals("-chromosome") && argsleft > 1) {
|
||||
argpos++;
|
||||
String chromosome = args[argpos++];
|
||||
mSequenceId = chromosomeToSequenceId(chromosome);
|
||||
if (mSequenceId < 0) {
|
||||
System.out.println("Invalid chromosome: " + chromosome);
|
||||
return false;
|
||||
}
|
||||
} else if (arg.equals("-start") && argsleft > 1) {
|
||||
argpos++;
|
||||
mStartPosition = new Integer(args[argpos++]);
|
||||
} else if (arg.equals("-end") && argsleft > 1) {
|
||||
argpos++;
|
||||
mEndPosition = new Integer(args[argpos++]);
|
||||
} else if (arg.equals("-verbose")) {
|
||||
argpos++;
|
||||
mVerbose = true;
|
||||
} else if (arg.equals("-debug")) {
|
||||
argpos++;
|
||||
mDebug = true;
|
||||
} else if (arg.startsWith("-")) {
|
||||
usage();
|
||||
return false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
argsleft = args.length - argpos;
|
||||
if (argsleft != 0) {
|
||||
usage();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private void run(String[] args)
|
||||
throws Exception {
|
||||
|
||||
if (!parseArguments(args)) {
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
long[] counts = countAlignments(mAlignmentFilePath);
|
||||
String line = counts[0] + " " + counts[1];
|
||||
if (mAlignmentFilePath != null) {
|
||||
line = mAlignmentFilePath + " " + line;
|
||||
}
|
||||
System.out.println(line);
|
||||
}
|
||||
|
||||
private long[] countAlignments(String path)
|
||||
throws IOException {
|
||||
long alignmentCount = 0;
|
||||
long baseCount = 0;
|
||||
LookAlignReader reader = null;
|
||||
if (path == null || path.equals("-")) {
|
||||
reader = new LookAlignReader(new InputStreamReader(System.in));
|
||||
} else {
|
||||
reader = new LookAlignReader(new File(path));
|
||||
}
|
||||
while (true) {
|
||||
Alignment alignment = getNextAlignment(reader);
|
||||
if (alignment == null) {
|
||||
reader.close();
|
||||
break;
|
||||
}
|
||||
if (mMismatchThreshold != null) {
|
||||
if (getAlignmentMismatches(alignment) > mMismatchThreshold) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (mSequenceId != null) {
|
||||
if (alignment.getBSequenceId() != mSequenceId) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (mStartPosition != null) {
|
||||
if (alignment.getBEnd() < mStartPosition) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (mEndPosition != null) {
|
||||
if (alignment.getBStart() > mEndPosition) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
alignmentCount++;
|
||||
baseCount += getBaseCount(alignment);
|
||||
}
|
||||
long[] result = { alignmentCount, baseCount };
|
||||
return result;
|
||||
}
|
||||
|
||||
private Alignment getNextAlignment(LookAlignReader reader)
|
||||
throws IOException {
|
||||
if (!mReturnBestHits) {
|
||||
if (!reader.hasNext()) {
|
||||
return null;
|
||||
}
|
||||
return reader.next();
|
||||
}
|
||||
while (true) {
|
||||
Alignment seed = mPendingAlignment;
|
||||
mPendingAlignment = null;
|
||||
if (seed == null && reader.hasNext()) {
|
||||
seed = reader.next();
|
||||
}
|
||||
if (seed == null) {
|
||||
return null;
|
||||
}
|
||||
List<Alignment> secondaryHits = null;
|
||||
while (reader.hasNext()) {
|
||||
Alignment alignment = reader.next();
|
||||
if (alignment.getASequenceId() != seed.getASequenceId()) {
|
||||
if (alignment.getASequenceId() < seed.getASequenceId()) {
|
||||
throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format());
|
||||
}
|
||||
mPendingAlignment = alignment;
|
||||
break;
|
||||
}
|
||||
if (secondaryHits == null) {
|
||||
secondaryHits = new ArrayList<Alignment>();
|
||||
}
|
||||
secondaryHits.add(alignment);
|
||||
}
|
||||
if (secondaryHits == null) {
|
||||
return seed;
|
||||
}
|
||||
secondaryHits.add(seed);
|
||||
Alignment result = getUniqueBestAlignment(secondaryHits);
|
||||
if (result != null) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Alignment getUniqueBestAlignment(List<Alignment> alignments) {
|
||||
int bestMismatches = 0;
|
||||
List<Alignment> best = new ArrayList<Alignment>();
|
||||
for (Alignment a : alignments) {
|
||||
int mismatches = getAlignmentMismatches(a);
|
||||
if (best.isEmpty()) {
|
||||
best.add(a);
|
||||
bestMismatches = mismatches;
|
||||
}
|
||||
if (mismatches == bestMismatches) {
|
||||
best.add(a);
|
||||
} else if (mismatches < bestMismatches) {
|
||||
best.clear();
|
||||
best.add(a);
|
||||
bestMismatches = mismatches;
|
||||
}
|
||||
}
|
||||
if (best.size() != 1) {
|
||||
return null;
|
||||
}
|
||||
return best.get(0);
|
||||
}
|
||||
|
||||
private int getAlignmentMismatches(Alignment alignment) {
|
||||
int mismatches = 0;
|
||||
int[] blocks = alignment.getAlignmentBlocks();
|
||||
for (int i = 0; i < blocks.length; i += 3) {
|
||||
int gap = blocks[i];
|
||||
int duration = blocks[i+1];
|
||||
int mm = blocks[i+2];
|
||||
if (mm > duration) {
|
||||
throw new RuntimeException("Invalid alignment? : " + alignment.format());
|
||||
}
|
||||
mismatches += Math.abs(gap);
|
||||
mismatches += mm;
|
||||
}
|
||||
return mismatches;
|
||||
}
|
||||
|
||||
// Return the number of reference bases covered by this alignment.
|
||||
private int getBaseCount(Alignment alignment) {
|
||||
int count = 0;
|
||||
int[] blocks = alignment.getAlignmentBlocks();
|
||||
for (int i = 0; i < blocks.length; i += 3) {
|
||||
// int gap = blocks[i];
|
||||
int duration = blocks[i+1];
|
||||
// int mm = blocks[i+2];
|
||||
count += duration;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private int chromosomeToSequenceId(String text) {
|
||||
if (text == null || text.length() == 0) {
|
||||
return -1;
|
||||
}
|
||||
if (text.matches("\\d+")) {
|
||||
return Integer.parseInt(text);
|
||||
}
|
||||
if (text.startsWith("chr") && text.length() > 3) {
|
||||
text = text.substring(3);
|
||||
}
|
||||
if (text.matches("\\d+") && !text.startsWith("0")) {
|
||||
return Integer.parseInt(text);
|
||||
}
|
||||
if (text.equals("M")) {
|
||||
return 0;
|
||||
} else if (text.equals("X")) {
|
||||
return 23;
|
||||
} else if (text.equals("Y")) {
|
||||
return 24;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean mDebug = false;
|
||||
private boolean mVerbose = false;
|
||||
|
||||
private String mAlignmentFilePath = null;
|
||||
private boolean mReturnBestHits = false;
|
||||
private Integer mMismatchThreshold = null;
|
||||
private Integer mSequenceId = null;
|
||||
private Integer mStartPosition = null;
|
||||
private Integer mEndPosition = null;
|
||||
private Alignment mPendingAlignment = null;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -1,399 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||||
* or functionality.
|
||||
*/
|
||||
package edu.mit.broad.cnv;
|
||||
|
||||
import edu.mit.broad.arachne.Alignment;
|
||||
import edu.mit.broad.arachne.LookAlignReader;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Utility program to gather CNV alignments from LookAlign files in an I/O efficient manner.
|
||||
*/
|
||||
public class GatherAlignments {
|
||||
|
||||
public static void main(String[] args)
|
||||
throws Exception {
|
||||
new GatherAlignments().run(args);
|
||||
}
|
||||
|
||||
private void usage() {
|
||||
System.out.println("Usage: GatherAlignments ...");
|
||||
System.out.println(" -cnpList <cnp-file>");
|
||||
System.out.println(" -sampleId <sample-id>");
|
||||
System.out.println(" -inputFileList <fofn>");
|
||||
System.out.println(" -outputDirectory <dir>");
|
||||
System.out.println(" -padding <n-bases>");
|
||||
System.out.println(" -bestAlignments");
|
||||
System.out.println(" -verbose");
|
||||
System.out.println(" -debug");
|
||||
}
|
||||
|
||||
private boolean parseArguments(String[] args) {
|
||||
|
||||
int argpos = 0;
|
||||
int argsleft = 0;
|
||||
|
||||
while (argpos < args.length) {
|
||||
argsleft = args.length - argpos;
|
||||
String arg = args[argpos];
|
||||
if (arg.equals("-cnpList") && argsleft > 1) {
|
||||
argpos++;
|
||||
mCnpListPath = args[argpos++];
|
||||
} else if (arg.equals("-sampleId") && argsleft > 1) {
|
||||
argpos++;
|
||||
mSampleId = args[argpos++];
|
||||
} else if (arg.equals("-inputFileList") && argsleft > 1) {
|
||||
argpos++;
|
||||
mInputFileListPath = args[argpos++];
|
||||
} else if (arg.equals("-outputDirectory") && argsleft > 1) {
|
||||
argpos++;
|
||||
mOutputDirectory = args[argpos++];
|
||||
} else if (arg.equals("-padding") && argsleft > 1) {
|
||||
argpos++;
|
||||
mCnpRegionPadding = Integer.parseInt(args[argpos++]);
|
||||
} else if (arg.equals("-bestAlignments")) {
|
||||
argpos++;
|
||||
mReturnBestHits = true;
|
||||
} else if (arg.equals("-verbose")) {
|
||||
argpos++;
|
||||
mVerbose = true;
|
||||
} else if (arg.equals("-debug")) {
|
||||
argpos++;
|
||||
mDebug = true;
|
||||
} else if (arg.startsWith("-")) {
|
||||
usage();
|
||||
return false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
argsleft = args.length - argpos;
|
||||
if (argsleft != 0) {
|
||||
usage();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private void run(String[] args)
|
||||
throws Exception {
|
||||
|
||||
if (!parseArguments(args)) {
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
List<File> mInputFileList = parseInputFiles(mInputFileListPath);
|
||||
Map<Integer, List<CnpRegion>> mCnpMap = parseCnpFile(mCnpListPath);
|
||||
for (File inputFile : mInputFileList) {
|
||||
scanInputFile(inputFile, mCnpMap);
|
||||
}
|
||||
}
|
||||
|
||||
private List<File> parseInputFiles(String path)
|
||||
throws IOException {
|
||||
List<File> fileList = new ArrayList<File>();
|
||||
LineNumberReader reader = new LineNumberReader(new FileReader(path));
|
||||
while (true) {
|
||||
String line = reader.readLine();
|
||||
if (line == null) {
|
||||
reader.close();
|
||||
break;
|
||||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0 || line.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
String[] fields = line.split("\\s+");
|
||||
fileList.add(new File(fields[0]));
|
||||
}
|
||||
return fileList;
|
||||
}
|
||||
|
||||
private Map<Integer, List<CnpRegion>> parseCnpFile(String path)
|
||||
throws IOException {
|
||||
Map<Integer, List<CnpRegion>> cnpMap = new HashMap<Integer, List<CnpRegion>>();
|
||||
LineNumberReader reader = new LineNumberReader(new FileReader(path));
|
||||
while (true) {
|
||||
String line = reader.readLine();
|
||||
if (line == null) {
|
||||
reader.close();
|
||||
break;
|
||||
}
|
||||
line = line.trim();
|
||||
if (line.length() == 0 || line.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
String[] fields = line.split("\\s+");
|
||||
if (fields.length != 4) {
|
||||
throw new RuntimeException("Invalid CNP line: " + line);
|
||||
}
|
||||
if (fields[0].equalsIgnoreCase("CNPID")) {
|
||||
continue;
|
||||
}
|
||||
String cnpId = fields[0];
|
||||
String chromosome = fields[1];
|
||||
int start = Integer.parseInt(fields[2].replaceAll(",", ""));
|
||||
int end = Integer.parseInt(fields[3].replaceAll(",", ""));
|
||||
int sequenceId = chromosomeToSequenceId(chromosome);
|
||||
if (sequenceId < 0) {
|
||||
throw new RuntimeException("Unrecognized chromosome: " + chromosome);
|
||||
}
|
||||
if (mCnpRegionPadding > 0) {
|
||||
start = Math.max(1, start - mCnpRegionPadding);
|
||||
end = end + mCnpRegionPadding;
|
||||
}
|
||||
CnpRegion cnp = new CnpRegion(cnpId, sequenceId, start, end);
|
||||
List<CnpRegion> cnpList = cnpMap.get(sequenceId);
|
||||
if (cnpList == null) {
|
||||
cnpList = new ArrayList<CnpRegion>();
|
||||
cnpMap.put(sequenceId, cnpList);
|
||||
}
|
||||
cnpList.add(cnp);
|
||||
}
|
||||
return cnpMap;
|
||||
}
|
||||
|
||||
private int chromosomeToSequenceId(String text) {
|
||||
if (text == null || text.length() == 0) {
|
||||
return -1;
|
||||
}
|
||||
if (text.matches("\\d+")) {
|
||||
return Integer.parseInt(text);
|
||||
}
|
||||
if (text.startsWith("chr") && text.length() > 3) {
|
||||
text = text.substring(3);
|
||||
}
|
||||
if (text.matches("\\d+") && !text.startsWith("0")) {
|
||||
return Integer.parseInt(text);
|
||||
}
|
||||
if (text.equals("M")) {
|
||||
return 0;
|
||||
} else if (text.equals("X")) {
|
||||
return 23;
|
||||
} else if (text.equals("Y")) {
|
||||
return 24;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
private void scanInputFile(File inputFile,
|
||||
Map<Integer, List<CnpRegion>> cnpMap)
|
||||
throws IOException {
|
||||
LookAlignReader reader = new LookAlignReader(inputFile);
|
||||
while (true) {
|
||||
Alignment alignment = getNextAlignment(reader);
|
||||
if (alignment == null) {
|
||||
reader.close();
|
||||
break;
|
||||
}
|
||||
List<CnpRegion> cnpList = cnpMap.get(alignment.getBSequenceId());
|
||||
if (cnpList == null) {
|
||||
continue;
|
||||
}
|
||||
for (CnpRegion cnp : cnpList) {
|
||||
if (overlaps(cnp, alignment)) {
|
||||
saveCnpAlignment(cnp, alignment, inputFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
flushCnpAlignments(inputFile);
|
||||
}
|
||||
|
||||
private Alignment getNextAlignment(LookAlignReader reader)
|
||||
throws IOException {
|
||||
if (!mReturnBestHits) {
|
||||
if (reader.hasNext()) {
|
||||
return reader.next();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
while (true) {
|
||||
Alignment seed = mPendingAlignment;
|
||||
mPendingAlignment = null;
|
||||
if (seed == null && reader.hasNext()) {
|
||||
seed = reader.next();
|
||||
}
|
||||
if (seed == null) {
|
||||
return null;
|
||||
}
|
||||
List<Alignment> secondaryHits = null;
|
||||
while (reader.hasNext()) {
|
||||
Alignment alignment = reader.next();
|
||||
if (alignment.getASequenceId() != seed.getASequenceId()) {
|
||||
if (alignment.getASequenceId() < seed.getASequenceId()) {
|
||||
throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format());
|
||||
}
|
||||
mPendingAlignment = alignment;
|
||||
break;
|
||||
}
|
||||
if (secondaryHits == null) {
|
||||
secondaryHits = new ArrayList<Alignment>();
|
||||
}
|
||||
secondaryHits.add(alignment);
|
||||
}
|
||||
if (secondaryHits == null) {
|
||||
return seed;
|
||||
}
|
||||
secondaryHits.add(seed);
|
||||
Alignment result = getUniqueBestAlignment(secondaryHits);
|
||||
if (result != null) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Alignment getUniqueBestAlignment(List<Alignment> alignments) {
|
||||
int bestMismatches = 0;
|
||||
List<Alignment> best = new ArrayList<Alignment>();
|
||||
for (Alignment a : alignments) {
|
||||
int mismatches = getAlignmentMismatches(a);
|
||||
if (best.isEmpty()) {
|
||||
best.add(a);
|
||||
bestMismatches = mismatches;
|
||||
}
|
||||
if (mismatches == bestMismatches) {
|
||||
best.add(a);
|
||||
} else if (mismatches < bestMismatches) {
|
||||
best.clear();
|
||||
best.add(a);
|
||||
bestMismatches = mismatches;
|
||||
}
|
||||
}
|
||||
if (best.size() != 1) {
|
||||
return null;
|
||||
}
|
||||
return best.get(0);
|
||||
}
|
||||
|
||||
private int getAlignmentMismatches(Alignment alignment) {
|
||||
int mismatches = 0;
|
||||
int[] blocks = alignment.getAlignmentBlocks();
|
||||
for (int i = 0; i < blocks.length; i += 3) {
|
||||
int gap = blocks[i];
|
||||
int duration = blocks[i+1];
|
||||
int mm = blocks[i+2];
|
||||
if (mm > duration) {
|
||||
throw new RuntimeException("Invalid alignment? : " + alignment.format());
|
||||
}
|
||||
mismatches += Math.abs(gap);
|
||||
mismatches += mm;
|
||||
}
|
||||
return mismatches;
|
||||
}
|
||||
|
||||
private boolean overlaps(CnpRegion cnp, Alignment alignment) {
|
||||
return (cnp.getSequenceId() == alignment.getBSequenceId() &&
|
||||
cnp.getStart() <= alignment.getBEnd() &&
|
||||
cnp.getEnd() >= alignment.getBStart());
|
||||
}
|
||||
|
||||
private void saveCnpAlignment(CnpRegion cnp, Alignment alignment, File inputFile)
|
||||
throws IOException {
|
||||
if (mCnpAlignmentCount > mCnpAlignmentLimit) {
|
||||
flushCnpAlignments(inputFile);
|
||||
}
|
||||
String cnpId = cnp.getCnpId();
|
||||
List<Alignment> alignmentList = mCnpAlignmentMap.get(cnpId);
|
||||
if (alignmentList == null) {
|
||||
alignmentList = new ArrayList<Alignment>();
|
||||
mCnpAlignmentMap.put(cnpId, alignmentList);
|
||||
}
|
||||
alignmentList.add(alignment);
|
||||
mCnpAlignmentCount++;
|
||||
}
|
||||
|
||||
private void flushCnpAlignments(File inputFile)
|
||||
throws IOException {
|
||||
while (!mCnpAlignmentMap.isEmpty()) {
|
||||
String cnpId = mCnpAlignmentMap.keySet().iterator().next();
|
||||
List<Alignment> alignmentList = mCnpAlignmentMap.get(cnpId);
|
||||
writeAlignments(cnpId, mSampleId, alignmentList, inputFile);
|
||||
mCnpAlignmentMap.remove(cnpId);
|
||||
mCnpAlignmentCount -= alignmentList.size();
|
||||
}
|
||||
if (mCnpAlignmentCount != 0) {
|
||||
throw new RuntimeException("Unsynchronized alignment count");
|
||||
}
|
||||
}
|
||||
|
||||
private void writeAlignments(String cnpId, String sampleId, List<Alignment> alignmentList, File inputFile)
|
||||
throws IOException {
|
||||
File outputDir = new File(".");
|
||||
if (mOutputDirectory != null) {
|
||||
outputDir = new File(mOutputDirectory);
|
||||
}
|
||||
String cnpSample = cnpId;
|
||||
if (sampleId != null) {
|
||||
cnpSample = cnpSample + "_" + sampleId;
|
||||
}
|
||||
File cnpSampleDir = new File(outputDir, cnpSample);
|
||||
if (!cnpSampleDir.exists()) {
|
||||
if (!cnpSampleDir.mkdir()) {
|
||||
throw new RuntimeException("Failed to create directory " + cnpSampleDir);
|
||||
}
|
||||
}
|
||||
String fileName = inputFile.getName();
|
||||
File alignmentFile = new File(cnpSampleDir, fileName);
|
||||
PrintWriter writer = new PrintWriter(new FileWriter(alignmentFile, true));
|
||||
for (Alignment alignment : alignmentList) {
|
||||
writer.println(alignment.arachneFormat());
|
||||
}
|
||||
writer.flush();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
private GatherAlignments() {
|
||||
}
|
||||
|
||||
private static class CnpRegion {
|
||||
|
||||
private CnpRegion(String cnpId, int sequenceId, int start, int end) {
|
||||
mCnpId = cnpId;
|
||||
mSequenceId = sequenceId;
|
||||
mStart = start;
|
||||
mEnd = end;
|
||||
}
|
||||
|
||||
public String getCnpId() { return mCnpId; };
|
||||
public int getSequenceId() { return mSequenceId; };
|
||||
public int getStart() { return mStart; };
|
||||
public int getEnd() { return mEnd; };
|
||||
|
||||
private String mCnpId;
|
||||
private int mSequenceId;
|
||||
private int mStart;
|
||||
private int mEnd;
|
||||
}
|
||||
|
||||
private boolean mDebug = false;
|
||||
private boolean mVerbose = false;
|
||||
|
||||
private boolean mReturnBestHits = false;
|
||||
private String mCnpListPath = null;
|
||||
private String mSampleId = null;
|
||||
private String mInputFileListPath = null;
|
||||
private String mOutputDirectory = null;
|
||||
private int mCnpRegionPadding = 0;
|
||||
|
||||
private Alignment mPendingAlignment = null;
|
||||
private int mCnpAlignmentCount = 0;
|
||||
private int mCnpAlignmentLimit = 1000000;
|
||||
private Map<String, List<Alignment>> mCnpAlignmentMap = new LinkedHashMap<String, List<Alignment>>();
|
||||
}
|
||||
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,151 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||||
* or functionality.
|
||||
*/
|
||||
package edu.mit.broad.cnv.kmer;
|
||||
|
||||
|
||||
import edu.mit.broad.dcp.DistributedAlgorithm;
|
||||
import edu.mit.broad.cnv.util.SequenceIterator;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Distributed algorithm for counting unique kmers.
|
||||
*/
|
||||
public class DistributedKMerCounter
|
||||
extends DistributedAlgorithm
|
||||
{
|
||||
private boolean mDebug = false;
|
||||
private boolean mVerbose = false;
|
||||
private int mK = 0;
|
||||
private List<File> mInputFiles = null;
|
||||
private List<String> mSequenceList = null;
|
||||
private List<Integer> mSequenceOffsetList = null;
|
||||
|
||||
|
||||
public DistributedKMerCounter() {
|
||||
}
|
||||
|
||||
public boolean getDebug() {
|
||||
return mDebug;
|
||||
}
|
||||
|
||||
public void setDebug(boolean value) {
|
||||
mDebug = value;
|
||||
}
|
||||
|
||||
public boolean getVerbose() {
|
||||
return mVerbose;
|
||||
}
|
||||
|
||||
public void setVerbose(boolean value) {
|
||||
mVerbose = value;
|
||||
}
|
||||
|
||||
public int getK() {
|
||||
return mK;
|
||||
}
|
||||
|
||||
public void setK(int value) {
|
||||
mK = value;
|
||||
}
|
||||
|
||||
public List<File> getInputFiles() {
|
||||
return mInputFiles;
|
||||
}
|
||||
|
||||
public void setInputFiles(List<File> value) {
|
||||
mInputFiles = value;
|
||||
}
|
||||
|
||||
public void run()
|
||||
throws Exception {
|
||||
super.run();
|
||||
finish();
|
||||
}
|
||||
|
||||
protected void init()
|
||||
throws Exception {
|
||||
if (getWorkerId() == MASTER) {
|
||||
initMaster();
|
||||
} else {
|
||||
initWorker();
|
||||
}
|
||||
}
|
||||
|
||||
private void initMaster()
|
||||
throws IOException {
|
||||
// Tasks to be amortized
|
||||
report("Scanning sequences ...");
|
||||
scanSequences();
|
||||
report("Scan complete.");
|
||||
}
|
||||
|
||||
private void initWorker() {
|
||||
// Tasks to be amortized
|
||||
}
|
||||
|
||||
protected void start() {
|
||||
// scan genome, divide into chromosomes and optionally segments, distribute calls
|
||||
}
|
||||
|
||||
private void finish() {
|
||||
// merge individual files, write out final results
|
||||
}
|
||||
|
||||
private void scanSequences()
|
||||
throws IOException {
|
||||
List<String> sequenceList = new ArrayList<String>();
|
||||
List<Integer> sequenceOffsetList = new ArrayList<Integer>();
|
||||
SequenceIterator seqIterator = new SequenceIterator(getInputFiles());
|
||||
while (true) {
|
||||
String seqName = seqIterator.getNextSequence();
|
||||
if (seqName == null) {
|
||||
break;
|
||||
}
|
||||
int baseIndex = seqIterator.getBaseIndex() + 1;
|
||||
sequenceList.add(seqName);
|
||||
sequenceOffsetList.add(baseIndex);
|
||||
}
|
||||
mSequenceList = sequenceList;
|
||||
mSequenceOffsetList = sequenceOffsetList;
|
||||
}
|
||||
|
||||
// Currently not used
|
||||
private void loadGenomeOffsets(File file)
|
||||
throws IOException {
|
||||
List<String> sequenceList = new ArrayList<String>();
|
||||
List<Integer> sequenceOffsetList = new ArrayList<Integer>();
|
||||
int baseIndex = 0;
|
||||
LineNumberReader reader = new LineNumberReader(new FileReader(file));
|
||||
while (true) {
|
||||
String line = reader.readLine();
|
||||
if (line == null) {
|
||||
break;
|
||||
}
|
||||
String text = line.trim();
|
||||
if (text.length() == 0 || text.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
String[] fields = text.split("\\s+");
|
||||
if (fields.length != 2) {
|
||||
throw new RuntimeException("Invalid input line: " + line);
|
||||
}
|
||||
int length = Integer.parseInt(fields[1]);
|
||||
sequenceList.add(fields[0]);
|
||||
sequenceOffsetList.add(baseIndex);
|
||||
baseIndex += length;
|
||||
}
|
||||
mSequenceList = sequenceList;
|
||||
mSequenceOffsetList = sequenceOffsetList;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,184 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||||
* or functionality.
|
||||
*/
|
||||
package edu.mit.broad.cnv.util;
|
||||
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Utility class for transforming between a linear base index
|
||||
* and a chromsome + position coordinate system.
|
||||
*/
|
||||
public class GenomeBaseIndex {
|
||||
|
||||
private List<String> mSequenceNames = null;
|
||||
private int[] mLengths = null;
|
||||
private long[] mOffsets = null;
|
||||
|
||||
private GenomeBaseIndex() {
|
||||
}
|
||||
|
||||
public static GenomeBaseIndex read(File file)
|
||||
throws IOException {
|
||||
Reader reader = new BufferedReader(new FileReader(file));
|
||||
try {
|
||||
return read(reader);
|
||||
} finally {
|
||||
reader.close();
|
||||
}
|
||||
}
|
||||
|
||||
// The input is just a list of space-delimited sequence name and length.
|
||||
public static GenomeBaseIndex read(Reader reader)
|
||||
throws IOException {
|
||||
List<String> sequenceNames = new ArrayList<String>();
|
||||
List<Integer> sequenceLengths = new ArrayList<Integer>();
|
||||
BufferedReader bufferedReader = new BufferedReader(reader);
|
||||
while (true) {
|
||||
String line = bufferedReader.readLine();
|
||||
if (line == null) {
|
||||
break;
|
||||
}
|
||||
String text = line.trim();
|
||||
if (text.length() == 0 || text.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
String[] fields = text.split("\\s+");
|
||||
if (fields.length < 2) {
|
||||
throw new RuntimeException("Invalid input line: " + line);
|
||||
}
|
||||
int length = Integer.parseInt(fields[1]);
|
||||
if (length <= 0) {
|
||||
throw new RuntimeException("Invalid sequence length: " + length);
|
||||
}
|
||||
sequenceNames.add(fields[0]);
|
||||
sequenceLengths.add(length);
|
||||
}
|
||||
int count = sequenceLengths.size();
|
||||
int[] lengths = new int[count];
|
||||
long[] offsets = new long[count];
|
||||
long offset = 0;
|
||||
for (int i = 0; i < count; i++) {
|
||||
lengths[i] = sequenceLengths.get(i);
|
||||
offsets[i] = offset;
|
||||
offset += lengths[i];
|
||||
}
|
||||
GenomeBaseIndex result = new GenomeBaseIndex();
|
||||
result.mSequenceNames = sequenceNames;
|
||||
result.mLengths = lengths;
|
||||
result.mOffsets = offsets;
|
||||
return result;
|
||||
}
|
||||
|
||||
public List<String> getSequenceNames() {
|
||||
return mSequenceNames;
|
||||
}
|
||||
|
||||
public boolean contains(String seqName) {
|
||||
return (getSequenceIndex(seqName) >= 0);
|
||||
}
|
||||
|
||||
public long getFirstIndex(String seqName) {
|
||||
int index = getSequenceIndex(seqName);
|
||||
if (index < 0) {
|
||||
return -1;
|
||||
}
|
||||
return mOffsets[index];
|
||||
}
|
||||
|
||||
public long getLastIndex(String seqName) {
|
||||
int index = getSequenceIndex(seqName);
|
||||
if (index < 0) {
|
||||
return -1;
|
||||
}
|
||||
return (mOffsets[index] + mLengths[index] - 1);
|
||||
}
|
||||
|
||||
public int getSequenceLength(String seqName) {
|
||||
int index = getSequenceIndex(seqName);
|
||||
if (index < 0) {
|
||||
return 0;
|
||||
}
|
||||
return mLengths[index];
|
||||
}
|
||||
|
||||
public long getBaseIndex(String seqName, int position) {
|
||||
int index = getSequenceIndex(seqName);
|
||||
if (index < 0) {
|
||||
return -1;
|
||||
}
|
||||
if (position > mLengths[index]) {
|
||||
return -1;
|
||||
}
|
||||
if (position < 1) {
|
||||
// Zero or negative position means last base index
|
||||
position = mLengths[index];
|
||||
}
|
||||
return (mOffsets[index] + position - 1);
|
||||
}
|
||||
|
||||
public String getSequenceName(long baseIndex) {
|
||||
int index = getSequenceIndex(baseIndex);
|
||||
if (index < 0) {
|
||||
return null;
|
||||
}
|
||||
return mSequenceNames.get(index);
|
||||
}
|
||||
|
||||
public int getPosition(long baseIndex) {
|
||||
if (baseIndex < 0) {
|
||||
// Catch common sign-extension error when packing indexes as ints.
|
||||
throw new IllegalArgumentException("Invalid base index: " + baseIndex);
|
||||
}
|
||||
int index = getSequenceIndex(baseIndex);
|
||||
if (index < 0) {
|
||||
return 0;
|
||||
}
|
||||
long offset = mOffsets[index];
|
||||
long result = baseIndex - offset + 1;
|
||||
return (int) result;
|
||||
}
|
||||
|
||||
// Same as getSequenceName, but treat the argument as an unsigned int.
|
||||
// This is useful for manipulating/storing indexes for the human
|
||||
// genome as 4-byte unsigned ints.
|
||||
public String getSequenceNameUnsigned(int baseIndex) {
|
||||
return getSequenceName(baseIndex & 0xFFFFFFFFL);
|
||||
}
|
||||
|
||||
// Same as getPosition, but treat the argument as an unsigned int.
|
||||
// This is useful for manipulating/storing indexes for the human
|
||||
// genome as 4-byte unsigned ints.
|
||||
public int getPositionUnsigned(int baseIndex) {
|
||||
return getPosition(baseIndex & 0xFFFFFFFFL);
|
||||
}
|
||||
|
||||
private int getSequenceIndex(String seqName) {
|
||||
return mSequenceNames.indexOf(seqName);
|
||||
}
|
||||
|
||||
private int getSequenceIndex(long baseIndex) {
|
||||
long offset = 0;
|
||||
if (baseIndex < 0) {
|
||||
return -1;
|
||||
}
|
||||
for (int i = 0; i < mLengths.length; i++) {
|
||||
int length = mLengths[i];
|
||||
if (offset + length > baseIndex) {
|
||||
return i;
|
||||
}
|
||||
offset += length;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,167 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||||
* or functionality.
|
||||
*/
|
||||
package edu.mit.broad.cnv.util;
|
||||
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Utility class for transforming between a chromsome + position
|
||||
* coordinate system and a binned coordinate system where each
|
||||
* chromosome (separately) is divided into fixed sized bins,
|
||||
* ragged on the right/upper end.
|
||||
*/
|
||||
public class GenomeBinIndex {
|
||||
|
||||
private int mBinSize;
|
||||
private List<String> mSequenceNames;
|
||||
private int[] mSequenceLengths;
|
||||
private int[] mBinOffsets;
|
||||
|
||||
public GenomeBinIndex(GenomeBaseIndex gbi, int binSize) {
|
||||
if (binSize <= 0) {
|
||||
throw new IllegalArgumentException("Illegal bin size: " + binSize);
|
||||
}
|
||||
mBinSize = binSize;
|
||||
mSequenceNames = new ArrayList<String>(gbi.getSequenceNames());
|
||||
int count = mSequenceNames.size();
|
||||
mSequenceLengths = new int[count];
|
||||
mBinOffsets = new int[count];
|
||||
long binOffset = 0; // long to detect overflow
|
||||
for (int i = 0; i < count; i++) {
|
||||
int length = gbi.getSequenceLength(mSequenceNames.get(i));
|
||||
int binCount = (length + binSize - 1) / binSize;
|
||||
mSequenceLengths[i] = length;
|
||||
mBinOffsets[i] = (int) binOffset;
|
||||
binOffset += binCount;
|
||||
}
|
||||
if (binOffset > Integer.MAX_VALUE) {
|
||||
// Check for integer overflow.
|
||||
// This will happen, e.g., with the human genome and a bin size of 1.
|
||||
throw new RuntimeException("Binsize too small: " + binSize);
|
||||
}
|
||||
}
|
||||
|
||||
public int getBinSize() {
|
||||
return mBinSize;
|
||||
}
|
||||
|
||||
public int getBinIndex(String seqName, int position) {
|
||||
int index = getSequenceIndex(seqName);
|
||||
if (index < 0) {
|
||||
return -1;
|
||||
}
|
||||
if (position > mSequenceLengths[index]) {
|
||||
return -1;
|
||||
}
|
||||
if (position < 1) {
|
||||
position = mSequenceLengths[index];
|
||||
}
|
||||
int bin = (position - 1) / mBinSize;
|
||||
return (mBinOffsets[index] + bin);
|
||||
}
|
||||
|
||||
public String getSequenceName(int binIndex) {
|
||||
int index = getSequenceIndex(binIndex);
|
||||
if (index < 0) {
|
||||
return null;
|
||||
}
|
||||
return mSequenceNames.get(index);
|
||||
}
|
||||
|
||||
public int getStartPosition(int binIndex) {
|
||||
int index = getSequenceIndex(binIndex);
|
||||
if (index < 0) {
|
||||
return -1;
|
||||
}
|
||||
int bin = binIndex - mBinOffsets[index];
|
||||
return (bin * mBinSize + 1);
|
||||
}
|
||||
|
||||
public int getEndPosition(int binIndex) {
|
||||
int index = getSequenceIndex(binIndex);
|
||||
if (index < 0) {
|
||||
return -1;
|
||||
}
|
||||
int bin = binIndex - mBinOffsets[index];
|
||||
int position = (bin+1) * mBinSize;
|
||||
position = Math.min(position, mSequenceLengths[index]);
|
||||
return position;
|
||||
}
|
||||
|
||||
public List<String> getSequenceNames() {
|
||||
return mSequenceNames;
|
||||
}
|
||||
|
||||
public int getFirstBin(String seqName) {
|
||||
return getBinIndex(seqName, 1);
|
||||
}
|
||||
|
||||
public int getLastBin(String seqName) {
|
||||
return getBinIndex(seqName, 0);
|
||||
}
|
||||
|
||||
public int getBinCount() {
|
||||
if (mBinOffsets.length == 0) {
|
||||
return 0;
|
||||
}
|
||||
int lastIndex = mBinOffsets.length - 1;
|
||||
int count = mBinOffsets[lastIndex];
|
||||
count += (mSequenceLengths[lastIndex] + mBinSize - 1) / mBinSize;
|
||||
return count;
|
||||
}
|
||||
|
||||
public int getBinCount(String seqName) {
|
||||
int index = getSequenceIndex(seqName);
|
||||
if (index < 0) {
|
||||
return -1;
|
||||
}
|
||||
return ((mSequenceLengths[index] + mBinSize - 1) / mBinSize);
|
||||
}
|
||||
|
||||
public int getSequenceLength(String seqName) {
|
||||
int index = getSequenceIndex(seqName);
|
||||
if (index < 0) {
|
||||
return 0;
|
||||
}
|
||||
return mSequenceLengths[index];
|
||||
}
|
||||
|
||||
private int getSequenceIndex(String seqName) {
|
||||
for (int i = 0; i < mSequenceNames.size(); i++) {
|
||||
if (mSequenceNames.get(i).equals(seqName)) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
private int getSequenceIndex(int binIndex) {
|
||||
if (binIndex < 0) {
|
||||
return -1;
|
||||
}
|
||||
for (int i = 1; i < mBinOffsets.length; i++) {
|
||||
if (mBinOffsets[i] > binIndex) {
|
||||
return i-1;
|
||||
}
|
||||
}
|
||||
int lastIndex = mBinOffsets.length-1;
|
||||
int lastBinIndex = mBinOffsets[lastIndex];
|
||||
lastBinIndex += (mSequenceLengths[lastIndex] + mBinSize - 1) / mBinSize;
|
||||
if (binIndex <= lastBinIndex) {
|
||||
return lastIndex;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,145 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||||
* or functionality.
|
||||
*/
|
||||
package edu.mit.broad.cnv.util;
|
||||
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Utility class for iterating over fasta files.
|
||||
* Also maintains an unsigned base index over the file set.
|
||||
*/
|
||||
public class SequenceIterator
|
||||
{
|
||||
private List<File> mInputFiles = null;
|
||||
private int mInputFileIndex = 0;
|
||||
private int mBaseIndex = -1;
|
||||
private LineNumberReader mCurrentReader = null;
|
||||
private String mNextSequence = null;
|
||||
private String mLineBuffer = null;
|
||||
private int mLineBufferIndex = 0;
|
||||
|
||||
public SequenceIterator(File inputFile) {
|
||||
mInputFiles = new ArrayList<File>();
|
||||
mInputFiles.add(inputFile);
|
||||
}
|
||||
|
||||
public SequenceIterator(List<File> inputFiles) {
|
||||
mInputFiles = inputFiles;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (mCurrentReader != null) {
|
||||
try {
|
||||
mCurrentReader.close();
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException("Error closing reader: " + exc.getMessage(),
|
||||
exc);
|
||||
}
|
||||
}
|
||||
mCurrentReader = null;
|
||||
mInputFiles = null;
|
||||
mInputFileIndex = 0;
|
||||
mBaseIndex = -1;
|
||||
mNextSequence = null;
|
||||
mLineBuffer = null;
|
||||
mLineBufferIndex = 0;
|
||||
}
|
||||
|
||||
public String getNextSequence()
|
||||
throws IOException {
|
||||
|
||||
while (mNextSequence == null) {
|
||||
if (mLineBuffer != null) {
|
||||
incrementBaseIndex(mLineBuffer.length() - mLineBufferIndex);
|
||||
mLineBuffer = null;
|
||||
mLineBufferIndex = 0;
|
||||
}
|
||||
if (mCurrentReader == null) {
|
||||
mCurrentReader = getNextReader();
|
||||
if (mCurrentReader == null) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
String line = mCurrentReader.readLine();
|
||||
if (line == null) {
|
||||
mCurrentReader.close();
|
||||
mCurrentReader = null;
|
||||
continue;
|
||||
}
|
||||
if (line.startsWith(">")) {
|
||||
String[] tokens = line.substring(1).trim().split("\\s+");
|
||||
mNextSequence = tokens[0];
|
||||
} else {
|
||||
incrementBaseIndex(line.length());
|
||||
}
|
||||
}
|
||||
String result = mNextSequence;
|
||||
mNextSequence = null;
|
||||
return result;
|
||||
}
|
||||
|
||||
public char getNextBase()
|
||||
throws IOException {
|
||||
|
||||
if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) {
|
||||
if (mCurrentReader == null) {
|
||||
return 0;
|
||||
}
|
||||
if (mNextSequence != null) {
|
||||
return 0;
|
||||
}
|
||||
String line = mCurrentReader.readLine();
|
||||
if (line == null) {
|
||||
mLineBuffer = null;
|
||||
mLineBufferIndex = 0;
|
||||
mCurrentReader.close();
|
||||
mCurrentReader = null;
|
||||
return 0;
|
||||
}
|
||||
if (line.startsWith(">")) {
|
||||
String[] tokens = line.substring(1).trim().split("\\s+");
|
||||
mNextSequence = tokens[0];
|
||||
mLineBuffer = null;
|
||||
mLineBufferIndex = 0;
|
||||
return 0;
|
||||
}
|
||||
mLineBuffer = line.toUpperCase();
|
||||
mLineBufferIndex = 0;
|
||||
}
|
||||
char result = mLineBuffer.charAt(mLineBufferIndex++);
|
||||
incrementBaseIndex(1);
|
||||
return result;
|
||||
}
|
||||
|
||||
public int getBaseIndex() {
|
||||
return mBaseIndex;
|
||||
}
|
||||
|
||||
private LineNumberReader getNextReader()
|
||||
throws IOException {
|
||||
if (mInputFileIndex >= mInputFiles.size()) {
|
||||
return null;
|
||||
}
|
||||
File file = mInputFiles.get(mInputFileIndex++);
|
||||
return new LineNumberReader(new FileReader(file));
|
||||
}
|
||||
|
||||
private void incrementBaseIndex(int amount) {
|
||||
if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) {
|
||||
throw new RuntimeException("Base index: 32-bit overflow");
|
||||
}
|
||||
mBaseIndex += amount;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2007 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.dcp;
|
||||
|
||||
public enum CallStatus
|
||||
{
|
||||
PENDING,
|
||||
PROCESSING
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1,309 +0,0 @@
|
|||
/**
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2006 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.dcp;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
|
||||
/**
|
||||
* Utility class to run system commands synchronously and return the output.
|
||||
*
|
||||
* The interface supports the typical case where you want to return a modest
|
||||
* amount of information from the command's standard output or standard error
|
||||
* as a string. The caller can override this behavior, however, and provide
|
||||
* alternative output destinations if necessary.
|
||||
*
|
||||
* If setMergeOutput() is true, then this class will attempt to interleave
|
||||
* the standard output and standard error streams of the command into one
|
||||
* stream (standard output). This may not produce exactly the same results
|
||||
* as having the operating system interleave the output, but works well for
|
||||
* simple executables that do not heavily intermix stdout and stderr.
|
||||
*
|
||||
* A typical invocation is:
|
||||
* <pre>
|
||||
* CommandRunner runner = new CommandRunner();
|
||||
* int status = runner.runCommand("ls");
|
||||
* if (status == 0) {
|
||||
* System.out.print(runner.getStandardOutput());
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* @author Bob Handsaker
|
||||
*/
|
||||
public class CommandRunner {
|
||||
|
||||
private boolean mMergeOutput = false;
|
||||
private Writer mStandardOutputDestination = null;
|
||||
private Writer mStandardErrorDestination = null;
|
||||
private String mStandardOutputString = null;
|
||||
private String mStandardErrorString = null;
|
||||
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
public CommandRunner() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the standard output from the last command as a string.
|
||||
*
|
||||
* If no command has been run or an explicit output destination
|
||||
* was set, then this method returns null.
|
||||
*/
|
||||
public String getStandardOutputString() {
|
||||
return mStandardOutputString;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the standard error from the last command as a string.
|
||||
*
|
||||
* If no command has been run or an explicit output destination
|
||||
* was set, then this method returns null.
|
||||
*/
|
||||
public String getStandardErrorString() {
|
||||
return mStandardErrorString;
|
||||
}
|
||||
|
||||
/**
|
||||
* If true, the command's standard error stream will be interleaved
|
||||
* with the command's standard output stream. The standard error
|
||||
* stream destination will not be used.
|
||||
*/
|
||||
public boolean getMergeOutput() {
|
||||
return mMergeOutput;
|
||||
}
|
||||
|
||||
/**
|
||||
* If true, the command's standard error stream will be interleaved
|
||||
* with the command's standard output stream.
|
||||
*/
|
||||
public void setMergeOutput(boolean value) {
|
||||
mMergeOutput = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* The destination for the command's standard output stream.
|
||||
* If null, the standard output will be captured in a string.
|
||||
*/
|
||||
public Writer getStandardOutputDestination() {
|
||||
return mStandardOutputDestination;
|
||||
}
|
||||
|
||||
/**
|
||||
* The destination for the command's standard output stream.
|
||||
* If set to null, the standard output will be captured in a string.
|
||||
*/
|
||||
public void setStandardOutputDestination(Writer writer) {
|
||||
mStandardOutputDestination = writer;
|
||||
}
|
||||
|
||||
/**
|
||||
* The destination for the command's standard error stream.
|
||||
* If null, the standard error will be captured in a string.
|
||||
*/
|
||||
public Writer getStandardErrorDestination() {
|
||||
return mStandardErrorDestination;
|
||||
}
|
||||
|
||||
/**
|
||||
* The destination for the command's standard error stream.
|
||||
* If set to null, the standard error will be captured in a string.
|
||||
*/
|
||||
public void setStandardErrorDestination(Writer writer) {
|
||||
mStandardErrorDestination = writer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a command string as a system command.
|
||||
*
|
||||
* Returns the exit status of the command.
|
||||
*
|
||||
* When this method is called, the standard output string
|
||||
* and standard error string are updated if no alternative output
|
||||
* destinations have been set.
|
||||
*
|
||||
* This method throws a RuntimeException if running the command fails
|
||||
* (for example, if there are not enough system resources to spawn
|
||||
* the process).
|
||||
*
|
||||
* @param commmand The command string to run.
|
||||
* @return Command exit status.
|
||||
* @throws RuntimeException If command execution fails.
|
||||
*/
|
||||
public int runCommand(String command)
|
||||
throws RuntimeException {
|
||||
return runCommand(command.split(" "), null, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a command string as a system command.
|
||||
*
|
||||
* Returns the exit status of the command.
|
||||
*
|
||||
* When this method is called, the standard output string
|
||||
* and standard error string are updated if no alternative output
|
||||
* destinations have been set.
|
||||
*
|
||||
* This method throws a RuntimeException if running the command fails
|
||||
* (for example, if there are not enough system resources to spawn
|
||||
* the process).
|
||||
*
|
||||
* @param commmand The command string to run.
|
||||
* @param environment The command environment (or null to inherit).
|
||||
* @param workingDirectory The working directory (or null to inherit).
|
||||
* @return Command exit status.
|
||||
* @throws RuntimeException If command execution fails.
|
||||
*/
|
||||
public int runCommand(String command, String[] environment, File workingDirectory)
|
||||
throws RuntimeException {
|
||||
return runCommand(command.split(" "), environment, workingDirectory);
|
||||
}
|
||||
|
||||
/**
|
||||
* Run a command string as a system command.
|
||||
*
|
||||
* Returns the exit status of the command.
|
||||
*
|
||||
* When this method is called, the standard output string
|
||||
* and standard error string are updated if no alternative output
|
||||
* destinations have been set.
|
||||
*
|
||||
* This method throws a RuntimeException if running the command fails
|
||||
* (for example, if there are not enough system resources to spawn
|
||||
* the process).
|
||||
*
|
||||
* @param commmand The command to run (as a array of arguments).
|
||||
* @param environment The command environment (or null to inherit).
|
||||
* @param workingDirectory The working directory (or null to inherit).
|
||||
* @return Command exit status.
|
||||
* @throws RuntimeException If command execution fails.
|
||||
*/
|
||||
public int runCommand(String[] command, String[] environment, File workingDirectory)
|
||||
throws RuntimeException {
|
||||
|
||||
Writer stdout = mStandardOutputDestination;
|
||||
Writer stderr = mStandardErrorDestination;
|
||||
if (stdout == null) {
|
||||
stdout = new StringWriter();
|
||||
}
|
||||
if (mMergeOutput) {
|
||||
stderr = stdout;
|
||||
} else if (stderr == null) {
|
||||
stderr = new StringWriter();
|
||||
}
|
||||
|
||||
mStandardOutputString = null;
|
||||
mStandardErrorString = null;
|
||||
|
||||
int commandStatus = 0;
|
||||
try {
|
||||
Process process =
|
||||
Runtime.getRuntime().exec(command, environment, workingDirectory);
|
||||
StreamHandler stdoutHandler =
|
||||
new StreamHandler(process.getInputStream(), stdout);
|
||||
StreamHandler stderrHandler =
|
||||
new StreamHandler(process.getErrorStream(), stderr);
|
||||
|
||||
commandStatus = process.waitFor();
|
||||
|
||||
// Wait for the streams to drain.
|
||||
stdoutHandler.join();
|
||||
stderrHandler.join();
|
||||
} catch (Exception exc) {
|
||||
throw new RuntimeException("Command execution failed: " +
|
||||
exc.getMessage(),
|
||||
exc);
|
||||
}
|
||||
|
||||
if (mStandardOutputDestination == null) {
|
||||
mStandardOutputString = stdout.toString();
|
||||
}
|
||||
if (mStandardErrorDestination == null && !mMergeOutput) {
|
||||
mStandardErrorString = stderr.toString();
|
||||
}
|
||||
|
||||
return commandStatus;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Internal class to asynchronously read from the standard output
|
||||
* and standard error streams of the command being executed.
|
||||
*
|
||||
* If you do not handle command output asynchronously, then execution
|
||||
* of a command may block in some environments if the program produces
|
||||
* too much output. In this case, the call to run the process will
|
||||
* never complete.
|
||||
*/
|
||||
private static class StreamHandler extends Thread {
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* Create an instance of this class, which is an asynchronous
|
||||
* thread that will consume input from the given input stream
|
||||
* and send the output to the given output destination.
|
||||
*
|
||||
* @param input The input stream to read.
|
||||
* @param output The output destination.
|
||||
*/
|
||||
StreamHandler(InputStream input, Writer output) {
|
||||
m_input = input;
|
||||
m_output = output;
|
||||
start();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Standard thread run method.
|
||||
* Pipe input from the input source to the output destination
|
||||
* until there is no more input left.
|
||||
*
|
||||
* If an IOException occurs, the thread will make sure all
|
||||
* available output has been flushed to the destination and
|
||||
* then terminate. The IOException is not propagated.
|
||||
*/
|
||||
public void run() {
|
||||
|
||||
char[] buffer = new char[4096];
|
||||
Reader reader =
|
||||
new InputStreamReader(new BufferedInputStream(m_input));
|
||||
Writer writer = m_output;
|
||||
|
||||
try {
|
||||
while (true) {
|
||||
int count = reader.read(buffer);
|
||||
if (count <= 0) {
|
||||
break;
|
||||
}
|
||||
if (writer != null) {
|
||||
synchronized (writer) {
|
||||
writer.write(buffer, 0, count);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException ignore) {
|
||||
// Ignore IO exceptions
|
||||
} finally {
|
||||
try {
|
||||
reader.close();
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
try {
|
||||
m_output.flush();
|
||||
} catch (Exception ignore) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private InputStream m_input;
|
||||
private Writer m_output;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,618 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2007 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.dcp;
|
||||
|
||||
import edu.mit.broad.dcp.message.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.lang.reflect.Method;
|
||||
import java.net.InetAddress;
|
||||
import java.net.ServerSocket;
|
||||
import java.rmi.registry.*;
|
||||
|
||||
/**
|
||||
* Experimental.
|
||||
*/
|
||||
public abstract class DistributedAlgorithm
|
||||
implements Serializable
|
||||
{
|
||||
public static final Integer ANY = 0;
|
||||
public static final Integer MASTER = 1;
|
||||
|
||||
public DistributedAlgorithm() {
|
||||
}
|
||||
|
||||
public String getServerHost() {
|
||||
return mServerHost;
|
||||
}
|
||||
|
||||
public void setServerHost(String value) {
|
||||
mServerHost = value;
|
||||
}
|
||||
|
||||
public int getServerPort() {
|
||||
return mServerPort;
|
||||
}
|
||||
|
||||
public void setServerPort(int value) {
|
||||
mServerPort = value;
|
||||
}
|
||||
|
||||
public String getAlgorithmName() {
|
||||
if (mAlgorithmName != null) {
|
||||
return mAlgorithmName;
|
||||
} else {
|
||||
return getClassName();
|
||||
}
|
||||
}
|
||||
|
||||
public void setAlgorithmName(String value) {
|
||||
mAlgorithmName = value;
|
||||
}
|
||||
|
||||
public int getMaximumWorkerCount() {
|
||||
return mMaximumWorkerCount;
|
||||
}
|
||||
|
||||
public void setMaximumWorkerCount(int value) {
|
||||
mMaximumWorkerCount = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Name of LSF queue to use for workers.
|
||||
*/
|
||||
public String getLsfQueue() {
|
||||
return mLsfQueue;
|
||||
}
|
||||
|
||||
public void setLsfQueue(String value) {
|
||||
mLsfQueue = value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Directory to hold lsf log files.
|
||||
*/
|
||||
public String getLsfLogDirectory() {
|
||||
return mLsfLogDirectory;
|
||||
}
|
||||
|
||||
public void setLsfLogDirectory(String value) {
|
||||
mLsfLogDirectory = value;
|
||||
}
|
||||
|
||||
public boolean getEnableGcLogging() {
|
||||
return mEnableGcLogging;
|
||||
}
|
||||
|
||||
public void setEnableGcLogging(boolean value) {
|
||||
mEnableGcLogging = value;
|
||||
}
|
||||
|
||||
public Integer getWorkerId() {
|
||||
return mWorkerId;
|
||||
}
|
||||
|
||||
public Integer getProcessId() {
|
||||
return mProcessId;
|
||||
}
|
||||
|
||||
protected void init()
|
||||
throws Exception {
|
||||
}
|
||||
|
||||
protected abstract void start()
|
||||
throws Exception;
|
||||
|
||||
public void run()
|
||||
throws Exception {
|
||||
|
||||
if (mIsRunning) {
|
||||
throw new IllegalStateException("Algorithm is already running");
|
||||
}
|
||||
|
||||
mIsRunning = true;
|
||||
mWorkerId = MASTER;
|
||||
mProcessId = MASTER;
|
||||
|
||||
try {
|
||||
startDistributedServer();
|
||||
init();
|
||||
startWorkerThread();
|
||||
startWorkers();
|
||||
start();
|
||||
waitForCompletion();
|
||||
} finally {
|
||||
// TBD: More cleanup (shutdown threads, etc.)
|
||||
stopDistributedServer();
|
||||
mIsRunning = false;
|
||||
}
|
||||
}
|
||||
|
||||
void runWorker(int workerId, int processId)
|
||||
throws Exception {
|
||||
|
||||
if (mIsRunning) {
|
||||
throw new IllegalStateException("Algorithm is already running");
|
||||
}
|
||||
|
||||
mIsRunning = true;
|
||||
mWorkerId = workerId;
|
||||
mProcessId = processId;
|
||||
|
||||
try {
|
||||
if (openDistributedServer() == null) {
|
||||
report("Server " + mServerHost + ":" + mServerPort + " not responding");
|
||||
return;
|
||||
}
|
||||
init();
|
||||
startWorkerThread();
|
||||
mWorkerThread.join();
|
||||
} finally {
|
||||
closeDistributedServer();
|
||||
mIsRunning = false;
|
||||
}
|
||||
}
|
||||
|
||||
private void startWorkers() {
|
||||
int workerCount = getMaximumWorkerCount();
|
||||
if (workerCount <= 0) {
|
||||
// Use single process execution for testing/debugging.
|
||||
new InProcessWorker().start();
|
||||
return;
|
||||
}
|
||||
if (workerCount > 1000) {
|
||||
throw new RuntimeException("Excessive worker count: " + workerCount);
|
||||
}
|
||||
for (int i = 0; i < workerCount; i++) {
|
||||
Integer workerId = (MASTER + i + 1);
|
||||
Integer processId = workerId; // for now
|
||||
startWorker(workerId, processId);
|
||||
}
|
||||
}
|
||||
|
||||
private void startDistributedServer() {
|
||||
try {
|
||||
// Create a server socket to allocate a unique port.
|
||||
// There is a window of vulnerability where the port
|
||||
// can get reused, but in practice this works ok.
|
||||
String serverHost = getCurrentHost();
|
||||
ServerSocket socket = new ServerSocket(0);
|
||||
int serverPort = socket.getLocalPort();
|
||||
socket.close();
|
||||
Registry registry = LocateRegistry.createRegistry(serverPort);
|
||||
DistributedCallServer server = new DistributedCallServer();
|
||||
server.setAlgorithm(this);
|
||||
registry.bind("DistributedCallService", server);
|
||||
mServerHost = serverHost;
|
||||
mServerPort = serverPort;
|
||||
mDistributedCallServer = server;
|
||||
mDistributedCallService = server;
|
||||
} catch (Exception exc) {
|
||||
throw wrapException(exc);
|
||||
}
|
||||
}
|
||||
|
||||
private void stopDistributedServer() {
|
||||
if (mDistributedCallServer != null) {
|
||||
try {
|
||||
Registry registry = LocateRegistry.getRegistry(mServerPort);
|
||||
registry.unbind("DistributedCallService");
|
||||
mDistributedCallServer.stop();
|
||||
} catch (Exception exc) {
|
||||
throw wrapException(exc);
|
||||
}
|
||||
}
|
||||
mDistributedCallService = null;
|
||||
mDistributedCallServer = null;
|
||||
}
|
||||
|
||||
private DistributedCallService openDistributedServer() {
|
||||
mDistributedCallService = null;
|
||||
try {
|
||||
String url = "rmi://" + getServerHost() + ":" + getServerPort() + "/DistributedCallService";
|
||||
DistributedCallService server =
|
||||
(DistributedCallService) java.rmi.Naming.lookup(url);
|
||||
mDistributedCallService = server;
|
||||
} catch (java.rmi.NotBoundException exc) {
|
||||
// Server has exited
|
||||
} catch (Exception exc) {
|
||||
throw wrapException(exc);
|
||||
}
|
||||
return mDistributedCallService;
|
||||
}
|
||||
|
||||
private void closeDistributedServer() {
|
||||
mDistributedCallService = null;
|
||||
}
|
||||
|
||||
private void startWorker(Integer workerId, Integer processId) {
|
||||
|
||||
String logFile = "worker_" + processId + "_%J.bsub";
|
||||
if (mLsfLogDirectory != null) {
|
||||
logFile = mLsfLogDirectory + "/" + logFile;
|
||||
}
|
||||
|
||||
List<String> command = new ArrayList<String>();
|
||||
command.add("bsub");
|
||||
command.add("-o");
|
||||
command.add(logFile);
|
||||
if (mLsfQueue != null) {
|
||||
command.add("-q");
|
||||
command.add(mLsfQueue);
|
||||
}
|
||||
command.add("runDistributedWorker");
|
||||
command.add("-serverHost");
|
||||
command.add(getServerHost());
|
||||
command.add("-serverPort");
|
||||
command.add(Integer.toString(getServerPort()));
|
||||
command.add("-workerId");
|
||||
command.add(Integer.toString(workerId));
|
||||
command.add("-processId");
|
||||
command.add(Integer.toString(processId));
|
||||
|
||||
// Pass our -Xmx setting along to all workers.
|
||||
Map<String, String> environment =
|
||||
new LinkedHashMap<String, String>(System.getenv());
|
||||
long maxMemory = Runtime.getRuntime().maxMemory();
|
||||
long maxKbytes = maxMemory / 1024;
|
||||
String memJavaOpt = "-Xmx" + maxKbytes + "K";
|
||||
|
||||
// Enable GC logging if requested
|
||||
String gcJavaOpt = null;
|
||||
if (mEnableGcLogging) {
|
||||
String gcLogFile = "worker_" + processId + ".gc.log";
|
||||
if (mLsfLogDirectory != null) {
|
||||
gcLogFile = mLsfLogDirectory + "/" + gcLogFile;
|
||||
}
|
||||
gcJavaOpt = "-Xloggc:" + gcLogFile;
|
||||
}
|
||||
|
||||
String javaOpts = environment.get("JAVAOPTS");
|
||||
if (javaOpts == null) {
|
||||
javaOpts = memJavaOpt;
|
||||
if (gcJavaOpt != null) {
|
||||
javaOpts = javaOpts + " " + gcJavaOpt;
|
||||
}
|
||||
environment.put("JAVAOPTS", javaOpts);
|
||||
}
|
||||
|
||||
// Log output ourselves (rather than waiting for bsub).
|
||||
String workerLogFile = "worker_" + processId + ".log";
|
||||
if (mLsfLogDirectory != null) {
|
||||
workerLogFile = mLsfLogDirectory + "/" + workerLogFile;
|
||||
}
|
||||
environment.put("DA_LOG_FILE", workerLogFile);
|
||||
|
||||
CommandRunner runner = new CommandRunner();
|
||||
Writer output = new LsfOutputFilter();
|
||||
runner.setStandardOutputDestination(output);
|
||||
runner.setStandardErrorDestination(output);
|
||||
String[] commandArray = command.toArray(new String[command.size()]);
|
||||
String[] environmentArray = createEnvironmentArray(environment);
|
||||
int status = runner.runCommand(commandArray, environmentArray, null);
|
||||
if (status != 0) {
|
||||
throw new RuntimeException("Error starting worker: " + status);
|
||||
}
|
||||
}
|
||||
|
||||
private String[] createEnvironmentArray(Map<String, String> map) {
|
||||
if (map == null) {
|
||||
return null;
|
||||
}
|
||||
int index = 0;
|
||||
String[] array = new String[map.size()];
|
||||
for (Map.Entry<String, String> entry : map.entrySet()) {
|
||||
array[index++] = entry.getKey() + "=" + entry.getValue();
|
||||
}
|
||||
return array;
|
||||
}
|
||||
|
||||
private String getCurrentHost() {
|
||||
try {
|
||||
return InetAddress.getLocalHost().getCanonicalHostName();
|
||||
} catch (Exception exc) {
|
||||
throw wrapException(exc);
|
||||
}
|
||||
}
|
||||
|
||||
private void waitForCompletion() {
|
||||
DistributedCallServer server = mDistributedCallServer;
|
||||
while (true) {
|
||||
if (server.isQueueEmpty()) {
|
||||
break;
|
||||
}
|
||||
try {
|
||||
Thread.sleep(1000);
|
||||
} catch (InterruptedException exc) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void callDistributed(String methodName, Object... methodArgs) {
|
||||
callDistributed(null, methodName, methodArgs);
|
||||
}
|
||||
|
||||
protected void callDistributed(Integer workerId, String methodName, Object... methodArgs) {
|
||||
if (workerId == null) {
|
||||
workerId = ANY;
|
||||
}
|
||||
try {
|
||||
DistributedCallMessage message = new DistributedCallMessage();
|
||||
message.setSenderWorkerId(getWorkerId());
|
||||
message.setSenderProcessId(getProcessId());
|
||||
message.setReceiverWorkerId(workerId);
|
||||
message.setMethodName(methodName);
|
||||
message.setMethodArgs(methodArgs);
|
||||
mDistributedCallService.writeMessage(message);
|
||||
} catch (Throwable exc) {
|
||||
throw wrapException(exc);
|
||||
}
|
||||
}
|
||||
|
||||
private void callMethod(String methodName, Object[] methodArgs) {
|
||||
try {
|
||||
Object target = this;
|
||||
Class targetClass = target.getClass();
|
||||
Method targetMethod = findMethod(targetClass, methodName);
|
||||
if (targetMethod == null) {
|
||||
throw new RuntimeException("Cannot find target method: " + methodName);
|
||||
}
|
||||
targetMethod.invoke(target, methodArgs);
|
||||
} catch (Throwable exc) {
|
||||
throw wrapException(exc);
|
||||
}
|
||||
}
|
||||
|
||||
private Method findMethod(Class clazz, String methodName) throws Exception {
|
||||
Method result = null;
|
||||
Method[] methods = clazz.getDeclaredMethods();
|
||||
for (int i = 0; i < methods.length; i++) {
|
||||
if (methods[i].getName().equals(methodName)) {
|
||||
if (result != null) {
|
||||
throw new RuntimeException("Duplicate method name: " + methodName);
|
||||
}
|
||||
result = methods[i];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private RuntimeException wrapException(Throwable exception) {
|
||||
if (exception instanceof RuntimeException) {
|
||||
return (RuntimeException) exception;
|
||||
} else {
|
||||
return new RuntimeException(exception.getMessage(), exception);
|
||||
}
|
||||
}
|
||||
|
||||
private void startWorkerThread() {
|
||||
if (mWorkerThread != null) {
|
||||
throw new IllegalStateException("WorkerThread is running");
|
||||
}
|
||||
mWorkerThread = new WorkerThread();
|
||||
mWorkerThread.start();
|
||||
}
|
||||
|
||||
private void stopWorkerThread() {
|
||||
if (mWorkerThread == null) {
|
||||
throw new IllegalStateException("WorkerThread is running");
|
||||
}
|
||||
mWorkerThread.stopThread();
|
||||
}
|
||||
|
||||
private class WorkerThread extends Thread {
|
||||
|
||||
WorkerThread() {
|
||||
setDaemon(true);
|
||||
}
|
||||
|
||||
public void run() {
|
||||
try {
|
||||
DistributedCallService service = mDistributedCallService;
|
||||
while (true) {
|
||||
if (isInterrupted()) {
|
||||
System.out.println("#DBG: Worker isInterrupted");
|
||||
throw new InterruptedException();
|
||||
}
|
||||
DistributedCallMessage message =
|
||||
service.acceptMessage(getWorkerId(), getProcessId());
|
||||
if (message == null) {
|
||||
Thread.sleep(1000);
|
||||
} else {
|
||||
processMessage(message);
|
||||
}
|
||||
}
|
||||
} catch (InterruptedException exc) {
|
||||
// Interruption terminates this thread.
|
||||
// System.out.println("#DBG: Worker caught InterruptedException");
|
||||
} catch (Throwable exc) {
|
||||
if (isDisconnectException(exc)) {
|
||||
report("Server disconnected");
|
||||
} else {
|
||||
reportError("Exception in WorkerThread: " + exc.getMessage(), exc);
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
report("WorkerThread terminated");
|
||||
}
|
||||
|
||||
void stopThread() {
|
||||
// System.out.println("#DBG: About to interrupt worker...");
|
||||
interrupt();
|
||||
// System.out.println("#DBG: Joining worker...");
|
||||
try {
|
||||
join();
|
||||
} catch (InterruptedException exc) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isDisconnectException(Throwable exc) {
|
||||
if (exc instanceof java.rmi.ConnectException) {
|
||||
return true;
|
||||
} else if (exc instanceof java.rmi.NoSuchObjectException) {
|
||||
return true;
|
||||
} else if (exc instanceof java.rmi.UnmarshalException &&
|
||||
exc.getCause() != null &&
|
||||
exc.getCause() instanceof EOFException) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void processMessage(DistributedCallMessage message) {
|
||||
try {
|
||||
Integer workerId = message.getReceiverWorkerId();
|
||||
if (workerId == null || !workerId.equals(getWorkerId())) {
|
||||
reportError("Invalid worker ID in message: " + message);
|
||||
return;
|
||||
}
|
||||
callMethod(message.getMethodName(), message.getMethodArgs());
|
||||
} catch (Throwable exc) {
|
||||
reportError("Exception running message: " + message, exc);
|
||||
} finally {
|
||||
completeMessage(message);
|
||||
}
|
||||
}
|
||||
|
||||
private void completeMessage(DistributedCallMessage message) {
|
||||
try {
|
||||
DistributedCallService service = mDistributedCallService;
|
||||
service.completeMessage(getWorkerId(), getProcessId(), message.getCallId());
|
||||
} catch (Throwable exc) {
|
||||
reportError("Exception completing message: " + message, exc);
|
||||
}
|
||||
}
|
||||
|
||||
protected void report(String message) {
|
||||
String identity =
|
||||
getAlgorithmName() + " " +
|
||||
getWorkerId() + "/" + getProcessId();
|
||||
System.out.println("# " + identity + " : " + message);
|
||||
}
|
||||
|
||||
protected void reportError(String message) {
|
||||
reportError(message, null);
|
||||
}
|
||||
|
||||
protected void reportError(String message, Throwable exception) {
|
||||
String identity =
|
||||
getAlgorithmName() + " " +
|
||||
getWorkerId() + "/" + getProcessId();
|
||||
System.out.println("Error" +
|
||||
" [" + identity + "]" +
|
||||
": " + message);
|
||||
if (exception != null) {
|
||||
System.out.println(" with exception: " + exception.getMessage());
|
||||
exception.printStackTrace(System.out);
|
||||
}
|
||||
}
|
||||
|
||||
private String getClassName() {
|
||||
String name = getClass().getName();
|
||||
return name.substring(name.lastIndexOf('.')+1);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("DistributedAlgorithm");
|
||||
builder.append("(");
|
||||
builder.append("" + getAlgorithmName());
|
||||
builder.append(",");
|
||||
builder.append("" + getWorkerId());
|
||||
builder.append(",");
|
||||
builder.append("" + getProcessId());
|
||||
builder.append(",");
|
||||
builder.append("" + getMaximumWorkerCount());
|
||||
builder.append(",");
|
||||
builder.append("" + getLsfQueue());
|
||||
builder.append(",");
|
||||
builder.append("" + mIsRunning);
|
||||
builder.append(")");
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
// This class is used only during in-process execution/testing/debugging.
|
||||
private class InProcessWorker extends Thread {
|
||||
|
||||
InProcessWorker() {
|
||||
setDaemon(true);
|
||||
}
|
||||
|
||||
public void run() {
|
||||
report("InProcessWorker starting");
|
||||
try {
|
||||
String serverAddress = getServerHost() + ":" + getServerPort();
|
||||
String url = "rmi://" + serverAddress + "/DistributedCallService";
|
||||
DistributedCallService server =
|
||||
(DistributedCallService) java.rmi.Naming.lookup(url);
|
||||
DistributedAlgorithm algorithm = server.getAlgorithm();
|
||||
algorithm.setServerHost(getServerHost());
|
||||
algorithm.setServerPort(getServerPort());
|
||||
algorithm.runWorker(2, 1);
|
||||
} catch (Throwable exc) {
|
||||
reportError("Exception in InProcessWorker: " + exc.getMessage(), exc);
|
||||
System.exit(1);
|
||||
}
|
||||
report("InProcessWorker terminated");
|
||||
}
|
||||
}
|
||||
|
||||
private static class LsfOutputFilter
|
||||
extends FilterWriter {
|
||||
|
||||
LsfOutputFilter() {
|
||||
super(new PrintWriter(System.out, true));
|
||||
}
|
||||
|
||||
public void write(int ch)
|
||||
throws IOException {
|
||||
if (mAtLineStart) {
|
||||
out.write("# ");
|
||||
mAtLineStart = false;
|
||||
}
|
||||
out.write(ch);
|
||||
mAtLineStart = (ch == '\n');
|
||||
}
|
||||
|
||||
public void write(String s, int off, int len)
|
||||
throws IOException {
|
||||
write(s.toCharArray(), off, len);
|
||||
}
|
||||
|
||||
public void write(char[] a, int off, int len)
|
||||
throws IOException {
|
||||
for (int i = 0; i < len; i++) {
|
||||
write(a[off+i]);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean mAtLineStart = true;
|
||||
}
|
||||
|
||||
|
||||
private transient int mMaximumWorkerCount = 0;
|
||||
private transient String mLsfQueue = null;
|
||||
private transient String mLsfLogDirectory = null;
|
||||
private transient boolean mEnableGcLogging = false;
|
||||
private transient boolean mIsRunning = false;
|
||||
private transient int mWorkerId = 0;
|
||||
private transient int mProcessId = 0;
|
||||
private transient WorkerThread mWorkerThread = null;
|
||||
private transient String mAlgorithmName = null;
|
||||
private transient String mServerHost = null;
|
||||
private transient int mServerPort = 0;
|
||||
private transient DistributedCallService mDistributedCallService = null;
|
||||
private transient DistributedCallServer mDistributedCallServer = null;
|
||||
}
|
||||
|
|
@ -1,134 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2007 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.dcp;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Command line driver for distributed worker invocation.
|
||||
*/
|
||||
public class DistributedAlgorithmWorker
|
||||
{
|
||||
public static void main(String[] args)
|
||||
throws Exception {
|
||||
new DistributedAlgorithmWorker().run(args);
|
||||
}
|
||||
|
||||
private void run(String[] args)
|
||||
throws Exception {
|
||||
|
||||
if (!parseArguments(args)) {
|
||||
System.exit(1);
|
||||
}
|
||||
System.out.println("# DistributedAlgorithmWorker");
|
||||
System.out.println("# Started at " + new Date());
|
||||
runDistributedWorker();
|
||||
System.out.println("# Ended at " + new Date());
|
||||
}
|
||||
|
||||
private boolean parseArguments(String[] args) {
|
||||
|
||||
int argpos = 0;
|
||||
int argsleft = 0;
|
||||
|
||||
while (argpos < args.length) {
|
||||
argsleft = args.length - argpos;
|
||||
String arg = args[argpos];
|
||||
if (arg.equals("-serverHost") && argsleft > 1) {
|
||||
argpos++;
|
||||
mServerHost = args[argpos++];
|
||||
} else if (arg.equals("-serverPort") && argsleft > 1) {
|
||||
argpos++;
|
||||
mServerPort = Integer.parseInt(args[argpos++]);
|
||||
} else if (arg.equals("-workerId") && argsleft > 1) {
|
||||
argpos++;
|
||||
mWorkerId = new Integer(args[argpos++]);
|
||||
} else if (arg.equals("-processId") && argsleft > 1) {
|
||||
argpos++;
|
||||
mProcessId = new Integer(args[argpos++]);
|
||||
} else if (arg.equals("-debug")) {
|
||||
argpos++;
|
||||
mDebug = true;
|
||||
continue;
|
||||
} else if (arg.equals("-verbose")) {
|
||||
argpos++;
|
||||
mVerbose = true;
|
||||
continue;
|
||||
} else if (arg.startsWith("-")) {
|
||||
usage();
|
||||
return false;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
argsleft = args.length - argpos;
|
||||
if (argsleft != 0) {
|
||||
usage();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private void usage() {
|
||||
System.out.println("Usage: DistributedWorkerMain ...");
|
||||
System.out.println(" -serverHost <hostname>");
|
||||
System.out.println(" -serverPort <port>");
|
||||
System.out.println(" -workerId <id>");
|
||||
System.out.println(" -processId <id>");
|
||||
System.out.println(" -verbose");
|
||||
System.out.println(" -debug");
|
||||
}
|
||||
|
||||
private void runDistributedWorker()
|
||||
throws Exception {
|
||||
|
||||
DistributedAlgorithm algorithm = null;
|
||||
String serverAddress = getServerHost() + ":" + getServerPort();
|
||||
try {
|
||||
String url = "rmi://" + serverAddress + "/DistributedCallService";
|
||||
DistributedCallService server =
|
||||
(DistributedCallService) java.rmi.Naming.lookup(url);
|
||||
algorithm = server.getAlgorithm();
|
||||
} catch (java.rmi.ConnectException exc) {
|
||||
System.out.println("# Server " + serverAddress + " not responding.");
|
||||
return;
|
||||
}
|
||||
|
||||
algorithm.setServerHost(getServerHost());
|
||||
algorithm.setServerPort(getServerPort());
|
||||
algorithm.runWorker(getWorkerId(), getProcessId());
|
||||
}
|
||||
|
||||
private Integer getWorkerId() {
|
||||
return mWorkerId;
|
||||
}
|
||||
|
||||
private Integer getProcessId() {
|
||||
return mProcessId;
|
||||
}
|
||||
|
||||
private String getServerHost() {
|
||||
return mServerHost;
|
||||
}
|
||||
|
||||
private int getServerPort() {
|
||||
return mServerPort;
|
||||
}
|
||||
|
||||
|
||||
private boolean mDebug = false;
|
||||
private boolean mVerbose = false;
|
||||
private String mServerHost = null;
|
||||
private int mServerPort = 0;
|
||||
private Integer mWorkerId = null;
|
||||
private Integer mProcessId = null;
|
||||
}
|
||||
|
|
@ -1,133 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.dcp;
|
||||
|
||||
|
||||
import edu.mit.broad.dcp.message.*;
|
||||
|
||||
import java.rmi.server.UnicastRemoteObject;
|
||||
import java.util.*;
|
||||
|
||||
public class DistributedCallServer
|
||||
extends UnicastRemoteObject
|
||||
implements DistributedCallService
|
||||
{
|
||||
public DistributedCallServer()
|
||||
throws java.rmi.RemoteException {
|
||||
}
|
||||
|
||||
public void setAlgorithm(DistributedAlgorithm algorithm) {
|
||||
mAlgorithm = algorithm;
|
||||
}
|
||||
|
||||
public DistributedAlgorithm getAlgorithm() {
|
||||
return mAlgorithm;
|
||||
}
|
||||
|
||||
public long writeMessage(DistributedCallMessage message) {
|
||||
message.setCallStatus(CallStatus.PENDING);
|
||||
message.setCallId(generateCallId());
|
||||
if (message.getReceiverWorkerId().equals(0)) {
|
||||
synchronized (mMessageQueue) {
|
||||
mMessageQueue.addLast(message);
|
||||
}
|
||||
} else {
|
||||
synchronized (mMessageQueue) {
|
||||
mMessageQueue.addFirst(message);
|
||||
}
|
||||
}
|
||||
return message.getCallId();
|
||||
}
|
||||
|
||||
public DistributedCallMessage acceptMessage(int workerId, int processId) {
|
||||
if (workerId <= 0) {
|
||||
throw new IllegalArgumentException("Invalid worker ID: " + workerId);
|
||||
}
|
||||
if (processId <= 0) {
|
||||
throw new IllegalArgumentException("Invalid process ID: " + processId);
|
||||
}
|
||||
synchronized (mMessageQueue) {
|
||||
Iterator<DistributedCallMessage> iterator = mMessageQueue.iterator();
|
||||
while (iterator.hasNext()) {
|
||||
DistributedCallMessage message = iterator.next();
|
||||
if (message.getCallStatus() != CallStatus.PENDING) {
|
||||
continue;
|
||||
}
|
||||
int receiverId = message.getReceiverWorkerId();
|
||||
if (receiverId == workerId ||
|
||||
(receiverId == 0 && workerId > 1)) {
|
||||
message.setCallStatus(CallStatus.PROCESSING);
|
||||
message.setReceiverWorkerId(workerId);
|
||||
message.setReceiverProcessId(processId);
|
||||
return message;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public void completeMessage(int workerId, int processId, long callId) {
|
||||
if (workerId <= 0) {
|
||||
throw new IllegalArgumentException("Invalid worker ID: " + workerId);
|
||||
}
|
||||
if (processId <= 0) {
|
||||
throw new IllegalArgumentException("Invalid process ID: " + processId);
|
||||
}
|
||||
if (callId <= 0) {
|
||||
throw new IllegalArgumentException("Invalid call ID: " + callId);
|
||||
}
|
||||
synchronized (mMessageQueue) {
|
||||
Iterator<DistributedCallMessage> iterator = mMessageQueue.iterator();
|
||||
while (iterator.hasNext()) {
|
||||
DistributedCallMessage message = iterator.next();
|
||||
if (message.getCallId().longValue() == callId) {
|
||||
if (message.getCallStatus() != CallStatus.PROCESSING) {
|
||||
throw new IllegalStateException("Call #" + callId + " not in state PROCESSING");
|
||||
}
|
||||
if (!message.getReceiverWorkerId().equals(workerId)) {
|
||||
throw new IllegalStateException("Call #" + callId + " assigned to worker " + message.getReceiverWorkerId() + " not worker " + workerId);
|
||||
}
|
||||
if (!message.getReceiverProcessId().equals(processId)) {
|
||||
throw new IllegalStateException("Call #" + callId + " assigned to process " + message.getReceiverProcessId() + " not process " + processId);
|
||||
}
|
||||
iterator.remove();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Unrecognized call ID " + callId);
|
||||
}
|
||||
|
||||
public boolean isQueueEmpty() {
|
||||
synchronized (mMessageQueue) {
|
||||
return mMessageQueue.isEmpty();
|
||||
}
|
||||
}
|
||||
|
||||
public void stop() {
|
||||
try {
|
||||
UnicastRemoteObject.unexportObject(this, false);
|
||||
} catch (java.rmi.NoSuchObjectException exc) {
|
||||
throw new RuntimeException("Exception unexporting object: " + exc.getMessage(),
|
||||
exc);
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized long generateCallId() {
|
||||
return ++mCallIdGenerator;
|
||||
}
|
||||
|
||||
private long mCallIdGenerator = 0;
|
||||
private DistributedAlgorithm mAlgorithm = null;
|
||||
private LinkedList<DistributedCallMessage> mMessageQueue =
|
||||
new LinkedList<DistributedCallMessage>();
|
||||
}
|
||||
|
|
@ -1,25 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.dcp;
|
||||
|
||||
import edu.mit.broad.dcp.message.*;
|
||||
|
||||
public interface DistributedCallService
|
||||
extends java.rmi.Remote
|
||||
{
|
||||
public DistributedAlgorithm getAlgorithm()
|
||||
throws java.rmi.RemoteException;
|
||||
public long writeMessage(DistributedCallMessage message)
|
||||
throws java.rmi.RemoteException;
|
||||
public DistributedCallMessage acceptMessage(int workerId, int processId)
|
||||
throws java.rmi.RemoteException;
|
||||
public void completeMessage(int workerId, int processId, long callId)
|
||||
throws java.rmi.RemoteException;
|
||||
}
|
||||
|
|
@ -1,90 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2007 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.dcp.message;
|
||||
|
||||
import edu.mit.broad.dcp.CallStatus;
|
||||
|
||||
public class DistributedCallMessage
|
||||
extends DistributedMessage
|
||||
{
|
||||
public DistributedCallMessage() {
|
||||
}
|
||||
|
||||
public Long getCallId() {
|
||||
return mCallId;
|
||||
}
|
||||
|
||||
public void setCallId(Long value) {
|
||||
mCallId = value;
|
||||
}
|
||||
|
||||
public CallStatus getCallStatus() {
|
||||
return mCallStatus;
|
||||
}
|
||||
|
||||
public void setCallStatus(CallStatus value) {
|
||||
mCallStatus = value;
|
||||
}
|
||||
|
||||
public String getMethodName() {
|
||||
return mMethodName;
|
||||
}
|
||||
|
||||
public void setMethodName(String value) {
|
||||
mMethodName = value;
|
||||
}
|
||||
|
||||
public Object[] getMethodArgs() {
|
||||
return mMethodArgs;
|
||||
}
|
||||
|
||||
public void setMethodArgs(Object[] value) {
|
||||
mMethodArgs = value;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("DistributedCallMessage");
|
||||
builder.append("(");
|
||||
builder.append("" + getSenderWorkerId());
|
||||
builder.append(",");
|
||||
builder.append("" + getSenderProcessId());
|
||||
builder.append(",");
|
||||
builder.append("" + getReceiverWorkerId());
|
||||
builder.append(",");
|
||||
builder.append("" + getReceiverProcessId());
|
||||
builder.append(",");
|
||||
builder.append("" + mCallId);
|
||||
builder.append(",");
|
||||
builder.append("" + mCallStatus);
|
||||
builder.append(",");
|
||||
builder.append("" + mMethodName);
|
||||
builder.append(",");
|
||||
if (mMethodArgs == null) {
|
||||
builder.append("" + mMethodArgs);
|
||||
} else {
|
||||
builder.append("[");
|
||||
for (int i = 0; i < mMethodArgs.length; i++) {
|
||||
if (i > 0) {
|
||||
builder.append(",");
|
||||
}
|
||||
builder.append("" + mMethodArgs[i]);
|
||||
}
|
||||
builder.append("]");
|
||||
}
|
||||
builder.append(")");
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
public Long mCallId;
|
||||
public CallStatus mCallStatus;
|
||||
public String mMethodName;
|
||||
public Object[] mMethodArgs;
|
||||
}
|
||||
|
|
@ -1,54 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2007 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.dcp.message;
|
||||
|
||||
|
||||
public class DistributedMessage
|
||||
{
|
||||
public DistributedMessage() {
|
||||
}
|
||||
|
||||
public Integer getSenderWorkerId() {
|
||||
return mSenderWorkerId;
|
||||
}
|
||||
|
||||
public void setSenderWorkerId(Integer value) {
|
||||
mSenderWorkerId = value;
|
||||
}
|
||||
|
||||
public Integer getSenderProcessId() {
|
||||
return mSenderProcessId;
|
||||
}
|
||||
|
||||
public void setSenderProcessId(Integer value) {
|
||||
mSenderProcessId = value;
|
||||
}
|
||||
|
||||
public Integer getReceiverWorkerId() {
|
||||
return mReceiverWorkerId;
|
||||
}
|
||||
|
||||
public void setReceiverWorkerId(Integer value) {
|
||||
mReceiverWorkerId = value;
|
||||
}
|
||||
|
||||
public Integer getReceiverProcessId() {
|
||||
return mReceiverProcessId;
|
||||
}
|
||||
|
||||
public void setReceiverProcessId(Integer value) {
|
||||
mReceiverProcessId = value;
|
||||
}
|
||||
|
||||
public Integer mSenderWorkerId;
|
||||
public Integer mSenderProcessId;
|
||||
public Integer mReceiverWorkerId;
|
||||
public Integer mReceiverProcessId;
|
||||
}
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard;
|
||||
|
||||
/**
|
||||
* Basic Picard runtime exception that, for now, does nothing much
|
||||
*
|
||||
* @author Kathleen Tibbetts
|
||||
*/
|
||||
public class PicardException extends RuntimeException
|
||||
{
|
||||
public PicardException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public PicardException(String message, Throwable throwable) {
|
||||
super(message, throwable);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,97 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.aligner;
|
||||
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Abstract base class for use by <code>Aligner</code> implementations. Provides a constructor and
|
||||
* accessors for common inputs and outputs.
|
||||
*
|
||||
* @author Kathleen Tibbetts
|
||||
*/
|
||||
public abstract class AbstractBaseAligner implements Aligner {
|
||||
|
||||
private final Stringency stringency; // The stringency of the alignment
|
||||
private final File readsBamFile; // The BAM file containing the read data
|
||||
private final String outputPrefix; // The directory and file name prefix for outputs
|
||||
private final String referenceFileDir; // The directory where the reference file can be found
|
||||
private final int clipPoints[]; // The clip points to use
|
||||
private final Integer expectedInsertSize; // Expected insert size; null for non-paired-end lanes
|
||||
private final Integer readsToAlign; // The number of reads to align (all if null)
|
||||
private final boolean pairedReads; // Whether this is a paired-end run
|
||||
private final int readLength;
|
||||
// Parameters specific to the Aligner implementation being used
|
||||
private final Map<String, String> customParametersMap;
|
||||
|
||||
/**
|
||||
* Constructor that sets every parameter.
|
||||
*
|
||||
* @param stringency the stringency of the alignment
|
||||
* @param readsBamFile the BAM file containing the reads
|
||||
* @param outputPrefix the directory and filename prefix for output
|
||||
* @param referenceFileDir the directory where the reference file is located
|
||||
* @param clipPoints the clip points
|
||||
* @param expectedInsertSize the expected insert size (null for non-PE lanes)
|
||||
* @param readsToAlign the number of reads to align
|
||||
* @param customParametersMap parameters specific to the Aligner implementation
|
||||
*/
|
||||
public AbstractBaseAligner(Stringency stringency, File readsBamFile, String outputPrefix,
|
||||
String referenceFileDir, int clipPoints[], Integer expectedInsertSize,
|
||||
Integer readsToAlign, Map<String, String> customParametersMap,
|
||||
boolean pairedReads, int readLength) {
|
||||
|
||||
// First, a little validation
|
||||
if (clipPoints != null && clipPoints.length != 4) {
|
||||
throw new IllegalArgumentException("Length of clipPoints array argument must be 4.");
|
||||
}
|
||||
IoUtil.assertFileIsReadable(readsBamFile);
|
||||
|
||||
this.stringency = stringency;
|
||||
this.readsBamFile = readsBamFile;
|
||||
this.outputPrefix = outputPrefix;
|
||||
this.referenceFileDir = referenceFileDir;
|
||||
this.clipPoints = clipPoints != null ? clipPoints : new int[4];
|
||||
this.expectedInsertSize = expectedInsertSize;
|
||||
this.readsToAlign = readsToAlign;
|
||||
this.customParametersMap = customParametersMap;
|
||||
this.pairedReads = pairedReads;
|
||||
this.readLength = readLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method for deleting a list of files, to be used by the
|
||||
* cleanup method of sub-classes
|
||||
*
|
||||
* @param files the list of files to delete
|
||||
*/
|
||||
protected final void deleteFiles(List<File> files) {
|
||||
for (File f : files) {
|
||||
f.delete();
|
||||
}
|
||||
}
|
||||
|
||||
// Accessors
|
||||
protected final Stringency getStringency() { return stringency; }
|
||||
protected final File getReadsBamFile() { return readsBamFile; }
|
||||
protected final String getOutputPrefix() { return outputPrefix; }
|
||||
protected final String getReferenceFileDir() { return referenceFileDir; }
|
||||
protected final int[] getClipPoints() { return clipPoints; }
|
||||
protected final Integer getExpectedInsertSize() { return expectedInsertSize; }
|
||||
protected final Integer getReadsToAlign() { return readsToAlign; }
|
||||
protected final Map<String, String> getCustomParametersMap() { return customParametersMap; }
|
||||
protected final boolean isPairedReads() { return pairedReads; }
|
||||
protected final int getReadLength() { return readLength; }
|
||||
}
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.aligner;
|
||||
|
||||
/**
|
||||
* API for aligners. Clients must call these methods in order, as each depends on
|
||||
* the previous one, but they may call them multiple times and need not call them all.
|
||||
* This allows steps to be rerun and also lets the caller review intermediate files
|
||||
* when troubleshooting.
|
||||
*
|
||||
* @author Kathleen Tibbetts
|
||||
*/
|
||||
public interface Aligner {
|
||||
|
||||
public static enum Stringency{ low, high };
|
||||
|
||||
/**
|
||||
* Prepares all the necessary inputs for the alignment process from a BAM file of read data.
|
||||
*/
|
||||
public void prepareInputs();
|
||||
|
||||
/**
|
||||
* Does the alignment and produces output in the underlying form of the aligner.
|
||||
*/
|
||||
public void align();
|
||||
|
||||
/**
|
||||
* Converts the output of the aligner to BAM format
|
||||
*/
|
||||
public void prepareOutput();
|
||||
|
||||
/**
|
||||
* Cleans up intermediate files (the files created in by and for the underlying aligner by the
|
||||
* prepareInputs() and align() methods. Does not clean up the original source files or the final BAM file.
|
||||
*/
|
||||
public void cleanup();
|
||||
|
||||
}
|
||||
|
|
@ -1,319 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.aligner.maq;
|
||||
|
||||
import edu.mit.broad.sam.SAMFileReader;
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
import edu.mit.broad.sam.util.BinaryCodec;
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.picard.filter.*;
|
||||
import edu.mit.broad.picard.util.PeekableIterator;
|
||||
import edu.mit.broad.picard.util.Log;
|
||||
import edu.mit.broad.picard.sam.ReservedTagConstants;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Class to take unmapped reads in BAM file format and create Maq binary fastq format file(s) --
|
||||
* one or two of them, depending on whether it's a paired-end read. This relies on the unmapped
|
||||
* BAM file having all paired reads together in order.
|
||||
*/
|
||||
public class BamToBfqWriter {
|
||||
|
||||
private final File bamFile;
|
||||
private final String outputPrefix;
|
||||
private boolean pairedReads = false;
|
||||
private int wrote = 0;
|
||||
private int increment = 1;
|
||||
private int chunk = 0;
|
||||
private BinaryCodec codec1;
|
||||
private BinaryCodec codec2;
|
||||
private final Log log = Log.getInstance(BamToBfqWriter.class);
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param bamFile the BAM file to read from
|
||||
* @param outputPrefix the directory and file prefix for the binary fastq files
|
||||
* @param total the total number of records that should be written, drawn evenly
|
||||
* from throughout the file (null for all).
|
||||
* @param chunk the maximum number of records taht should be written to any one file
|
||||
* @param pairedReads whether these reads are from a paired-end run
|
||||
*/
|
||||
public BamToBfqWriter(File bamFile, String outputPrefix, Integer total, Integer chunk, boolean pairedReads) {
|
||||
this.bamFile = bamFile;
|
||||
this.outputPrefix = outputPrefix;
|
||||
this.pairedReads = pairedReads;
|
||||
if (total != null) {
|
||||
double writeable = (double)countWritableRecords();
|
||||
this.increment = (int)Math.floor(writeable/total.doubleValue());
|
||||
}
|
||||
if (chunk != null) {
|
||||
this.chunk = chunk;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param bamFile the BAM file to read from
|
||||
* @param outputPrefix the directory and file prefix for the binary fastq files
|
||||
* @param pairedReads whether these reads are from a paired-end run
|
||||
*/
|
||||
public BamToBfqWriter(File bamFile, String outputPrefix, boolean pairedReads) {
|
||||
this(bamFile, outputPrefix, null, null, pairedReads);
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the binary fastq file(s) to the output directory
|
||||
*/
|
||||
public void writeBfqFiles() {
|
||||
|
||||
Iterator<SAMRecord> iterator = (new SAMFileReader(IoUtil.openFileForReading(this.bamFile))).iterator();
|
||||
|
||||
// Filter out noise reads and reads that fail the quality filter
|
||||
TagFilter tagFilter = new TagFilter(ReservedTagConstants.XN, 1);
|
||||
FailsVendorReadQualityFilter qualityFilter = new FailsVendorReadQualityFilter();
|
||||
|
||||
if (!pairedReads) {
|
||||
writeSingleEndBfqs(iterator, Arrays.asList(tagFilter, qualityFilter));
|
||||
codec1.close();
|
||||
}
|
||||
else {
|
||||
writePairedEndBfqs(iterator, tagFilter, qualityFilter);
|
||||
codec1.close();
|
||||
codec2.close();
|
||||
}
|
||||
log.info("Wrote " + wrote + " bfq records.");
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Path for writing bfqs for paired-end reads
|
||||
*
|
||||
* @param iterator the iterator witht he SAM Records to write
|
||||
* @param tagFilter the filter for noise reads
|
||||
* @param qualityFilter the filter for PF reads
|
||||
*/
|
||||
private void writePairedEndBfqs(Iterator<SAMRecord> iterator, TagFilter tagFilter,
|
||||
FailsVendorReadQualityFilter qualityFilter) {
|
||||
// Open the codecs for writing
|
||||
int fileIndex = 0;
|
||||
initializeNextBfqFiles(fileIndex++);
|
||||
|
||||
int records = 0;
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
SAMRecord first = iterator.next();
|
||||
if (!iterator.hasNext()) {
|
||||
throw new PicardException("Mismatched number of records in " + this.bamFile.getAbsolutePath());
|
||||
}
|
||||
SAMRecord second = iterator.next();
|
||||
if (!second.getReadName().equals(first.getReadName()) ||
|
||||
first.getFirstOfPairFlag() == second.getFirstOfPairFlag()) {
|
||||
throw new PicardException("Unmatched read pairs in " + this.bamFile.getAbsolutePath() +
|
||||
": " + first.getReadName() + ", " + second.getReadName() + ".");
|
||||
}
|
||||
|
||||
// If both are noise reads, filter them out
|
||||
if (tagFilter.filterOut(first) && tagFilter.filterOut(second)) {
|
||||
// skip it
|
||||
}
|
||||
// If either fails to pass filter, then exclude them as well
|
||||
else if (qualityFilter.filterOut(first) || qualityFilter.filterOut(second)) {
|
||||
// skip it
|
||||
}
|
||||
// Otherwise, write them out
|
||||
else {
|
||||
records++;
|
||||
if (records % increment == 0) {
|
||||
first.setReadName(first.getReadName() + "#0/1");
|
||||
writeFastqRecord(first.getFirstOfPairFlag() ? codec1 : codec2, first);
|
||||
second.setReadName(second.getReadName() + "#0/2");
|
||||
writeFastqRecord(second.getFirstOfPairFlag() ? codec1 : codec2, second);
|
||||
wrote++;
|
||||
if (wrote % 1000000 == 0) {
|
||||
log.info(wrote + " records written.");
|
||||
}
|
||||
if (chunk > 0 && wrote % chunk == 0) {
|
||||
initializeNextBfqFiles(fileIndex++);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Path for writing bfqs for single-end reads
|
||||
*
|
||||
* @param iterator the iterator witht he SAM Records to write
|
||||
* @param filters the list of filters to be applied
|
||||
*/
|
||||
private void writeSingleEndBfqs(Iterator<SAMRecord> iterator, List<SamRecordFilter> filters) {
|
||||
|
||||
// Open the codecs for writing
|
||||
int fileIndex = 0;
|
||||
initializeNextBfqFiles(fileIndex++);
|
||||
|
||||
int records = 0;
|
||||
|
||||
FilteringIterator it = new FilteringIterator(iterator, new AggregateFilter(filters));
|
||||
while (it.hasNext()) {
|
||||
SAMRecord record = it.next();
|
||||
records++;
|
||||
if (records % increment == 0) {
|
||||
|
||||
writeFastqRecord(codec1, record);
|
||||
wrote++;
|
||||
if (wrote % 1000000 == 0) {
|
||||
log.info(wrote + " records processed.");
|
||||
}
|
||||
if (chunk > 0 && wrote % chunk == 0) {
|
||||
initializeNextBfqFiles(fileIndex++);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes any the open bfq file(s), if any, and opens the new one(s)
|
||||
*
|
||||
* @param fileIndex the index (counter) of the files to write
|
||||
*/
|
||||
private void initializeNextBfqFiles(int fileIndex) {
|
||||
// Close the codecs if they were writing before
|
||||
if (codec1 != null) {
|
||||
codec1.close();
|
||||
if (pairedReads) {
|
||||
codec2.close();
|
||||
}
|
||||
}
|
||||
|
||||
// Open new file, using the fileIndex.
|
||||
File bfq1 = getOutputFile(this.outputPrefix , 1, fileIndex);
|
||||
codec1 = new BinaryCodec(IoUtil.openFileForWriting(bfq1));
|
||||
log.info("Now writing to file " + bfq1.getAbsolutePath());
|
||||
if (pairedReads) {
|
||||
File bfq2 = getOutputFile(this.outputPrefix , 2, fileIndex);
|
||||
codec2 = new BinaryCodec(IoUtil.openFileForWriting(bfq2));
|
||||
log.info("Now writing to file " + bfq2.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes out a SAMRecord in Maq fastq format
|
||||
*
|
||||
* @param codec the code to write to
|
||||
* @param rec the SAMRecord to write
|
||||
*/
|
||||
private void writeFastqRecord(BinaryCodec codec, SAMRecord rec) {
|
||||
|
||||
// Writes the length of the read name and then the name (null-terminated)
|
||||
codec.writeString(rec.getReadName(), true, true);
|
||||
|
||||
char seqs[] = rec.getReadString().toCharArray();
|
||||
char quals[] = rec.getBaseQualityString().toCharArray();
|
||||
|
||||
// Write the length of the sequence
|
||||
codec.writeInt(seqs.length);
|
||||
|
||||
// Calculate and write the sequence and qualities
|
||||
byte seqsAndQuals[] = new byte[seqs.length];
|
||||
|
||||
for (int i = 0; i < seqs.length; i++) {
|
||||
int quality = Math.min(quals[i]-33, 63);
|
||||
int base;
|
||||
switch(seqs[i]) {
|
||||
case 'A':
|
||||
case 'a':
|
||||
base = 0;
|
||||
break;
|
||||
case 'C':
|
||||
case 'c':
|
||||
base = 1;
|
||||
break;
|
||||
case 'G':
|
||||
case 'g':
|
||||
base = 2;
|
||||
break;
|
||||
case 'T':
|
||||
case 't':
|
||||
base = 3;
|
||||
break;
|
||||
case 'N':
|
||||
case 'n':
|
||||
case '.':
|
||||
base = 0;
|
||||
quality = 0;
|
||||
break;
|
||||
default:
|
||||
throw new PicardException("Unknown base when writing bfq file: " + seqs[i]);
|
||||
}
|
||||
seqsAndQuals[i] = (byte) (base << 6 | quality);
|
||||
}
|
||||
codec.writeBytes(seqsAndQuals);
|
||||
}
|
||||
|
||||
private int countWritableRecords() {
|
||||
int count = 0;
|
||||
PeekableIterator<SAMRecord> it = new PeekableIterator<SAMRecord>((new SAMFileReader(IoUtil.openFileForReading(this.bamFile))).iterator());
|
||||
if (!this.pairedReads) {
|
||||
// Filter out noise reads and reads that fail the quality filter
|
||||
List<SamRecordFilter> filters = new ArrayList<SamRecordFilter>();
|
||||
filters.add(new TagFilter(ReservedTagConstants.XN, 1));
|
||||
filters.add(new FailsVendorReadQualityFilter());
|
||||
FilteringIterator itr = new FilteringIterator(it, new AggregateFilter(filters));
|
||||
while (itr.hasNext()) {
|
||||
itr.next();
|
||||
count++;
|
||||
}
|
||||
}
|
||||
else {
|
||||
while (it.hasNext()) {
|
||||
SAMRecord first = it.next();
|
||||
SAMRecord second = it.next();
|
||||
// If both are noise reads, filter them out
|
||||
if (first.getAttribute(ReservedTagConstants.XN) != null &&
|
||||
second.getAttribute(ReservedTagConstants.XN) != null) {
|
||||
// skip it
|
||||
}
|
||||
// If either fails to pass filter, then exclude them as well
|
||||
else if (first.getReadFailsVendorQualityCheckFlag() || second.getReadFailsVendorQualityCheckFlag() ) {
|
||||
// skip it
|
||||
}
|
||||
// Otherwise, write them out
|
||||
else {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
it.close();
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs the name for the output file and returns the file
|
||||
*
|
||||
* @param outputPrefix the directory and file prefix for the output bfq file
|
||||
* @param read whether this is the file for the first or second read
|
||||
* @return a new File object for the bfq file.
|
||||
*/
|
||||
private File getOutputFile(String outputPrefix, int read, int index) {
|
||||
File result = new File(outputPrefix + "." + index + "." + read + ".bfq");
|
||||
IoUtil.assertFileIsWritable(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,357 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.aligner.maq;
|
||||
|
||||
import edu.mit.broad.sam.*;
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
import edu.mit.broad.sam.util.BinaryCodec;
|
||||
import edu.mit.broad.sam.util.StringUtil;
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.picard.util.SamPairUtil;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Reads a Maq map file and returns an an iterator of SAMRecords and a populated header
|
||||
*
|
||||
* IMPORTANT! Even though the reads in the map file are in coordinate order, this iterator
|
||||
* will not necessarily return them in that order. For paired reads, both will be
|
||||
* returned only after *both* records have been seen.
|
||||
*
|
||||
* @author Kathleen Tibbetts
|
||||
*/
|
||||
public class MapFileIterator implements CloseableIterator<SAMRecord> {
|
||||
|
||||
public static final int MATE_UNMAPPED_FLAG = 64;
|
||||
public static final int READ_UNMAPPED_FLAG = 192;
|
||||
|
||||
private static final int READ_NAME_LENGTH = 36;
|
||||
private static final int MAP_FORMAT = -1;
|
||||
private static final int MAX_READ_LENGTH = 128;
|
||||
|
||||
private static final byte ACGT[] = {'A', 'C', 'G', 'T'};
|
||||
|
||||
public static final String PROGRAM_RECORD = "0";
|
||||
|
||||
private long recordCount = 0L;
|
||||
private int recordsRead = 0;
|
||||
private BinaryCodec mapCodec;
|
||||
private final SAMFileHeader header;
|
||||
private final boolean pairedReads;
|
||||
private final boolean jumpingLibrary;
|
||||
private final List<SAMRecord> next = new ArrayList<SAMRecord>();
|
||||
private final Map<String, SAMRecord> pending = new HashMap<String, SAMRecord>();
|
||||
private final List<File> mapFiles = new LinkedList<File>();
|
||||
|
||||
/**
|
||||
* Constructor. Opens the map file, reads the record count and header from it,
|
||||
* creates the SAMFileHeader, and queues up the first read
|
||||
*
|
||||
* @param mapFile The Maq map file to read
|
||||
* @param commandLine The command line used to invoke Maq (for the header)
|
||||
* @param pairedReads Whether this is a paired-end run
|
||||
*/
|
||||
public MapFileIterator(String commandLine, boolean pairedReads, boolean jumpingLibrary, File... mapFile) {
|
||||
if (mapFile.length == 0) {
|
||||
throw new IllegalArgumentException("At least one map file must be provided.");
|
||||
}
|
||||
mapFiles.addAll(Arrays.asList(mapFile));
|
||||
|
||||
this.pairedReads = pairedReads;
|
||||
this.jumpingLibrary = jumpingLibrary;
|
||||
|
||||
header = new SAMFileHeader();
|
||||
header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
|
||||
SAMProgramRecord program = new SAMProgramRecord(PROGRAM_RECORD);
|
||||
program.setProgramVersion(MaqConstants.getProgramVersion());
|
||||
program.setCommandLine(commandLine);
|
||||
header.addProgramRecord(program);
|
||||
|
||||
queueNextMapFile();
|
||||
}
|
||||
|
||||
/**
|
||||
* Queues up the next map file
|
||||
*
|
||||
* @return true if there's another map file to iterate over
|
||||
*/
|
||||
private boolean queueNextMapFile() {
|
||||
|
||||
// Close the old file
|
||||
if (mapCodec != null) {
|
||||
mapCodec.close();
|
||||
}
|
||||
|
||||
// If there are no more map files, return fales
|
||||
if (mapFiles.size() == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Otherwise, open the next file and reset the recordsRead count
|
||||
mapCodec = new BinaryCodec(new BufferedInputStream(IoUtil.openFileForReading(mapFiles.remove(0))));
|
||||
int format = mapCodec.readInt();
|
||||
if (format != MAP_FORMAT) {
|
||||
mapCodec.close();
|
||||
throw new PicardException("Unrecognized Maq map file format: " + format);
|
||||
}
|
||||
recordsRead = 0;
|
||||
|
||||
|
||||
// Read the sequences out of the map file and set them on the header
|
||||
int sequenceCount = mapCodec.readInt();
|
||||
List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>();
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
int length = mapCodec.readInt();
|
||||
// Write the sequence name, trimming off the null terminator
|
||||
sequences.add(new SAMSequenceRecord(mapCodec.readString(length).substring(0, length-1)));
|
||||
}
|
||||
if (header.getSequences() == null || header.getSequences().size() == 0) {
|
||||
header.setSequences(sequences);
|
||||
}
|
||||
else {
|
||||
// TODO: Check that the sequences match and throw and exception if they don't
|
||||
}
|
||||
recordCount = mapCodec.readLong();
|
||||
|
||||
readNext();
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the BinaryCodec reading the map file
|
||||
*/
|
||||
public void close() {
|
||||
mapCodec.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if the iteration has more elements
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return next.size() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the next SAMRecord in the iteration
|
||||
* @throws NoSuchElementException if this is called when hasNext() returns false
|
||||
*/
|
||||
public SAMRecord next() {
|
||||
if (!hasNext()) {
|
||||
throw new NoSuchElementException("No more elements in this iteration");
|
||||
}
|
||||
SAMRecord result = next.remove(0);
|
||||
readNext();
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next element from the map file. If we are done with it, we put it in the <code>next</code>
|
||||
* list; if we are waiting to see its mate, we put it in the <code>pending</code> map. Calls itself
|
||||
* repeatedly until there is at least one element in <code>next</code>.
|
||||
*/
|
||||
private void readNext() {
|
||||
|
||||
// If there's already a record queued up, just return
|
||||
if (next.size() > 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// If we've read all there is, then any remaining records in the pending map should be returned.
|
||||
// If this is not a PE run, then the pending map will be empty and we're done.
|
||||
if (recordsRead == recordCount) {
|
||||
if (pending.size() > 0) {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
for (String item : pending.keySet()) {
|
||||
sb.append(item).append("\n");
|
||||
}
|
||||
throw new PicardException("MapFileIterator pending map should have been empty but contained " +
|
||||
"the following records: " + sb.toString());
|
||||
}
|
||||
queueNextMapFile();
|
||||
return;
|
||||
}
|
||||
|
||||
// Otherwise, we read until there is at least one record in the <code>next</code> list
|
||||
readMapRecord();
|
||||
if (next.size() == 0) {
|
||||
readNext();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads one record from the map file and throws it onto the pending map or the next list,
|
||||
* depending on whether we have already seen its mate
|
||||
*/
|
||||
private void readMapRecord() {
|
||||
|
||||
// Now that we've got all the data from the binary file, write a SAMRecord and add it to
|
||||
// the new BAM file
|
||||
SAMRecord record = new SAMRecord();
|
||||
record.setAttribute(SAMTag.PG.toString(), PROGRAM_RECORD);
|
||||
record.setReadPairedFlag(this.pairedReads);
|
||||
|
||||
// the last base is the single-end mapping quality.
|
||||
byte seqsAndQuals[] = new byte[MAX_READ_LENGTH-1];
|
||||
mapCodec.readBytes(seqsAndQuals);
|
||||
|
||||
byte singleEndMappingQualityOrIndelLength = mapCodec.readByte();
|
||||
|
||||
// the length of the read
|
||||
int readLength = mapCodec.readUByte();
|
||||
setSeqsAndQuals(seqsAndQuals, readLength, record);
|
||||
|
||||
// the final mapping quality (unless <code>flag</code> below is 130, then it is the
|
||||
// position of the indel (or 0 if no indel)
|
||||
int mappingQuality = mapCodec.readUByte();
|
||||
|
||||
// mismatches in the 28bp (higher 4 bits) and mismatches (lower 4 bits)
|
||||
mapCodec.readUByte();
|
||||
// sum of errors of the best hit
|
||||
mapCodec.readUByte();
|
||||
// counts of all 0- and 1-mismatch hits on the reference
|
||||
mapCodec.readUByte();
|
||||
mapCodec.readUByte();
|
||||
|
||||
// A bitwise flag. See the Maq docs for its full meaning
|
||||
int flag = mapCodec.readUByte();
|
||||
|
||||
// the lower mapQ of the two ends (equals map_qual if unpaired); if flag is 130: mapQ of its mate
|
||||
int altQual = mapCodec.readUByte();
|
||||
|
||||
// Index of the sequence for this read
|
||||
record.setReferenceIndex((int)mapCodec.readUInt(), getHeader());
|
||||
|
||||
// Start position and strand
|
||||
long pos = mapCodec.readUInt();
|
||||
int startPos = ((int)((pos>>1)& 0x7FFFFFFF)) + 1;
|
||||
record.setAlignmentStart(startPos);
|
||||
record.setReadNegativeStrandFlag((pos&1) == 1);
|
||||
|
||||
// offset of the mate (zero if unpaired, or two ends mapped to different chr)
|
||||
mapCodec.readInt();
|
||||
|
||||
// The read name
|
||||
byte nameBytes[] = new byte[READ_NAME_LENGTH];
|
||||
mapCodec.readBytes(nameBytes);
|
||||
String name = StringUtil.bytesToString(nameBytes).trim();
|
||||
if (this.pairedReads) {
|
||||
if (name.endsWith("/1")) {
|
||||
record.setFirstOfPairFlag(true);
|
||||
record.setSecondOfPairFlag(false);
|
||||
}
|
||||
else if (name.endsWith("/2")) {
|
||||
record.setFirstOfPairFlag(false);
|
||||
record.setSecondOfPairFlag(true);
|
||||
}
|
||||
else {
|
||||
throw new PicardException("Unrecognized ending for paired read name: " + name);
|
||||
}
|
||||
name = name.substring(0, name.length()-2);
|
||||
}
|
||||
record.setReadName(name);
|
||||
|
||||
|
||||
if (flag != 130 || singleEndMappingQualityOrIndelLength == 0) { // No indel
|
||||
record.setCigarString(readLength + "M");
|
||||
record.setMappingQuality(mappingQuality);
|
||||
}
|
||||
else { // Indel
|
||||
int indelPos = mappingQuality;
|
||||
String cigar = indelPos + "M" + Math.abs(singleEndMappingQualityOrIndelLength);
|
||||
int remaining = readLength - indelPos;
|
||||
if (singleEndMappingQualityOrIndelLength > 0) {
|
||||
cigar += "I" + (remaining - singleEndMappingQualityOrIndelLength) + "M";
|
||||
}
|
||||
else {
|
||||
cigar += "D" + remaining + "M";
|
||||
}
|
||||
record.setCigarString(cigar);
|
||||
// In the docs, it look like there is a mapping quality for the mate, do we use that?
|
||||
record.setMappingQuality(altQual);
|
||||
}
|
||||
|
||||
if (!pairedReads) {
|
||||
record.setProperPairFlag(false);
|
||||
next.add(record);
|
||||
}
|
||||
else {
|
||||
record.setMateUnmappedFlag(flag == MATE_UNMAPPED_FLAG);
|
||||
SAMRecord mate = pending.remove(record.getReadName());
|
||||
|
||||
if (mate != null) {
|
||||
boolean proper = SamPairUtil.isProperPair(record, mate, jumpingLibrary);
|
||||
record.setProperPairFlag(proper);
|
||||
mate.setProperPairFlag(proper);
|
||||
|
||||
SamPairUtil.setMateInfo(record, mate);
|
||||
|
||||
int insertSize = SamPairUtil.computeInsertSize(record, mate);
|
||||
record.setInferredInsertSize(insertSize);
|
||||
mate.setInferredInsertSize(insertSize);
|
||||
|
||||
if (!mate.getMateUnmappedFlag()) {
|
||||
next.add(record);
|
||||
}
|
||||
if (!record.getMateUnmappedFlag()) {
|
||||
next.add(mate);
|
||||
}
|
||||
}
|
||||
else {
|
||||
pending.put(record.getReadName(), record);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Figure out what do do about noise reads long-term
|
||||
// Note that it is possible that we have lost a "Noise read" annotation at this point. Since
|
||||
// we try to map a pair if only one of the reads is classified as "noise", then for any paired
|
||||
// reads where one was a noise read and one was not, we will lose the noise annotation on the
|
||||
// one noisy read. We have discussed either re-doing the noise evaluation here, modifying the
|
||||
// read name to carry the noise flag through Maq, or changing what reads we give to Maq.
|
||||
|
||||
recordsRead++;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Decodes the sequence and the qualities and sets them on the SAMrecords
|
||||
*
|
||||
* @param seqsAndQuals the list of seqs and quals
|
||||
* @param readLength the length of the read
|
||||
* @param sam the SAMRecord to populate
|
||||
*/
|
||||
private void setSeqsAndQuals(byte seqsAndQuals[], int readLength, SAMRecord sam) {
|
||||
byte sequence[] = new byte[readLength];
|
||||
byte qualities[] = new byte[readLength];
|
||||
for (int i = 0; i < readLength; i++) {
|
||||
byte b = seqsAndQuals[i];
|
||||
qualities[i] = (byte)(b & 0x3F);
|
||||
if (b == 0) {
|
||||
sequence[i] = 'N';
|
||||
}
|
||||
else {
|
||||
sequence[i] = ACGT[(seqsAndQuals[i] >> 6) & 3];
|
||||
}
|
||||
}
|
||||
sam.setReadBases(sequence);
|
||||
sam.setBaseQualities(qualities);
|
||||
}
|
||||
|
||||
/**
|
||||
* @throws UnsupportedOperationException -- not implemented
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("remove() not supported in MapFileIterator");
|
||||
}
|
||||
|
||||
public SAMFileHeader getHeader() { return header; }
|
||||
}
|
||||
|
|
@ -1,211 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.aligner.maq;
|
||||
|
||||
import edu.mit.broad.picard.aligner.Aligner;
|
||||
import edu.mit.broad.picard.aligner.AbstractBaseAligner;
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.picard.util.Log;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FilenameFilter;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Maq implementation of the Aligner interface
|
||||
*/
|
||||
public class MaqAligner extends AbstractBaseAligner implements Aligner {
|
||||
|
||||
// Constants related to Maq output files
|
||||
public static final String MAQ_MAP_SUFFIX = ".out.aln.map";
|
||||
public static final String MAQ_LOG_SUFFIX = ".out.map.log";
|
||||
|
||||
// Internal constant for multi-plexing lane data
|
||||
private static final int READ_CHUNK_SIZE = 2000000;
|
||||
|
||||
public static final String REFERENCE_FILE_SUFFIX = ".bfa";
|
||||
|
||||
private final Log log = Log.getInstance(MaqAligner.class);
|
||||
|
||||
private String commandLine = null;
|
||||
|
||||
|
||||
/**
|
||||
* Constructor that sets every parameter. All other constructors delegate to this one.
|
||||
*
|
||||
* @param stringency the stringency of the alignment
|
||||
* @param readsBamFile the BAM file containing the reads
|
||||
* @param outputPrefix the directory and filename prefix for output
|
||||
* @param referenceFileDir the directory where the reference file is located
|
||||
* @param clipPoints the clip points
|
||||
* @param expectedInsertSize the expected insert size (null for non-PE lanes)
|
||||
* @param readsToAlign the number of reads to align
|
||||
* @param customParametersMap parameters specific to the Aligner implementation
|
||||
*/
|
||||
public MaqAligner(Stringency stringency, File readsBamFile, String outputPrefix,
|
||||
String referenceFileDir, int clipPoints[], Integer expectedInsertSize,
|
||||
Integer readsToAlign, Map<String, String> customParametersMap,
|
||||
boolean pairedReads, int readLength) {
|
||||
|
||||
super(stringency, readsBamFile, outputPrefix, referenceFileDir, clipPoints,
|
||||
expectedInsertSize, readsToAlign, customParametersMap, pairedReads, readLength);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares all the necessary inputs for the alignment process from a BAM file of read data.
|
||||
*/
|
||||
public void prepareInputs() {
|
||||
log.info("Preparing Maq inputs.");
|
||||
BamToBfqWriter writer = new BamToBfqWriter(this.getReadsBamFile(), this.getOutputPrefix(),
|
||||
this.getReadsToAlign(), READ_CHUNK_SIZE, isPairedReads());
|
||||
writer.writeBfqFiles();
|
||||
}
|
||||
|
||||
/**
|
||||
* Does the alignment and produces output in the underlying form of the aligner.
|
||||
*/
|
||||
public void align() {
|
||||
log.info("Running Maq alignment.");
|
||||
|
||||
// Temporary hack until we get the multi-tasking code from Seva
|
||||
List<String> mapFileNames = new ArrayList<String>(); // All map files that we will merge together at the end
|
||||
|
||||
String maqParams = MaqConstants.SWITCH_RANDOM_SEED + " " + MaqConstants.DEFAULT_RANDOM_SEED;
|
||||
|
||||
if (this.getStringency() == Stringency.high) {
|
||||
maqParams += " " + MaqConstants.SWITCH_MAX_OUTER_DISTANCE + " " + Math.round(
|
||||
this.getExpectedInsertSize() * MaqConstants.HIGH_STRINGENCY_MAX_OUTER_DISTANCE_MULTIPLIER);
|
||||
maqParams += " " + MaqConstants.SWITCH_SUM_MISMATCHES + " " +
|
||||
MaqConstants.HIGH_STRINGENCY_SUM_MISMATCHES;
|
||||
}
|
||||
else {
|
||||
maqParams += " " + MaqConstants.SWITCH_MAX_OUTER_DISTANCE + " " +
|
||||
MaqConstants.LOW_STRINGENCY_MAX_OUTER_DISTANCE;
|
||||
// For low stringency, get at least 30 bases and then let half of what's remaining mismatch
|
||||
int maxMisMatches = (this.getReadLength() - 30)/2;
|
||||
maqParams += " " + MaqConstants.SWITCH_SUM_MISMATCHES + " " +
|
||||
(maxMisMatches * MaqConstants.LOW_STRINGENCY_QUALITY_FOR_MISMATCHES);
|
||||
}
|
||||
|
||||
String referenceFile = new File(this.getReferenceFileDir()).listFiles(new FilenameFilter() {
|
||||
public boolean accept(File dir, String name) {
|
||||
return name.endsWith(REFERENCE_FILE_SUFFIX);
|
||||
}
|
||||
})[0].getAbsolutePath();
|
||||
|
||||
ProcessBuilder builder;
|
||||
|
||||
// Map the bfq files, individually or in pairs
|
||||
SortedSet<File> bfqs = new TreeSet<File>(this.getBfqFiles());
|
||||
for (Iterator<File> it = bfqs.iterator(); it.hasNext();) {
|
||||
|
||||
String read1bfq = it.next().getAbsolutePath();
|
||||
String read2bfq = (this.isPairedReads()) ? it.next().getAbsolutePath() : "";
|
||||
|
||||
String outputFileBase = read1bfq.substring(0, read1bfq.lastIndexOf('.')-2);
|
||||
String mapFile = outputFileBase + MAQ_MAP_SUFFIX;
|
||||
String logFile = outputFileBase + MAQ_LOG_SUFFIX;
|
||||
|
||||
String command = MaqConstants.MAQ_HOME + MaqConstants.MAQ_COMMAND + " " + MaqConstants.MAP_COMMAND +
|
||||
" " + maqParams + " " + mapFile + " " + referenceFile + " " + read1bfq + " " + read2bfq +
|
||||
" 2> " + logFile;
|
||||
setCommandLine(getCommandLine() == null ? command : getCommandLine() + ";" + command);
|
||||
log.info("Executing command: " + command);
|
||||
try {
|
||||
builder = new ProcessBuilder(command.split(" "));
|
||||
Process p = builder.start();
|
||||
p.waitFor();
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new PicardException("Error starting Maq process", e);
|
||||
}
|
||||
|
||||
mapFileNames.add(mapFile);
|
||||
}
|
||||
|
||||
// If there's more than one map file, then merge them.
|
||||
String finalFileName = this.getOutputPrefix() + "." + this.getStringency() + MAQ_MAP_SUFFIX;
|
||||
if (mapFileNames.size() > 1) {
|
||||
String command = MaqConstants.MAQ_HOME + MaqConstants.MAQ_COMMAND + " " +
|
||||
MaqConstants.MERGE_COMMAND + " " + finalFileName;
|
||||
for (String name : mapFileNames) {
|
||||
command += " " + name;
|
||||
}
|
||||
setCommandLine(getCommandLine() == null ? command : getCommandLine() + ";" + command);
|
||||
log.info("Executing command: " + command);
|
||||
|
||||
try {
|
||||
builder = new ProcessBuilder(command.split(" "));
|
||||
Process p = builder.start();
|
||||
p.waitFor();
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new PicardException("Error starting Maq process", e);
|
||||
}
|
||||
}
|
||||
else { // Otherwise rename the single map file so we can find it later
|
||||
File f = new File(mapFileNames.get(0));
|
||||
if (!f.renameTo(new File(finalFileName))) {
|
||||
throw new PicardException("Error renaming " + f.getAbsolutePath() + " to " + finalFileName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts the output of the aligner to BAM format
|
||||
*/
|
||||
public void prepareOutput() {
|
||||
log.info("Preparing output from Maq alignment.");
|
||||
// TODO: MaqToBam
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleans up intermediate files (the files created in by and for the underlying aligner by the
|
||||
* prepareInputs() and align() methods. Does not clean up the original source files or the final BAM file.
|
||||
*/
|
||||
public void cleanup() {
|
||||
log.info("Cleaning up Maq intermediate files.");
|
||||
this.deleteFiles(getBfqFiles());
|
||||
// this.deleteFiles(getMaqAlignmentFiles());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of zero to two BFQ files, depending on whether they are there
|
||||
* and whether it was a paired-end run or not
|
||||
*
|
||||
* @return a list of BFQ files
|
||||
*/
|
||||
private List<File> getBfqFiles() {
|
||||
File dir = new File(this.getOutputPrefix().substring(0, this.getOutputPrefix().lastIndexOf("/")));
|
||||
return Arrays.asList(dir.listFiles(new FilenameFilter() {
|
||||
public boolean accept(File dir, String name) {
|
||||
return name.endsWith(".bfq");
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the Maq map files
|
||||
*
|
||||
* @return a list of Maq .map files
|
||||
*/
|
||||
private List<File> getMaqAlignmentFiles() {
|
||||
File dir = new File(this.getOutputPrefix().substring(0, this.getOutputPrefix().lastIndexOf("/")));
|
||||
return Arrays.asList(dir.listFiles(new FilenameFilter() {
|
||||
public boolean accept(File dir, String name) {
|
||||
// TODO: Add the text files if we do not read the binary map files
|
||||
return name.endsWith(MAQ_MAP_SUFFIX) || name.endsWith(MAQ_LOG_SUFFIX);
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
public String getCommandLine() { return commandLine; }
|
||||
public void setCommandLine(String commandLine) { this.commandLine = commandLine; }
|
||||
}
|
||||
|
|
@ -1,39 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.aligner.maq;
|
||||
|
||||
/**
|
||||
* Utility class to hold Maq-related constants (program name, location, switches, etc)
|
||||
*/
|
||||
public class MaqConstants {
|
||||
// General Maq constants
|
||||
public static final String PROGRAM_NAME = "Maq";
|
||||
public static final String PROGRAM_VERSION = "0.7.1";
|
||||
public static final String MAQ_HOME = "/seq/dirseq/maq-0.7.1/";
|
||||
|
||||
// Command-related constants
|
||||
public static final String MAQ_COMMAND = "maq";
|
||||
public static final String MAP_COMMAND = "map";
|
||||
public static final String MERGE_COMMAND = "mapmerge";
|
||||
|
||||
// Constants related to Maq map switches
|
||||
public static final String SWITCH_SUM_MISMATCHES = "-e";
|
||||
public static final int HIGH_STRINGENCY_SUM_MISMATCHES = 100;
|
||||
public static final int LOW_STRINGENCY_QUALITY_FOR_MISMATCHES = 30;
|
||||
|
||||
public static final String SWITCH_MAX_OUTER_DISTANCE = "-a";
|
||||
public static final int LOW_STRINGENCY_MAX_OUTER_DISTANCE = 1500;
|
||||
public static final double HIGH_STRINGENCY_MAX_OUTER_DISTANCE_MULTIPLIER = 1.5d;
|
||||
|
||||
public static final String SWITCH_RANDOM_SEED = "-s";
|
||||
public static final int DEFAULT_RANDOM_SEED = 0;
|
||||
|
||||
public static String getProgramVersion() { return PROGRAM_VERSION; }
|
||||
}
|
||||
|
|
@ -1,125 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.aligner.maq;
|
||||
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
import edu.mit.broad.picard.util.StringSortingCollectionFactory;
|
||||
import edu.mit.broad.picard.util.Log;
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.sam.util.SortingCollection;
|
||||
import edu.mit.broad.sam.util.BinaryCodec;
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
import edu.mit.broad.sam.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.BufferedInputStream;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
/**
|
||||
* Class to write a BAM file that includes the results from a Maq .map file along with the unaligned
|
||||
* reads from the original BAM file.
|
||||
*
|
||||
* Information on the meaning of the elements of the map file is drawn from the Maq documentation
|
||||
* on this page: http://maq.sourceforge.net/maqmap_format.shtml
|
||||
*/
|
||||
public class MaqMapMerger {
|
||||
|
||||
private final File mapFile;
|
||||
private final File sourceBamFile;
|
||||
private final File targetBamFile;
|
||||
private final boolean pairedReads;
|
||||
private final Log log = Log.getInstance(MaqMapMerger.class);
|
||||
private String commandLine = null;
|
||||
private List<SAMSequenceRecord> sequences = new ArrayList<SAMSequenceRecord>();
|
||||
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param mapFile The Maq map file to parse
|
||||
* @param sourceBamFile The BAM file that was used as the input to the Maq aligner, which will
|
||||
* include info on all the reads that did not map
|
||||
* @param targetBamFile The file to which to write the merged
|
||||
*/
|
||||
public MaqMapMerger(File mapFile, File sourceBamFile, File targetBamFile, boolean pairedReads) {
|
||||
IoUtil.assertFileIsReadable(mapFile);
|
||||
IoUtil.assertFileIsReadable(sourceBamFile);
|
||||
IoUtil.assertFileIsWritable(targetBamFile);
|
||||
this.mapFile = mapFile;
|
||||
this.sourceBamFile = sourceBamFile;
|
||||
this.targetBamFile = targetBamFile;
|
||||
this.pairedReads = pairedReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges the alignment from the map file with the remaining records from the source BAM file.
|
||||
*/
|
||||
public void mergeAlignment() {
|
||||
log.info("Processing map file: " + mapFile.getAbsolutePath());
|
||||
// Write the header
|
||||
MapFileIterator it = new MapFileIterator(getCommandLine(), this.pairedReads, false, this.mapFile);
|
||||
SAMFileHeader header = it.getHeader();
|
||||
SAMFileWriter writer = new SAMFileWriterFactory().makeBAMWriter(header, false, targetBamFile);
|
||||
|
||||
// Write the alignments
|
||||
SortingCollection<String> readNames = writeAlignments(it, writer);
|
||||
|
||||
// We're done with the map file, so close it
|
||||
it.close();
|
||||
writeUnalignedReads(writer, readNames.iterator());
|
||||
|
||||
// Now close the writer
|
||||
writer.close();
|
||||
}
|
||||
|
||||
|
||||
private void writeUnalignedReads(SAMFileWriter writer, CloseableIterator<String> nameIterator) {
|
||||
|
||||
int skipCount = 0;
|
||||
SAMFileReader reader = new SAMFileReader(IoUtil.openFileForReading(this.sourceBamFile));
|
||||
CloseableIterator<SAMRecord> bamRecords = reader.iterator();
|
||||
|
||||
String readName = nameIterator.hasNext() ? nameIterator.next() : null;
|
||||
while(bamRecords.hasNext()) {
|
||||
SAMRecord rec = bamRecords.next();
|
||||
if (rec.getReadName().equals(readName)) {
|
||||
// skip it and pull the next name off the name iterator
|
||||
readName = nameIterator.hasNext() ? nameIterator.next() : null;
|
||||
skipCount++;
|
||||
}
|
||||
else {
|
||||
writer.addAlignment(rec);
|
||||
}
|
||||
}
|
||||
System.out.println("Skipped " + skipCount + " already-aligned records.");
|
||||
bamRecords.close();
|
||||
nameIterator.close();
|
||||
}
|
||||
|
||||
private SortingCollection<String> writeAlignments(MapFileIterator iterator, SAMFileWriter writer) {
|
||||
|
||||
int wrote = 0;
|
||||
SortingCollection<String> readNames = StringSortingCollectionFactory.newCollection();
|
||||
while (iterator.hasNext()) {
|
||||
SAMRecord record = iterator.next();
|
||||
readNames.add(record.getReadName());
|
||||
writer.addAlignment(record);
|
||||
wrote++;
|
||||
}
|
||||
System.out.println("Wrote " + wrote + " alignment records.");
|
||||
return readNames;
|
||||
}
|
||||
|
||||
public void setCommandLine(String commandLine) { this.commandLine = commandLine; }
|
||||
public String getCommandLine() { return this.commandLine; }
|
||||
}
|
||||
|
|
@ -1,133 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.aligner.maq;
|
||||
|
||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||
import edu.mit.broad.picard.cmdline.Usage;
|
||||
import edu.mit.broad.picard.cmdline.Option;
|
||||
import edu.mit.broad.picard.aligner.Aligner;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
import java.util.HashMap;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* CommandLineProgram to generate to invoke BustardToBamWriter
|
||||
*
|
||||
* @author Kathleen Tibbetts
|
||||
*/
|
||||
public class RunMaq extends CommandLineProgram {
|
||||
private static final String PROGRAM_VERSION = "1.0";
|
||||
|
||||
// The following attributes define the command-line arguments
|
||||
@Usage
|
||||
public String USAGE =
|
||||
"Usage: " + getClass().getName() + " [options]\n\n" +
|
||||
"Invoke the Maq aligner.\n" +
|
||||
"Version: " + PROGRAM_VERSION +"\n";
|
||||
|
||||
@Option(shortName="I", doc="The BAM file to parse.", optional=true)
|
||||
public File INPUT;
|
||||
@Option(shortName="O", doc="The directory and file prefix for all output.", optional=false)
|
||||
public String OUTPUT;
|
||||
@Option(shortName="L", doc="The read length.", optional=false)
|
||||
public Integer READ_LENGTH;
|
||||
@Option(shortName="S", doc="Stringency of the alignment.", optional=true)
|
||||
public Aligner.Stringency STRINGENCY;
|
||||
@Option(shortName="R", doc="Directory where the reference file is located.", optional=true)
|
||||
public String REFERENCE;
|
||||
@Option(shortName="C", doc="Clip points for the alignment.", optional=true, minElements=0, maxElements=4)
|
||||
public List<Integer> CLIP_POINT = new ArrayList<Integer>();
|
||||
@Option(shortName="E", doc="Expected insert size.", optional=true)
|
||||
public Integer EXPECTED_INSERT_SIZE;
|
||||
@Option(doc="Whether this is a paired-end run.", optional=false)
|
||||
public Boolean PE;
|
||||
@Option(shortName="NUM", doc="Number of reads to align (null = all).", optional=true)
|
||||
public Integer READS_TO_ALIGN;
|
||||
@Option(shortName="CUSTOM", doc="Custom parameter in the form name=value.", optional=true)
|
||||
public List<String> CUSTOM_PARAMETER = new ArrayList<String>();
|
||||
@Option(shortName="PREP", doc="Whether to prepare inputs for the alignement.", optional=true)
|
||||
public Boolean PREPARE = true;
|
||||
@Option(doc="Whether to do the alignement.", optional=true)
|
||||
public Boolean ALIGN = true;
|
||||
@Option(shortName="BAM", doc="Whether to generate a BAM file from the alignment output.", optional=true)
|
||||
public Boolean BAM_OUTPUT = true;
|
||||
@Option(doc="Whether to clean up intermediate input and output.", optional=true)
|
||||
public Boolean CLEANUP = true;
|
||||
|
||||
protected int doWork() {
|
||||
int clipPoints[] = null;
|
||||
if (CLIP_POINT != null) {
|
||||
clipPoints = new int[4];
|
||||
int index=0;
|
||||
for (Integer i : CLIP_POINT) {
|
||||
clipPoints[index++] = i;
|
||||
}
|
||||
}
|
||||
Map<String, String> params = null;
|
||||
if (CUSTOM_PARAMETER != null) {
|
||||
params = new HashMap<String, String>();
|
||||
for (String param : CUSTOM_PARAMETER) {
|
||||
String nameAndVal[] = param.split("=");
|
||||
params.put(nameAndVal[0], nameAndVal[1]);
|
||||
}
|
||||
}
|
||||
Aligner aligner = new MaqAligner(STRINGENCY, INPUT, OUTPUT, REFERENCE, clipPoints,
|
||||
EXPECTED_INSERT_SIZE, READS_TO_ALIGN, params, PE, READ_LENGTH);
|
||||
if (PREPARE) {
|
||||
aligner.prepareInputs();
|
||||
}
|
||||
if (ALIGN) {
|
||||
aligner.align();
|
||||
}
|
||||
if (BAM_OUTPUT) {
|
||||
aligner.prepareOutput();
|
||||
}
|
||||
if (CLEANUP) {
|
||||
aligner.cleanup();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* This is kind of a mess. Almost everything is optional, since you don't have to do all of the steps in the
|
||||
* alignement.
|
||||
* @return
|
||||
*/
|
||||
protected boolean customCommandLineValidation() {
|
||||
if (PREPARE) {
|
||||
if( INPUT == null) {
|
||||
System.err.println("ERROR: INPUT must be specified when preparing inputs for the alignment.");
|
||||
return false;
|
||||
}
|
||||
if (CLIP_POINT.size() != 0 && CLIP_POINT.size() != 4) {
|
||||
System.err.println("ERROR: You must supply either 0 or 4 values for CLIP_POINT: " + CLIP_POINT.size());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (ALIGN) {
|
||||
if (STRINGENCY == null) {
|
||||
System.err.println("ERROR: STRINGENCY must be specified when doing an alignment.");
|
||||
return false;
|
||||
}
|
||||
if (REFERENCE == null) {
|
||||
System.err.println("ERROR: REFERENCE must be specified when doing an alignment.");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static void main(String[] argv) {
|
||||
System.exit(new RunMaq().instanceMain(argv));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.cmdline;
|
||||
|
||||
public class CommandLineParseException extends RuntimeException{
|
||||
public CommandLineParseException() {
|
||||
}
|
||||
|
||||
public CommandLineParseException(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
public CommandLineParseException(String s, Throwable throwable) {
|
||||
super(s, throwable);
|
||||
}
|
||||
|
||||
public CommandLineParseException(Throwable throwable) {
|
||||
super(throwable);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,638 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.cmdline;
|
||||
|
||||
import java.io.*;
|
||||
import java.lang.reflect.Constructor;
|
||||
import java.lang.reflect.Field;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.lang.reflect.ParameterizedType;
|
||||
import java.lang.reflect.Type;
|
||||
import java.util.*;
|
||||
|
||||
import edu.mit.broad.picard.util.StringUtil;
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
|
||||
/**
|
||||
* Annotation-driven utility for parsing command-line arguments, checking for errors, and producing usage message.
|
||||
*
|
||||
* This class supports options of the form KEY=VALUE, plus positional arguments. Positional arguments must not contain
|
||||
* an equal sign lest they be mistaken for a KEY=VALUE pair.
|
||||
*
|
||||
* The caller must supply an object that both defines the command line and has the parsed options set into it.
|
||||
* For each possible KEY=VALUE option, there must be a public data member annotated with @Option. The KEY name is
|
||||
* the name of the data member. An abbreviated name may also be specified with the shortName attribute of @Option.
|
||||
* If the data member is a List<T>, then the option may be specified multiple times. The type of the data member,
|
||||
* or the type of the List element must either have a ctor T(String), or must be an Enum. List options must
|
||||
* be initialized by the caller with some kind of list. Any other option that is non-null is assumed to have the given
|
||||
* value as a default. If an option has no default value, and does not have the optional attribute of @Option set,
|
||||
* is required. For List options, minimum and maximum number of elements may be specified in the @Option annotation.
|
||||
*
|
||||
* A single List data member may be annotated with the @PositionalArguments. This behaves similarly to a Option
|
||||
* with List data member: the caller must initialize the data member, the type must be constructable from String, and
|
||||
* min and max number of elements may be specified. If no @PositionalArguments annotation appears in the object,
|
||||
* then it is an error for the command line to contain positional arguments.
|
||||
*
|
||||
* A single String public data member may be annotated with @Usage. This string, if present, is used to
|
||||
* construct the usage message. Details about the possible options are automatically appended to this string.
|
||||
* If @Usage does not appear, a boilerplate usage message is used.
|
||||
*/
|
||||
public class CommandLineParser {
|
||||
// For formatting option section of usage message.
|
||||
private static final int OPTION_COLUMN_WIDTH = 30;
|
||||
private static final int DESCRIPTION_COLUMN_WIDTH = 50;
|
||||
|
||||
private static final Boolean[] TRUE_FALSE_VALUES = {Boolean.TRUE, Boolean.FALSE};
|
||||
|
||||
// Use these if no @Usage annotation
|
||||
private static final String defaultUsagePreamble = "Usage: program [options...]\n";
|
||||
private static final String defaultUsagePreambleWithPositionalArguments =
|
||||
"Usage: program [options...] [positional-arguments...]\n";
|
||||
private static final String OPTIONS_FILE = "OPTIONS_FILE";
|
||||
|
||||
/**
|
||||
* A typical command line program will call this to get the beginning of the usage message,
|
||||
* and then append a description of the program, like this:
|
||||
*
|
||||
* \@Usage(programVersion=PROGRAM_VERSION)
|
||||
* public String USAGE = CommandLineParser.getStandardUsagePreamble(getClass()) + "Frobnicates the freebozzle."
|
||||
*/
|
||||
public static String getStandardUsagePreamble(Class mainClass) {
|
||||
return "USAGE: " + mainClass.getName() + " [options]\n\n";
|
||||
}
|
||||
|
||||
// This is the object that the caller has provided that contains annotations,
|
||||
// and into which the values will be assigned.
|
||||
private final Object callerOptions;
|
||||
|
||||
private String usagePreamble;
|
||||
// null if no @PositionalArguments annotation
|
||||
private Field positionalArguments;
|
||||
private int minPositionalArguments;
|
||||
private int maxPositionalArguments;
|
||||
|
||||
// List of all the data members with @Option annotation
|
||||
private final List<OptionDefinition> optionDefinitions = new ArrayList<OptionDefinition>();
|
||||
|
||||
// Maps long name, and short name, if present, to an option definition that is
|
||||
// also in the optionDefinitions list.
|
||||
private final Map<String, OptionDefinition> optionMap = new HashMap<String, OptionDefinition>();
|
||||
|
||||
// For printing error messages when parsing command line.
|
||||
private PrintStream messageStream;
|
||||
|
||||
// In case implementation wants to get at arg for some reason.
|
||||
private String[] argv;
|
||||
|
||||
|
||||
/**
|
||||
* This attribute is here just to facilitate printing usage for OPTIONS_FILE
|
||||
*/
|
||||
public File IGNORE_THIS_PROPERTY;
|
||||
|
||||
/**
|
||||
* Prepare for parsing command line arguments, by validating annotations.
|
||||
* @param callerOptions This object contains annotations that define the acceptable command-line options,
|
||||
* and ultimately will receive the settings when a command line is parsed.
|
||||
*/
|
||||
public CommandLineParser(final Object callerOptions) {
|
||||
this.callerOptions = callerOptions;
|
||||
|
||||
for (final Field field : this.callerOptions.getClass().getFields()) {
|
||||
if (field.getAnnotation(PositionalArguments.class) != null) {
|
||||
handlePositionalArgumentAnnotation(field);
|
||||
}
|
||||
if (field.getAnnotation(Usage.class) != null) {
|
||||
handleUsageAnnotation(field);
|
||||
}
|
||||
if (field.getAnnotation(Option.class) != null) {
|
||||
handleOptionAnnotation(field);
|
||||
}
|
||||
}
|
||||
|
||||
if (usagePreamble == null) {
|
||||
if (positionalArguments == null) {
|
||||
usagePreamble = defaultUsagePreamble;
|
||||
} else {
|
||||
usagePreamble = defaultUsagePreambleWithPositionalArguments;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Print a usage message based on the options object passed to the ctor.
|
||||
* @param stream Where to write the usage message.
|
||||
*/
|
||||
public void usage(final PrintStream stream) {
|
||||
stream.print(usagePreamble);
|
||||
if (!optionDefinitions.isEmpty()) {
|
||||
stream.println("\nOptions:\n");
|
||||
for (final OptionDefinition optionDefinition : optionDefinitions) {
|
||||
printOptionUsage(stream, optionDefinition);
|
||||
}
|
||||
}
|
||||
final Field fileField;
|
||||
try {
|
||||
fileField = getClass().getField("IGNORE_THIS_PROPERTY");
|
||||
} catch (NoSuchFieldException e) {
|
||||
throw new PicardException("Should never happen", e);
|
||||
}
|
||||
final OptionDefinition optionsFileOptionDefinition =
|
||||
new OptionDefinition(fileField, OPTIONS_FILE, "",
|
||||
"File of OPTION_NAME=value pairs. No positional parameters allowed. Unlike command-line options, " +
|
||||
"unrecognized options are ignored. " + "A single-valued option set in an options file may be overridden " +
|
||||
"by a subsequent command-line option. " +
|
||||
"A line starting with '#' is considered a comment.", false, true, 0, Integer.MAX_VALUE, null, new String[0]);
|
||||
printOptionUsage(stream, optionsFileOptionDefinition);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse command-line options, and store values in callerOptions object passed to ctor.
|
||||
* @param messageStream Where to write error messages.
|
||||
* @param args Command line tokens.
|
||||
* @return true if command line is valid.
|
||||
*/
|
||||
public boolean parseOptions(final PrintStream messageStream, final String[] args) {
|
||||
this.argv = args;
|
||||
this.messageStream = messageStream;
|
||||
for (final String arg: args) {
|
||||
if (arg.equals("-h") || arg.equals("--help")) {
|
||||
usage(messageStream);
|
||||
return false;
|
||||
}
|
||||
final String[] pair = arg.split("=", 2);
|
||||
if (pair.length == 2) {
|
||||
if (pair[0].equals(OPTIONS_FILE)) {
|
||||
if (!parseOptionsFile(pair[1])) {
|
||||
messageStream.println();
|
||||
usage(messageStream);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (!parseOption(pair[0], pair[1], false)) {
|
||||
messageStream.println();
|
||||
usage(messageStream);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else if (!parsePositionalArgument(arg)) {
|
||||
messageStream.println();
|
||||
usage(messageStream);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!checkNumArguments()) {
|
||||
messageStream.println();
|
||||
usage(messageStream);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* After command line has been parsed, make sure that all required options have values, and that
|
||||
* lists with minimum # of elements have sufficient.
|
||||
* @return true if valid
|
||||
*/
|
||||
private boolean checkNumArguments() {
|
||||
try {
|
||||
for (final OptionDefinition optionDefinition : optionDefinitions) {
|
||||
StringBuilder mutextOptionNames = new StringBuilder();
|
||||
for (String mutexOption : optionDefinition.mutuallyExclusive) {
|
||||
OptionDefinition mutextOptionDef = optionMap.get(mutexOption);
|
||||
if (mutextOptionDef != null && mutextOptionDef.hasBeenSet) {
|
||||
mutextOptionNames.append(" ").append(mutextOptionDef.name);
|
||||
}
|
||||
}
|
||||
if (optionDefinition.hasBeenSet && mutextOptionNames.length() > 0) {
|
||||
messageStream.println("ERROR: Option '" + optionDefinition.name +
|
||||
"' cannot be used in conjunction with option(s)" +
|
||||
mutextOptionNames.toString());
|
||||
return false;
|
||||
}
|
||||
if (optionDefinition.isCollection) {
|
||||
final Collection c = (Collection)optionDefinition.field.get(callerOptions);
|
||||
if (c.size() < optionDefinition.minElements) {
|
||||
messageStream.println("ERROR: Option '" + optionDefinition.name + "' must be specified at least " +
|
||||
optionDefinition.minElements + " times.");
|
||||
return false;
|
||||
}
|
||||
} else if (!optionDefinition.optional && !optionDefinition.hasBeenSet && mutextOptionNames.length() == 0) {
|
||||
messageStream.print("ERROR: Option '" + optionDefinition.name + "' is required");
|
||||
if (optionDefinition.mutuallyExclusive.isEmpty()) {
|
||||
messageStream.println(".");
|
||||
} else {
|
||||
messageStream.println(" unless any of " + optionDefinition.mutuallyExclusive + " are specified.");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (positionalArguments != null) {
|
||||
final Collection c = (Collection)positionalArguments.get(callerOptions);
|
||||
if (c.size() < minPositionalArguments) {
|
||||
messageStream.println("ERROR: At least " + minPositionalArguments +
|
||||
" positional arguments must be specified.");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} catch (IllegalAccessException e) {
|
||||
// Should never happen because lack of publicness has already been checked.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean parsePositionalArgument(final String stringValue) {
|
||||
if (positionalArguments == null) {
|
||||
messageStream.println("ERROR: Invalid argument '" + stringValue + "'.");
|
||||
return false;
|
||||
}
|
||||
final Object value;
|
||||
try {
|
||||
value = constructFromString(getUnderlyingType(positionalArguments), stringValue);
|
||||
} catch (CommandLineParseException e) {
|
||||
messageStream.println("ERROR: " + e.getMessage());
|
||||
return false;
|
||||
}
|
||||
final Collection c;
|
||||
try {
|
||||
c = (Collection)positionalArguments.get(callerOptions);
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
if (c.size() >= maxPositionalArguments) {
|
||||
messageStream.println("ERROR: No more than " + maxPositionalArguments +
|
||||
" positional arguments may be specified on the command line.");
|
||||
return false;
|
||||
}
|
||||
c.add(value);
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean parseOption(String key, final String stringValue, final boolean optionsFile) {
|
||||
key = key.toUpperCase();
|
||||
final OptionDefinition optionDefinition = optionMap.get(key);
|
||||
if (optionDefinition == null) {
|
||||
if (optionsFile) {
|
||||
// Silently ignore unrecognized option from options file
|
||||
return true;
|
||||
}
|
||||
messageStream.println("ERROR: Unrecognized option: " + key);
|
||||
return false;
|
||||
}
|
||||
if (!optionDefinition.isCollection) {
|
||||
if (optionDefinition.hasBeenSet && !optionDefinition.hasBeenSetFromOptionsFile) {
|
||||
messageStream.println("ERROR: Option '" + key + "' cannot be specified more than once.");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
final Object value;
|
||||
try {
|
||||
value = constructFromString(getUnderlyingType(optionDefinition.field), stringValue);
|
||||
} catch (CommandLineParseException e) {
|
||||
messageStream.println("ERROR: " + e.getMessage());
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
if (optionDefinition.isCollection) {
|
||||
final Collection c = (Collection)optionDefinition.field.get(callerOptions);
|
||||
if (c.size() >= optionDefinition.maxElements) {
|
||||
messageStream.println("ERROR: Option '" + key + "' cannot be used more than " +
|
||||
optionDefinition.maxElements + " times.");
|
||||
return false;
|
||||
}
|
||||
c.add(value);
|
||||
} else {
|
||||
optionDefinition.field.set(callerOptions, value);
|
||||
optionDefinition.hasBeenSet = true;
|
||||
optionDefinition.hasBeenSetFromOptionsFile = optionsFile;
|
||||
}
|
||||
} catch (IllegalAccessException e) {
|
||||
// Should never happen because we only iterate through public fields.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parsing of options from file is looser than normal. Any unrecognized options are
|
||||
* ignored, and a single-valued option that is set in a file may be overridden by a
|
||||
* subsequent appearance of that option.
|
||||
* A line that starts with '#' is ignored.
|
||||
* @param optionsFile
|
||||
* @return false if a fatal error occurred
|
||||
*/
|
||||
private boolean parseOptionsFile(final String optionsFile) {
|
||||
try {
|
||||
final BufferedReader reader = new BufferedReader(new FileReader(optionsFile));
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
if (line.startsWith("#")) {
|
||||
continue;
|
||||
}
|
||||
final String[] pair = line.split("=", 2);
|
||||
if (pair.length == 2) {
|
||||
if (!parseOption(pair[0], pair[1], true)) {
|
||||
messageStream.println();
|
||||
usage(messageStream);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
messageStream.println("Strange line in OPTIONS_FILE " + optionsFile + ": " + line);
|
||||
usage(messageStream);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
return true;
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new PicardException("I/O error loading OPTIONS_FILE=" + optionsFile, e);
|
||||
}
|
||||
}
|
||||
|
||||
private void printOptionUsage(final PrintStream stream, final OptionDefinition optionDefinition) {
|
||||
final String type = getUnderlyingType(optionDefinition.field).getSimpleName();
|
||||
String optionLabel = optionDefinition.name + "=" + type;
|
||||
stream.print(optionLabel);
|
||||
if (optionDefinition.shortName.length() > 0) {
|
||||
stream.println();
|
||||
}
|
||||
if (optionDefinition.shortName.length() > 0) {
|
||||
optionLabel = optionDefinition.shortName + "=" + type;
|
||||
stream.print(optionLabel);
|
||||
}
|
||||
int numSpaces = OPTION_COLUMN_WIDTH - optionLabel.length();
|
||||
if (optionLabel.length() > OPTION_COLUMN_WIDTH) {
|
||||
stream.println();
|
||||
numSpaces = OPTION_COLUMN_WIDTH;
|
||||
}
|
||||
printSpaces(stream, numSpaces);
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
if (optionDefinition.doc.length() > 0) {
|
||||
sb.append(optionDefinition.doc);
|
||||
sb.append(" ");
|
||||
}
|
||||
if (optionDefinition.optional && !optionDefinition.isCollection) {
|
||||
sb.append("Default value: ");
|
||||
sb.append(optionDefinition.defaultValue);
|
||||
sb.append(". ");
|
||||
} else if (!optionDefinition.isCollection){
|
||||
sb.append("Required. ");
|
||||
}
|
||||
Object[] enumConstants = getUnderlyingType(optionDefinition.field).getEnumConstants();
|
||||
if (enumConstants == null && getUnderlyingType(optionDefinition.field) == Boolean.class) {
|
||||
enumConstants = TRUE_FALSE_VALUES;
|
||||
}
|
||||
if (enumConstants != null) {
|
||||
sb.append("Possible values: {");
|
||||
for (int i = 0; i < enumConstants.length; ++i) {
|
||||
if (i > 0) {
|
||||
sb.append(", ");
|
||||
}
|
||||
sb.append(enumConstants[i].toString());
|
||||
}
|
||||
sb.append("} ");
|
||||
}
|
||||
if (optionDefinition.isCollection) {
|
||||
if (optionDefinition.minElements == 0) {
|
||||
if (optionDefinition.maxElements == Integer.MAX_VALUE) {
|
||||
sb.append("This option may be specified 0 or more times.");
|
||||
} else {
|
||||
sb.append("This option must be specified no more than " + optionDefinition.maxElements + "times.");
|
||||
}
|
||||
} else if (optionDefinition.maxElements == Integer.MAX_VALUE) {
|
||||
sb.append("This option must be specified at least " + optionDefinition.minElements + " times.");
|
||||
} else {
|
||||
sb.append("This option may be specified between " + optionDefinition.minElements +
|
||||
" and " + optionDefinition.maxElements + " times.");
|
||||
}
|
||||
}
|
||||
if (!optionDefinition.mutuallyExclusive.isEmpty()) {
|
||||
sb.append(" Cannot be used in conjuction with option(s)");
|
||||
for (String option : optionDefinition.mutuallyExclusive) {
|
||||
OptionDefinition mutextOptionDefinition = optionMap.get(option);
|
||||
sb.append(" ").append(mutextOptionDefinition.name);
|
||||
if (mutextOptionDefinition.shortName.length() > 0) {
|
||||
sb.append(" (").append(mutextOptionDefinition.shortName).append(")");
|
||||
}
|
||||
}
|
||||
}
|
||||
final String wrappedDescription = StringUtil.wordWrap(sb.toString(), DESCRIPTION_COLUMN_WIDTH);
|
||||
final String[] descriptionLines = wrappedDescription.split("\n");
|
||||
for (int i = 0; i < descriptionLines.length; ++i) {
|
||||
if (i > 0) {
|
||||
printSpaces(stream, OPTION_COLUMN_WIDTH);
|
||||
}
|
||||
stream.println(descriptionLines[i]);
|
||||
}
|
||||
stream.println();
|
||||
}
|
||||
|
||||
private void printSpaces(final PrintStream stream, final int numSpaces) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < numSpaces; ++i) {
|
||||
sb.append(" ");
|
||||
}
|
||||
stream.print(sb);
|
||||
}
|
||||
|
||||
private void handleOptionAnnotation(final Field field) {
|
||||
try {
|
||||
final Option optionAnnotation = field.getAnnotation(Option.class);
|
||||
final boolean isCollection = isCollectionField(field);
|
||||
if (isCollection) {
|
||||
if (optionAnnotation.maxElements() == 0) {
|
||||
throw new CommandLineParserDefinitionException("@Option member " + field.getName() +
|
||||
"has maxElements = 0");
|
||||
}
|
||||
if (optionAnnotation.minElements() > optionAnnotation.maxElements()) {
|
||||
throw new CommandLineParserDefinitionException("In @Option member " + field.getName() +
|
||||
", minElements cannot be > maxElements");
|
||||
}
|
||||
}
|
||||
if (!canBeMadeFromString(getUnderlyingType(field))) {
|
||||
throw new CommandLineParserDefinitionException("@Option member " + field.getName() +
|
||||
" must have a String ctor or be an enum");
|
||||
}
|
||||
|
||||
final OptionDefinition optionDefinition = new OptionDefinition(field,
|
||||
field.getName(),
|
||||
optionAnnotation.shortName(),
|
||||
optionAnnotation.doc(), optionAnnotation.optional() || (field.get(callerOptions) != null),
|
||||
isCollection, optionAnnotation.minElements(),
|
||||
optionAnnotation.maxElements(), field.get(callerOptions),
|
||||
optionAnnotation.mutex());
|
||||
|
||||
for (String option : optionAnnotation.mutex()) {
|
||||
OptionDefinition mutextOptionDef = optionMap.get(option);
|
||||
if (mutextOptionDef != null) {
|
||||
mutextOptionDef.mutuallyExclusive.add(field.getName());
|
||||
}
|
||||
}
|
||||
if (optionMap.containsKey(optionDefinition.name)) {
|
||||
throw new CommandLineParserDefinitionException(optionDefinition.name + " has already been used");
|
||||
}
|
||||
optionMap.put(optionDefinition.name, optionDefinition);
|
||||
if (optionDefinition.shortName.length() > 0) {
|
||||
if (optionMap.containsKey(optionDefinition.shortName)) {
|
||||
throw new CommandLineParserDefinitionException(optionDefinition.shortName + " has already been used");
|
||||
}
|
||||
optionMap.put(optionDefinition.shortName, optionDefinition);
|
||||
}
|
||||
optionDefinitions.add(optionDefinition);
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new CommandLineParserDefinitionException(field.getName() +
|
||||
" must have public visibility to have @Option annotation");
|
||||
}
|
||||
}
|
||||
|
||||
private void handleUsageAnnotation(final Field field) {
|
||||
if (usagePreamble != null) {
|
||||
throw new CommandLineParserDefinitionException
|
||||
("@Usage cannot be used more than once in an option class.");
|
||||
}
|
||||
try {
|
||||
usagePreamble = (String)field.get(callerOptions);
|
||||
final Usage usageAnnotation = field.getAnnotation(Usage.class);
|
||||
if (usageAnnotation.programVersion().length() > 0) {
|
||||
usagePreamble += "Version: " + usageAnnotation.programVersion() + "\n";
|
||||
}
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new CommandLineParserDefinitionException("@Usage data member must be public");
|
||||
} catch (ClassCastException e) {
|
||||
throw new CommandLineParserDefinitionException
|
||||
("@Usage can only be applied to a String data member.");
|
||||
}
|
||||
}
|
||||
|
||||
private void handlePositionalArgumentAnnotation(final Field field) {
|
||||
if (positionalArguments != null) {
|
||||
throw new CommandLineParserDefinitionException
|
||||
("@PositionalArguments cannot be used more than once in an option class.");
|
||||
}
|
||||
positionalArguments = field;
|
||||
if (!isCollectionField(field)) {
|
||||
throw new CommandLineParserDefinitionException("@PositionalArguments must be applied to a Collection");
|
||||
}
|
||||
|
||||
if (!canBeMadeFromString(getUnderlyingType(field))) {
|
||||
throw new CommandLineParserDefinitionException("@PositionalParameters member " + field.getName() +
|
||||
"does not have a String ctor");
|
||||
}
|
||||
|
||||
final PositionalArguments positionalArgumentsAnnotation = field.getAnnotation(PositionalArguments.class);
|
||||
minPositionalArguments = positionalArgumentsAnnotation.minElements();
|
||||
maxPositionalArguments = positionalArgumentsAnnotation.maxElements();
|
||||
if (minPositionalArguments > maxPositionalArguments) {
|
||||
throw new CommandLineParserDefinitionException("In @PositionalArguments, minElements cannot be > maxElements");
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isCollectionField(final Field field) {
|
||||
try {
|
||||
field.getType().asSubclass(Collection.class);
|
||||
return true;
|
||||
} catch (ClassCastException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private Class getUnderlyingType(final Field field) {
|
||||
if (isCollectionField(field)) {
|
||||
final ParameterizedType clazz = (ParameterizedType)(field.getGenericType());
|
||||
final Type[] genericTypes = clazz.getActualTypeArguments();
|
||||
if (genericTypes.length != 1) {
|
||||
throw new CommandLineParserDefinitionException("Strange collection type for field " + field.getName());
|
||||
}
|
||||
return (Class)genericTypes[0];
|
||||
|
||||
} else {
|
||||
return field.getType();
|
||||
}
|
||||
}
|
||||
|
||||
// True if clazz is an enum, or if it has a ctor that takes a single String argument.
|
||||
private boolean canBeMadeFromString(final Class clazz) {
|
||||
if (clazz.isEnum()) {
|
||||
return true;
|
||||
}
|
||||
try {
|
||||
clazz.getConstructor(String.class);
|
||||
return true;
|
||||
} catch (NoSuchMethodException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private Object constructFromString(final Class clazz, final String s) {
|
||||
try {
|
||||
if (clazz.isEnum()) {
|
||||
try {
|
||||
return Enum.valueOf(clazz, s);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new CommandLineParseException("'" + s + "' is not a valid value for " +
|
||||
clazz.getSimpleName() + ".", e);
|
||||
}
|
||||
}
|
||||
final Constructor ctor = clazz.getConstructor(String.class);
|
||||
return ctor.newInstance(s);
|
||||
} catch (NoSuchMethodException e) {
|
||||
// Shouldn't happen because we've checked for presence of ctor
|
||||
throw new CommandLineParseException(e);
|
||||
} catch (InstantiationException e) {
|
||||
throw new CommandLineParseException("Abstract class '" + clazz.getSimpleName() +
|
||||
"'cannot be used for an option value type.", e);
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new CommandLineParseException("String constructor for option value type '" + clazz.getSimpleName() +
|
||||
"' must be public.", e);
|
||||
} catch (InvocationTargetException e) {
|
||||
throw new CommandLineParseException("Problem constructing " + clazz.getSimpleName() + " from the string '" + s + "'.",
|
||||
e.getCause());
|
||||
}
|
||||
}
|
||||
|
||||
public String[] getArgv() {
|
||||
return argv;
|
||||
}
|
||||
|
||||
private class OptionDefinition {
|
||||
final Field field;
|
||||
final String name;
|
||||
final String shortName;
|
||||
final String doc;
|
||||
final boolean optional;
|
||||
final boolean isCollection;
|
||||
final int minElements;
|
||||
final int maxElements;
|
||||
final String defaultValue;
|
||||
boolean hasBeenSet = false;
|
||||
boolean hasBeenSetFromOptionsFile = false;
|
||||
Set<String> mutuallyExclusive;
|
||||
|
||||
private OptionDefinition(final Field field, final String name, final String shortName, final String doc, final boolean optional, final boolean collection,
|
||||
final int minElements, final int maxElements, final Object defaultValue, String[] mutuallyExclusive) {
|
||||
this.field = field;
|
||||
this.name = name.toUpperCase();
|
||||
this.shortName = shortName.toUpperCase();
|
||||
this.doc = doc;
|
||||
this.optional = optional;
|
||||
isCollection = collection;
|
||||
this.minElements = minElements;
|
||||
this.maxElements = maxElements;
|
||||
if (defaultValue != null) {
|
||||
this.defaultValue = defaultValue.toString();
|
||||
} else {
|
||||
this.defaultValue = "null";
|
||||
}
|
||||
this.mutuallyExclusive = new HashSet<String>(Arrays.asList(mutuallyExclusive));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.cmdline;
|
||||
|
||||
public class CommandLineParserDefinitionException extends RuntimeException {
|
||||
public CommandLineParserDefinitionException() {
|
||||
}
|
||||
|
||||
public CommandLineParserDefinitionException(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
public CommandLineParserDefinitionException(String s, Throwable throwable) {
|
||||
super(s, throwable);
|
||||
}
|
||||
|
||||
public CommandLineParserDefinitionException(Throwable throwable) {
|
||||
super(throwable);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,141 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.cmdline;
|
||||
|
||||
import edu.mit.broad.picard.util.Log;
|
||||
import edu.mit.broad.picard.util.StringUtil;
|
||||
import edu.mit.broad.picard.metrics.Header;
|
||||
import edu.mit.broad.picard.metrics.StringHeader;
|
||||
import edu.mit.broad.picard.metrics.MetricsFile;
|
||||
import edu.mit.broad.picard.metrics.MetricBase;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Abstract class to facilitate writing command-line programs.
|
||||
*
|
||||
* To use:
|
||||
*
|
||||
* 1. Extend this class with a concrete class that has data members annotated with @Option, @PositionalArguments
|
||||
* and/or @Usage annotations.
|
||||
*
|
||||
* 2. If there is any custom command-line validation, override customCommandLineValidation(). When this method is
|
||||
* called, the command line has been parsed and set into the data members of the concrete class.
|
||||
*
|
||||
* 3. Implement a method doWork(). This is called after successful comand-line processing. The value it returns is
|
||||
* the exit status of the program. It is assumed that the concrete class emits any appropriate error message before
|
||||
* returning non-zero. doWork() may throw unchecked exceptions, which are caught and reported appropriately.
|
||||
*
|
||||
* 4. Implement the following static method in the concrete class:
|
||||
*
|
||||
* public static void main(String[] argv) {
|
||||
System.exit(new MyConcreteClass().instanceMain(argv));
|
||||
}
|
||||
|
||||
|
||||
*/
|
||||
public abstract class CommandLineProgram {
|
||||
|
||||
@Option
|
||||
public File TMP_DIR = new File(System.getProperty("java.io.tmpdir"), System.getProperty("user.name"));
|
||||
|
||||
@Option(doc = "Control verbosity of logging")
|
||||
public Log.LogLevel VERBOSITY = Log.LogLevel.INFO;
|
||||
|
||||
@Option(doc = "Whether to suppress job-summary info on System.out")
|
||||
public Boolean QUIET = false;
|
||||
|
||||
private final String standardUsagePreamble = CommandLineParser.getStandardUsagePreamble(getClass());
|
||||
|
||||
/**
|
||||
* Initialized in parseArgs. Subclasses may want to access this to do
|
||||
* their own validation, and then print usage using clp.
|
||||
*/
|
||||
protected CommandLineParser clp;
|
||||
|
||||
private final List<Header> defaultHeaders = new ArrayList<Header>();
|
||||
|
||||
/**
|
||||
* Do the work after command line has been parsed.
|
||||
* RuntimeException may be thrown by this method, and are reported appropriately.
|
||||
* @return program exit status.
|
||||
*/
|
||||
protected abstract int doWork();
|
||||
|
||||
public int instanceMain(final String[] argv) {
|
||||
// Build the default headers
|
||||
final Date startDate = new Date();
|
||||
final String cmdline = getClass().getName() + " " + StringUtil.join(" ", argv);
|
||||
this.defaultHeaders.add(new StringHeader(cmdline));
|
||||
this.defaultHeaders.add(new StringHeader("Started on: " + startDate));
|
||||
|
||||
if (!parseArgs(argv)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
Log.setGlobalLogLevel(VERBOSITY);
|
||||
|
||||
if (!TMP_DIR.exists()) {
|
||||
// Intentially not checking the return value, because it may be that the program does not
|
||||
// need a tmp_dir. If this fails, the problem will be discovered downstream.
|
||||
TMP_DIR.mkdir();
|
||||
}
|
||||
System.setProperty("java.io.tmpdir", TMP_DIR.getAbsolutePath());
|
||||
if (!QUIET) {
|
||||
System.out.println("[" + new Date() + "] " + cmdline);
|
||||
}
|
||||
final int ret = doWork();
|
||||
if (!QUIET) {
|
||||
System.out.println("[" + new Date() + "] " + getClass().getName() + " done.");
|
||||
System.out.println("Runtime.totalMemory()=" + Runtime.getRuntime().totalMemory());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Put any custom command-line validation in an override of this method.
|
||||
* clp is initialized at this point and can be used to print usage and access argv.
|
||||
* Any options set by command-line parser can be validated.
|
||||
* @return true if command line is valid.
|
||||
*/
|
||||
protected boolean customCommandLineValidation() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return true if command line is valid
|
||||
*/
|
||||
protected boolean parseArgs(final String[] argv) {
|
||||
clp = new CommandLineParser(this);
|
||||
final boolean ret = clp.parseOptions(System.err, argv);
|
||||
if (!ret) {
|
||||
return false;
|
||||
}
|
||||
return customCommandLineValidation();
|
||||
}
|
||||
|
||||
/** Gets a MetricsFile with default headers already written into it. */
|
||||
protected <A extends MetricBase,B extends Comparable> MetricsFile<A,B> getMetricsFile() {
|
||||
final MetricsFile<A,B> file = new MetricsFile<A,B>();
|
||||
for (final Header h : this.defaultHeaders) {
|
||||
file.addHeader(h);
|
||||
}
|
||||
|
||||
return file;
|
||||
}
|
||||
|
||||
public String getStandardUsagePreamble() {
|
||||
return standardUsagePreamble;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,39 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.cmdline;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class CommandLineUtils {
|
||||
/** Regex for splitting on spaces. */
|
||||
public static final Pattern SPACE_SPLITTER = Pattern.compile(" ");
|
||||
|
||||
// Regexes to split things apart on white space
|
||||
public static final Pattern TAB_SPLITTER = Pattern.compile("\\t");
|
||||
|
||||
/** Checks that a file exists and is readable, and then returns a buffered reader for it. */
|
||||
public static BufferedReader getReader(File file) throws IOException {
|
||||
return new BufferedReader(new InputStreamReader(getInputStream(file)));
|
||||
}
|
||||
|
||||
/** Checks that a file exists and is readable, and then returns a input stream for it. */
|
||||
public static InputStream getInputStream(File file) throws IOException {
|
||||
if (!file.exists()) {
|
||||
throw new RuntimeException("Specified file does not exist: " + file);
|
||||
}
|
||||
|
||||
if (!file.canRead()) {
|
||||
throw new RuntimeException("Specified file is not readable: " + file);
|
||||
}
|
||||
|
||||
return new FileInputStream(file);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,60 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.cmdline;
|
||||
|
||||
import java.lang.annotation.Documented;
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* Used to annotate which fields of a CommandLineProgram are options given at the command line.
|
||||
* If a command line call looks like "cmd option=foo x=y bar baz" the CommandLineProgram
|
||||
* would have annotations on fields to handle the values of option and x. All options
|
||||
* must be in the form name=value on the command line. The java type of the option
|
||||
* will be inferred from the type of the field or from the generic type of the collection
|
||||
* if this option is allowed more than once. The type must be an enum or
|
||||
* have a constructor with a single String parameter.
|
||||
*
|
||||
* @author Alec Wysoker
|
||||
*/
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.FIELD)
|
||||
@Documented
|
||||
public @interface Option {
|
||||
/** The name of the option as it would appear on the command line. */
|
||||
String shortName() default "";
|
||||
|
||||
/** Text that appears for this option in text describing usage of the command line program. */
|
||||
String doc() default "";
|
||||
|
||||
/**
|
||||
* If set to false, an exception will be thrown if the option is not specified.
|
||||
* If 2 options are mutually exclusive and both have optional=false it will be
|
||||
* interpreted as one or the other is required and an exception will only be thrown if
|
||||
* neither are specified.
|
||||
*/
|
||||
boolean optional() default false;
|
||||
|
||||
/**
|
||||
* Array of option names that cannot be used in conjunction with this one.
|
||||
* If 2 options are mutually exclusive and both have optional=false it will be
|
||||
* interpreted as one OR the other is required and an exception will only be thrown if
|
||||
* neither are specified.
|
||||
*/
|
||||
String[] mutex() default {};
|
||||
|
||||
/** The minimum number of times that this option is required. */
|
||||
int minElements() default 0;
|
||||
|
||||
/** The maximum number of times this option is allowed. */
|
||||
int maxElements() default Integer.MAX_VALUE;
|
||||
}
|
||||
|
|
@ -1,38 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.cmdline;
|
||||
|
||||
import java.lang.annotation.Documented;
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* Used to annotate which field of a CommandLineProgram should store parameters given at the
|
||||
* command line which are not options. Fields with this annotation must be a Collection
|
||||
* (and probably should be a List if order is important).
|
||||
* If a command line call looks like "cmd option=foo x=y bar baz" the values "bar" and "baz"
|
||||
* would be added to the collection with this annotation. The java type of the arguments
|
||||
* will be inferred from the generic type of the collection. The type must be an enum or
|
||||
* have a constructor with a single String parameter.
|
||||
*
|
||||
* @author Alec Wysoker
|
||||
*/
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.FIELD)
|
||||
@Documented
|
||||
public @interface PositionalArguments {
|
||||
/** The minimum number of arguments required. */
|
||||
int minElements() default 0;
|
||||
|
||||
/** The maximum number of arguments allowed. */
|
||||
int maxElements() default Integer.MAX_VALUE;
|
||||
}
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.cmdline;
|
||||
|
||||
import java.lang.annotation.Documented;
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
/**
|
||||
* Annotates the field that contains text to be displayed in a usage message.
|
||||
*/
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.FIELD)
|
||||
@Documented
|
||||
public @interface Usage {
|
||||
String programVersion() default "";
|
||||
}
|
||||
|
|
@ -1,62 +0,0 @@
|
|||
package edu.mit.broad.picard.directed;
|
||||
|
||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||
import edu.mit.broad.picard.cmdline.Option;
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
import edu.mit.broad.picard.util.BasicTextFileParser;
|
||||
import edu.mit.broad.picard.util.Interval;
|
||||
import edu.mit.broad.picard.util.FormatUtil;
|
||||
import edu.mit.broad.sam.SAMFileReader;
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.sam.SAMSequenceRecord;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Converts an arachne style map file to the new interval list format.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class ArachneMapToIntervalList extends CommandLineProgram {
|
||||
@Option(shortName="M", doc="The path to an archne style map file") public File MAP;
|
||||
@Option(shortName="SD", doc="A sequence dictionary in SAM or BAM format") public File SEQUENCE_DICTIONARY;
|
||||
@Option(shortName="O", doc="The output file to write the interval list to") public File OUTPUT;
|
||||
@Option(shortName="P", doc="Prefix to use when generating names") public String PREFIX;
|
||||
|
||||
/** Stock main method. */
|
||||
public static void main(String[] argv) {
|
||||
System.exit(new ArachneMapToIntervalList().instanceMain(argv));
|
||||
}
|
||||
|
||||
protected int doWork() {
|
||||
IoUtil.assertFileIsReadable(MAP);
|
||||
IoUtil.assertFileIsReadable(SEQUENCE_DICTIONARY);
|
||||
IoUtil.assertFileIsWritable(OUTPUT);
|
||||
|
||||
SAMFileReader sam = new SAMFileReader(SEQUENCE_DICTIONARY);
|
||||
SAMFileHeader header = sam.getFileHeader();
|
||||
List<SAMSequenceRecord> seqs = header.getSequences();
|
||||
IntervalList list = new IntervalList(header);
|
||||
|
||||
BasicTextFileParser parser = new BasicTextFileParser(true, 3, MAP);
|
||||
FormatUtil format = new FormatUtil();
|
||||
int i=1;
|
||||
|
||||
while (parser.hasNext()) {
|
||||
String[] fields = parser.next();
|
||||
int seqIndex = format.parseInt(fields[0]);
|
||||
int start = format.parseInt(fields[1]) + 1;
|
||||
int end = format.parseInt(fields[2]) + 1;
|
||||
String seq = seqs.get(seqIndex).getSequenceName();
|
||||
|
||||
Interval interval = new Interval(seq, start, end, false, PREFIX + "_" + i++);
|
||||
list.add(interval);
|
||||
}
|
||||
|
||||
list.sort();
|
||||
list.write(OUTPUT);
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,51 +0,0 @@
|
|||
package edu.mit.broad.picard.directed;
|
||||
|
||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||
import edu.mit.broad.picard.cmdline.Option;
|
||||
import edu.mit.broad.picard.cmdline.Usage;
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
import edu.mit.broad.picard.metrics.MetricsFile;
|
||||
import edu.mit.broad.sam.SAMFileReader;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Calculates a set of HS metrics from a sam or bam file.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class CalculateHsMetrics extends CommandLineProgram {
|
||||
@Usage public final String USAGE =
|
||||
"Calculates a set of Hybrid Selection specific metrics from an aligned SAM" +
|
||||
"or BAM file.";
|
||||
@Option(shortName="BI") public File BAIT_INTERVALS;
|
||||
@Option(shortName="TI") public File TARGET_INTERVALS;
|
||||
@Option(shortName="I") public File INPUT;
|
||||
@Option(shortName="M") public File METRICS_FILE;
|
||||
|
||||
/** Stock main method. */
|
||||
public static void main(String[] argv) {
|
||||
System.exit(new CalculateHsMetrics().instanceMain(argv));
|
||||
}
|
||||
|
||||
/**
|
||||
* Asserts that files are readable and writable and then fires off an
|
||||
* HsMetricsCalculator instance to do the real work.
|
||||
*/
|
||||
protected int doWork() {
|
||||
IoUtil.assertFileIsReadable(BAIT_INTERVALS);
|
||||
IoUtil.assertFileIsReadable(TARGET_INTERVALS);
|
||||
IoUtil.assertFileIsReadable(INPUT);
|
||||
IoUtil.assertFileIsWritable(METRICS_FILE);
|
||||
|
||||
HsMetricsCalculator calculator = new HsMetricsCalculator(BAIT_INTERVALS, TARGET_INTERVALS);
|
||||
SAMFileReader sam = new SAMFileReader(INPUT);
|
||||
calculator.analyze(sam.iterator());
|
||||
|
||||
MetricsFile<HsMetrics, Integer> metrics = getMetricsFile();
|
||||
metrics.addMetric(calculator.getMetrics());
|
||||
|
||||
metrics.write(METRICS_FILE);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,52 +0,0 @@
|
|||
package edu.mit.broad.picard.directed;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.BitSet;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* Utility class to store coordinates of interest in per-sequence bitmasks.
|
||||
*/
|
||||
public class GenomeMask {
|
||||
|
||||
// if memory usage becomes a problem... this could be changed to a SparseBitSet
|
||||
// http://java.sun.com/developer/onlineTraining/collections/magercises/BitSet/index.html
|
||||
private SortedMap<Integer, BitSet> data = new TreeMap<Integer, BitSet>();
|
||||
|
||||
|
||||
public GenomeMask() {
|
||||
}
|
||||
|
||||
public boolean get(int contig, int position) {
|
||||
BitSet bits = data.get(contig);
|
||||
return (bits != null) && bits.get(position);
|
||||
}
|
||||
|
||||
public BitSet get(int contig) {
|
||||
return data.get(contig);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an existing BitSet for the given contig, or create one if not already present. This is
|
||||
* useful when initializing a GenomeMask from an external source.
|
||||
* @param contig which BitSet
|
||||
* @param numBits if there was not already a BitSet for this contig, one is created and initialized to this size.
|
||||
* @return the BitSet for the given contig, creating one if necessary
|
||||
*/
|
||||
public BitSet getOrCreate(int contig, int numBits) {
|
||||
BitSet ret = data.get(contig);
|
||||
if (ret == null) {
|
||||
ret = new BitSet(numBits);
|
||||
data.put(contig, ret);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public int getMaxContig() {
|
||||
return data.lastKey();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.directed;
|
||||
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.picard.util.Interval;
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.BitSet;
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Create a GenomeMask from an IntervalList or a file containing an IntervalList
|
||||
*/
|
||||
public class GenomeMaskFactory {
|
||||
|
||||
public GenomeMask makeGenomeMaskFromIntervalList(IntervalList intervalList) {
|
||||
if (intervalList.getHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
|
||||
intervalList.sort();
|
||||
}
|
||||
List<Interval> uniqueIntervals = intervalList.getUniqueIntervals();
|
||||
GenomeMask ret = new GenomeMask();
|
||||
|
||||
SAMFileHeader samHeader = intervalList.getHeader();
|
||||
|
||||
for (Interval interval : uniqueIntervals) {
|
||||
// TODO: Maybe figure out more intelligently how big the bitset might be?
|
||||
BitSet bitSet = ret.getOrCreate(samHeader.getSequenceIndex(interval.getSequence()), interval.getEnd() + 1);
|
||||
bitSet.set(interval.getStart(), interval.getEnd() + 1);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public GenomeMask makeGenomeMaskFromIntervalList(File intervalListFile) {
|
||||
IoUtil.assertFileIsReadable(intervalListFile);
|
||||
IntervalList intervalList = IntervalList.fromFile(intervalListFile);
|
||||
return makeGenomeMaskFromIntervalList(intervalList);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,108 +0,0 @@
|
|||
package edu.mit.broad.picard.directed;
|
||||
|
||||
import edu.mit.broad.picard.metrics.MetricBase;
|
||||
|
||||
/**
|
||||
* The set of metrics captured that are specific to a hybrid selection analysis.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class HsMetrics extends MetricBase {
|
||||
/** The name of the bait set used in the hybrid selection. */
|
||||
public String BAIT_SET;
|
||||
|
||||
/** The number of bases in the reference genome used for alignment. */
|
||||
public long GENOME_SIZE;
|
||||
|
||||
/** The number of bases which have one or more baits on top of them. */
|
||||
public long BAIT_TERRITORY;
|
||||
|
||||
/** The unique number of target bases in the experiment where target is usually exons etc. */
|
||||
public long TARGET_TERRITORY;
|
||||
|
||||
/** Target terrirtoy / bait territory. 1 == perfectly efficient, 0.5 = half of baited bases are not target. */
|
||||
public double BAIT_DESIGN_EFFICIENCY;
|
||||
|
||||
/** The total number of reads in the SAM or BAM file examine. */
|
||||
public int TOTAL_READS;
|
||||
|
||||
/** The number of reads that pass the vendor's filter. */
|
||||
public int PF_READS;
|
||||
|
||||
/** The number of PF reads that are not marked as duplicates. */
|
||||
public int PF_UNIQUE_READS;
|
||||
|
||||
/** PF reads / total reads. The percent of reads passing filter. */
|
||||
public double PCT_PF_READS;
|
||||
|
||||
/** PF Unique Reads / Total Reads. */
|
||||
public double PCT_PF_UQ_READS;
|
||||
|
||||
/** The number of PF reads that are aligned with mapping score > 0 to the reference genome. */
|
||||
public int PF_READS_ALIGNED;
|
||||
|
||||
/** PF Reads Aligned / PF Reads. */
|
||||
public double PCT_PF_READS_ALIGNED;
|
||||
|
||||
/** The number of bases in the PF aligned reads that are mapped to a reference base. Accounts for clipping and gaps. */
|
||||
public int PF_BASES_ALIGNED;
|
||||
|
||||
/** The number of PF aligned bases that mapped to a baited region of the genome. */
|
||||
public long ON_BAIT_BASES;
|
||||
|
||||
/** The number of PF aligned bases that mapped to within a fixed interval of a baited region, but not on a baited region. */
|
||||
public long NEAR_BAIT_BASES;
|
||||
|
||||
/** The number of PF aligned bases that mapped to neither on or near a bait. */
|
||||
public long OFF_BAIT_BASES;
|
||||
|
||||
/** The number of PF aligned bases that mapped to a targetted region of the genome. */
|
||||
public long ON_TARGET_BASES;
|
||||
|
||||
/** On+Near Bait Bases / PF Bases Aligned. */
|
||||
public double PCT_SELECTED_BASES;
|
||||
|
||||
/** The percentage of aligned PF bases that mapped neither on or near a bait. */
|
||||
public double PCT_OFF_BAIT;
|
||||
|
||||
/** The percentage of on+near bait bases that are on as opposed to near. */
|
||||
public double ON_BAIT_VS_SELECTED;
|
||||
|
||||
/** The mean coverage of all baits in the experiment. */
|
||||
public double MEAN_BAIT_COVERAGE;
|
||||
|
||||
/** The mean coverage of targets that recieved at least coverage depth = 2 at one base. */
|
||||
public double MEAN_TARGET_COVERAGE;
|
||||
|
||||
/** The fold by which the baited region has been amplified above genomic background. */
|
||||
public double FOLD_ENRICHMENT;
|
||||
|
||||
/** The number of targets that did not reach coverage=2 over any base. */
|
||||
public double ZERO_CVG_TARGETS_PCT;
|
||||
|
||||
/**
|
||||
* The fold over-coverage necessary to raise 80% of bases in "non-zero-cvg" targets to
|
||||
* the mean coverage level in those targets.
|
||||
*/
|
||||
public double FOLD_80_BASE_PENALTY;
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the metrics in this class that can be derived from other metrics in the class.
|
||||
*/
|
||||
public void calculateDerivedMetrics() {
|
||||
BAIT_DESIGN_EFFICIENCY = (double) TARGET_TERRITORY / (double) BAIT_TERRITORY;
|
||||
|
||||
PCT_PF_READS = PF_READS / (double) TOTAL_READS;
|
||||
PCT_PF_UQ_READS = PF_UNIQUE_READS / (double) TOTAL_READS;
|
||||
PCT_PF_READS_ALIGNED = PF_READS_ALIGNED / (double) PF_UNIQUE_READS;
|
||||
|
||||
double denominator = (ON_BAIT_BASES + NEAR_BAIT_BASES + OFF_BAIT_BASES);
|
||||
|
||||
PCT_SELECTED_BASES = (ON_BAIT_BASES + NEAR_BAIT_BASES) / denominator;
|
||||
PCT_OFF_BAIT = OFF_BAIT_BASES / denominator;
|
||||
ON_BAIT_VS_SELECTED = ON_BAIT_BASES / (double) (ON_BAIT_BASES + NEAR_BAIT_BASES);
|
||||
MEAN_BAIT_COVERAGE = ON_BAIT_BASES / (double) BAIT_TERRITORY;
|
||||
FOLD_ENRICHMENT = (ON_BAIT_BASES/ denominator) / ((double) BAIT_TERRITORY / GENOME_SIZE);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,207 +0,0 @@
|
|||
package edu.mit.broad.picard.directed;
|
||||
|
||||
import edu.mit.broad.picard.util.*;
|
||||
import edu.mit.broad.sam.SAMFileReader;
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
import edu.mit.broad.sam.AlignmentBlock;
|
||||
import edu.mit.broad.sam.SAMSequenceRecord;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* Calculates HS metrics for a given SAM or BAM file. Requires the input of a list of
|
||||
* target intervals and a list of bait intervals. Can be invoked either on an entire
|
||||
* iterator of SAMRecords or be passed SAMRecords one at a time.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class HsMetricsCalculator {
|
||||
// What is considered "near" to the bait
|
||||
private static final int NEAR_BAIT_DISTANCE = 250;
|
||||
private static final Log log = Log.getInstance(HsMetricsCalculator.class);
|
||||
|
||||
// Holds file names and other parameter related junk
|
||||
private SAMFileReader sam;
|
||||
private File baitFile;
|
||||
private File targetFile;
|
||||
private IntervalList baits;
|
||||
private IntervalList targets;
|
||||
|
||||
// Overlap detector for finding overlaps between reads and the experimental targets
|
||||
private OverlapDetector<Interval> targetDetector = new OverlapDetector<Interval>(0,0);
|
||||
|
||||
// Overlap detector for finding overlaps between the reads and the baits (and the near bait space)
|
||||
private OverlapDetector<Interval> baitDetector = new OverlapDetector<Interval>(-NEAR_BAIT_DISTANCE,0);
|
||||
|
||||
// A Map to accumulate per-bait-region (i.e. merge of overlapping baits) coverage. */
|
||||
private Map<Interval, Coverage> coverageByTarget = new HashMap<Interval, Coverage>();
|
||||
|
||||
private HsMetrics metrics = new HsMetrics();
|
||||
|
||||
/**
|
||||
* Constructor that parses the squashed reference to genome reference file and stores the
|
||||
* information in a map for later use.
|
||||
*/
|
||||
public HsMetricsCalculator(File baits, File targets) {
|
||||
this.baitFile = baits;
|
||||
this.targetFile = targets;
|
||||
this.baits = IntervalList.fromFile(baits);
|
||||
this.targets = IntervalList.fromFile(targets);
|
||||
|
||||
this.metrics.BAIT_SET = baits.getName();
|
||||
int tmp = this.metrics.BAIT_SET.indexOf(".");
|
||||
if (tmp > 0) {
|
||||
this.metrics.BAIT_SET = this.metrics.BAIT_SET.substring(0, tmp);
|
||||
}
|
||||
|
||||
List<Interval> uniqueBaits = this.baits.getUniqueIntervals();
|
||||
this.baitDetector.addAll(uniqueBaits, uniqueBaits);
|
||||
this.metrics.BAIT_TERRITORY = Interval.countBases(uniqueBaits);
|
||||
|
||||
List<Interval> uniqueTargets = this.targets.getUniqueIntervals();
|
||||
this.targetDetector.addAll(uniqueTargets, uniqueTargets);
|
||||
this.metrics.TARGET_TERRITORY = Interval.countBases(uniqueTargets);
|
||||
|
||||
for (SAMSequenceRecord seq : this.baits.getHeader().getSequences()) {
|
||||
this.metrics.GENOME_SIZE += seq.getSequenceLength();
|
||||
}
|
||||
|
||||
// Populate the coverage by target map
|
||||
for (Interval target : this.targets.getIntervals()) {
|
||||
this.coverageByTarget.put(target, new Coverage(target, 0));
|
||||
}
|
||||
}
|
||||
|
||||
/** Iterates over all records in the file and collects metrics. */
|
||||
public void analyze(Iterator<SAMRecord> records) {
|
||||
int i = 0;
|
||||
while (records.hasNext()) {
|
||||
analyze(records.next());
|
||||
|
||||
if (++i % 1000000 == 0) {
|
||||
log.info("Processed " + i + " records so far.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Adds information about an individual SAMRecord to the statistics. */
|
||||
public void analyze(SAMRecord rec) {
|
||||
// Just plain avoid records that are marked as not-primary
|
||||
if (rec.getNotPrimaryAlignmentFlag()) return;
|
||||
|
||||
this.metrics.TOTAL_READS += 1;
|
||||
|
||||
// Check for PF reads
|
||||
if (rec.getReadFailsVendorQualityCheckFlag()) {
|
||||
return;
|
||||
}
|
||||
else {
|
||||
++this.metrics.PF_READS;
|
||||
}
|
||||
|
||||
// Check for reads that are marked as duplicates
|
||||
if (rec.getDuplicateReadFlag()) {
|
||||
return;
|
||||
}
|
||||
else {
|
||||
++this.metrics.PF_UNIQUE_READS;
|
||||
}
|
||||
|
||||
// Don't bother with reads that didn't align uniquely
|
||||
if (rec.getReadUnmappedFlag() || rec.getMappingQuality() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.metrics.PF_READS_ALIGNED += 1;
|
||||
for (AlignmentBlock block : rec.getAlignmentBlocks()) {
|
||||
this.metrics.PF_BASES_ALIGNED += block.getLength();
|
||||
}
|
||||
|
||||
Interval read = new Interval(rec.getReferenceName(), rec.getAlignmentStart(), rec.getAlignmentEnd());
|
||||
|
||||
// Find the target overlaps
|
||||
Collection<Interval> targets = this.targetDetector.getOverlaps(read);
|
||||
if (targets != null && !targets.isEmpty()) {
|
||||
for (Interval target : targets) {
|
||||
Coverage coverage = this.coverageByTarget.get(target);
|
||||
|
||||
for (AlignmentBlock block : rec.getAlignmentBlocks()) {
|
||||
int end = CoordMath.getEnd(block.getReferenceStart(), block.getLength());
|
||||
for (int pos=block.getReferenceStart(); pos<=end; ++ pos) {
|
||||
if (pos >= target.getStart() && pos <= target.getEnd()) {
|
||||
++this.metrics.ON_TARGET_BASES;
|
||||
coverage.addBase(pos - target.getStart());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Now do the bait overlaps
|
||||
int mappedBases = 0;
|
||||
for (AlignmentBlock block : rec.getAlignmentBlocks()) mappedBases += block.getLength();
|
||||
Collection<Interval> baits = this.baitDetector.getOverlaps(read);
|
||||
int onBaitBases = 0;
|
||||
|
||||
if (baits != null && !baits.isEmpty()) {
|
||||
for (Interval bait : baits) {
|
||||
for (AlignmentBlock block : rec.getAlignmentBlocks()) {
|
||||
int end = CoordMath.getEnd(block.getReferenceStart(), block.getLength());
|
||||
|
||||
for (int pos=block.getReferenceStart(); pos<=end; ++pos) {
|
||||
if (pos >= bait.getStart() && pos <= bait.getEnd()) ++onBaitBases;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.metrics.ON_BAIT_BASES += onBaitBases;
|
||||
this.metrics.NEAR_BAIT_BASES += (mappedBases - onBaitBases);
|
||||
}
|
||||
else {
|
||||
this.metrics.OFF_BAIT_BASES += mappedBases;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/** Calculates a few last summary metrics and then returns the metrics calculated. */
|
||||
public HsMetrics getMetrics() {
|
||||
this.metrics.calculateDerivedMetrics();
|
||||
calculateTargetCoverageMetrics();
|
||||
return this.metrics;
|
||||
}
|
||||
|
||||
/** Calculates how much additional sequencing is needed to raise 80% of bases to the mean for the lane. */
|
||||
private void calculateTargetCoverageMetrics() {
|
||||
short[] depths = new short[(int) this.metrics.TARGET_TERRITORY]; // may not use entire array
|
||||
int zeroCoverageTargets = 0;
|
||||
int depthIndex = 0;
|
||||
double totalCoverage = 0;
|
||||
int basesConsidered = 0;
|
||||
|
||||
for (Coverage c : this.coverageByTarget.values()) {
|
||||
if (!c.hasCoverage()) {
|
||||
++zeroCoverageTargets;
|
||||
continue;
|
||||
}
|
||||
|
||||
final short[] targetDepths = c.getDepths();
|
||||
basesConsidered += targetDepths.length;
|
||||
|
||||
for (short depth : targetDepths) {
|
||||
depths[depthIndex++] = depth;
|
||||
totalCoverage += depth;
|
||||
}
|
||||
}
|
||||
|
||||
this.metrics.MEAN_TARGET_COVERAGE = totalCoverage / basesConsidered;
|
||||
|
||||
// Sort the array (ASCENDING) and then find the base the coverage value that lies at the 80%
|
||||
// line, which is actually at 20% into the array now
|
||||
Arrays.sort(depths);
|
||||
int indexOf80thPercentile = (depths.length - basesConsidered) + (int) (basesConsidered * 0.2);
|
||||
int coverageAt80thPercentile = depths[indexOf80thPercentile];
|
||||
this.metrics.FOLD_80_BASE_PENALTY = this.metrics.MEAN_TARGET_COVERAGE / coverageAt80thPercentile;
|
||||
this.metrics.ZERO_CVG_TARGETS_PCT = zeroCoverageTargets / (double) this.targets.getIntervals().size();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,240 +0,0 @@
|
|||
package edu.mit.broad.picard.directed;
|
||||
|
||||
import edu.mit.broad.picard.util.Interval;
|
||||
import edu.mit.broad.picard.util.FormatUtil;
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.sam.SAMTextHeaderCodec;
|
||||
import edu.mit.broad.sam.util.StringLineReader;
|
||||
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
/**
|
||||
* Represents a list of intervals against a reference sequence that can be written to
|
||||
* and read from a file. The file format is relatively simple and reflects the SAM
|
||||
* alignment format to a degree.
|
||||
*
|
||||
* A SAM style header must be present in the file which lists the sequence records
|
||||
* against which the intervals are described. After the header the file then contains
|
||||
* records one per line in text format with the following values tab-separated:
|
||||
* - Sequence name
|
||||
* - Start position (1-based)
|
||||
* - End position (1-based, end inclusive)
|
||||
* - Strand (either + or -)
|
||||
* - Interval name (an, ideally unique, name for the interval)
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class IntervalList implements Iterable<Interval> {
|
||||
private SAMFileHeader header;
|
||||
private List<Interval> intervals = new ArrayList<Interval>();
|
||||
|
||||
/** Constructs a new interval list using the supplied header information. */
|
||||
public IntervalList(SAMFileHeader header) {
|
||||
if (header == null) {
|
||||
throw new IllegalArgumentException("SAMFileHeader must be supplied.");
|
||||
}
|
||||
this.header = header;
|
||||
}
|
||||
|
||||
/** Gets the header (if there is one) for the interval list. */
|
||||
public SAMFileHeader getHeader() { return header; }
|
||||
|
||||
/** Returns an iterator over the intervals. */
|
||||
public Iterator<Interval> iterator() { return this.intervals.iterator(); }
|
||||
|
||||
/** Adds an interval to the list of intervals. */
|
||||
public void add(Interval interval) { this.intervals.add(interval); }
|
||||
|
||||
/** Sorts the internal collection of intervals by coordinate. */
|
||||
public void sort() {
|
||||
Collections.sort(this.intervals, new IntervalCoordinateComparator(this.header));
|
||||
this.header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
|
||||
}
|
||||
|
||||
/** Gets the set of intervals as held internally. */
|
||||
public List<Interval> getIntervals() {
|
||||
return Collections.unmodifiableList(this.intervals);
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges the list of intervals and then reduces them down where regions overlap
|
||||
* or are directly adjacent to one another. During this process the "merged" interval
|
||||
* will retain the strand and name of the 5' most interval merged.
|
||||
*
|
||||
* @return the set of unique intervals condensed from the contained intervals
|
||||
*/
|
||||
public List<Interval> getUniqueIntervals() {
|
||||
List<Interval> unique = new ArrayList<Interval>();
|
||||
ListIterator<Interval> iterator = this.intervals.listIterator();
|
||||
Interval previous = iterator.next();
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
Interval next = iterator.next();
|
||||
if (previous.intersects(next) || previous.abuts(next)) {
|
||||
previous = new Interval(previous.getSequence(),
|
||||
previous.getStart(),
|
||||
Math.max(previous.getEnd(), next.getEnd()),
|
||||
previous.isNegativeStrand(),
|
||||
previous.getName());
|
||||
}
|
||||
else {
|
||||
unique.add(previous);
|
||||
previous = next;
|
||||
}
|
||||
}
|
||||
|
||||
if (previous != null) unique.add(previous);
|
||||
|
||||
return unique;
|
||||
}
|
||||
|
||||
/** Gets the (potentially redundant) sum of the length of the intervals in the list. */
|
||||
public long getBaseCount() {
|
||||
return Interval.countBases(this.intervals);
|
||||
}
|
||||
|
||||
/** Gets the count of unique bases represented by the intervals in the list. */
|
||||
public long getUniqueBaseCount() {
|
||||
return Interval.countBases(getUniqueIntervals());
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses an interval list from a file.
|
||||
* @param file the file containing the intervals
|
||||
* @return an IntervalList object that contains the headers and intervals from the file
|
||||
*/
|
||||
public static IntervalList fromFile(File file) {
|
||||
BufferedReader in = new BufferedReader(new InputStreamReader(IoUtil.openFileForReading(file)));
|
||||
|
||||
try {
|
||||
// Setup a reader and parse the header
|
||||
StringBuilder builder = new StringBuilder(4096);
|
||||
String line = null;
|
||||
|
||||
while ((line = in.readLine()) != null) {
|
||||
if (line.startsWith("@")) {
|
||||
builder.append(line).append('\n');
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (builder.length() == 0) {
|
||||
throw new IllegalStateException("Interval list file must contain header: " + file.getAbsolutePath());
|
||||
}
|
||||
|
||||
StringLineReader headerReader = new StringLineReader(builder.toString());
|
||||
SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
|
||||
IntervalList list = new IntervalList(codec.decode(headerReader, file));
|
||||
|
||||
// Then read in the intervals
|
||||
FormatUtil format = new FormatUtil();
|
||||
do {
|
||||
if (line.trim().length() == 0) continue; // skip over blank lines
|
||||
|
||||
// Make sure we have the right number of fields
|
||||
String fields[] = line.split("\t");
|
||||
if (fields.length != 5) {
|
||||
throw new PicardException("Invalid interval record contains " +
|
||||
fields.length + " fields: " + line);
|
||||
}
|
||||
|
||||
// Then parse them out
|
||||
String seq = fields[0];
|
||||
int start = format.parseInt(fields[1]);
|
||||
int end = format.parseInt(fields[2]);
|
||||
|
||||
boolean negative;
|
||||
if (fields[3].equals("-")) negative = true;
|
||||
else if (fields[3].equals("+")) negative = false;
|
||||
else throw new IllegalArgumentException("Invalid strand field: " + fields[3]);
|
||||
|
||||
String name = fields[4];
|
||||
|
||||
Interval interval = new Interval(seq, start, end, negative, name);
|
||||
list.intervals.add(interval);
|
||||
}
|
||||
while ((line = in.readLine()) != null);
|
||||
|
||||
return list;
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new PicardException("Error parsing interval list file: " + file.getAbsolutePath(), ioe);
|
||||
}
|
||||
finally {
|
||||
try { in.close(); } catch (Exception e) { /* do nothing */ }
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes out the list of intervals to the supplied file.
|
||||
* @param file a file to write to. If exists it will be overwritten.
|
||||
*/
|
||||
public void write(File file) {
|
||||
try {
|
||||
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(IoUtil.openFileForWriting(file)));
|
||||
FormatUtil format = new FormatUtil();
|
||||
|
||||
// Write out the header
|
||||
if (this.header != null) {
|
||||
SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
|
||||
codec.encode(out, this.header);
|
||||
}
|
||||
|
||||
// Write out the intervals
|
||||
for (Interval interval : this) {
|
||||
out.write(interval.getSequence());
|
||||
out.write('\t');
|
||||
out.write(format.format(interval.getStart()));
|
||||
out.write('\t');
|
||||
out.write(format.format(interval.getEnd()));
|
||||
out.write('\t');
|
||||
out.write(interval.isPositiveStrand() ? '+' : '-');
|
||||
out.write('\t');
|
||||
out.write(interval.getName());
|
||||
out.newLine();
|
||||
}
|
||||
|
||||
out.flush();
|
||||
out.close();
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new PicardException("Error writing out interval list to file: " + file.getAbsolutePath(), ioe);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Comparator that orders intervals based on their sequence index, by coordinate
|
||||
* then by strand and finally by name.
|
||||
*/
|
||||
class IntervalCoordinateComparator implements Comparator<Interval> {
|
||||
private SAMFileHeader header;
|
||||
|
||||
/** Constructs a comparator using the supplied sequence header. */
|
||||
IntervalCoordinateComparator(SAMFileHeader header) {
|
||||
this.header = header;
|
||||
}
|
||||
|
||||
public int compare(Interval lhs, Interval rhs) {
|
||||
int lhsIndex = this.header.getSequenceIndex(lhs.getSequence());
|
||||
int rhsIndex = this.header.getSequenceIndex(rhs.getSequence());
|
||||
int retval = lhsIndex - rhsIndex;
|
||||
|
||||
if (retval == 0) retval = lhs.getStart() - rhs.getStart();
|
||||
if (retval == 0) retval = lhs.getEnd() - rhs.getEnd();
|
||||
if (retval == 0) {
|
||||
if (lhs.isPositiveStrand() && rhs.isNegativeStrand()) retval = -1;
|
||||
else if (lhs.isNegativeStrand() && rhs.isPositiveStrand()) retval = 1;
|
||||
}
|
||||
if (retval == 0) {
|
||||
retval = lhs.getName().compareTo(rhs.getName());
|
||||
}
|
||||
|
||||
return retval;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,46 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.filter;
|
||||
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Aggregates multiple filters and provides a method for applying them all to a given record with
|
||||
* one method call.
|
||||
*/
|
||||
public class AggregateFilter implements SamRecordFilter {
|
||||
|
||||
private final List<SamRecordFilter> filters;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
* @param filters the list of filters that this Aggregator applies
|
||||
*/
|
||||
public AggregateFilter(List<SamRecordFilter> filters) {
|
||||
this.filters = filters;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether a SAMRecord matches this filter
|
||||
*
|
||||
* @param record the SAMRecord to evaluate
|
||||
* @return true if the SAMRecord matches at least one filter, otherwise false
|
||||
*/
|
||||
public boolean filterOut(SAMRecord record) {
|
||||
for (SamRecordFilter filter : filters) {
|
||||
if (filter.filterOut(record)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,28 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.filter;
|
||||
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
|
||||
/**
|
||||
* Filter for filtering out reads that do not pass the quality filter
|
||||
*/
|
||||
public class FailsVendorReadQualityFilter implements SamRecordFilter {
|
||||
|
||||
/**
|
||||
* Determines whether a SAMRecord matches this filter
|
||||
*
|
||||
* @param record the SAMRecord to evaluate
|
||||
* @return true if the SAMRecord matches the filter, otherwise false
|
||||
*/
|
||||
public boolean filterOut(SAMRecord record) {
|
||||
return record.getReadFailsVendorQualityCheckFlag();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,94 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.filter;
|
||||
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
import edu.mit.broad.picard.util.CloserUtil;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Filtering Iterator which takes a filter and an iterator and iterates
|
||||
* through only those records which are not rejected by the filter.
|
||||
*
|
||||
* @author Kathleen Tibbetts
|
||||
*/
|
||||
public class FilteringIterator implements CloseableIterator<SAMRecord> {
|
||||
|
||||
private final Iterator<SAMRecord> iterator;
|
||||
private final SamRecordFilter filter;
|
||||
private SAMRecord next = null;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param iterator the backing iterator
|
||||
* @param filter the filter (which may be a FilterAggregator)
|
||||
*/
|
||||
public FilteringIterator(Iterator<SAMRecord> iterator, SamRecordFilter filter) {
|
||||
this.iterator = iterator;
|
||||
this.filter = filter;
|
||||
next = getNextRecord();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the iteration has more elements.
|
||||
*
|
||||
* @return true if the iteration has more elements. Otherwise returns false.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
return next != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next element in the iteration.
|
||||
*
|
||||
* @return the next element in the iteration
|
||||
* @throws java.util.NoSuchElementException
|
||||
*/
|
||||
public SAMRecord next() {
|
||||
if (next == null) {
|
||||
throw new NoSuchElementException("Iterator has no more elements.");
|
||||
}
|
||||
SAMRecord result = next;
|
||||
next = getNextRecord();
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Required method for Iterator API.
|
||||
*
|
||||
* @throws UnsupportedOperationException
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Remove() not supported by FilteringIterator");
|
||||
}
|
||||
|
||||
public void close() {
|
||||
CloserUtil.close(iterator);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the next record from the underlying iterator that passes the filter
|
||||
*
|
||||
* @return SAMRecord the next filter-passing record
|
||||
*/
|
||||
private SAMRecord getNextRecord() {
|
||||
while (iterator.hasNext()) {
|
||||
SAMRecord record = iterator.next();
|
||||
if (!filter.filterOut(record)) {
|
||||
return record;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.filter;
|
||||
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
|
||||
/**
|
||||
* API for filtering SAMRecords
|
||||
*/
|
||||
public interface SamRecordFilter {
|
||||
|
||||
/**
|
||||
* Determines whether a SAMRecord matches this filter
|
||||
*
|
||||
* @param record the SAMRecord to evaluate
|
||||
* @return true if the SAMRecord matches the filter, otherwise false
|
||||
*/
|
||||
public boolean filterOut(SAMRecord record);
|
||||
}
|
||||
|
|
@ -1,37 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.filter;
|
||||
|
||||
import edu.mit.broad.picard.util.SequenceUtil;
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
|
||||
/**
|
||||
* Filter to determine whether a read is "noisy" due to a poly-A run that is a sequencing artifact.
|
||||
* Currently we filter out only reads that are composed entirely of As.
|
||||
*/
|
||||
public class SolexaNoiseFilter implements SamRecordFilter {
|
||||
|
||||
/**
|
||||
* Determines whether a SAMRecord matches this filter
|
||||
*
|
||||
* @param record the SAMRecord to evaluate
|
||||
* @return true if the SAMRecord matches the filter, otherwise false
|
||||
*/
|
||||
public boolean filterOut(SAMRecord record) {
|
||||
byte sequence[] = record.getReadBases();
|
||||
for (byte base : sequence) {
|
||||
if (base != 'A' && base != 'a' &&
|
||||
!SequenceUtil.isNoCall(base)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,56 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.filter;
|
||||
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Filter class for matching tag attributes in SAMRecords
|
||||
*/
|
||||
public class TagFilter implements SamRecordFilter {
|
||||
|
||||
private final String tag; // The key of the tag to match
|
||||
private final List<Object> values; // The list of matching values
|
||||
|
||||
/**
|
||||
* Constructor for a single value
|
||||
*
|
||||
* @param tag the key of the tag to match
|
||||
* @param value the value to match
|
||||
*/
|
||||
public TagFilter(String tag, Object value) {
|
||||
this.tag = tag;
|
||||
this.values = Arrays.asList(value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructor for multiple values
|
||||
*
|
||||
* @param tag the key of the tag to match
|
||||
* @param values the matching values
|
||||
*/
|
||||
public TagFilter(String tag, List<Object> values) {
|
||||
this.tag = tag;
|
||||
this.values = values;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether a SAMRecord matches this filter
|
||||
*
|
||||
* @param record the SAMRecord to evaluate
|
||||
* @return true if the SAMRecord matches the filter, otherwise false
|
||||
*/
|
||||
public boolean filterOut(SAMRecord record) {
|
||||
return values.contains(record.getAttribute(tag));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
|
||||
package edu.mit.broad.picard.genotype;
|
||||
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
|
||||
/**
|
||||
* Generic exception thrown by GELI format machinery.
|
||||
*
|
||||
* @author Doug Voet
|
||||
*/
|
||||
public class GeliException extends PicardException {
|
||||
|
||||
public GeliException(String message, Throwable throwable) {
|
||||
super(message, throwable);
|
||||
}
|
||||
|
||||
public GeliException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,20 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
|
||||
package edu.mit.broad.picard.genotype;
|
||||
|
||||
/**
|
||||
* Misc constants for GELI format
|
||||
*
|
||||
* @author Doug Voet
|
||||
*/
|
||||
public interface GeliFileConstants {
|
||||
public static final byte[] GELI_MAGIC = "GELI".getBytes();
|
||||
}
|
||||
|
|
@ -1,103 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||||
* or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.genotype;
|
||||
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.sam.util.BlockCompressedInputStream;
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
import edu.mit.broad.sam.util.RuntimeIOException;
|
||||
|
||||
|
||||
/**
|
||||
* Class for reading GELI (GEnotype LIkelihood) files.
|
||||
*
|
||||
* @author Doug Voet
|
||||
*/
|
||||
public class GeliFileReader implements Iterable<GenotypeLikelihoods>
|
||||
{
|
||||
private ReaderImplementation mReader = null;
|
||||
|
||||
/**
|
||||
* Internal interface for SAM/BAM file reader implementations.
|
||||
* Implemented as an abstract class to enforce better access control.
|
||||
*/
|
||||
static abstract class ReaderImplementation {
|
||||
abstract SAMFileHeader getFileHeader();
|
||||
abstract CloseableIterator<GenotypeLikelihoods> getIterator();
|
||||
abstract void close();
|
||||
}
|
||||
|
||||
|
||||
public GeliFileReader(final InputStream stream) {
|
||||
try {
|
||||
final BufferedInputStream bufferedStream = toBufferedStream(stream);
|
||||
if (isValidGELIFile(bufferedStream)) {
|
||||
mReader = new GeliFileReaderImplementation(bufferedStream);
|
||||
} else {
|
||||
throw new GeliException("Unrecognized file format");
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public GeliFileReader(final File file) {
|
||||
try {
|
||||
final BufferedInputStream bufferedStream =
|
||||
new BufferedInputStream(new FileInputStream(file));
|
||||
if (isValidGELIFile(bufferedStream)) {
|
||||
bufferedStream.close();
|
||||
final GeliFileReaderImplementation reader = new GeliFileReaderImplementation(file);
|
||||
mReader = reader;
|
||||
} else {
|
||||
bufferedStream.close();
|
||||
throw new GeliException("Unrecognized file format");
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (mReader != null) {
|
||||
mReader.close();
|
||||
}
|
||||
mReader = null;
|
||||
}
|
||||
|
||||
public SAMFileHeader getFileHeader() {
|
||||
return mReader.getFileHeader();
|
||||
}
|
||||
|
||||
public CloseableIterator<GenotypeLikelihoods> iterator() {
|
||||
return mReader.getIterator();
|
||||
}
|
||||
|
||||
private boolean isValidGELIFile(final InputStream stream)
|
||||
throws IOException {
|
||||
return BlockCompressedInputStream.isValidFile(stream);
|
||||
}
|
||||
|
||||
private BufferedInputStream toBufferedStream(final InputStream stream) {
|
||||
if (stream instanceof BufferedInputStream) {
|
||||
return (BufferedInputStream) stream;
|
||||
} else {
|
||||
return new BufferedInputStream(stream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,189 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever.
|
||||
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
|
||||
* or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.genotype;
|
||||
|
||||
|
||||
import java.io.DataInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.LineNumberReader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.sam.SAMSequenceRecord;
|
||||
import edu.mit.broad.sam.SAMTextHeaderCodec;
|
||||
import edu.mit.broad.sam.util.BinaryCodec;
|
||||
import edu.mit.broad.sam.util.BlockCompressedInputStream;
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
import edu.mit.broad.sam.util.StringLineReader;
|
||||
|
||||
/**
|
||||
* Internal class for reading GELI files.
|
||||
*/
|
||||
class GeliFileReaderImplementation extends GeliFileReader.ReaderImplementation {
|
||||
|
||||
private boolean mIsSeekable = false;
|
||||
private BinaryCodec mStream = null;
|
||||
private final BlockCompressedInputStream mCompressedInputStream;
|
||||
private SAMFileHeader mFileHeader = null;
|
||||
private long mFirstRecordPointer = 0;
|
||||
private CloseableIterator<GenotypeLikelihoods> mCurrentIterator = null;
|
||||
|
||||
|
||||
GeliFileReaderImplementation(final InputStream stream)
|
||||
throws IOException {
|
||||
mIsSeekable = false;
|
||||
mCompressedInputStream = new BlockCompressedInputStream(stream);
|
||||
mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream));
|
||||
readHeader(null);
|
||||
}
|
||||
|
||||
GeliFileReaderImplementation(final File file)
|
||||
throws IOException {
|
||||
mIsSeekable = true;
|
||||
mCompressedInputStream = new BlockCompressedInputStream(file);
|
||||
mStream = new BinaryCodec(new DataInputStream(mCompressedInputStream));
|
||||
readHeader(file);
|
||||
mFirstRecordPointer = mCompressedInputStream.getFilePointer();
|
||||
}
|
||||
|
||||
void close() {
|
||||
if (mStream != null) {
|
||||
mStream.close();
|
||||
}
|
||||
mStream = null;
|
||||
mFileHeader = null;
|
||||
}
|
||||
|
||||
SAMFileHeader getFileHeader() {
|
||||
return mFileHeader;
|
||||
}
|
||||
|
||||
CloseableIterator<GenotypeLikelihoods> getIterator() {
|
||||
if (mStream == null) {
|
||||
throw new IllegalStateException("File reader is closed");
|
||||
}
|
||||
if (mCurrentIterator != null) {
|
||||
throw new IllegalStateException("Iteration in progress");
|
||||
}
|
||||
if (mIsSeekable) {
|
||||
try {
|
||||
mCompressedInputStream.seek(mFirstRecordPointer);
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
mCurrentIterator = new GELIFileIterator();
|
||||
return mCurrentIterator;
|
||||
}
|
||||
|
||||
private void readHeader(final File file)
|
||||
throws IOException {
|
||||
|
||||
final byte[] buffer = new byte[4];
|
||||
mStream.readBytes(buffer);
|
||||
if (!Arrays.equals(buffer, GeliFileConstants.GELI_MAGIC)) {
|
||||
throw new IOException("Invalid GELI file header");
|
||||
}
|
||||
|
||||
final int headerTextLength = mStream.readInt();
|
||||
final String textHeader = mStream.readString(headerTextLength);
|
||||
mFileHeader = new SAMTextHeaderCodec().decode(new StringLineReader(textHeader),
|
||||
file);
|
||||
|
||||
final int sequenceCount = mStream.readInt();
|
||||
if (sequenceCount != mFileHeader.getSequences().size()) {
|
||||
throw new GeliException("Number of sequences in text header (" + mFileHeader.getSequences().size() +
|
||||
") != number of sequences in binary header (" + sequenceCount + ") for file " + file);
|
||||
}
|
||||
for (int i = 0; i < sequenceCount; i++) {
|
||||
readSequenceRecord(file);
|
||||
// final SAMSequenceRecord sequenceRecord = mFileHeader.getSequence(i);
|
||||
// if (!sequenceRecord.getSequenceName().equals(binarySequenceRecord.getSequenceName())) {
|
||||
// throw new GELIException("For sequence " + i + ", text and binary have different names in file " +
|
||||
// file);
|
||||
// }
|
||||
// if (sequenceRecord.getSequenceLength() != binarySequenceRecord.getSequenceLength()) {
|
||||
// throw new GELIException("For sequence " + i + ", text and binary have different lengths in file " +
|
||||
// file);
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
private SAMSequenceRecord readSequenceRecord(final File file) {
|
||||
final int nameLength = mStream.readInt();
|
||||
if (nameLength <= 1) {
|
||||
throw new GeliException("Invalid BAM file header: missing sequence name in file " + file);
|
||||
}
|
||||
final String sequenceName = mStream.readString(nameLength - 1);
|
||||
// Skip the null terminator
|
||||
mStream.readByte();
|
||||
final int sequenceLength = mStream.readInt();
|
||||
final SAMSequenceRecord record = new SAMSequenceRecord(sequenceName);
|
||||
record.setSequenceLength(sequenceLength);
|
||||
return record;
|
||||
}
|
||||
|
||||
private class GELIFileIterator
|
||||
implements CloseableIterator<GenotypeLikelihoods> {
|
||||
|
||||
private GenotypeLikelihoods mNextRecord = null;
|
||||
private final GenotypeLikelihoodsCodec likelihoodsCodec = new GenotypeLikelihoodsCodec();
|
||||
|
||||
|
||||
GELIFileIterator() {
|
||||
this(true);
|
||||
}
|
||||
|
||||
GELIFileIterator(final boolean advance) {
|
||||
likelihoodsCodec.setInputStream(mStream.getInputStream());
|
||||
if (advance) {
|
||||
advance();
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (this != mCurrentIterator) {
|
||||
throw new IllegalStateException("Attempt to close non-current iterator");
|
||||
}
|
||||
mCurrentIterator = null;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return (mNextRecord != null);
|
||||
}
|
||||
|
||||
public GenotypeLikelihoods next() {
|
||||
final GenotypeLikelihoods result = mNextRecord;
|
||||
advance();
|
||||
return result;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Not supported: remove");
|
||||
}
|
||||
|
||||
void advance() {
|
||||
try {
|
||||
mNextRecord = getNextRecord();
|
||||
} catch (IOException exc) {
|
||||
throw new RuntimeException(exc.getMessage(), exc);
|
||||
}
|
||||
}
|
||||
|
||||
GenotypeLikelihoods getNextRecord()
|
||||
throws IOException {
|
||||
return likelihoodsCodec.decode();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,168 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.genotype;
|
||||
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.StringWriter;
|
||||
|
||||
import edu.mit.broad.picard.genotype.GenotypeLikelihoods.GenotypeLikelihoodsComparator;
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.sam.SAMSequenceRecord;
|
||||
import edu.mit.broad.sam.SAMTextHeaderCodec;
|
||||
import edu.mit.broad.sam.SAMFileHeader.SortOrder;
|
||||
import edu.mit.broad.sam.util.BinaryCodec;
|
||||
import edu.mit.broad.sam.util.BlockCompressedOutputStream;
|
||||
import edu.mit.broad.sam.util.SortingCollection;
|
||||
|
||||
/**
|
||||
* Class for writing GELI (GEnotype LIkelihood) files.
|
||||
*/
|
||||
public class GeliFileWriter {
|
||||
private static final int MAX_RECORDS_IN_RAM = 1000000;
|
||||
private SAMFileHeader.SortOrder sortOrder = SortOrder.coordinate;
|
||||
private SAMFileHeader header;
|
||||
private SortingCollection<GenotypeLikelihoods> likelihoodsSorter;
|
||||
|
||||
// These two fields are for validating presorted records.
|
||||
private GenotypeLikelihoods prevLikelihoods;
|
||||
private GenotypeLikelihoodsComparator presortedComparator;
|
||||
|
||||
// If true, records passed to addAlignment are already in the order specified by sortOrder
|
||||
private boolean presorted;
|
||||
protected final BinaryCodec outputBinaryCodec;
|
||||
private GenotypeLikelihoodsCodec genotypeLikelihoodsCodec = null;
|
||||
|
||||
public GeliFileWriter(final File path) {
|
||||
this(path, false);
|
||||
}
|
||||
|
||||
public GeliFileWriter(final File path, boolean presorted) {
|
||||
outputBinaryCodec = new BinaryCodec(new DataOutputStream(new BlockCompressedOutputStream(path)));
|
||||
outputBinaryCodec.setOutputFileName(path.toString());
|
||||
this.presorted = presorted;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be called before addAlignment.
|
||||
* @param header
|
||||
*/
|
||||
public void setHeader(final SAMFileHeader header)
|
||||
{
|
||||
this.header = header;
|
||||
header.setSortOrder(sortOrder);
|
||||
final StringWriter headerTextBuffer = new StringWriter();
|
||||
new SAMTextHeaderCodec().encode(headerTextBuffer, header);
|
||||
final String headerText = headerTextBuffer.toString();
|
||||
|
||||
writeHeader(headerText);
|
||||
|
||||
if (presorted) {
|
||||
presortedComparator = makeComparator();
|
||||
} else if (!sortOrder.equals(SAMFileHeader.SortOrder.unsorted)) {
|
||||
likelihoodsSorter = SortingCollection.newInstance(GenotypeLikelihoods.class,
|
||||
new GenotypeLikelihoodsCodec(), makeComparator(), MAX_RECORDS_IN_RAM);
|
||||
}
|
||||
}
|
||||
|
||||
protected SAMFileHeader getHeader() {
|
||||
return header;
|
||||
}
|
||||
|
||||
private GenotypeLikelihoodsComparator makeComparator() {
|
||||
return new GenotypeLikelihoodsComparator();
|
||||
}
|
||||
|
||||
public void addGenotypeLikelihoods(GenotypeLikelihoods genotypeLikelihoods)
|
||||
{
|
||||
if (presorted) {
|
||||
assertPresorted(genotypeLikelihoods);
|
||||
writeGenotypeLikelihoods(genotypeLikelihoods);
|
||||
} else {
|
||||
likelihoodsSorter.add(genotypeLikelihoods);
|
||||
}
|
||||
}
|
||||
|
||||
private void assertPresorted(final GenotypeLikelihoods genotypeLikelihoods) {
|
||||
if (prevLikelihoods != null) {
|
||||
if (presortedComparator.compare(prevLikelihoods, genotypeLikelihoods) > 0) {
|
||||
throw new IllegalArgumentException("GenotypeLikelihoods added out of order in GELIFileWriterImpl.addGenotypeLikelihoods for " +
|
||||
getFilename() + ". Sort order is " + this.sortOrder + ". Offending records are at ["
|
||||
+ prevLikelihoods.getReferenceIndex() + ":" + prevLikelihoods.getPosition() + "] and ["
|
||||
+ genotypeLikelihoods.getReferenceIndex() + ":" + genotypeLikelihoods.getPosition() + "]");
|
||||
}
|
||||
}
|
||||
prevLikelihoods = genotypeLikelihoods;
|
||||
}
|
||||
|
||||
public final void close()
|
||||
{
|
||||
if (likelihoodsSorter != null) {
|
||||
for (final GenotypeLikelihoods genotypeLikelihoods : likelihoodsSorter) {
|
||||
writeGenotypeLikelihoods(genotypeLikelihoods);
|
||||
}
|
||||
likelihoodsSorter.cleanup();
|
||||
}
|
||||
finish();
|
||||
}
|
||||
|
||||
private void prepareToWriteAlignments() {
|
||||
if (genotypeLikelihoodsCodec == null) {
|
||||
genotypeLikelihoodsCodec = new GenotypeLikelihoodsCodec();
|
||||
genotypeLikelihoodsCodec.setOutputStream(outputBinaryCodec.getOutputStream());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the record to disk. Sort order has been taken care of by the time
|
||||
* this method is called.
|
||||
* @param alignment
|
||||
*/
|
||||
protected void writeGenotypeLikelihoods(GenotypeLikelihoods genotypeLikelihoods) {
|
||||
prepareToWriteAlignments();
|
||||
genotypeLikelihoodsCodec.encode(genotypeLikelihoods);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the header to disk. Header object is available via getHeader().
|
||||
* @param textHeader for convenience if the implementation needs it.
|
||||
*/
|
||||
protected void writeHeader(final String textHeader) {
|
||||
outputBinaryCodec.writeBytes(GeliFileConstants.GELI_MAGIC);
|
||||
|
||||
// calculate and write the length of the SAM file header text and the header text
|
||||
outputBinaryCodec.writeInt(textHeader.length());
|
||||
outputBinaryCodec.writeBytes(textHeader.getBytes());
|
||||
|
||||
// write the sequences binarily. This is redundant with the text header
|
||||
outputBinaryCodec.writeInt(getHeader().getSequences().size());
|
||||
for (final SAMSequenceRecord sequenceRecord: getHeader().getSequences()) {
|
||||
outputBinaryCodec.writeInt(sequenceRecord.getSequenceName().length() + 1);
|
||||
outputBinaryCodec.writeBytes(sequenceRecord.getSequenceName().getBytes());
|
||||
outputBinaryCodec.writeByte(0);
|
||||
outputBinaryCodec.writeInt(sequenceRecord.getSequenceLength());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Do any required flushing here.
|
||||
*/
|
||||
protected void finish() {
|
||||
outputBinaryCodec.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* For producing error messages.
|
||||
* @return Output filename, or null if there isn't one.
|
||||
*/
|
||||
protected String getFilename() {
|
||||
return outputBinaryCodec.getOutputFileName();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,164 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
|
||||
package edu.mit.broad.picard.genotype;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* Data object for Genotype Likelihoods. One object represents one row in a GELI file.
|
||||
*
|
||||
* @author Doug Voet
|
||||
*/
|
||||
public class GenotypeLikelihoods {
|
||||
/** this is a guess at how much memory an instance of this object occupies */
|
||||
public static final int OBJECT_SIZE_BYTES = 150;
|
||||
|
||||
public static final int AA_GENOTYPE = 0;
|
||||
public static final int AC_GENOTYPE = 1;
|
||||
public static final int AG_GENOTYPE = 2;
|
||||
public static final int AT_GENOTYPE = 3;
|
||||
public static final int CC_GENOTYPE = 4;
|
||||
public static final int CG_GENOTYPE = 5;
|
||||
public static final int CT_GENOTYPE = 6;
|
||||
public static final int GG_GENOTYPE = 7;
|
||||
public static final int GT_GENOTYPE = 8;
|
||||
public static final int TT_GENOTYPE = 9;
|
||||
|
||||
private static final char[][] GENOTYPES = {
|
||||
"AA".toCharArray(),
|
||||
"AC".toCharArray(),
|
||||
"AG".toCharArray(),
|
||||
"AT".toCharArray(),
|
||||
"CC".toCharArray(),
|
||||
"CG".toCharArray(),
|
||||
"CT".toCharArray(),
|
||||
"GG".toCharArray(),
|
||||
"GT".toCharArray(),
|
||||
"TT".toCharArray()
|
||||
};
|
||||
|
||||
/** compares first by reference index then by position */
|
||||
public static class GenotypeLikelihoodsComparator implements Comparator<GenotypeLikelihoods> {
|
||||
@Override
|
||||
public int compare(GenotypeLikelihoods thing1, GenotypeLikelihoods thing2) {
|
||||
long refCompare = thing1.referenceIndex - thing2.referenceIndex;
|
||||
if (refCompare == 0) {
|
||||
long posCompare = thing1.position - thing2.position;
|
||||
return (int) posCompare;
|
||||
} else {
|
||||
return (int) refCompare;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private long referenceIndex;
|
||||
private long position;
|
||||
private byte referenceBase;
|
||||
private int numReads;
|
||||
private short maxMappingQuality;
|
||||
private float[] likelihoods = new float[10];
|
||||
private byte bestLikelihoodIndex = -1; // stored as byte to reduce memory footprint
|
||||
private byte secondBestLikelihoodIndex = -1; // stored as byte to reduce memory footprint
|
||||
|
||||
public static int getLikelihoodIndex(char[] genotype) {
|
||||
char first = Character.isLowerCase(genotype[0]) ? Character.toUpperCase(genotype[0]) : genotype[0];
|
||||
char second = Character.isLowerCase(genotype[1]) ? Character.toUpperCase(genotype[1]) : genotype[1];
|
||||
if (first > second) {
|
||||
char temp = first;
|
||||
first = second;
|
||||
second = temp;
|
||||
}
|
||||
for (int i=0; i<GENOTYPES.length; i++) {
|
||||
if (first == GENOTYPES[i][0] && second == GENOTYPES[i][1]) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Unknown genotype string [" + new String(genotype) +
|
||||
"], any pair of ACTG case insensitive is acceptable");
|
||||
}
|
||||
|
||||
public float getLikelihood(int genotype) {
|
||||
return likelihoods[genotype];
|
||||
}
|
||||
|
||||
public void setLikelihood(int genotype, float value) {
|
||||
likelihoods[genotype] = value;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("referenc ").append(referenceIndex).append(":").append(position);
|
||||
builder.append(", ref base ").append((char) referenceBase);
|
||||
builder.append(", #reads ").append(numReads);
|
||||
builder.append(", quality ").append(maxMappingQuality);
|
||||
builder.append(" [");
|
||||
for (int i=0; i<likelihoods.length; i++) {
|
||||
builder.append(GENOTYPES[i]).append(":").append(likelihoods[i]).append(" ");
|
||||
}
|
||||
builder.append("]");
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + Arrays.hashCode(likelihoods);
|
||||
result = prime * result + maxMappingQuality;
|
||||
result = prime * result + numReads;
|
||||
result = prime * result + (int) (position ^ (position >>> 32));
|
||||
result = prime * result + referenceBase;
|
||||
result = prime * result + (int) (referenceIndex ^ (referenceIndex >>> 32));
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
GenotypeLikelihoods other = (GenotypeLikelihoods) obj;
|
||||
if (!Arrays.equals(likelihoods, other.likelihoods))
|
||||
return false;
|
||||
if (maxMappingQuality != other.maxMappingQuality)
|
||||
return false;
|
||||
if (numReads != other.numReads)
|
||||
return false;
|
||||
if (position != other.position)
|
||||
return false;
|
||||
if (referenceBase != other.referenceBase)
|
||||
return false;
|
||||
if (referenceIndex != other.referenceIndex)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
public long getReferenceIndex() { return referenceIndex; }
|
||||
public void setReferenceIndex(long sequenceIndex) { this.referenceIndex = sequenceIndex; }
|
||||
public long getPosition() { return position; }
|
||||
public void setPosition(long position) { this.position = position; }
|
||||
public byte getReferenceBase() { return referenceBase; }
|
||||
public void setReferenceBase(byte referenceBase) { this.referenceBase = referenceBase; }
|
||||
public int getNumReads() { return numReads; }
|
||||
public void setNumReads(int numReads) { this.numReads = numReads; }
|
||||
public short getMaxMappingQuality() { return maxMappingQuality; }
|
||||
public void setMaxMappingQuality(short maxMappingQuality) { this.maxMappingQuality = maxMappingQuality; }
|
||||
float[] getLikelihoods() { return likelihoods; }
|
||||
public int getBestLikelihoodIndex() { return bestLikelihoodIndex; }
|
||||
public void setBestLikelihoodIndex(int bestLikelihoodIndex) { this.bestLikelihoodIndex = (byte) bestLikelihoodIndex; }
|
||||
public int getSecondBestLikelihoodIndex() { return secondBestLikelihoodIndex; }
|
||||
public void setSecondBestLikelihoodIndex(int secondBestLikelihoodIndex) { this.secondBestLikelihoodIndex = (byte) secondBestLikelihoodIndex; }
|
||||
}
|
||||
|
|
@ -1,126 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.genotype;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
|
||||
import edu.mit.broad.sam.util.BinaryCodec;
|
||||
import edu.mit.broad.sam.util.RuntimeEOFException;
|
||||
import edu.mit.broad.sam.util.SortingCollection;
|
||||
|
||||
public class GenotypeLikelihoodsCodec implements SortingCollection.Codec<GenotypeLikelihoods> {
|
||||
private static final int SIG_FIG_MULTIPLIER = 100;
|
||||
private static final short BLOCK_SIZE = 12 + 10 * 4;
|
||||
|
||||
private OutputStream os;
|
||||
private InputStream is;
|
||||
private BinaryCodec binaryCodec;
|
||||
|
||||
/** Returns a new genotype likelihood codec. */
|
||||
public SortingCollection.Codec<GenotypeLikelihoods> clone() {
|
||||
return new GenotypeLikelihoodsCodec();
|
||||
}
|
||||
|
||||
/**
|
||||
* Write object to OutputStream.
|
||||
*
|
||||
* @param genotypeLikelihoods what to write
|
||||
*/
|
||||
public void encode(final GenotypeLikelihoods genotypeLikelihoods) {
|
||||
this.binaryCodec.writeShort(BLOCK_SIZE);
|
||||
this.binaryCodec.writeUInt(genotypeLikelihoods.getReferenceIndex());
|
||||
this.binaryCodec.writeUInt(genotypeLikelihoods.getPosition());
|
||||
this.binaryCodec.writeByte(genotypeLikelihoods.getReferenceBase());
|
||||
this.binaryCodec.writeUShort(genotypeLikelihoods.getNumReads());
|
||||
this.binaryCodec.writeByte(genotypeLikelihoods.getMaxMappingQuality());
|
||||
|
||||
for (int i = 0; i < genotypeLikelihoods.getLikelihoods().length; i++) {
|
||||
writeLikelihood(genotypeLikelihoods.getLikelihoods()[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the next record from the input stream and convert into a java object.
|
||||
*
|
||||
* @return null if no more records. Should throw exception if EOF is encountered in the middle of
|
||||
* a record.
|
||||
*/
|
||||
public GenotypeLikelihoods decode() {
|
||||
int recordLength = 0;
|
||||
try {
|
||||
recordLength = this.binaryCodec.readShort();
|
||||
} catch (RuntimeEOFException e) {
|
||||
return null;
|
||||
}
|
||||
if (recordLength != BLOCK_SIZE) {
|
||||
throw new GeliException("Invalid record length: " + recordLength);
|
||||
}
|
||||
|
||||
final GenotypeLikelihoods genotypeLikelihoods = new GenotypeLikelihoods();
|
||||
genotypeLikelihoods.setReferenceIndex(this.binaryCodec.readUInt());
|
||||
genotypeLikelihoods.setPosition(this.binaryCodec.readUInt());
|
||||
genotypeLikelihoods.setReferenceBase(this.binaryCodec.readByte());
|
||||
genotypeLikelihoods.setNumReads(this.binaryCodec.readUShort());
|
||||
genotypeLikelihoods.setMaxMappingQuality(this.binaryCodec.readByte());
|
||||
|
||||
int bestIndex = -1;
|
||||
int secondBestIndex = -1;
|
||||
for (int i = 0; i < genotypeLikelihoods.getLikelihoods().length; i++) {
|
||||
float likelihood = readLikelihood();
|
||||
genotypeLikelihoods.getLikelihoods()[i] = likelihood;
|
||||
|
||||
if (bestIndex == -1 || genotypeLikelihoods.getLikelihood(bestIndex) < likelihood) {
|
||||
secondBestIndex = bestIndex;
|
||||
bestIndex = i;
|
||||
} else if (secondBestIndex == -1 || genotypeLikelihoods.getLikelihood(secondBestIndex) < likelihood) {
|
||||
secondBestIndex = i;
|
||||
}
|
||||
}
|
||||
genotypeLikelihoods.setBestLikelihoodIndex(bestIndex);
|
||||
genotypeLikelihoods.setSecondBestLikelihoodIndex(secondBestIndex);
|
||||
|
||||
return genotypeLikelihoods;
|
||||
}
|
||||
|
||||
/**
|
||||
* Where to write encoded output
|
||||
*
|
||||
* @param os
|
||||
*/
|
||||
public void setOutputStream(final OutputStream os) {
|
||||
this.os = os;
|
||||
this.binaryCodec = new BinaryCodec(os);
|
||||
}
|
||||
|
||||
/**
|
||||
* Where to read encoded input from
|
||||
*
|
||||
* @param is
|
||||
*/
|
||||
public void setInputStream(final InputStream is) {
|
||||
this.is = is;
|
||||
this.binaryCodec = new BinaryCodec(is);
|
||||
}
|
||||
|
||||
private void writeLikelihood(float likelihood) {
|
||||
float shiftedLikelihood = likelihood * SIG_FIG_MULTIPLIER;
|
||||
this.binaryCodec.writeInt((int) Math.round(shiftedLikelihood));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return
|
||||
*/
|
||||
private float readLikelihood() {
|
||||
float likelihood = (float) this.binaryCodec.readInt() / SIG_FIG_MULTIPLIER;
|
||||
return likelihood;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,192 +0,0 @@
|
|||
package edu.mit.broad.picard.genotype.caller;
|
||||
|
||||
import edu.mit.broad.picard.sam.SamLocusIterator;
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.util.SortedSet;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Base class for AlleleCallers. Handles efficient access to the reference, output of data to a
|
||||
* standard file format, and application of priors
|
||||
*/
|
||||
public abstract class AbstractAlleleCaller {
|
||||
// writer for output
|
||||
private final BufferedWriter writer;
|
||||
|
||||
// for providing access to reference data
|
||||
private final ReferenceSequenceFile referenceSequenceFile;
|
||||
private final SAMFileHeader samHeader;
|
||||
private ReferenceSequence referenceSequence;
|
||||
|
||||
public AbstractAlleleCaller(final File reference, final SAMFileHeader samHeader, final BufferedWriter writer) {
|
||||
this.writer = writer;
|
||||
this.referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(reference);
|
||||
this.samHeader = samHeader;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* emit allele calls to the writer specified in the constructor
|
||||
*
|
||||
* @param li Locus to call
|
||||
*/
|
||||
public void callAlleles(final SamLocusIterator.LocusInfo li) throws IOException {
|
||||
|
||||
|
||||
cacheReferenceSequence(li.getSequenceIndex());
|
||||
|
||||
final char ref = Character.toUpperCase((char)(referenceSequence.getBases()[li.getPosition() - 1] & 0xff));
|
||||
|
||||
|
||||
// delegate to the specific implementation
|
||||
final SortedSet<GenotypeTheory> likelihoods = call(ref, li.getBasesAsString(), li.getQualities());
|
||||
|
||||
|
||||
final GenotypeTheory bestTheory = likelihoods.first();
|
||||
GenotypeTheory nextBestTheory = null;
|
||||
GenotypeTheory refTheory = null;
|
||||
final String refString = new String(new char[]{ref,ref});
|
||||
final DiploidGenotype refGenotype = DiploidGenotype.valueOf(refString);
|
||||
|
||||
|
||||
final StringBuilder theoryString = new StringBuilder();
|
||||
int k=0;
|
||||
for(final GenotypeTheory t : likelihoods) {
|
||||
if (k == 1) { nextBestTheory = t; }
|
||||
if (t.getGenotype() == refGenotype) { refTheory = t; }
|
||||
|
||||
theoryString.append(t.getGenotype())
|
||||
.append(":")
|
||||
.append(String.format("%.2f",t.getLikelihood()))
|
||||
.append(" ");
|
||||
k++;
|
||||
}
|
||||
|
||||
final double btnb = bestTheory.getLikelihood() - nextBestTheory.getLikelihood();
|
||||
final double btr = bestTheory.getLikelihood() - refTheory.getLikelihood();
|
||||
|
||||
final DiploidGenotype gt = likelihoods.first().getGenotype();
|
||||
|
||||
final String type;
|
||||
if (!gt.isHet() && gt.getAllele1() == ref) {
|
||||
type = "homozygous";
|
||||
} else if (!gt.isHet() && gt.getAllele1() != ref) {
|
||||
type = "homozygous-SNP";
|
||||
} else {
|
||||
type = "heterozygous-SNP";
|
||||
}
|
||||
|
||||
final String bases = li.getBasesAsString();
|
||||
int a = 0,c = 0,g = 0,t = 0;
|
||||
for(int i=0; i<bases.length(); i++) {
|
||||
if (bases.charAt(i) == 'A') { a++; }
|
||||
else if (bases.charAt(i) == 'C') { c++; }
|
||||
else if (bases.charAt(i) == 'G') { g++; }
|
||||
else if (bases.charAt(i) == 'T') { t++; }
|
||||
else { throw new RuntimeException("Unknown Base " + bases.charAt(i)); }
|
||||
}
|
||||
|
||||
writer.write(
|
||||
li.getSequenceIndex() + ":" +
|
||||
(li.getPosition()-1) + " " + // arachne output is 0-based
|
||||
ref + " " +
|
||||
gt + " " +
|
||||
String.format("%f %f", btnb,btr) + " " +
|
||||
type + " " +
|
||||
"A:" + a + " " +
|
||||
"C:" + c + " " +
|
||||
"G:" + g + " " +
|
||||
"T:" + t + " " +
|
||||
bases.length() + " " +
|
||||
"0 1 1 " + // used prior, is alignable, bait present
|
||||
theoryString
|
||||
);
|
||||
|
||||
|
||||
writer.write("\n");
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure that the referenceSequence member points to the sequenceIndex-th sequence. Note that
|
||||
* this is not random access. It is required that current sequenceIndex is >= the arg in the previous
|
||||
* call to this method.
|
||||
*/
|
||||
private void cacheReferenceSequence(int sequenceIndex) {
|
||||
if (referenceSequence != null && referenceSequence.getContigIndex() == sequenceIndex) {
|
||||
return;
|
||||
}
|
||||
referenceSequence = null;
|
||||
for(referenceSequence = referenceSequenceFile.nextSequence();
|
||||
referenceSequence != null;
|
||||
referenceSequence = referenceSequenceFile.nextSequence()) {
|
||||
// Sanity check the sequence names against the sequence dictionary while scanning through.
|
||||
if (!referenceSequence.getName().equals(samHeader.getSequence(referenceSequence.getContigIndex()).getSequenceName())) {
|
||||
throw new PicardException("Sequence name mismatch at sequence index " + referenceSequence.getContigIndex() +
|
||||
": " + referenceSequence.getName() + " != " +
|
||||
samHeader.getSequence(referenceSequence.getContigIndex()).getSequenceName());
|
||||
}
|
||||
if (referenceSequence.getContigIndex() == sequenceIndex) {
|
||||
break;
|
||||
}
|
||||
if (referenceSequence.getContigIndex() > sequenceIndex) {
|
||||
throw new PicardException("Never found reference sequence with index " + sequenceIndex);
|
||||
}
|
||||
}
|
||||
if (referenceSequence == null) {
|
||||
throw new PicardException("Reference sequence with index " + sequenceIndex + " was not found");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Override this to implement a concrete genotype caller
|
||||
* @param ref the reference base
|
||||
* @param bases each element in the String is the base at current locus for a given read
|
||||
* @param quals same length as bases. the ith element corresponds to the ith element of bases.
|
||||
* @return
|
||||
*/
|
||||
abstract protected SortedSet<GenotypeTheory> call(char ref, String bases, List<Byte> quals);
|
||||
|
||||
|
||||
/**
|
||||
* Apply a general population-based prior to the likelihood:
|
||||
* <ul>
|
||||
* <li>ref is .999</li>
|
||||
* <li>het is 10^-3</li>
|
||||
* <li>homozygous, non-reference is 10^-5</li>
|
||||
*
|
||||
* @param ref reference allele
|
||||
* @return prior, given the reference and genotype alleles
|
||||
*/
|
||||
protected double getPrior(final char ref, final DiploidGenotype gt) {
|
||||
final double prior;
|
||||
if (gt.isHom() && gt.getAllele1() == ref) {
|
||||
prior = 0.999; // reference
|
||||
} else {
|
||||
if (gt.getAllele1() != ref && gt.getAllele2() != ref) {
|
||||
prior = 0.00001; // neither base is reference
|
||||
} else {
|
||||
prior = 0.001; // het, one base is reference
|
||||
}
|
||||
}
|
||||
return prior;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------
|
||||
// Helper methods below this point...
|
||||
// --------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
public boolean isHet(final String alleles) {
|
||||
return (alleles.charAt(0) != (alleles.charAt(1)));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -1,93 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.genotype.caller;
|
||||
|
||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||
import edu.mit.broad.picard.cmdline.Option;
|
||||
import edu.mit.broad.picard.cmdline.Usage;
|
||||
import edu.mit.broad.picard.directed.GenomeMaskFactory;
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.sam.SAMFileReader;
|
||||
import edu.mit.broad.picard.sam.SamLocusIterator;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Call genotypes given a SAM file of aligned reads, reference sequences, and optionally a target map.
|
||||
*/
|
||||
public class CallGenotypes extends CommandLineProgram {
|
||||
// Usage and parameters
|
||||
@Usage(programVersion="1.0") public String USAGE = "Basic Allele Caller\n";
|
||||
@Option(shortName="I", doc="SAM or BAM file for calling") public File INPUT_FILE;
|
||||
@Option(shortName="O", doc="Allele Call output GELI file") public File OUTPUT_FILE;
|
||||
@Option(shortName="R", doc="Reference fasta or fasta.gz file") public File REF_FILE;
|
||||
@Option(shortName="T", doc="IntervalList-format target map file", optional = true) public File TARGET_FILE;
|
||||
@Option(shortName="Q", doc="Minimum quality score threshold to use in allele calling", optional = true) public Integer QUAL_SCORE_THRESHOLD;
|
||||
|
||||
|
||||
/** Required main method implementation. */
|
||||
public static void main(final String[] argv) {
|
||||
System.exit(new CallGenotypes().instanceMain(argv));
|
||||
}
|
||||
|
||||
|
||||
protected int doWork() {
|
||||
try {
|
||||
final BufferedWriter writer = new BufferedWriter(new FileWriter(OUTPUT_FILE));
|
||||
|
||||
final SAMFileReader samReader = getSamReader(INPUT_FILE);
|
||||
|
||||
// TODO -- parameterize, or create separate executables...
|
||||
// AbstractAlleleCaller caller = new FlatQualityAlleleCaller(reference, writer);
|
||||
final AbstractAlleleCaller caller = new QualityScoreAlleleCaller(REF_FILE, samReader.getFileHeader(), writer);
|
||||
final long startTime = System.currentTimeMillis();
|
||||
|
||||
final SamLocusIterator sli = new SamLocusIterator(samReader.iterator());
|
||||
|
||||
if (TARGET_FILE != null) {
|
||||
sli.setGenomeMask(new GenomeMaskFactory().makeGenomeMaskFromIntervalList(TARGET_FILE));
|
||||
}
|
||||
|
||||
if (QUAL_SCORE_THRESHOLD != null) {
|
||||
System.out.println("Masking out bases with < Q"+QUAL_SCORE_THRESHOLD);
|
||||
sli.setQualityScoreCutoff(QUAL_SCORE_THRESHOLD);
|
||||
}
|
||||
|
||||
for (final SamLocusIterator.LocusInfo li : sli) {
|
||||
if (li != null) caller.callAlleles(li);
|
||||
}
|
||||
|
||||
final long elapsed = System.currentTimeMillis() - startTime;
|
||||
System.out.println("Completed in " + elapsed + "ms");
|
||||
|
||||
writer.flush();
|
||||
writer.close();
|
||||
} catch (IOException ioe) {
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private SAMFileReader getSamReader(final File samFile) {
|
||||
final SAMFileReader samReader = new SAMFileReader(samFile);
|
||||
|
||||
// ensure the file is sorted
|
||||
if (samReader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
|
||||
System.out.println("SAM Files must be coordinate-sorted, this is " + samReader.getFileHeader().getSortOrder());
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
return samReader;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
package edu.mit.broad.picard.genotype.caller;
|
||||
|
||||
public enum DiploidGenotype {
|
||||
AA('A','A'),
|
||||
AC('A','C'),
|
||||
AG('A','G'),
|
||||
AT('A','T'),
|
||||
CC('C','C'),
|
||||
CG('C','G'),
|
||||
CT('C','T'),
|
||||
GG('G','G'),
|
||||
GT('G','T'),
|
||||
TT('T','T');
|
||||
|
||||
private final char allele1;
|
||||
private final char allele2;
|
||||
|
||||
private DiploidGenotype(final char allele1, final char allele2) {
|
||||
this.allele1 = allele1;
|
||||
this.allele2 = allele2;
|
||||
}
|
||||
|
||||
public char getAllele1() { return allele1; }
|
||||
public char getAllele2() { return allele2; }
|
||||
public boolean isHet() { return this.allele1 != this.allele2; }
|
||||
public boolean isHom() { return this.allele1 == this.allele2; }
|
||||
}
|
||||
|
|
@ -1,76 +0,0 @@
|
|||
package edu.mit.broad.picard.genotype.caller;
|
||||
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
import static java.lang.Math.*;
|
||||
|
||||
|
||||
/**
|
||||
* Bayesian-based allele caller using flat qualities and a 1e-3 error rate, based on CRD algorithm
|
||||
*/
|
||||
public class FlatQualityAlleleCaller extends AbstractAlleleCaller {
|
||||
|
||||
public FlatQualityAlleleCaller(final File fastbReference, SAMFileHeader samHeader, final BufferedWriter writer) {
|
||||
super(fastbReference, samHeader, writer);
|
||||
}
|
||||
|
||||
|
||||
protected SortedSet<GenotypeTheory> call(final char ref, final String bases, final List<Byte> quals) {
|
||||
final float eps = 1e-3f;
|
||||
|
||||
// count up the base by nucleotide and put them into a map
|
||||
final int depth = bases.length();
|
||||
int a = 0,c = 0,g = 0,t = 0;
|
||||
for(int i=0; i< bases.length(); i++) {
|
||||
if (bases.charAt(i) == 'A') { a++; }
|
||||
else if (bases.charAt(i) == 'C') { c++; }
|
||||
else if (bases.charAt(i) == 'G') { g++; }
|
||||
else if (bases.charAt(i) == 'T') { t++; }
|
||||
else { throw new RuntimeException("Unknown Base " + bases.charAt(i)); }
|
||||
}
|
||||
|
||||
final Map<Character, Integer> counts = new HashMap<Character, Integer>();
|
||||
counts.put('A', a);
|
||||
counts.put('C', c);
|
||||
counts.put('G', g);
|
||||
counts.put('T', t);
|
||||
|
||||
|
||||
// for each of the 10 theories, calculate the likelihood
|
||||
final SortedSet<GenotypeTheory> results = new TreeSet<GenotypeTheory>();
|
||||
for(final DiploidGenotype theory : DiploidGenotype.values()) {
|
||||
final double likelihood;
|
||||
final char allele1 = theory.getAllele1();
|
||||
final char allele2 = theory.getAllele2();
|
||||
|
||||
if (!theory.isHet()) {
|
||||
likelihood = log10(1-eps)*counts.get(allele1) + log10(eps)*(depth - counts.get(allele1));
|
||||
} else {
|
||||
final int major_allele_counts;
|
||||
final int minor_allele_counts;
|
||||
if (counts.get(allele1) > counts.get(allele2)) {
|
||||
major_allele_counts = counts.get(allele1);
|
||||
minor_allele_counts = counts.get(allele2);
|
||||
} else {
|
||||
major_allele_counts = counts.get(allele2);
|
||||
minor_allele_counts = counts.get(allele1);
|
||||
}
|
||||
|
||||
likelihood = log10(0.5 - (eps/2.0) )*major_allele_counts +
|
||||
log10(0.5 - (eps/2.0) )*minor_allele_counts +
|
||||
log10(eps)*(depth - major_allele_counts - minor_allele_counts);
|
||||
}
|
||||
|
||||
final double prior = getPrior(ref, theory);
|
||||
results.add(new GenotypeTheory(theory, likelihood + log10(prior)));
|
||||
}
|
||||
|
||||
|
||||
return results;
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -1,46 +0,0 @@
|
|||
package edu.mit.broad.picard.genotype.caller;
|
||||
|
||||
/**
|
||||
* Datastructure to hold a single genotype along with a likelihood.
|
||||
*/
|
||||
public class GenotypeTheory implements Comparable<GenotypeTheory> {
|
||||
private DiploidGenotype genotype;
|
||||
private double likelihood;
|
||||
|
||||
public GenotypeTheory(final DiploidGenotype genotype, final double likelihood) {
|
||||
this.genotype = genotype;
|
||||
this.likelihood = likelihood;
|
||||
}
|
||||
|
||||
public DiploidGenotype getGenotype() {
|
||||
return genotype;
|
||||
}
|
||||
|
||||
public void setGenotype(final DiploidGenotype genotype) {
|
||||
this.genotype = genotype;
|
||||
}
|
||||
|
||||
public double getLikelihood() {
|
||||
return likelihood;
|
||||
}
|
||||
|
||||
public void setLikelihood(final double likelihood) {
|
||||
this.likelihood = likelihood;
|
||||
}
|
||||
|
||||
/**
|
||||
* Genotype Theories are sorted first by descending likelihood (ie
|
||||
* the GenotypeTheory with biggest likelihood comes first). Ties are
|
||||
* broken by lexical sorting of the genotypes themselves
|
||||
*
|
||||
*/
|
||||
public int compareTo(final GenotypeTheory other) {
|
||||
if (this.getLikelihood() == other.getLikelihood()) {
|
||||
return this.getGenotype().compareTo(other.getGenotype());
|
||||
} else if (this.getLikelihood() > other.getLikelihood()) {
|
||||
return -1;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
package edu.mit.broad.picard.genotype.caller;
|
||||
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
|
||||
import java.util.*;
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Bayesian-based allele caller using quality scores, based on CRD algorithm
|
||||
*/
|
||||
public class QualityScoreAlleleCaller extends AbstractAlleleCaller {
|
||||
|
||||
public QualityScoreAlleleCaller(final File fastbReference, SAMFileHeader samHeader, final BufferedWriter writer) {
|
||||
super(fastbReference, samHeader, writer);
|
||||
}
|
||||
|
||||
protected SortedSet<GenotypeTheory> call(final char ref, final String bases, final List<Byte> quals) {
|
||||
|
||||
// for each of the 10 theories, calculate the likelihood using quality scores
|
||||
final SortedSet<GenotypeTheory> results = new TreeSet<GenotypeTheory>();
|
||||
for(final DiploidGenotype theory : DiploidGenotype.values()) {
|
||||
double likelihood = 0;
|
||||
|
||||
for(int i=0; i<bases.length(); i++) {
|
||||
final char base = bases.charAt(i);
|
||||
final byte qual = quals.get(i);
|
||||
|
||||
if (theory.isHom()) {
|
||||
if (base == theory.getAllele1() || base == theory.getAllele2()) {
|
||||
likelihood += getOneMinusQual(qual);
|
||||
} else {
|
||||
// the real math would be
|
||||
// likelihood += log10(pow(10,(qual/-10.0)));
|
||||
// but it simplifies to
|
||||
likelihood += qual/-10.0;
|
||||
}
|
||||
} else {
|
||||
if (base == theory.getAllele1() || base == theory.getAllele2()) {
|
||||
likelihood += getOneHalfMinusQual(qual);
|
||||
} else {
|
||||
// the real math would be
|
||||
// likelihood += log10(pow(10,(qual/-10.0)));
|
||||
// but it simplifies to
|
||||
likelihood += qual/-10.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final double prior = getPrior(ref, theory);
|
||||
results.add(new GenotypeTheory(theory, likelihood + log10(prior)));
|
||||
}
|
||||
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private static final double[] oneMinusData = new double[Byte.MAX_VALUE];
|
||||
{
|
||||
for(int qual=0; qual < Byte.MAX_VALUE; qual++) {
|
||||
oneMinusData[qual] = log10(1.0 - pow(10,(qual/-10.0)));
|
||||
}
|
||||
}
|
||||
private double getOneMinusQual(final byte qual) {
|
||||
return oneMinusData[qual];
|
||||
}
|
||||
|
||||
private static final double[] oneHalfMinusData = new double[Byte.MAX_VALUE];
|
||||
{
|
||||
for(int qual=0; qual < Byte.MAX_VALUE; qual++) {
|
||||
oneHalfMinusData[qual] = log10(0.5-pow(10,(qual/-10.0))/2.0);
|
||||
}
|
||||
}
|
||||
|
||||
private double getOneHalfMinusQual(final byte qual) {
|
||||
return oneHalfMinusData[qual];
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,257 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.illumina;
|
||||
|
||||
import edu.mit.broad.picard.util.PasteParser;
|
||||
import edu.mit.broad.picard.util.FormatUtil;
|
||||
import edu.mit.broad.picard.util.BasicTextFileParser;
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FilenameFilter;
|
||||
import java.io.Closeable;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Class to parse the data in an Illumina Bustard directory and return an iterator over that data, in order
|
||||
* by tile.
|
||||
*
|
||||
* @author Kathleen Tibbetts
|
||||
*/
|
||||
public class BustardFileParser implements Iterator<BustardReadData>, Iterable<BustardReadData>, Closeable {
|
||||
|
||||
private final File bustardDirectory;
|
||||
private final int lane;
|
||||
private final boolean pairedEnd;
|
||||
private PasteParser parser;
|
||||
private BustardReadData next = null;
|
||||
private final FormatUtil formatter = new FormatUtil();
|
||||
private boolean iterating = false;
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param bustardDirectory directory where the Bustard files can be located
|
||||
* @param lane the lane to parse
|
||||
* @param pairedEnd whether this is a paired-end run
|
||||
*/
|
||||
public BustardFileParser(File bustardDirectory, int lane, boolean pairedEnd) {
|
||||
this.bustardDirectory = bustardDirectory;
|
||||
this.lane = lane;
|
||||
this.pairedEnd = pairedEnd;
|
||||
initialize();
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the relevant files in the bustardDirectory, sorts them, and puts them into the
|
||||
* <code>sortedFiles</code> iterator. Does some basic sanity checking to ensure that some files
|
||||
* are found and that they are the expected multiple for paired-end or not.
|
||||
*
|
||||
*/
|
||||
private void initialize()
|
||||
{
|
||||
final String qseq1Regex = "s_" + lane + "_1_\\d{4}_qseq.txt(.gz)?";
|
||||
final String qseq2Regex = "s_" + lane + "_2_\\d{4}_qseq.txt(.gz)?";
|
||||
final String intensityRegex = "s_" + lane + "_\\d{4}_sig2.txt(.gz)?";
|
||||
|
||||
File read1files[] = bustardDirectory.listFiles( new FilenameFilter() {
|
||||
public boolean accept(File dir, String name) {
|
||||
return name.matches(qseq1Regex);
|
||||
}
|
||||
});
|
||||
|
||||
File read2files[] = bustardDirectory.listFiles( new FilenameFilter() {
|
||||
public boolean accept(File dir, String name) {
|
||||
return name.matches(qseq2Regex);
|
||||
}
|
||||
});
|
||||
|
||||
File intensityFiles[] = bustardDirectory.listFiles( new FilenameFilter() {
|
||||
public boolean accept(File dir, String name) {
|
||||
return name.matches(intensityRegex);
|
||||
}
|
||||
});
|
||||
|
||||
// Some basic sanity checking on file counts
|
||||
if (read1files.length == 0 && read2files.length == 0 && intensityFiles.length == 0) {
|
||||
throw new PicardException("No Bustard files found in " +
|
||||
bustardDirectory.getAbsolutePath() + " for lane " + lane);
|
||||
}
|
||||
if (pairedEnd) {
|
||||
if (read1files.length != read2files.length || read2files.length != intensityFiles.length) {
|
||||
throw new PicardException("Incorrect number of Bustard files found in " +
|
||||
bustardDirectory.getAbsolutePath() + " for lane " + lane + ". Found " +
|
||||
read1files.length + " read 1 qseq files, " + read2files.length + " read 2 " +
|
||||
"qseq files, and " + intensityFiles.length + " sig2 files. There should be " +
|
||||
"the same number of each type of file");
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (read1files.length != intensityFiles.length) {
|
||||
throw new PicardException("Incorrect number of Bustard files found in " +
|
||||
bustardDirectory.getAbsolutePath() + " for lane " + lane + ". Found " +
|
||||
read1files.length + " qseq files and " + intensityFiles.length + " sig2 files, " +
|
||||
"which should be equal.");
|
||||
}
|
||||
if (read2files.length > 0) {
|
||||
throw new PicardException("Read 2 Bustard files found in " +
|
||||
bustardDirectory.getAbsolutePath() + " for lane " + lane + ". Lane " +
|
||||
" was specified as a non-PE run, and so should not have any read 2 data.");
|
||||
}
|
||||
}
|
||||
|
||||
// Sort each set of reads and create a text parser for it
|
||||
SortedSet<File> sortedRead1 = new TreeSet<File>(new BustardFilenameComparator());
|
||||
sortedRead1.addAll(Arrays.asList(read1files));
|
||||
read1files = sortedRead1.toArray(read1files);
|
||||
BasicTextFileParser read1Parser = new BasicTextFileParser(true, read1files);
|
||||
|
||||
SortedSet<File> sortedIntensity = new TreeSet<File>(new BustardFilenameComparator());
|
||||
sortedIntensity.addAll(Arrays.asList(intensityFiles));
|
||||
intensityFiles = sortedIntensity.toArray(intensityFiles);
|
||||
BasicTextFileParser intensityParser = new BasicTextFileParser(true, intensityFiles);
|
||||
|
||||
// And create a paste parser for all of them
|
||||
if (pairedEnd) {
|
||||
SortedSet<File> sortedRead2 = new TreeSet<File>(new BustardFilenameComparator());
|
||||
sortedRead2.addAll(Arrays.asList(read2files));
|
||||
read2files = sortedRead2.toArray(read2files);
|
||||
BasicTextFileParser read2Parser = new BasicTextFileParser(true, read2files);
|
||||
|
||||
parser = new PasteParser(read1Parser, read2Parser, intensityParser);
|
||||
}
|
||||
else {
|
||||
parser = new PasteParser(read1Parser, intensityParser);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the next line from the parser and constructs a BustardReadData object from it
|
||||
* The first 11 fields are the read1 data, the second 11 are the read2 data, and the remaining
|
||||
* values are the intensities data. Note that the first four values in the intensity file
|
||||
* are not intensities but rather lane, tiles, x, and y for the given cluster.
|
||||
*
|
||||
* @param validate whether to check that the expected number of intensity values are returned
|
||||
* @return a fully populated BustardReadData object
|
||||
*/
|
||||
private BustardReadData readNext(boolean validate) {
|
||||
if (!parser.hasNext()) {
|
||||
return null;
|
||||
}
|
||||
String data[][] = parser.next();
|
||||
String machine = data[0][0];
|
||||
int run = formatter.parseInt(data[0][1]);
|
||||
int lane = formatter.parseInt(data[0][2]);
|
||||
int tile = formatter.parseInt(data[0][3]);
|
||||
int x = formatter.parseInt(data[0][4]);
|
||||
int y = formatter.parseInt(data[0][5]);
|
||||
String firstSeq = data[0][8];
|
||||
String firstQual = data[0][9];
|
||||
boolean pf = formatter.parseInt(data[0][10]) == 1;
|
||||
String secondSeq = null;
|
||||
String secondQual = null;
|
||||
|
||||
int intensityIndex = 1;
|
||||
if (pairedEnd) {
|
||||
secondSeq = data[1][8];
|
||||
secondQual = data[1][9];
|
||||
intensityIndex = 2;
|
||||
}
|
||||
|
||||
int numIntensities = firstSeq.length() * (pairedEnd ? 2 : 1);
|
||||
|
||||
// Sanity check since some of those files look a little weird
|
||||
if (validate) {
|
||||
int remaining = data[intensityIndex].length - 4;
|
||||
if ((remaining % 4 != 0) || (remaining/4) != numIntensities) {
|
||||
throw new PicardException("Unexpected number of intensity fields for " + machine + "/" + run +
|
||||
"/" + lane + "/" + tile + ": " + remaining);
|
||||
}
|
||||
}
|
||||
|
||||
double intensities[][] = new double[numIntensities][4];
|
||||
int intensityArrayIndex = 4;
|
||||
for (int i = 0; i < numIntensities; i++) {
|
||||
for (int j = 0; j < 4; j++) {
|
||||
intensities[i][j] = formatter.parseDouble(data[intensityIndex][intensityArrayIndex++]);
|
||||
}
|
||||
}
|
||||
|
||||
return new BustardReadData(
|
||||
machine, run, lane, tile, firstSeq, firstQual, secondSeq, secondQual, pf, intensities, x, y);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an iterator over a set of elements of type BustardReadData.
|
||||
*
|
||||
* @return an iterator over a set of elements of type BustardReadData
|
||||
*/
|
||||
public Iterator<BustardReadData> iterator() {
|
||||
if (iterating) {
|
||||
throw new IllegalStateException("iterator() method can only be called once, before the" +
|
||||
"first call to hasNext()");
|
||||
}
|
||||
next = readNext(true);
|
||||
iterating = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the iteration has more elements.
|
||||
*
|
||||
* @return true if the iteration has more elements. Otherwise returns false.
|
||||
*/
|
||||
public boolean hasNext() {
|
||||
if (!iterating) {
|
||||
next = readNext(true);
|
||||
iterating = true;
|
||||
}
|
||||
return next != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the next element in the iteration.
|
||||
*
|
||||
* @return the next element in the iteration
|
||||
* @throws java.util.NoSuchElementException
|
||||
*/
|
||||
public BustardReadData next() {
|
||||
|
||||
if (!hasNext()) {
|
||||
throw new NoSuchElementException("Iteration has no more elements.");
|
||||
}
|
||||
|
||||
BustardReadData result = next;
|
||||
next = readNext(false);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Required method for Iterator API.
|
||||
*
|
||||
* @throws UnsupportedOperationException
|
||||
*/
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Remove() not supported.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Closes the underlying PasteParser
|
||||
*/
|
||||
public void close() {
|
||||
if (parser != null) {
|
||||
parser.close();
|
||||
}
|
||||
}
|
||||
|
||||
public int getLane() { return this.lane; }
|
||||
public boolean isPairedEnd() { return this.pairedEnd; }
|
||||
}
|
||||
|
|
@ -1,78 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.illumina;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* Comparator for getting Bustard files in "sorted" order for use by the BustardFileParser. Expected order is
|
||||
* by lane in ascending order, then by tile in ascending order, then:
|
||||
* the read 1 qseq file
|
||||
* the read 2 qseq file
|
||||
* the sig2 file
|
||||
*
|
||||
* IMPORTANT: Currently this class expects to receive ONLY qseq and sig2 files.
|
||||
*
|
||||
* @author Kathleen Tibbetts
|
||||
*/
|
||||
public class BustardFilenameComparator implements Comparator<File> {
|
||||
|
||||
/**
|
||||
* Compares its two arguments for order. Returns a negative integer, zero, or a positive integer as
|
||||
* the first argument is less than, equal to, or greater than the second.
|
||||
*
|
||||
* @param file1
|
||||
* @param file2
|
||||
* @return a negative integer, zero, or a positive integer as
|
||||
* the first argument is less than, equal to, or greater than the second.
|
||||
*/
|
||||
public int compare(File file1, File file2)
|
||||
{
|
||||
Integer parts1[] = parseFileNameParts(file1.getName());
|
||||
Integer parts2[] = parseFileNameParts(file2.getName());
|
||||
|
||||
for (int i = 1; i < parts1.length; i++)
|
||||
{
|
||||
if (!parts1[i].equals(parts2[i])) {
|
||||
return parts1[i].compareTo(parts2[i]);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method that returns an array of integers that represent, in order,
|
||||
* the lane, tile, type (0 for qseq files, 1 for sig2 files), and read (if any)
|
||||
* represented by the given file name
|
||||
*
|
||||
* @param name
|
||||
* @return an array of integers that represent, in order,
|
||||
* the lane, tile, type (0 for qseq files, 1 for sig2 files), and read (if any)
|
||||
* represented by the given file name
|
||||
*/
|
||||
private Integer[] parseFileNameParts(String name)
|
||||
{
|
||||
Integer parts[] = new Integer[4]; // Lane, tile, type, read
|
||||
String src[] = name.split("_");
|
||||
parts[0] = new Integer(src[1]); // Lane is always the second part
|
||||
if (src[2].length() == 4) { // Tile is 3rd or fourth
|
||||
parts[1] = new Integer(src[2]);
|
||||
}
|
||||
else {
|
||||
parts[1] = new Integer(src[3]);
|
||||
}
|
||||
parts[2] = (src[src.length-1].equals("qseq.txt")) ? 0 : 1; // qseq tests are lower
|
||||
if (src[2].length() == 1) { // read is last
|
||||
parts[3] = new Integer(src[2]);
|
||||
}
|
||||
return parts;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,128 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.illumina;
|
||||
|
||||
/**
|
||||
* Holds all the Bustard-level data we need (so far) about an individual read.
|
||||
*
|
||||
* @author Kathleen Tibbetts
|
||||
*/
|
||||
public class BustardReadData {
|
||||
|
||||
private static final String PADDING ="00000";
|
||||
|
||||
final private String machineName;
|
||||
final private int runNumber;
|
||||
final private int laneNumber;
|
||||
final private int tileNumber;
|
||||
final private String firstReadSequence;
|
||||
final private String firstReadQualities;
|
||||
final private String secondReadSequence;
|
||||
final private String secondReadQualities;
|
||||
final private boolean pf;
|
||||
final private double intensities[][];
|
||||
final private int xCoordinate;
|
||||
final private int yCoordinate;
|
||||
private final SolexaQualityConverter converter = new SolexaQualityConverter();
|
||||
|
||||
|
||||
/**
|
||||
* Constructor that takes everything to populate this object
|
||||
*
|
||||
* @param machineName
|
||||
* @param runNumber
|
||||
* @param laneNumber
|
||||
* @param tileNumber
|
||||
* @param firstReadSequence
|
||||
* @param firstReadQualities
|
||||
* @param secondReadSequence
|
||||
* @param secondReadQualities
|
||||
* @param pf
|
||||
* @param intensities
|
||||
* @param xCoordinate
|
||||
* @param yCoordinate
|
||||
*/
|
||||
public BustardReadData(String machineName, int runNumber, int laneNumber, int tileNumber,
|
||||
String firstReadSequence, String firstReadQualities,
|
||||
String secondReadSequence, String secondReadQualities,
|
||||
boolean pf, double[][] intensities, int xCoordinate, int yCoordinate ) {
|
||||
|
||||
this.machineName = machineName;
|
||||
this.runNumber = runNumber;
|
||||
this.laneNumber = laneNumber;
|
||||
this.tileNumber = tileNumber;
|
||||
this.firstReadSequence = firstReadSequence;
|
||||
this.firstReadQualities = firstReadQualities;
|
||||
this.secondReadSequence = secondReadSequence;
|
||||
this.secondReadQualities = secondReadQualities;
|
||||
this.pf = pf;
|
||||
this.intensities = intensities;
|
||||
this.xCoordinate = xCoordinate;
|
||||
this.yCoordinate = yCoordinate;
|
||||
}
|
||||
|
||||
// TODO: Finalize read name -- ask Tim
|
||||
/**
|
||||
* Composes a name for this read from its values
|
||||
*
|
||||
* @return the read name
|
||||
*/
|
||||
public String getReadName() {
|
||||
return this.machineName + ":" + this.laneNumber + ":" + this.tileNumber +
|
||||
":" + this.xCoordinate + ":" + this.yCoordinate;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets Phred-style qualitites for the first read
|
||||
*
|
||||
* @return the String of qualities
|
||||
*/
|
||||
public String getFirstReadPhredQualities() {
|
||||
return decodeSolexaQualitiesToPhred(getFirstReadQualities());
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets Phred-style qualitites for the second read
|
||||
*
|
||||
* @return the String of qualities
|
||||
*/
|
||||
public String getSecondReadPhredQualities() {
|
||||
return decodeSolexaQualitiesToPhred(getSecondReadQualities());
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a string of Solexa qualities to a Phred-style quality String
|
||||
*
|
||||
* @param qualities the Solexa qualities to decode
|
||||
* @return the String of Phred qualities
|
||||
*/
|
||||
private String decodeSolexaQualitiesToPhred(String qualities) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (char c : qualities.toCharArray()) {
|
||||
// Quality char is phred score + 33
|
||||
sb.append((char)(converter.solexaToPhred((byte)c)+33));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String getMachineName() { return machineName; }
|
||||
public int getRunNumber() { return runNumber; }
|
||||
public int getLaneNumber() { return laneNumber; }
|
||||
public int getTileNumber() { return tileNumber; }
|
||||
public String getFirstReadSequence() { return firstReadSequence; }
|
||||
public String getFirstReadQualities() { return firstReadQualities; }
|
||||
public String getSecondReadSequence() { return secondReadSequence; }
|
||||
public String getSecondReadQualities() { return secondReadQualities; }
|
||||
public double[][] getIntensities() { return intensities; }
|
||||
public boolean isPf() { return pf; }
|
||||
public int getXCoordinate() { return xCoordinate; }
|
||||
public int getYCoordinate() { return yCoordinate; }
|
||||
|
||||
}
|
||||
|
|
@ -1,58 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.illumina;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||
import edu.mit.broad.picard.cmdline.Option;
|
||||
import edu.mit.broad.picard.cmdline.Usage;
|
||||
|
||||
/**
|
||||
* CommandLineProgram to generate to invoke BustardToBamWriter
|
||||
*
|
||||
* @author Kathleen Tibbetts
|
||||
*/
|
||||
public class BustardToSam extends CommandLineProgram {
|
||||
// The following attributes define the command-line arguments
|
||||
@Usage(programVersion="1.0")
|
||||
public String USAGE =
|
||||
"Usage: " + getClass().getName() + " [options]\n\n" +
|
||||
"Generate a BAM binary file from data in an illumina Bustard directory.\n";
|
||||
|
||||
@Option(shortName = "B", doc = "Bustard directory to parse. ")
|
||||
public File BUSTARD_DIRECTORY;
|
||||
|
||||
@Option(shortName = "F", doc = "The flowcell. ")
|
||||
public String FLOWCELL;
|
||||
|
||||
@Option(shortName = "L", doc = "The lane for which to parse data. ")
|
||||
public Integer LANE;
|
||||
|
||||
@Option(shortName = "P", doc = "Whether the lane was a paired-end run. ")
|
||||
public Boolean PE;
|
||||
|
||||
@Option(shortName = "O", doc = "The directory for the binary output file. ")
|
||||
public File OUTPUT;
|
||||
|
||||
@Override
|
||||
protected int doWork() {
|
||||
BustardToSamWriter writer = new BustardToSamWriter(
|
||||
new BustardFileParser(BUSTARD_DIRECTORY, LANE, PE), OUTPUT, FLOWCELL);
|
||||
writer.writeBamFile();
|
||||
return 0;
|
||||
}
|
||||
|
||||
public static void main(String[] argv) {
|
||||
System.exit(new BustardToSam().instanceMain(argv));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -1,138 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.illumina;
|
||||
|
||||
import edu.mit.broad.sam.*;
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
import edu.mit.broad.picard.util.Log;
|
||||
import edu.mit.broad.picard.filter.AggregateFilter;
|
||||
import edu.mit.broad.picard.filter.SamRecordFilter;
|
||||
import edu.mit.broad.picard.filter.SolexaNoiseFilter;
|
||||
import edu.mit.broad.picard.sam.ReservedTagConstants;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Writes the data from a BustardFileParser to a BAM file
|
||||
*/
|
||||
public class BustardToSamWriter {
|
||||
|
||||
private final BustardFileParser parser;
|
||||
private SAMFileWriter writer;
|
||||
private final File outputFile;
|
||||
private AggregateFilter filters;
|
||||
private int recordsWritten = 0;
|
||||
private Log log = Log.getInstance(BustardToSamWriter.class);
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param parser The parser for the Bustard data
|
||||
* @param outputDirectory The directory in which to write the BAM file
|
||||
* @param flowcell The flowcell from which the data is drawn
|
||||
*/
|
||||
public BustardToSamWriter(BustardFileParser parser, File outputDirectory, String flowcell) {
|
||||
this.parser = parser;
|
||||
this.outputFile = getOutputFile(outputDirectory, flowcell);
|
||||
initializeFilters();
|
||||
}
|
||||
|
||||
/**
|
||||
* Alternate constructor for testing
|
||||
*
|
||||
* @param parser The parser for the Bustard data
|
||||
* @param outputFile The directory in which to write the BAM file
|
||||
*/
|
||||
BustardToSamWriter(BustardFileParser parser, File outputFile) {
|
||||
this.parser = parser;
|
||||
this.outputFile = outputFile;
|
||||
initializeFilters();
|
||||
}
|
||||
|
||||
private void initializeFilters() {
|
||||
filters = new AggregateFilter(Arrays.asList(
|
||||
(SamRecordFilter)new SolexaNoiseFilter()
|
||||
));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Writes all data from the BustardFileParser to a BAM file
|
||||
*/
|
||||
public void writeBamFile() {
|
||||
SAMFileHeader header = new SAMFileHeader();
|
||||
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||
writer = new SAMFileWriterFactory().makeBAMWriter(header, false, outputFile);
|
||||
|
||||
while (parser.hasNext()) {
|
||||
BustardReadData brd = parser.next();
|
||||
|
||||
SAMRecord sam = createSamRecord(brd, true);
|
||||
writer.addAlignment(sam);
|
||||
this.recordsWritten++;
|
||||
|
||||
if (parser.isPairedEnd()) {
|
||||
SAMRecord sam2 = createSamRecord(brd, false);
|
||||
writer.addAlignment(sam2);
|
||||
this.recordsWritten++;
|
||||
}
|
||||
|
||||
}
|
||||
writer.close();
|
||||
|
||||
log.info("Wrote " + this.recordsWritten + " read records to BAM file " +
|
||||
this.outputFile.getAbsolutePath());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a SAMRecord from Bustard data
|
||||
*
|
||||
* @param brd The BustardReadData to use in populating the SAMRecord
|
||||
* @param isFirstRead whether this is the first read of a pair
|
||||
* @return SAMRecord fully populated SAMRecord
|
||||
*/
|
||||
private SAMRecord createSamRecord(BustardReadData brd, boolean isFirstRead) {
|
||||
SAMRecord sam = new SAMRecord();
|
||||
sam.setReadName(brd.getReadName());
|
||||
sam.setReadString(isFirstRead ? brd.getFirstReadSequence() : brd.getSecondReadSequence());
|
||||
sam.setBaseQualityString(isFirstRead ? brd.getFirstReadPhredQualities() : brd.getSecondReadPhredQualities());
|
||||
|
||||
// Flag values
|
||||
sam.setReadPairedFlag(parser.isPairedEnd());
|
||||
sam.setReadUmappedFlag(true);
|
||||
sam.setReadFailsVendorQualityCheckFlag(!brd.isPf());
|
||||
sam.setMateUnmappedFlag(true);
|
||||
if (parser.isPairedEnd()) {
|
||||
sam.setFirstOfPairFlag(isFirstRead);
|
||||
sam.setSecondOfPairFlag(!isFirstRead);
|
||||
}
|
||||
|
||||
if (filters.filterOut(sam)) {
|
||||
sam.setAttribute(ReservedTagConstants.XN, 1);
|
||||
}
|
||||
return sam;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs the name for the output file, determines whether it is writeable,
|
||||
* and returns the file
|
||||
*
|
||||
* @param outputDirectory the directory in which to write the BAM file
|
||||
* @param flowcell the flowcell from which the data is drawn
|
||||
* @return a new File object for the BAM file.
|
||||
*/
|
||||
private File getOutputFile(File outputDirectory, String flowcell) {
|
||||
File result = new File(outputDirectory.getAbsolutePath() + "/" +
|
||||
flowcell + "." + parser.getLane() + ".unmapped.bam");
|
||||
IoUtil.assertFileIsWritable(result);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,235 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.illumina;
|
||||
|
||||
import edu.mit.broad.picard.util.PasteParser;
|
||||
import edu.mit.broad.picard.util.TabbedTextFileParser;
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Iterator;
|
||||
import java.util.Arrays;
|
||||
import java.util.regex.Pattern;
|
||||
import java.text.ParsePosition;
|
||||
import java.text.NumberFormat;
|
||||
|
||||
/**
|
||||
* Parse the pair of files (eland_extended.txt and export.txt) that correspond to an end of a Gerald run for a lane.
|
||||
*/
|
||||
public class GeraldParser implements Iterable<GeraldParser.GeraldAlignment>, CloseableIterator<GeraldParser.GeraldAlignment> {
|
||||
private static final int EXPECTED_ELAND_FIELDS = 4;
|
||||
// Regex used to split apart multiple alignments in the eland output
|
||||
private static final Pattern ALIGN_SPLITTER = Pattern.compile("\\,+");
|
||||
|
||||
// export.txt constants
|
||||
private static final int PASSING_FILTER_COLUMN = 21;
|
||||
private static final int QUALITIES_COLUMN = 9;
|
||||
private static final int REQUIRED_EXPORT_COLUMNS = PASSING_FILTER_COLUMN + 1;
|
||||
|
||||
private final NumberFormat integerFormat = NumberFormat.getIntegerInstance();
|
||||
|
||||
private final SquashedCoordinateMap geraldToArachne;
|
||||
private final PasteParser pasteParser;
|
||||
private final File elandExtended;
|
||||
private final File export;
|
||||
private boolean iteratorCalled = false;
|
||||
private final byte[] solexaToPhredQualityConverter = new SolexaQualityConverter().getSolexaToPhredConversionTable();
|
||||
|
||||
/**
|
||||
* @param geraldToArachne for converting btw Gerald coordinate and genomic coordinate
|
||||
*/
|
||||
public GeraldParser(final SquashedCoordinateMap geraldToArachne, final File elandExtended, final File export) {
|
||||
this.geraldToArachne = geraldToArachne;
|
||||
this.elandExtended = elandExtended;
|
||||
this.export = export;
|
||||
final TabbedTextFileParser[] parsers = {
|
||||
new TabbedTextFileParser(false, elandExtended),
|
||||
new TabbedTextFileParser(false, export)
|
||||
};
|
||||
pasteParser = new PasteParser(parsers);
|
||||
}
|
||||
|
||||
public Iterator<GeraldAlignment> iterator() {
|
||||
if (iteratorCalled) {
|
||||
throw new IllegalStateException("iterator() cannot be called more than once on a GeraldParser instance.");
|
||||
}
|
||||
iteratorCalled = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
pasteParser.close();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return pasteParser.hasNext();
|
||||
}
|
||||
|
||||
public GeraldAlignment next() {
|
||||
final GeraldAlignment ret = new GeraldAlignment();
|
||||
final String[][] fields = pasteParser.next();
|
||||
|
||||
// Parse eland_extended.txt fields
|
||||
final String[] elandExtendedFields = fields[0];
|
||||
if (elandExtendedFields.length < EXPECTED_ELAND_FIELDS) {
|
||||
throw new PicardException("Not enough fields in file: " + elandExtended);
|
||||
}
|
||||
|
||||
ret.readName = elandExtendedFields[0].substring(1);
|
||||
ret.readBases = elandExtendedFields[1];
|
||||
ret.readLength = ret.readBases.length();
|
||||
final String[] alignCounts = elandExtendedFields[2].split(":");
|
||||
if (alignCounts.length == 3) {
|
||||
ret.zeroMismatchPlacements = Short.parseShort(alignCounts[0]);
|
||||
ret.oneMismatchPlacements = Short.parseShort(alignCounts[1]);
|
||||
ret.twoMismatchPlacements = Short.parseShort(alignCounts[2]);
|
||||
}
|
||||
|
||||
final String[] alignments = ALIGN_SPLITTER.split(elandExtendedFields[3]);
|
||||
if (alignments.length == 1 && !"-".equals(alignments[0])) {
|
||||
final int lastDot = alignments[0].lastIndexOf(".");
|
||||
final int colon = alignments[0].indexOf(':');
|
||||
|
||||
final String tmp = alignments[0].substring(colon + 1);
|
||||
final ParsePosition pos = new ParsePosition(0);
|
||||
final long start = integerFormat.parse(tmp, pos).longValue();
|
||||
if (pos.getIndex() == 0) {
|
||||
throw new RuntimeException("Problem parsing eland extended alignment record: " + Arrays.toString(elandExtendedFields));
|
||||
}
|
||||
|
||||
final SimpleMapping m = new SimpleMapping(alignments[0].substring(lastDot+1, colon).trim(),
|
||||
start, start + ret.readLength - 1, null);
|
||||
geraldToArachne.convertToArachneCoords(m);
|
||||
ret.primaryChrom = m.getSequenceName();
|
||||
ret.primaryStart = m.getStartPos();
|
||||
ret.primaryStop = m.getEndPos();
|
||||
ret.orientation = tmp.substring(pos.getIndex(), pos.getIndex() + 1);
|
||||
ret.mismatchString = tmp.substring(pos.getIndex() + 1);
|
||||
|
||||
// Count the mismatches in the alignment
|
||||
for (int i=pos.getIndex(); i<tmp.length(); ++i) {
|
||||
final char ch = tmp.charAt(i);
|
||||
if (ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T') {
|
||||
ret.primaryMismatches += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final String[] exportFields = fields[1];
|
||||
// Parse export.txt fields
|
||||
if (exportFields.length < REQUIRED_EXPORT_COLUMNS) {
|
||||
throw new RuntimeException("Not enough columns in _export.txt file " + export);
|
||||
}
|
||||
if (exportFields[PASSING_FILTER_COLUMN].equals("Y")) {
|
||||
ret.passingFilter = true;
|
||||
} else if (exportFields[PASSING_FILTER_COLUMN].equals("N")) {
|
||||
ret.passingFilter = false;
|
||||
} else {
|
||||
throw new RuntimeException("Strange value for PF column in _export.txt file " + export + ": '" +
|
||||
exportFields[PASSING_FILTER_COLUMN] + "'.");
|
||||
}
|
||||
ret.phredQualities = exportFields[QUALITIES_COLUMN].getBytes();
|
||||
decodeSolexaQualitiesToPhred(ret.phredQualities);
|
||||
|
||||
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
/** Decodes an array of solexa quality chars into SOLEXA numeric space.
|
||||
* Decode in place in order to avoid extra object allocation */
|
||||
private void decodeSolexaQualitiesToPhred(final byte[] solexaQuals) {
|
||||
for (int i=0; i<solexaQuals.length; ++i) {
|
||||
solexaQuals[i] = solexaToPhredQualityConverter[solexaQuals[i]];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class GeraldAlignment {
|
||||
// From eland_extended.txt
|
||||
private String readName = null;
|
||||
private String readBases = null;
|
||||
private int readLength = 0;
|
||||
private short zeroMismatchPlacements = 0;
|
||||
private short oneMismatchPlacements = 0;
|
||||
private short twoMismatchPlacements = 0;
|
||||
private String primaryChrom = null;
|
||||
private long primaryStart = 0;
|
||||
private long primaryStop = 0;
|
||||
private String orientation = null;
|
||||
private short primaryMismatches = 0;
|
||||
private String mismatchString = null;
|
||||
|
||||
// from export.txt
|
||||
private boolean passingFilter;
|
||||
private byte[] phredQualities;
|
||||
|
||||
public String getMismatchString() {
|
||||
return mismatchString;
|
||||
}
|
||||
|
||||
public short getOneMismatchPlacements() {
|
||||
return oneMismatchPlacements;
|
||||
}
|
||||
|
||||
public String getOrientation() {
|
||||
return orientation;
|
||||
}
|
||||
|
||||
public boolean isPassingFilter() {
|
||||
return passingFilter;
|
||||
}
|
||||
|
||||
public byte[] getPhredQualities() {
|
||||
return phredQualities;
|
||||
}
|
||||
|
||||
public String getPrimaryChrom() {
|
||||
return primaryChrom;
|
||||
}
|
||||
|
||||
public short getPrimaryMismatches() {
|
||||
return primaryMismatches;
|
||||
}
|
||||
|
||||
public long getPrimaryStart() {
|
||||
return primaryStart;
|
||||
}
|
||||
|
||||
public long getPrimaryStop() {
|
||||
return primaryStop;
|
||||
}
|
||||
|
||||
public String getReadBases() {
|
||||
return readBases;
|
||||
}
|
||||
|
||||
public int getReadLength() {
|
||||
return readLength;
|
||||
}
|
||||
|
||||
public String getReadName() {
|
||||
return readName;
|
||||
}
|
||||
|
||||
public short getTwoMismatchPlacements() {
|
||||
return twoMismatchPlacements;
|
||||
}
|
||||
|
||||
public short getZeroMismatchPlacements() {
|
||||
return zeroMismatchPlacements;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,58 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.illumina;
|
||||
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Given a Gerald directory, create a GeraldParser for one end or both ends as appropriate.
|
||||
*/
|
||||
public class GeraldParserFactory {
|
||||
|
||||
// A Map of squashed reference chunk to reference genome sequence/chromosome. The chunk is represented as
|
||||
// a mapping (sequence=chunk file, startPos=offset into chunk file).
|
||||
private final SquashedCoordinateMap geraldToArachne;
|
||||
private final File geraldDir;
|
||||
private final int lane;
|
||||
|
||||
public GeraldParserFactory(final File geraldDir, final int lane, final File squashedMapFile) {
|
||||
this.geraldDir = geraldDir;
|
||||
this.lane = lane;
|
||||
geraldToArachne = new SquashedCoordinateMap(squashedMapFile);
|
||||
}
|
||||
|
||||
/** Attempts to determine if an analysis on a lane is PE or single. */
|
||||
public boolean isPairedRun() {
|
||||
if (new File(geraldDir, "s_" + lane + "_1_eland_query.txt").exists()) return true;
|
||||
else if (new File(geraldDir, "s_" + lane + "_eland_query.txt").exists()) return false;
|
||||
|
||||
throw new IllegalStateException("Could not determine if gerald run is PE or fragment.");
|
||||
}
|
||||
|
||||
private String makeLanePrefix(final Integer readNumber) {
|
||||
return "s_" + lane + "_" + (readNumber == null ? "" : readNumber + "_");
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* @param readNumber 1 == first end of pair; 2 == second end of pair; null == unpaired
|
||||
* @return a GeraldParser for the given end
|
||||
*/
|
||||
public GeraldParser makeParser(final Integer readNumber) {
|
||||
final File elandExtendedFile = new File(geraldDir, makeLanePrefix(readNumber) + "eland_extended.txt");
|
||||
final File exportFile = new File(geraldDir, makeLanePrefix(readNumber) + "export.txt");
|
||||
IoUtil.assertFileIsReadable(elandExtendedFile);
|
||||
IoUtil.assertFileIsReadable(exportFile);
|
||||
return new GeraldParser(geraldToArachne, elandExtendedFile, exportFile);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,348 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2008 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.illumina;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Iterator;
|
||||
|
||||
import edu.mit.broad.picard.util.*;
|
||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||
import edu.mit.broad.picard.cmdline.Usage;
|
||||
import edu.mit.broad.picard.cmdline.Option;
|
||||
import edu.mit.broad.picard.cmdline.CommandLineParser;
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.sam.SAMFileReader;
|
||||
import edu.mit.broad.sam.SAMFileWriter;
|
||||
import edu.mit.broad.sam.SAMFileWriterFactory;
|
||||
import edu.mit.broad.sam.SAMProgramRecord;
|
||||
import edu.mit.broad.sam.SAMReadGroupRecord;
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
|
||||
/**
|
||||
* Read alignments for a lane (paired or unpaired) from Gerald directory and write to SAM file.
|
||||
*/
|
||||
public class GeraldToSam extends CommandLineProgram {
|
||||
|
||||
// These are all written to the SAM header
|
||||
private static final String DEFAULT_CN = "broad";
|
||||
private static final String DEFAULT_PL = "illumina";
|
||||
private static final String PROGRAM_VERSION = "1.0";
|
||||
private static final String READ_GROUP_ID = "0";
|
||||
private static final String PROGRAM_RECORD_ID = "0";
|
||||
private static final String UNKNOWN_SAMPLE = "N/A";
|
||||
|
||||
private static final Log log = Log.getInstance(GeraldToSam.class);
|
||||
|
||||
// The following attributes define the command-line arguments
|
||||
@Usage(programVersion=PROGRAM_VERSION)
|
||||
public String USAGE =
|
||||
getStandardUsagePreamble() +
|
||||
"Read Gerald alignments for the given lane, and write in SAM format, coordinate sorted.\n";
|
||||
|
||||
@Option(shortName = "G", doc = "Location of Gerald files.")
|
||||
public File GERALD_DIR;
|
||||
|
||||
@Option(shortName = "L")
|
||||
public Integer LANE;
|
||||
|
||||
@Option(shortName = "M", doc = "Translates from Gerald alignment coordinates to genomic coordinates.")
|
||||
public File SQUASHED_MAP;
|
||||
|
||||
@Option(shortName = "D", doc = "Input SAM or BAM file defining the names, sizes and order of the reference contig, " +
|
||||
"and other reference metadata.")
|
||||
public File SEQUENCE_DICT;
|
||||
|
||||
@Option(shortName = "O", doc = "SAM or BAM file to be written (file extension determines format).")
|
||||
public File OUTPUT;
|
||||
|
||||
@Option(doc = "Populates SM field of read group. Use pool name when a pool is being sequenced. " +
|
||||
"If any other read group fields are specified, then this is required.")
|
||||
public String SAMPLE = UNKNOWN_SAMPLE;
|
||||
|
||||
@Option(doc = "Populates LB field of read group.")
|
||||
public String LIBRARY;
|
||||
|
||||
@Option(doc = "Populates DS field of read group.", optional = true)
|
||||
public String DESCRIPTION;
|
||||
|
||||
@Option(doc = "Flowcell.lane. Populates PU field of read group.")
|
||||
public String RUN;
|
||||
|
||||
@Option(doc = "Predicted median insert size (may be different from the actual median insert size. " +
|
||||
"Populates the PI field of read group.", optional = true)
|
||||
public Integer PI;
|
||||
|
||||
@Option(doc = "Sequencing center that produced the reads. Populates CN field of read group.")
|
||||
public String CN = DEFAULT_CN;
|
||||
|
||||
@Option(doc = "Date the run was produced. Populates the DT field of read group.")
|
||||
public Date RUN_DATE;
|
||||
|
||||
@Option(doc = "Platform/technology used to produce the reads. Populates the PL field of read group")
|
||||
public String PL = DEFAULT_PL;
|
||||
|
||||
@Option(shortName = "JUMPING", doc = "True if this is a jumping library")
|
||||
public Boolean JUMPING_LIBRARY = Boolean.FALSE;
|
||||
|
||||
@Option(doc = "String to put in the PG:CL header field. If not present, the GeraldToSam command line is put there",
|
||||
optional = true)
|
||||
public String ALIGNMENT_COMMAND;
|
||||
|
||||
@Option(doc = "Write no more than this number of alignment records. Default: Write all the alignment records",
|
||||
optional = true)
|
||||
public Integer MAX_ALIGNMENTS;
|
||||
|
||||
private SAMFileWriter writer;
|
||||
SAMFileHeader header;
|
||||
private boolean paired;
|
||||
|
||||
|
||||
public static void main(final String[] argv) {
|
||||
System.exit(new GeraldToSam().instanceMain(argv));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int doWork() {
|
||||
makeHeader(clp.getArgv());
|
||||
writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(header, false, OUTPUT);
|
||||
writeAlignments();
|
||||
writer.close();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* If any of the read group options are specified on the command line, then SAMPLE must be specified.
|
||||
* This is currently not doing anything because SAMPLE has a non-null default value.
|
||||
* @return false if there is a problem with the command line
|
||||
*/
|
||||
@Override
|
||||
protected boolean customCommandLineValidation() {
|
||||
if (SAMPLE == null &&
|
||||
(LIBRARY != null || DESCRIPTION != null || RUN != null || PI != null || !CN.equals(DEFAULT_CN)
|
||||
|| RUN_DATE != null || !PL.equals(DEFAULT_PL)
|
||||
)) {
|
||||
System.err.println("SAMPLE must be specified if any read group options are used.");
|
||||
clp.usage(System.err);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create the SAMFileHeader given the cmd-line args
|
||||
* @param argv
|
||||
*/
|
||||
private void makeHeader(final String[] argv) {
|
||||
header = new SAMFileHeader();
|
||||
header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
|
||||
final SAMProgramRecord programRecord = new SAMProgramRecord(PROGRAM_RECORD_ID);
|
||||
programRecord.setProgramVersion(PROGRAM_VERSION);
|
||||
String commandLine = ALIGNMENT_COMMAND;
|
||||
if (commandLine == null) {
|
||||
commandLine = StringUtil.join(" ", argv);
|
||||
}
|
||||
programRecord.setCommandLine(commandLine);
|
||||
header.addProgramRecord(programRecord);
|
||||
|
||||
final SAMFileReader sequenceDictionary = new SAMFileReader(SEQUENCE_DICT);
|
||||
final SAMFileHeader sequenceDictionaryHeader = sequenceDictionary.getFileHeader();
|
||||
header.setSequences(sequenceDictionaryHeader.getSequences());
|
||||
|
||||
if (SAMPLE != null) {
|
||||
final SAMReadGroupRecord readGroup = new SAMReadGroupRecord(READ_GROUP_ID);
|
||||
final List<SAMReadGroupRecord> readGroups = new ArrayList<SAMReadGroupRecord>();
|
||||
readGroups.add(readGroup);
|
||||
readGroup.setSample(SAMPLE);
|
||||
if (LIBRARY != null) {
|
||||
readGroup.setLibrary(LIBRARY);
|
||||
}
|
||||
setRGAttributeIfNotNull(readGroup, DESCRIPTION, "DS");
|
||||
setRGAttributeIfNotNull(readGroup, RUN, "PU");
|
||||
setRGAttributeIfNotNull(readGroup, PI, SAMReadGroupRecord.PREDICTED_MEDIAN_INSERT_SIZE_TAG);
|
||||
setRGAttributeIfNotNull(readGroup, CN, "CN");
|
||||
setRGAttributeIfNotNull(readGroup, RUN_DATE, SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG);
|
||||
setRGAttributeIfNotNull(readGroup, PL, "PL");
|
||||
header.setReadGroups(readGroups);
|
||||
}
|
||||
}
|
||||
|
||||
private void setRGAttributeIfNotNull(final SAMReadGroupRecord readGroup, final Object value, final String key) {
|
||||
if (value == null) {
|
||||
return;
|
||||
}
|
||||
readGroup.setAttribute(key, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate through the Gerald output and write alignments. eland_extended.txt and export.txt are
|
||||
* iterated together using PasteParser. If paired end lane, then two PasteParsers are iterated in tandem,
|
||||
* so that mate info is available when a SAMRecord is created.
|
||||
*/
|
||||
private void writeAlignments() {
|
||||
final GeraldParserFactory geraldParserFactory = new GeraldParserFactory(GERALD_DIR, LANE, SQUASHED_MAP);
|
||||
paired = geraldParserFactory.isPairedRun();
|
||||
final GeraldParser firstEndIterator = geraldParserFactory.makeParser(paired ? 1: null);
|
||||
GeraldParser secondEndIterator = null;
|
||||
if (paired) {
|
||||
secondEndIterator = geraldParserFactory.makeParser(2);
|
||||
}
|
||||
int numAlignmentsOrPairsWritten = 0;
|
||||
while (firstEndIterator.hasNext()) {
|
||||
final GeraldParser.GeraldAlignment firstEnd = firstEndIterator.next();
|
||||
GeraldParser.GeraldAlignment secondEnd = null;
|
||||
if (paired) {
|
||||
hasNextAssert(secondEndIterator);
|
||||
secondEnd = secondEndIterator.next();
|
||||
}
|
||||
final SAMRecord firstEndAlignment = createSAMRecordFromGerald(firstEnd);
|
||||
SAMRecord secondEndAlignment = null;
|
||||
if (paired) {
|
||||
secondEndAlignment = createSAMRecordFromGerald(secondEnd);
|
||||
setMateInfo(secondEndAlignment, firstEnd);
|
||||
setMateInfo(firstEndAlignment, secondEnd);
|
||||
secondEndAlignment.setSecondOfPairFlag(true);
|
||||
firstEndAlignment.setFirstOfPairFlag(true);
|
||||
final boolean properPair = SamPairUtil.isProperPair(firstEndAlignment, secondEndAlignment, JUMPING_LIBRARY);
|
||||
firstEndAlignment.setProperPairFlag(properPair);
|
||||
secondEndAlignment.setProperPairFlag(properPair);
|
||||
int insertSize = SamPairUtil.computeInsertSize(firstEndAlignment, secondEndAlignment);
|
||||
firstEndAlignment.setInferredInsertSize(insertSize);
|
||||
secondEndAlignment.setInferredInsertSize(-insertSize);
|
||||
}
|
||||
|
||||
writer.addAlignment(firstEndAlignment);
|
||||
if (secondEndAlignment != null) {
|
||||
writer.addAlignment(secondEndAlignment);
|
||||
}
|
||||
++numAlignmentsOrPairsWritten;
|
||||
if (MAX_ALIGNMENTS != null && numAlignmentsOrPairsWritten >= MAX_ALIGNMENTS) {
|
||||
break;
|
||||
}
|
||||
if (numAlignmentsOrPairsWritten % 500000 == 0) {
|
||||
log.info("Loaded " + numAlignmentsOrPairsWritten + " reads");
|
||||
}
|
||||
}
|
||||
if (MAX_ALIGNMENTS == null) {
|
||||
noMoreAssert(firstEndIterator);
|
||||
if (paired) {
|
||||
noMoreAssert(secondEndIterator);
|
||||
}
|
||||
}
|
||||
log.info("Done loading " + numAlignmentsOrPairsWritten + " reads");
|
||||
}
|
||||
|
||||
/**
|
||||
* Write into the samRecord the mate info from the mate gerald alignment
|
||||
*/
|
||||
private void setMateInfo(final SAMRecord samRecord, final GeraldParser.GeraldAlignment mateGeraldAlignment) {
|
||||
final boolean isMapped = mateGeraldAlignment.getPrimaryChrom() != null;
|
||||
if (isMapped) {
|
||||
samRecord.setMateReferenceName(mateGeraldAlignment.getPrimaryChrom());
|
||||
samRecord.setMateAlignmentStart((int)mateGeraldAlignment.getPrimaryStart());
|
||||
samRecord.setMateNegativeStrandFlag(isNegativeStrand(mateGeraldAlignment));
|
||||
} else {
|
||||
samRecord.setMateReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
|
||||
samRecord.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
|
||||
samRecord.setMateUnmappedFlag(true);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isNegativeStrand(final GeraldParser.GeraldAlignment alignment) {
|
||||
final String orientation = alignment.getOrientation();
|
||||
if (orientation.equals("F")) {
|
||||
return false;
|
||||
} else if (orientation.equals("R")) {
|
||||
return true;
|
||||
} else {
|
||||
throw new RuntimeException("Strange orientation in eland_extended file");
|
||||
}
|
||||
}
|
||||
|
||||
private SAMRecord createSAMRecordFromGerald(final GeraldParser.GeraldAlignment alignment) {
|
||||
final SAMRecord samRecord = new SAMRecord();
|
||||
// Consider an alignment with a negative start (i.e. that hangs off the beginning of the contig)
|
||||
// to be unmapped.
|
||||
final boolean isMapped = alignment.getPrimaryChrom() != null && alignment.getPrimaryStart() >= 0;
|
||||
|
||||
String readName = alignment.getReadName();
|
||||
if (readName.endsWith("/1") || readName.endsWith("/2")) {
|
||||
readName = readName.substring(0, readName.length() - 2);
|
||||
}
|
||||
samRecord.setReadName(readName);
|
||||
|
||||
// Set all the flags
|
||||
samRecord.setReadPairedFlag(paired);
|
||||
samRecord.setReadUmappedFlag(!isMapped);
|
||||
if (isMapped) {
|
||||
samRecord.setReadNegativeStrandFlag(isNegativeStrand(alignment));
|
||||
}
|
||||
// For now we are only taking the primary alignment
|
||||
samRecord.setNotPrimaryAlignmentFlag(false);
|
||||
String readBases = alignment.getReadBases();
|
||||
if (samRecord.getReadNegativeStrandFlag()) {
|
||||
readBases = SequenceUtil.reverseComplement(readBases);
|
||||
}
|
||||
samRecord.setReadString(readBases);
|
||||
final byte[] phredQualities = alignment.getPhredQualities();
|
||||
if (isMapped && samRecord.getReadNegativeStrandFlag()) {
|
||||
ArrayUtil.reverseArray(phredQualities);
|
||||
}
|
||||
samRecord.setBaseQualities(phredQualities);
|
||||
if (isMapped) {
|
||||
/*
|
||||
if ("23".equals(geraldReferenceName)) {
|
||||
geraldReferenceName = "X";
|
||||
} else if ("24".equals(geraldReferenceName)) {
|
||||
geraldReferenceName = "Y";
|
||||
}
|
||||
return REFERENCE_PREFIX + geraldReferenceName;
|
||||
*/
|
||||
samRecord.setReferenceName(alignment.getPrimaryChrom());
|
||||
samRecord.setAlignmentStart((int)alignment.getPrimaryStart());
|
||||
samRecord.setMappingQuality(SAMRecord.UNKNOWN_MAPPING_QUALITY);
|
||||
// CIGAR is trivial because there are no indels or clipping in Gerald
|
||||
final String cigar = Integer.toString(alignment.getReadLength()) + "M";
|
||||
samRecord.setCigarString(cigar);
|
||||
// We've decided not to bother with this, and just load the reference
|
||||
// if we want to determine mismatches.
|
||||
// samRecord.setAttribute("MD", alignment.getMismatchString());
|
||||
} else {
|
||||
samRecord.setReferenceName(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
|
||||
samRecord.setAlignmentStart(SAMRecord.NO_ALIGNMENT_START);
|
||||
samRecord.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY);
|
||||
samRecord.setCigarString(SAMRecord.NO_ALIGNMENT_CIGAR);
|
||||
}
|
||||
|
||||
if (SAMPLE != null) {
|
||||
// There is a read group (id = READ_GROUP_ID)
|
||||
samRecord.setAttribute("RG", READ_GROUP_ID);
|
||||
}
|
||||
|
||||
samRecord.setAttribute("PG", PROGRAM_RECORD_ID);
|
||||
return samRecord;
|
||||
}
|
||||
|
||||
private void hasNextAssert(final Iterator iterator) {
|
||||
if (!iterator.hasNext()) {
|
||||
throw new RuntimeException("gerald output file ends unexpectedly.");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private void noMoreAssert(final Iterator iterator) {
|
||||
if (iterator.hasNext()) {
|
||||
throw new RuntimeException("gerald output file has more lines than expected.");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,117 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.illumina;
|
||||
|
||||
import edu.mit.broad.sam.util.CoordMath;
|
||||
|
||||
class SimpleMapping implements Comparable<SimpleMapping> {
|
||||
String arachneIndex;
|
||||
long startPos;
|
||||
long endPos;
|
||||
String sequenceName;
|
||||
|
||||
public SimpleMapping(final String arachneIndex, final long startPos, final long endPos, final String sequenceName) {
|
||||
this.arachneIndex = arachneIndex;
|
||||
this.startPos = startPos;
|
||||
this.endPos = endPos;
|
||||
this.sequenceName = sequenceName;
|
||||
|
||||
if (this.endPos < this.startPos) throw new IllegalArgumentException("startPos must be less than endPos!");
|
||||
}
|
||||
|
||||
public String getArachneIndex() {
|
||||
return arachneIndex;
|
||||
}
|
||||
|
||||
public void setArachneIndex(final String arachneIndex) {
|
||||
this.arachneIndex = arachneIndex;
|
||||
}
|
||||
|
||||
public long getStartPos() {
|
||||
return startPos;
|
||||
}
|
||||
|
||||
public void setStartPos(final long startPos) {
|
||||
this.startPos = startPos;
|
||||
}
|
||||
|
||||
public long getEndPos() {
|
||||
return endPos;
|
||||
}
|
||||
|
||||
public void setEndPos(final long endPos) {
|
||||
this.endPos = endPos;
|
||||
}
|
||||
|
||||
public String getSequenceName() {
|
||||
return sequenceName;
|
||||
}
|
||||
|
||||
public void setSequenceName(final String sequenceName) {
|
||||
this.sequenceName = sequenceName;
|
||||
}
|
||||
|
||||
public SimpleMapping intersection(final SimpleMapping other) {
|
||||
if (this.intersects(other)) {
|
||||
return new SimpleMapping(this.getArachneIndex(),
|
||||
(this.getStartPos() >= other.getStartPos())?this.getStartPos():other.getStartPos(),
|
||||
(this.getEndPos() <= other.getEndPos())?this.getEndPos():other.getEndPos(), this.getSequenceName());
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
public boolean intersects(final SimpleMapping other) {
|
||||
return (this.getArachneIndex().equals(other.getArachneIndex()) &&
|
||||
CoordMath.overlaps(this.getStartPos(), this.getEndPos(), other.getStartPos(), other.getEndPos()));
|
||||
}
|
||||
|
||||
public long length() {
|
||||
return CoordMath.getLength(startPos, endPos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort based on sequence.compareTo, then start pos, then end pos
|
||||
* with null objects coming lexically last
|
||||
*/
|
||||
public int compareTo(final SimpleMapping that) {
|
||||
if (that == null) return -1; // nulls last
|
||||
|
||||
int result = this.getArachneIndex().compareTo(that.getArachneIndex());
|
||||
if (result == 0) {
|
||||
if (this.getStartPos() == that.getStartPos()) {
|
||||
result = ((int) (this.getEndPos() - that.getEndPos()));
|
||||
} else {
|
||||
result = ((int) (this.getStartPos() - that.getStartPos()));
|
||||
}
|
||||
}
|
||||
|
||||
// normalize to -1, 0, 1
|
||||
if (result > 1) result = 1;
|
||||
else if (result < -1) result = -1;
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean equals(final SimpleMapping that) {
|
||||
return (this.compareTo(that) == 0);
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
int result;
|
||||
result = arachneIndex.hashCode();
|
||||
result = 31 * result + (int) (startPos ^ (startPos >>> 32));
|
||||
result = 31 * result + (int) (endPos ^ (endPos >>> 32));
|
||||
return result;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getArachneIndex() + ":" + getStartPos() + "-" + getEndPos();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,58 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.illumina;
|
||||
|
||||
/**
|
||||
* Optimized method for converting Solexa ASCII qualities into Phred scores.
|
||||
* Pre-computes all values in order to eliminate repeated computation.
|
||||
*/
|
||||
public class SolexaQualityConverter {
|
||||
|
||||
/**
|
||||
* This value is added to a Solexa quality score to make it printable ASCII
|
||||
*/
|
||||
private static int SOLEXA_ADDEND = 64;
|
||||
|
||||
/**
|
||||
* Mapping from ASCII value in Gerald export file to phred score
|
||||
*/
|
||||
private final byte[] phredScore = new byte[256];
|
||||
|
||||
public SolexaQualityConverter() {
|
||||
for (int i = 0; i < SOLEXA_ADDEND; ++i) {
|
||||
phredScore[i] = 0;
|
||||
}
|
||||
for (int i = SOLEXA_ADDEND; i < phredScore.length; ++i) {
|
||||
phredScore[i] = decodeSolexaQualityToPhred(i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Converts a solexa character quality into a phred numeric quality. */
|
||||
private byte decodeSolexaQualityToPhred(final int solexaQuality) {
|
||||
return (byte) Math.round(10d * Math.log10(1d+Math.pow(10d, (solexaQuality - SOLEXA_ADDEND)/10d)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a solexa quality ASCII character into a phred score.
|
||||
*/
|
||||
public byte solexaToPhred(final byte solexaQuality) {
|
||||
return phredScore[solexaQuality];
|
||||
}
|
||||
|
||||
/**
|
||||
* @return a byte array that can be indexed by Solexa ASCII quality, with value
|
||||
* of corresponding Phred score. Elements 0-63 are invalid because Solexa qualities
|
||||
* should all be >= 64. Do not modify this array!
|
||||
*/
|
||||
public byte[] getSolexaToPhredConversionTable() {
|
||||
return phredScore;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,75 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.illumina;
|
||||
|
||||
import edu.mit.broad.sam.util.CoordMath;
|
||||
import edu.mit.broad.picard.cmdline.CommandLineUtils;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
import java.io.File;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
|
||||
public class SquashedCoordinateMap {
|
||||
private final Map<SimpleMapping, String> geraldToArachne = new HashMap<SimpleMapping, String>();
|
||||
private long genomeSize;
|
||||
|
||||
public SquashedCoordinateMap(final File squashedMapFile) {
|
||||
try {
|
||||
final BufferedReader in = CommandLineUtils.getReader(squashedMapFile);
|
||||
String line;
|
||||
genomeSize = 0;
|
||||
|
||||
while ((line = in.readLine()) != null) {
|
||||
final String[] fields = CommandLineUtils.SPACE_SPLITTER.split(line);
|
||||
final String arachneIndex = fields[0].trim().intern();
|
||||
final String squashedRefIndex = fields[1].trim().intern();
|
||||
final long squashedStart = Long.parseLong(fields[2]);
|
||||
final long length = Long.parseLong(fields[3]);
|
||||
final String sequenceName = fields[4];
|
||||
|
||||
final SimpleMapping mapping = new SimpleMapping(squashedRefIndex, squashedStart,
|
||||
CoordMath.getEnd(squashedStart, length), sequenceName);
|
||||
geraldToArachne.put(mapping, arachneIndex);
|
||||
|
||||
genomeSize += length;
|
||||
}
|
||||
|
||||
in.close();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/* Converts a read's mapping from Gerald's vretarded space to arachne index + coords. */
|
||||
public void convertToArachneCoords(final SimpleMapping read) {
|
||||
if (this.geraldToArachne == null || this.geraldToArachne.isEmpty()) {
|
||||
throw new IllegalStateException("Cannot invoke convertToArachneCoords before parseSquashedMapFile");
|
||||
}
|
||||
|
||||
for (final Map.Entry<SimpleMapping,String> entry : this.geraldToArachne.entrySet()) {
|
||||
final SimpleMapping chunk = entry.getKey();
|
||||
if (chunk.intersects(read)) {
|
||||
read.setArachneIndex(entry.getValue());
|
||||
read.setStartPos( read.getStartPos() - chunk.getStartPos() );
|
||||
read.setEndPos( read.getEndPos() - chunk.getStartPos() );
|
||||
read.setSequenceName(chunk.getSequenceName());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
throw new RuntimeException("Could not convert read: " + read);
|
||||
}
|
||||
|
||||
long getGenomeSize() {
|
||||
return genomeSize;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
|
||||
package edu.mit.broad.picard.importer.genotype;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.File;
|
||||
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.sam.util.BinaryCodec;
|
||||
|
||||
/**
|
||||
*
|
||||
*
|
||||
* @author Doug Voet
|
||||
*/
|
||||
public class BedFileReader implements Closeable {
|
||||
private static final int LOWEST_2_BIT_MASK = 3; // binary 11
|
||||
private static final short BED_MAGIC_NUMBER = 7020;
|
||||
// private static final short BED_MAGIC_NUMBER = Short.parseShort("0110110000011011", 2);
|
||||
|
||||
public static final byte MODE_INDIVIDUAL_MAJOR = 0;
|
||||
public static final byte MODE_SNP_MAJOR = 1;
|
||||
|
||||
public static final byte GENOTYPE_AA = 0; // binary 00
|
||||
public static final byte GENOTYPE_NO_CALL = 1; // binary 01
|
||||
public static final byte GENOTYPE_AB = 2; // binary 10
|
||||
public static final byte GENOTYPE_BB = 3; // binary 11
|
||||
|
||||
private final byte mode;
|
||||
private final BinaryCodec codec;
|
||||
private byte currentBlock;
|
||||
private int genotypeCount = 0;
|
||||
|
||||
public BedFileReader(File bedFile) {
|
||||
this.codec = new BinaryCodec(bedFile, false);
|
||||
short fileMagicNumber = this.codec.readShort();
|
||||
if (fileMagicNumber != BED_MAGIC_NUMBER) {
|
||||
this.codec.close();
|
||||
throw new PicardException("Given file [" + bedFile.getAbsolutePath() +
|
||||
"] is not in bed file format... magic number does not match");
|
||||
}
|
||||
this.mode = codec.readByte();
|
||||
}
|
||||
|
||||
public byte getMode() {
|
||||
return mode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
this.codec.close();
|
||||
}
|
||||
|
||||
public byte nextGenotype() {
|
||||
// there are 4 genotypes per byte so get a new byte every 4 genotypes read
|
||||
if (this.genotypeCount++ % 4 == 0) {
|
||||
this.currentBlock = this.codec.readByte();
|
||||
}
|
||||
|
||||
// the 2 lowest order bits of currentBlock are the next genotype, pop them off
|
||||
byte genotype = (byte) (LOWEST_2_BIT_MASK & this.currentBlock);
|
||||
this.currentBlock >>>= 2;
|
||||
|
||||
return genotype;
|
||||
}
|
||||
|
||||
/**
|
||||
* Call this method when moving on to the next individual (in indiv-major mode) or next
|
||||
* snp (in snp-major mode).
|
||||
*/
|
||||
public void dropRemainingBlock() {
|
||||
this.genotypeCount = 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,371 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
|
||||
package edu.mit.broad.picard.importer.genotype;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||
import edu.mit.broad.picard.cmdline.Option;
|
||||
import edu.mit.broad.picard.cmdline.Usage;
|
||||
import edu.mit.broad.picard.genotype.GeliFileWriter;
|
||||
import edu.mit.broad.picard.genotype.GenotypeLikelihoods;
|
||||
import edu.mit.broad.picard.genotype.GenotypeLikelihoodsCodec;
|
||||
import edu.mit.broad.picard.genotype.GenotypeLikelihoods.GenotypeLikelihoodsComparator;
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
import edu.mit.broad.picard.util.BasicTextFileParser;
|
||||
import edu.mit.broad.picard.util.Log;
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.sam.SAMSequenceRecord;
|
||||
import edu.mit.broad.sam.SAMTextHeaderCodec;
|
||||
import edu.mit.broad.sam.util.AsciiLineReader;
|
||||
import edu.mit.broad.sam.util.SortingCollection;
|
||||
|
||||
/**
|
||||
* Converts a BED/BIM/FAM file trio to a number of GELI files (1 per individual).
|
||||
* BED files come in 2 formats, individual-major and snp-major. The former lists all SNPs for the
|
||||
* first individual then all SNPs for the second individual, etc. The latter list all individuals
|
||||
* for first SNP then all individuals for second SNP, etc. The order for snps is dictated by
|
||||
* the bim file and the order for individuals is dictated by the fam file.
|
||||
* <p>
|
||||
* See <a href="http://pngu.mgh.harvard.edu/~purcell/plink/binary.shtml">this page</a> for details
|
||||
* of the format.
|
||||
*
|
||||
* @author Doug Voet
|
||||
*/
|
||||
public class BedToGeli extends CommandLineProgram {
|
||||
static final float LIKELIHOOD = 500;
|
||||
private static final Log log = Log.getInstance(BedToGeli.class);
|
||||
|
||||
@Usage(programVersion="1.0")
|
||||
public final String USAGE = "";
|
||||
|
||||
@Option(doc="The bed file name.", mutex="BFILE")
|
||||
public File BED;
|
||||
|
||||
@Option(doc="The bim file name.", mutex="BFILE")
|
||||
public File BIM;
|
||||
|
||||
@Option(doc="The fam file name.", mutex="BFILE")
|
||||
public File FAM;
|
||||
|
||||
@Option(doc="The root file name of the bed, bim & fam files.", mutex={"BED", "BIM", "FAM"})
|
||||
public String BFILE;
|
||||
|
||||
@Option(doc="The directory to write the output GELI files", shortName="D")
|
||||
public File OUTPUT_DIR;
|
||||
|
||||
@Option(doc="Set to 'true' if the family name should be included in the output file names, default false",
|
||||
shortName="F",
|
||||
optional=true)
|
||||
public Boolean USE_FAMILY = Boolean.FALSE;
|
||||
|
||||
@Option(doc="Name of file containing sequence dictionary to embed in new GELI files",
|
||||
shortName="DICT")
|
||||
public File SEQUENCE_DICTIONARY;
|
||||
|
||||
private List<SNP> snpCache;
|
||||
private List<String> geliFileNames;
|
||||
private List<SAMSequenceRecord> sequenceDictionary;
|
||||
private Map<String, Byte> referenceIndexes;
|
||||
|
||||
public static void main(String[] argv) {
|
||||
System.exit(new BedToGeli().instanceMain(argv));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int doWork() {
|
||||
populateFileNames();
|
||||
IoUtil.assertFileIsReadable(this.BED);
|
||||
IoUtil.assertFileIsReadable(this.BIM);
|
||||
IoUtil.assertFileIsReadable(this.FAM);
|
||||
IoUtil.assertFileIsReadable(this.SEQUENCE_DICTIONARY);
|
||||
IoUtil.assertDirectoryIsWritable(this.OUTPUT_DIR);
|
||||
|
||||
populateSequenceDictionary();
|
||||
|
||||
BedFileReader bedReader = new BedFileReader(this.BED);
|
||||
if (bedReader.getMode() == BedFileReader.MODE_INDIVIDUAL_MAJOR) {
|
||||
log.debug("Detected BED file in individual-major mode");
|
||||
parseIndividualMajor(bedReader);
|
||||
} else {
|
||||
log.debug("Detected BED file in snp-major mode");
|
||||
parseSnpMajor(bedReader);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* loads the SEQUENCE_DICTIONARY file
|
||||
*/
|
||||
private void populateSequenceDictionary() {
|
||||
try {
|
||||
final SAMFileHeader header = new SAMTextHeaderCodec().decode(new AsciiLineReader(new FileInputStream(this.SEQUENCE_DICTIONARY)), null);
|
||||
this.sequenceDictionary = header.getSequences();
|
||||
|
||||
this.referenceIndexes = new HashMap<String, Byte>();
|
||||
for (byte i = 0; i < sequenceDictionary.size(); i++) {
|
||||
this.referenceIndexes.put(sequenceDictionary.get(i).getSequenceName().intern(), i);
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new PicardException("Unexpected exception", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void parseIndividualMajor(BedFileReader bedReader) {
|
||||
cacheSnps();
|
||||
BasicTextFileParser famReader = new BasicTextFileParser(true, this.FAM);
|
||||
for (String[] famFields : famReader) {
|
||||
GeliFileWriter geliWriter = getGeliFileWriter(getGeliFileName(famFields[0], famFields[1]), false);
|
||||
for (SNP snp : this.snpCache) {
|
||||
GenotypeLikelihoods genotypeLikelihoods = constructGenotypeLikelihoods(
|
||||
bedReader, snp);
|
||||
if (genotypeLikelihoods != null) {
|
||||
geliWriter.addGenotypeLikelihoods(genotypeLikelihoods);
|
||||
}
|
||||
}
|
||||
bedReader.dropRemainingBlock();
|
||||
geliWriter.close();
|
||||
}
|
||||
famReader.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return null if for a no-call or the snp has no position on the genome
|
||||
*/
|
||||
private char[] getNextGenotype(BedFileReader bedReader, SNP snp) {
|
||||
char[] genotype = new char[2];
|
||||
byte genotypeCode = bedReader.nextGenotype();
|
||||
if (snp == null) {
|
||||
// unplaced marker... we need to read the genotype off the reader so we don't lose
|
||||
// our place, but we cannot put the marker in the geli file.
|
||||
return null;
|
||||
}
|
||||
switch (genotypeCode) {
|
||||
case BedFileReader.GENOTYPE_AA:
|
||||
genotype[0] = (char) snp.getAllele1();
|
||||
genotype[1] = (char) snp.getAllele1();
|
||||
break;
|
||||
case BedFileReader.GENOTYPE_AB:
|
||||
genotype[0] = (char) snp.getAllele1();
|
||||
genotype[1] = (char) snp.getAllele2();
|
||||
break;
|
||||
case BedFileReader.GENOTYPE_BB:
|
||||
genotype[0] = (char) snp.getAllele2();
|
||||
genotype[1] = (char) snp.getAllele2();
|
||||
break;
|
||||
case BedFileReader.GENOTYPE_NO_CALL:
|
||||
// don't record a genotype likelihood for a no call
|
||||
return null;
|
||||
default:
|
||||
throw new PicardException("Unknown genotype code: " + Integer.toBinaryString(genotypeCode));
|
||||
}
|
||||
return genotype;
|
||||
}
|
||||
|
||||
private void cacheSnps() {
|
||||
BasicTextFileParser bimReader = null;
|
||||
try {
|
||||
bimReader = new BasicTextFileParser(true, this.BIM);
|
||||
this.snpCache = new LinkedList<SNP>();
|
||||
for (String[] bimFields : bimReader) {
|
||||
SNP snp = constructSnp(bimFields);
|
||||
snpCache.add(snp);
|
||||
}
|
||||
} finally {
|
||||
try {
|
||||
bimReader.close();
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private SNP constructSnp(String[] bimFields) {
|
||||
byte referenceIndex = getReferenceIndex(bimFields[0]);
|
||||
if (referenceIndex == -1) {
|
||||
return null;
|
||||
}
|
||||
SNP snp = new SNP(
|
||||
referenceIndex,
|
||||
Integer.parseInt(bimFields[3]),
|
||||
bimFields[4].toUpperCase().getBytes()[0],
|
||||
bimFields[5].toUpperCase().getBytes()[0]);
|
||||
return snp;
|
||||
}
|
||||
|
||||
/**
|
||||
* determines the index in the sequence dictionary for the given chromosome
|
||||
*/
|
||||
private byte getReferenceIndex(String chromosome) {
|
||||
final String referenceName;
|
||||
int chromosomeNumber;
|
||||
try {
|
||||
chromosomeNumber = Integer.parseInt(chromosome);
|
||||
} catch (NumberFormatException e) {
|
||||
chromosomeNumber = -1;
|
||||
}
|
||||
|
||||
if (chromosomeNumber >= 1 && chromosomeNumber <= 22) {
|
||||
referenceName = ("chr" + chromosome).intern();
|
||||
} else if (chromosomeNumber == 26 || chromosome.equalsIgnoreCase("MT")) {
|
||||
referenceName = "chrM";
|
||||
} else if (chromosomeNumber == 23 || chromosomeNumber == 25 ||
|
||||
chromosome.equalsIgnoreCase("XY") || chromosome.equalsIgnoreCase("X")) {
|
||||
referenceName = "chrX";
|
||||
} else if (chromosomeNumber == 24 || chromosome.equalsIgnoreCase("Y")) {
|
||||
referenceName = "chrY";
|
||||
} else {
|
||||
// unplaced marker
|
||||
return -1;
|
||||
}
|
||||
|
||||
Byte referenceIndex = this.referenceIndexes.get(referenceName);
|
||||
if (referenceIndex == null) {
|
||||
throw new PicardException("Reference sequence [" + referenceName + "] not found in sequence dictionary");
|
||||
}
|
||||
return referenceIndex;
|
||||
}
|
||||
|
||||
private void cacheGELIFileNames() {
|
||||
BasicTextFileParser famReader = null;
|
||||
try {
|
||||
famReader = new BasicTextFileParser(true, this.FAM);
|
||||
this.geliFileNames = new LinkedList<String>();
|
||||
for (String[] fields : famReader) {
|
||||
this.geliFileNames.add(getGeliFileName(fields[0], fields[1]));
|
||||
}
|
||||
} finally {
|
||||
try {
|
||||
famReader.close();
|
||||
} catch (Exception e) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void parseSnpMajor(BedFileReader bedReader) {
|
||||
cacheGELIFileNames();
|
||||
BasicTextFileParser bimReader = new BasicTextFileParser(true, this.BIM);
|
||||
Map<String, SortingCollection<GenotypeLikelihoods>> likelihoodsByFile =
|
||||
new HashMap<String, SortingCollection<GenotypeLikelihoods>>(
|
||||
(int) Math.ceil(this.geliFileNames.size() * 1.34));
|
||||
|
||||
int maxRecordsInRam = calculateMaxRecordsInRam();
|
||||
for (String geliFileName : this.geliFileNames) {
|
||||
likelihoodsByFile.put(geliFileName, SortingCollection.newInstance(
|
||||
GenotypeLikelihoods.class,
|
||||
new GenotypeLikelihoodsCodec(),
|
||||
new GenotypeLikelihoodsComparator(),
|
||||
maxRecordsInRam));
|
||||
}
|
||||
|
||||
for (String[] bimFields : bimReader) {
|
||||
for (String fileName : this.geliFileNames) {
|
||||
SNP snp = constructSnp(bimFields);
|
||||
GenotypeLikelihoods genotypeLikelihoods = constructGenotypeLikelihoods(
|
||||
bedReader, snp);
|
||||
if (genotypeLikelihoods != null) {
|
||||
likelihoodsByFile.get(fileName).add(genotypeLikelihoods);
|
||||
}
|
||||
}
|
||||
bedReader.dropRemainingBlock();
|
||||
}
|
||||
bimReader.close();
|
||||
|
||||
writeGeliFiles(likelihoodsByFile);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return
|
||||
*/
|
||||
private int calculateMaxRecordsInRam() {
|
||||
Runtime.getRuntime().gc();
|
||||
double memoryToUse = Runtime.getRuntime().maxMemory() * .8; // use up to 80%
|
||||
int objectCountLimit = (int) (memoryToUse / GenotypeLikelihoods.OBJECT_SIZE_BYTES);
|
||||
return objectCountLimit / this.geliFileNames.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @param likelihoodsByFile
|
||||
*/
|
||||
private void writeGeliFiles(
|
||||
Map<String, SortingCollection<GenotypeLikelihoods>> likelihoodsByFile) {
|
||||
|
||||
for (Map.Entry<String, SortingCollection<GenotypeLikelihoods>> entry : likelihoodsByFile.entrySet()) {
|
||||
GeliFileWriter fileWriter = getGeliFileWriter(entry.getKey(), true);
|
||||
for (GenotypeLikelihoods likelihoods : entry.getValue()) {
|
||||
fileWriter.addGenotypeLikelihoods(likelihoods);
|
||||
}
|
||||
fileWriter.close();
|
||||
}
|
||||
}
|
||||
|
||||
private GeliFileWriter getGeliFileWriter(
|
||||
String fileName, boolean presorted) {
|
||||
File geliFile = new File(this.OUTPUT_DIR, fileName);
|
||||
GeliFileWriter fileWriter = new GeliFileWriter(geliFile, presorted);
|
||||
SAMFileHeader header = new SAMFileHeader();
|
||||
header.setAttribute(SAMFileHeader.VERSION_TAG, "1.0");
|
||||
header.setSequences(this.sequenceDictionary);
|
||||
fileWriter.setHeader(header);
|
||||
return fileWriter;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param bedReader
|
||||
* @param snp
|
||||
* @return
|
||||
*/
|
||||
private GenotypeLikelihoods constructGenotypeLikelihoods(
|
||||
BedFileReader bedReader, SNP snp) {
|
||||
char[] genotype = getNextGenotype(bedReader, snp);
|
||||
if (genotype == null) {
|
||||
// no call or unplaced marker
|
||||
return null;
|
||||
}
|
||||
|
||||
GenotypeLikelihoods genotypeLikelihoods = new GenotypeLikelihoods();
|
||||
genotypeLikelihoods.setLikelihood(
|
||||
GenotypeLikelihoods.getLikelihoodIndex(genotype),
|
||||
LIKELIHOOD);
|
||||
genotypeLikelihoods.setReferenceIndex(snp.getReferenceIndex());
|
||||
genotypeLikelihoods.setPosition(snp.getPosition());
|
||||
return genotypeLikelihoods;
|
||||
}
|
||||
|
||||
/**
|
||||
* populates bed/bim/fam if bfile option is used
|
||||
*/
|
||||
private void populateFileNames() {
|
||||
if (this.BFILE != null) {
|
||||
this.BED = new File(this.BFILE + ".bed");
|
||||
this.BIM = new File(this.BFILE + ".bim");
|
||||
this.FAM = new File(this.BFILE + ".fam");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the appropriate name taking into account this.USE_FAMILY
|
||||
*/
|
||||
private String getGeliFileName(String family, String individual) {
|
||||
StringBuilder fileName = new StringBuilder(individual).append(".geli");
|
||||
if (this.USE_FAMILY) {
|
||||
fileName.insert(0, "_").insert(0, family);
|
||||
}
|
||||
return fileName.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
|
||||
package edu.mit.broad.picard.importer.genotype;
|
||||
|
||||
/**
|
||||
* data class for storing snp info
|
||||
*
|
||||
* @author Doug Voet
|
||||
*/
|
||||
public class SNP {
|
||||
private final byte referenceIndex;
|
||||
private final int position;
|
||||
private final byte allele1;
|
||||
private final byte allele2;
|
||||
|
||||
public SNP(byte chromosome, int position, byte allele1, byte allele2) {
|
||||
this.referenceIndex = chromosome;
|
||||
this.position = position;
|
||||
this.allele1 = allele1;
|
||||
this.allele2 = allele2;
|
||||
}
|
||||
|
||||
public byte getReferenceIndex() { return referenceIndex; }
|
||||
public int getPosition() { return position; }
|
||||
public byte getAllele1() { return allele1; }
|
||||
public byte getAllele2() { return allele2; }
|
||||
}
|
||||
|
|
@ -1,183 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.io;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
|
||||
/**
|
||||
* A class for utility methods that wrap or aggregate functionality in Java IO.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class IoUtil {
|
||||
/**
|
||||
* Checks that a file is non-null, exists, is not a directory and is readable. If any
|
||||
* condition is false then a runtime exception is thrown.
|
||||
*
|
||||
* @param file the file to check for readability
|
||||
*/
|
||||
public static void assertFileIsReadable(File file) {
|
||||
if (file == null) {
|
||||
throw new IllegalArgumentException("Cannot check readability of null file.");
|
||||
} else if (!file.exists()) {
|
||||
throw new PicardException("Cannot read non-existent file: " + file.getAbsolutePath());
|
||||
}
|
||||
else if (file.isDirectory()) {
|
||||
throw new PicardException("Cannot read file because it is a directory: " + file.getAbsolutePath());
|
||||
}
|
||||
else if (!file.canRead()) {
|
||||
throw new PicardException("File exists but is not readable: " + file.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks that a file is non-null, and is either extent and writable, or non-existent but
|
||||
* that the parent directory exists and is writable. If any
|
||||
* condition is false then a runtime exception is thrown.
|
||||
*
|
||||
* @param file the file to check for writability
|
||||
*/
|
||||
public static void assertFileIsWritable(File file) {
|
||||
if (file == null) {
|
||||
throw new IllegalArgumentException("Cannot check readability of null file.");
|
||||
} else if (!file.exists()) {
|
||||
// If the file doesn't exist, check that it's parent directory does and is writable
|
||||
File parent = file.getAbsoluteFile().getParentFile();
|
||||
if (!parent.exists()) {
|
||||
throw new PicardException("Cannot write file: " + file.getAbsolutePath() + ". " +
|
||||
"Neither file nor parent directory exist.");
|
||||
}
|
||||
else if (!parent.isDirectory()) {
|
||||
throw new PicardException("Cannot write file: " + file.getAbsolutePath() + ". " +
|
||||
"File does not exist and parent is not a directory.");
|
||||
}
|
||||
else if (!parent.canWrite()) {
|
||||
throw new PicardException("Cannot write file: " + file.getAbsolutePath() + ". " +
|
||||
"File does not exist and parent directory is not writable..");
|
||||
}
|
||||
}
|
||||
else if (file.isDirectory()) {
|
||||
throw new PicardException("Cannot write file because it is a directory: " + file.getAbsolutePath());
|
||||
}
|
||||
else if (!file.canWrite()) {
|
||||
throw new PicardException("File exists but is not writable: " + file.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks that a directory is non-null, extent, writable and a directory
|
||||
* otherwise a runtime exception is thrown.
|
||||
*
|
||||
* @param dir the dir to check for writability
|
||||
*/
|
||||
public static void assertDirectoryIsWritable(File dir) {
|
||||
if (dir == null) {
|
||||
throw new IllegalArgumentException("Cannot check readability of null file.");
|
||||
}
|
||||
else if (!dir.exists()) {
|
||||
throw new PicardException("Directory does not exist: " + dir.getAbsolutePath());
|
||||
}
|
||||
else if (!dir.isDirectory()) {
|
||||
throw new PicardException("Cannot write to directory because it is not a directory: " + dir.getAbsolutePath());
|
||||
}
|
||||
else if (!dir.canWrite()) {
|
||||
throw new PicardException("Directory exists but is not writable: " + dir.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens a file for reading, decompressing it if necessary
|
||||
*
|
||||
* @param file The file to open
|
||||
* @return the input stream to read from
|
||||
*/
|
||||
public static InputStream openFileForReading(File file) {
|
||||
|
||||
try {
|
||||
if (file.getName().endsWith(".gz") ||
|
||||
file.getName().endsWith(".bfq") ||
|
||||
file.getName().endsWith(".map")) {
|
||||
return new GZIPInputStream(new FileInputStream(file));
|
||||
}
|
||||
//TODO: Other compression formats
|
||||
else {
|
||||
return new FileInputStream(file);
|
||||
}
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new PicardException("File not found: " + file.getName(), ioe);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens a file for writing, overwriting the file if it already exists
|
||||
*
|
||||
* @param file the file to write to
|
||||
* @return the output stream to write to
|
||||
*/
|
||||
public static OutputStream openFileForWriting(File file) {
|
||||
return openFileForWriting(file, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Opens a file for writing
|
||||
*
|
||||
* @param file the file to write to
|
||||
* @param append whether to append to the file if it already exists (we overwrite it if false)
|
||||
* @return the output stream to write to
|
||||
*/
|
||||
public static OutputStream openFileForWriting(File file, boolean append) {
|
||||
|
||||
try {
|
||||
if (file.getName().endsWith(".gz") ||
|
||||
file.getName().endsWith(".bfq") ||
|
||||
file.getName().endsWith(".map")) {
|
||||
return new GZIPOutputStream(new FileOutputStream(file, append));
|
||||
}
|
||||
//TODO: Other compression formats
|
||||
else {
|
||||
return new FileOutputStream(file, append);
|
||||
}
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new PicardException("Error opening file for writing: " + file.getName(), ioe);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method to copy the contents of input to output. The caller is responsible for
|
||||
* opening and closing both streams.
|
||||
*
|
||||
* @param input contents to be copied
|
||||
* @param output destination
|
||||
*/
|
||||
public static void copyStream(InputStream input, OutputStream output) {
|
||||
try {
|
||||
byte[] buffer = new byte[1024];
|
||||
int bytesRead = 0;
|
||||
while((bytesRead = input.read(buffer)) > 0) {
|
||||
output.write(buffer, 0, bytesRead);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new PicardException("Exception copying stream", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
|
||||
package edu.mit.broad.picard.metrics;
|
||||
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
|
||||
public class AggregateMetricCollector<T extends MetricBase> implements MetricCollector<T> {
|
||||
private final MetricCollector<T>[] collectors;
|
||||
|
||||
public AggregateMetricCollector(MetricCollector<T>... collectors) {
|
||||
if (collectors.length == 0) {
|
||||
throw new IllegalArgumentException("Must supply at least one collector.");
|
||||
}
|
||||
this.collectors = collectors;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addRecord(SAMRecord record) {
|
||||
for (MetricCollector<T> collector : this.collectors) {
|
||||
collector.addRecord(record);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onComplete() {
|
||||
for (MetricCollector<T> collector : this.collectors) {
|
||||
collector.onComplete();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setMetrics(T metrics) {
|
||||
for (MetricCollector<T> collector : this.collectors) {
|
||||
collector.setMetrics(metrics);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public T getMetrics() {
|
||||
return this.collectors[0].getMetrics();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,17 +0,0 @@
|
|||
package edu.mit.broad.picard.metrics;
|
||||
|
||||
/**
|
||||
* A header for a metrics file. A header simply consists of a type and some arbitrary
|
||||
* data, but must be able to turn itself into a String and parse it's data back out
|
||||
* of that String at a later date.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public interface Header {
|
||||
/** Converts the header to a String for persisting to a file. */
|
||||
public String toString();
|
||||
|
||||
/** Parses the data contained in the String version of the header. */
|
||||
public void parse(String in);
|
||||
|
||||
}
|
||||
|
|
@ -1,77 +0,0 @@
|
|||
package edu.mit.broad.picard.metrics;
|
||||
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.picard.util.FormatUtil;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
|
||||
/**
|
||||
* A base class from which all Metric classes should inherit.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class MetricBase {
|
||||
/**
|
||||
* An equals method that checks equality by asserting that the classes are of the exact
|
||||
* same type and that all public fields are equal.
|
||||
*
|
||||
* @param o an instance to compare to
|
||||
* @return true if they are equal, false otherwise
|
||||
*/
|
||||
public boolean equals(Object o) {
|
||||
if (o == null) return false;
|
||||
if (o.getClass() != getClass()) return false;
|
||||
|
||||
// Loop through all the fields and check that they are either
|
||||
// null in both objects or equal in both objects
|
||||
for (Field f : getClass().getFields()) {
|
||||
try {
|
||||
Object lhs = f.get(this);
|
||||
Object rhs = f.get(o);
|
||||
|
||||
if (lhs == null) {
|
||||
if (rhs == null) {
|
||||
// keep going
|
||||
}
|
||||
else if (rhs != null) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (lhs.equals(rhs)) {
|
||||
// keep going
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (IllegalAccessException iae) {
|
||||
throw new PicardException("Could not read field " + f.getName() + " from a " + getClass().getSimpleName());
|
||||
}
|
||||
}
|
||||
|
||||
// If we got this far all the fields are equal
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Converts the metric class to a human readable string. */
|
||||
public String toString() {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
FormatUtil formatter = new FormatUtil();
|
||||
|
||||
for (Field f : getClass().getFields()) {
|
||||
try {
|
||||
buffer.append(f.getName());
|
||||
buffer.append("\t");
|
||||
buffer.append(formatter.format(f.get(this)));
|
||||
buffer.append("\n");
|
||||
}
|
||||
catch (IllegalAccessException iae) {
|
||||
throw new PicardException("Could not read field " + f.getName() + " from a " + getClass().getSimpleName());
|
||||
}
|
||||
}
|
||||
|
||||
return buffer.toString();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,24 +0,0 @@
|
|||
package edu.mit.broad.picard.metrics;
|
||||
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
|
||||
/**
|
||||
* Interface for objects that collect metrics about SAMRecords.
|
||||
*/
|
||||
public interface MetricCollector<T extends MetricBase> {
|
||||
T getMetrics();
|
||||
|
||||
/** Called after collector is constructed to populate the metrics object. */
|
||||
void setMetrics(T metrics);
|
||||
|
||||
/**
|
||||
* Called when collection is complete. Implementations can do any calculations
|
||||
* that must wait until all records are visited at this time.
|
||||
*/
|
||||
void onComplete();
|
||||
|
||||
/**
|
||||
* Visitor method called to have the record considered by the collector.
|
||||
*/
|
||||
void addRecord(SAMRecord record);
|
||||
}
|
||||
|
|
@ -1,370 +0,0 @@
|
|||
package edu.mit.broad.picard.metrics;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.Writer;
|
||||
import java.lang.reflect.Field;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.picard.util.FormatUtil;
|
||||
import edu.mit.broad.picard.util.Histogram;
|
||||
import edu.mit.broad.picard.util.StringUtil;
|
||||
|
||||
/**
|
||||
* Contains a set of metrics that can be written to a file and parsed back
|
||||
* again. The set of metrics is composed of zero or more instances of a class,
|
||||
* BEAN, that extends {@link MetricBase} (all instances must be of the same type)
|
||||
* and may optionally include a histogram of data.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class MetricsFile<BEAN extends MetricBase, HKEY extends Comparable> {
|
||||
public static final String MAJOR_HEADER_PREFIX = "## ";
|
||||
public static final String MINOR_HEADER_PREFIX = "# ";
|
||||
public static final String SEPARATOR = "\t";
|
||||
public static final String HISTO_HEADER = "## HISTOGRAM\t";
|
||||
public static final String METRIC_HEADER = "## METRICS CLASS\t";
|
||||
|
||||
private List<Header> headers = new ArrayList<Header>();
|
||||
private List<BEAN> metrics = new ArrayList<BEAN>();
|
||||
private Histogram<HKEY> histogram;
|
||||
|
||||
/** Adds a header to the collection of metrics. */
|
||||
public void addHeader(Header h) { this.headers.add(h); }
|
||||
|
||||
/** Returns the list of headers. */
|
||||
public List<Header> getHeaders() { return Collections.unmodifiableList(this.headers); }
|
||||
|
||||
/** Adds a bean to the collection of metrics. */
|
||||
public void addMetric(BEAN bean) { this.metrics.add(bean); }
|
||||
|
||||
/** Returns the list of headers. */
|
||||
public List<BEAN> getMetrics() { return Collections.unmodifiableList(this.metrics); }
|
||||
|
||||
/** Returns the histogram contained in the metrics file if any. */
|
||||
public Histogram<HKEY> getHistogram() { return histogram; }
|
||||
|
||||
/** Sets the histogram contained in the metrics file. */
|
||||
public void setHistogram(Histogram<HKEY> histogram) { this.histogram = histogram; }
|
||||
|
||||
/** Returns the list of headers with the specified type. */
|
||||
public List<Header> getHeaders(Class<? extends Header> type) {
|
||||
List<Header> tmp = new ArrayList<Header>();
|
||||
for (Header h : this.headers) {
|
||||
if (h.getClass().equals(type)) {
|
||||
tmp.add(h);
|
||||
}
|
||||
}
|
||||
|
||||
return tmp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes out the metrics file to the supplied file. The file is written out
|
||||
* headers first, metrics second and histogram third.
|
||||
*
|
||||
* @param f a File into which to write the metrics
|
||||
*/
|
||||
public void write(File f) {
|
||||
FileWriter w = null;
|
||||
try {
|
||||
w = new FileWriter(f);
|
||||
write(w);
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new PicardException("Could not write metrics to file: " + f.getAbsolutePath(), ioe);
|
||||
}
|
||||
finally {
|
||||
if (w != null) {
|
||||
try {
|
||||
w.close();
|
||||
} catch (IOException e) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes out the metrics file to the supplied writer. The file is written out
|
||||
* headers first, metrics second and histogram third.
|
||||
*
|
||||
* @param w a Writer into which to write the metrics
|
||||
*/
|
||||
public void write(Writer w) {
|
||||
try {
|
||||
FormatUtil formatter = new FormatUtil();
|
||||
BufferedWriter out = new BufferedWriter(w);
|
||||
printHeaders(out);
|
||||
out.newLine();
|
||||
|
||||
printBeanMetrics(out, formatter);
|
||||
out.newLine();
|
||||
|
||||
printHistogram(out, formatter);
|
||||
out.newLine();
|
||||
out.flush();
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new PicardException("Could not write metrics file.", ioe);
|
||||
}
|
||||
}
|
||||
|
||||
/** Prints the headers into the provided PrintWriter. */
|
||||
private void printHeaders(BufferedWriter out) throws IOException {
|
||||
for (Header h : this.headers) {
|
||||
out.append(MAJOR_HEADER_PREFIX);
|
||||
out.append(h.getClass().getName());
|
||||
out.newLine();
|
||||
out.append(MINOR_HEADER_PREFIX);
|
||||
out.append(h.toString());
|
||||
out.newLine();
|
||||
}
|
||||
}
|
||||
|
||||
/** Prints each of the metrics entries into the provided PrintWriter. */
|
||||
private void printBeanMetrics(BufferedWriter out, FormatUtil formatter) throws IOException {
|
||||
if (this.metrics.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Write out a header row with the type of the metric class
|
||||
out.append(METRIC_HEADER + getBeanType().getName());
|
||||
out.newLine();
|
||||
|
||||
// Write out the column headers
|
||||
Field[] fields = getBeanType().getFields();
|
||||
final int fieldCount = fields.length;
|
||||
|
||||
for (int i=0; i<fieldCount; ++i) {
|
||||
out.append(fields[i].getName());
|
||||
if (i < fieldCount - 1) {
|
||||
out.append(MetricsFile.SEPARATOR);
|
||||
}
|
||||
else {
|
||||
out.newLine();
|
||||
}
|
||||
}
|
||||
|
||||
// Write out each of the data rows
|
||||
for (BEAN bean : this.metrics) {
|
||||
for (int i=0; i<fieldCount; ++i) {
|
||||
try {
|
||||
Object value = fields[i].get(bean);
|
||||
out.append(StringUtil.assertCharactersNotInString(formatter.format(value), '\t', '\n'));
|
||||
|
||||
if (i < fieldCount - 1) {
|
||||
out.append(MetricsFile.SEPARATOR);
|
||||
}
|
||||
else {
|
||||
out.newLine();
|
||||
}
|
||||
}
|
||||
catch (IllegalAccessException iae) {
|
||||
throw new PicardException("Could not read property " + fields[i].getName()
|
||||
+ " from class of type " + bean.getClass());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out.flush();
|
||||
}
|
||||
|
||||
/** Prints the histogram if one is present. */
|
||||
private void printHistogram(BufferedWriter out, FormatUtil formatter) throws IOException {
|
||||
if (this.histogram == null || this.histogram.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Add a header for the histogram key type
|
||||
out.append(HISTO_HEADER + this.histogram.keySet().iterator().next().getClass().getName());
|
||||
out.newLine();
|
||||
|
||||
if (this.histogram != null) {
|
||||
out.append(StringUtil.assertCharactersNotInString(this.histogram.getBinLabel(), '\t', '\n'));
|
||||
out.append(SEPARATOR);
|
||||
out.append(StringUtil.assertCharactersNotInString(this.histogram.getValueLabel(), '\t', '\n'));
|
||||
out.newLine();
|
||||
|
||||
for (Histogram<HKEY>.Bin bin : this.histogram.values()) {
|
||||
out.append(StringUtil.assertCharactersNotInString(formatter.format(bin.getId()), '\t', '\n'));
|
||||
out.append(MetricsFile.SEPARATOR);
|
||||
out.append(formatter.format(bin.getValue()));
|
||||
out.newLine();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Gets the type of the metrics bean being used. */
|
||||
private Class<?> getBeanType() {
|
||||
if (this.metrics == null || this.metrics.isEmpty()) {
|
||||
return null;
|
||||
} else {
|
||||
return this.metrics.get(0).getClass();
|
||||
}
|
||||
}
|
||||
|
||||
/** Reads the Metrics in from the given reader. */
|
||||
public void read(Reader r) {
|
||||
BufferedReader in = new BufferedReader(r);
|
||||
FormatUtil formatter = new FormatUtil();
|
||||
String line = null;
|
||||
|
||||
try {
|
||||
// First read the headers
|
||||
Header header = null;
|
||||
boolean inHeader = true;
|
||||
while ((line = in.readLine()) != null && inHeader) {
|
||||
line = line.trim();
|
||||
// A blank line signals the end of the headers, otherwise parse out
|
||||
// the header types and values and build the headers.
|
||||
if ("".equals(line)) {
|
||||
inHeader = false;
|
||||
}
|
||||
else if (line.startsWith(MAJOR_HEADER_PREFIX)) {
|
||||
if (header != null) {
|
||||
throw new IllegalStateException("Consecutive header class lines encountered.");
|
||||
}
|
||||
|
||||
String className = line.substring(MAJOR_HEADER_PREFIX.length()).trim();
|
||||
try {
|
||||
header = (Header) Class.forName(className).newInstance();
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new PicardException("Error load and/or instantiating an instance of " + className, e);
|
||||
}
|
||||
}
|
||||
else if (line.startsWith(MINOR_HEADER_PREFIX)) {
|
||||
if (header == null) {
|
||||
throw new IllegalStateException("Header class must precede header value:" + line);
|
||||
}
|
||||
header.parse(line.substring(MINOR_HEADER_PREFIX.length()));
|
||||
this.headers.add(header);
|
||||
header = null;
|
||||
}
|
||||
else {
|
||||
throw new PicardException("Illegal state. Found following string in metrics file header: " + line);
|
||||
}
|
||||
}
|
||||
|
||||
// Then read the metrics if there are any
|
||||
while (!line.startsWith(MAJOR_HEADER_PREFIX)) {
|
||||
line = in.readLine().trim();
|
||||
}
|
||||
if (line.startsWith(METRIC_HEADER)) {
|
||||
// Get the metric class from the header
|
||||
String className = line.split(SEPARATOR)[1];
|
||||
Class<?> type = null;
|
||||
try {
|
||||
type = Class.forName(className);
|
||||
}
|
||||
catch (ClassNotFoundException cnfe) {
|
||||
throw new PicardException("Could not locate class with name " + className, cnfe);
|
||||
}
|
||||
|
||||
// Read the next line with the column headers
|
||||
String[] fieldNames = in.readLine().split(SEPARATOR);
|
||||
Field[] fields = new Field[fieldNames.length];
|
||||
for (int i=0; i<fieldNames.length; ++i) {
|
||||
try {
|
||||
fields[i] = type.getField(fieldNames[i]);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new PicardException("Could not get field with name " + fieldNames[i] +
|
||||
" from class " + type.getName());
|
||||
}
|
||||
}
|
||||
|
||||
// Now read the values
|
||||
while ((line = in.readLine()) != null) {
|
||||
line = line.trim();
|
||||
if ("".equals(line)) {
|
||||
break;
|
||||
}
|
||||
else {
|
||||
String[] values = line.split(SEPARATOR);
|
||||
BEAN bean = null;
|
||||
|
||||
try { bean = (BEAN) type.newInstance(); }
|
||||
catch (Exception e) { throw new PicardException("Error instantiating a " + type.getName(), e); }
|
||||
|
||||
|
||||
for (int i=0; i<fields.length; ++i) {
|
||||
Object value = null;
|
||||
if (values[i] != null && values[i].length() > 0) {
|
||||
value = formatter.parseObject(values[i], fields[i].getType());
|
||||
}
|
||||
|
||||
try { fields[i].set(bean, value); }
|
||||
catch (Exception e) {
|
||||
throw new PicardException("Error setting field " + fields[i].getName() +
|
||||
" on class of type " + type.getName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
this.metrics.add(bean);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Then read the histogram if it is present
|
||||
while (line != null && !line.startsWith(MAJOR_HEADER_PREFIX)) {
|
||||
line = in.readLine();
|
||||
}
|
||||
if (line != null && line.startsWith(HISTO_HEADER)) {
|
||||
// Get the key type of the histogram
|
||||
String keyClassName = line.split(SEPARATOR)[1].trim();
|
||||
Class<?> keyClass = null;
|
||||
|
||||
try { keyClass = Class.forName(keyClassName); }
|
||||
catch (ClassNotFoundException cnfe) { throw new PicardException("Could not load class with name " + keyClassName); }
|
||||
|
||||
// Read the next line with the bin and value labels
|
||||
String[] labels = in.readLine().split(SEPARATOR);
|
||||
this.histogram = new Histogram(labels[0], labels[1]);
|
||||
|
||||
// Read the entries in the histogram
|
||||
while ((line = in.readLine()) != null && !"".equals(line)) {
|
||||
String[] fields = line.trim().split(SEPARATOR);
|
||||
HKEY key = (HKEY) formatter.parseObject(fields[0], keyClass);
|
||||
double value = formatter.parseDouble(fields[1]);
|
||||
this.histogram.increment(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (IOException ioe) {
|
||||
throw new PicardException("Could not read metrics from reader.", ioe);
|
||||
}
|
||||
}
|
||||
|
||||
/** Checks that the headers, metrics and histogram are all equal. */
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (getClass() != o.getClass()) {
|
||||
return false;
|
||||
}
|
||||
MetricsFile that = (MetricsFile) o;
|
||||
|
||||
if (!this.headers.equals(that.headers)) {
|
||||
return false;
|
||||
}
|
||||
if (!this.metrics.equals(that.metrics)) {
|
||||
return false;
|
||||
}
|
||||
if (this.histogram == null && that.histogram == null) {
|
||||
return true;
|
||||
} else if (this.histogram != null) {
|
||||
return this.histogram.equals(that.histogram);
|
||||
} else if (that.histogram != null) {
|
||||
return that.histogram.equals(this.histogram);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,43 +0,0 @@
|
|||
package edu.mit.broad.picard.metrics;
|
||||
|
||||
import edu.mit.broad.picard.util.StringUtil;
|
||||
|
||||
/**
|
||||
* A simple header who's data type is a single String. Should not be used for anything other
|
||||
* than comments or descriptive text.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class StringHeader implements Header {
|
||||
private String value;
|
||||
|
||||
/** Default constructor. */
|
||||
public StringHeader() {}
|
||||
|
||||
/** Constructor that uses the supplied value as the value of the header. */
|
||||
public StringHeader(String value) {
|
||||
setValue(value);
|
||||
}
|
||||
|
||||
public void parse(String in) { value = in.trim(); }
|
||||
public String toString() { return value; }
|
||||
|
||||
public String getValue() { return value; }
|
||||
public void setValue(String value) { this.value = StringUtil.assertCharactersNotInString(value, '\n'); }
|
||||
|
||||
/** Checks equality on the value of the header. */
|
||||
public boolean equals(Object o) {
|
||||
if (o != null && o instanceof StringHeader) {
|
||||
StringHeader that = (StringHeader) o;
|
||||
if (this.value == null) {
|
||||
return that.value == null;
|
||||
}
|
||||
else {
|
||||
return this.value.equals(that.value);
|
||||
}
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
package edu.mit.broad.picard.metrics;
|
||||
|
||||
import edu.mit.broad.picard.util.StringUtil;
|
||||
|
||||
/**
|
||||
* Header that stores information about the version of some piece of software or
|
||||
* data used to create the metrics file. Payload consists of a name or description
|
||||
* of the versioned item and a version string.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class VersionHeader implements Header {
|
||||
private String versionedItem;
|
||||
private String versionString;
|
||||
|
||||
public void parse(String in) {
|
||||
String[] fields = in.split("\t");
|
||||
this.versionedItem = fields[0];
|
||||
this.versionString = fields[1];
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return this.versionedItem + "\t" + this.versionString;
|
||||
}
|
||||
|
||||
public String getVersionedItem() { return versionedItem; }
|
||||
public void setVersionedItem(String versionedItem) {
|
||||
this.versionedItem = StringUtil.assertCharactersNotInString(versionedItem, '\t', '\n');
|
||||
}
|
||||
|
||||
public String getVersionString() { return versionString; }
|
||||
public void setVersionString(String versionString) {
|
||||
this.versionString = StringUtil.assertCharactersNotInString(versionString, '\t', '\n');
|
||||
}
|
||||
|
||||
/** Equals method that checks that both the item and version string are equal. */
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (o == null || getClass() != o.getClass()) return false;
|
||||
|
||||
VersionHeader that = (VersionHeader) o;
|
||||
|
||||
if (versionString != null ? !versionString.equals(that.versionString) : that.versionString != null)
|
||||
return false;
|
||||
if (versionedItem != null ? !versionedItem.equals(that.versionedItem) : that.versionedItem != null)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,148 +0,0 @@
|
|||
package edu.mit.broad.picard.quality;
|
||||
|
||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||
import edu.mit.broad.picard.cmdline.Option;
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
|
||||
import edu.mit.broad.picard.variation.DbSnpFileReader;
|
||||
import edu.mit.broad.picard.util.Log;
|
||||
import edu.mit.broad.sam.SAMFileReader;
|
||||
import edu.mit.broad.sam.SAMFileWriter;
|
||||
import edu.mit.broad.sam.SAMFileWriterFactory;
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Command line program to calibrate quality scores using alignment and dbsnp data. Calibrates
|
||||
* qualities cycle by cycle and separately for reads one and two in a pair. Bases that fall
|
||||
* within dbSNP loci are ignored otherwise the empircal mismatch rate is calculated for
|
||||
* each quality at each cycle and used to calculate the calibrated quality value.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class CalibrateQualityScores extends CommandLineProgram {
|
||||
@Option(shortName="A", doc="A file of aligned reads in SAM or BAM format")
|
||||
public File ALIGNED_SAM;
|
||||
|
||||
@Option(shortName="I", doc="A SAM or BAM file to rewrite with calibrated qualities. If omitted ALIGNED_SAM is used.", optional=true)
|
||||
public File INPUT;
|
||||
|
||||
@Option(shortName="O", doc="The SAM or BAM file to write with updated qualities.")
|
||||
public File OUTPUT;
|
||||
|
||||
@Option(shortName="R", doc="Reference sequence file")
|
||||
public File REFERENCE;
|
||||
|
||||
@Option(shortName="SNP", doc="Binary file of dbSNP information", optional=true)
|
||||
public File DBSNP_FILE;
|
||||
|
||||
@Option(shortName="TABLE", doc="A file to output the calibration table(s) to.")
|
||||
public File CALIBRATION_TABLE_OUT;
|
||||
|
||||
@Option(doc="Optional limit to the number of aligned reads that should be procesed", optional=true)
|
||||
public Integer READ_LIMIT = -1;
|
||||
|
||||
/** Stock main method for a command line program. */
|
||||
public static void main(String[] argv) {
|
||||
System.exit(new CalibrateQualityScores().instanceMain(argv));
|
||||
}
|
||||
|
||||
/**
|
||||
* Main method for the program. Checks that all input files are present and
|
||||
* readable and that the output file can be written to. Then loads up all the
|
||||
* data and calibrates the quality scores and proceeds to write an output file
|
||||
* with calibrated quality scores instead of the input quality scores.
|
||||
*/
|
||||
protected int doWork() {
|
||||
final Log log = Log.getInstance(getClass());
|
||||
|
||||
// Some quick parameter checking
|
||||
if (INPUT == null) INPUT = ALIGNED_SAM;
|
||||
|
||||
IoUtil.assertFileIsReadable(ALIGNED_SAM);
|
||||
IoUtil.assertFileIsReadable(REFERENCE);
|
||||
IoUtil.assertFileIsReadable(INPUT);
|
||||
IoUtil.assertFileIsWritable(OUTPUT);
|
||||
IoUtil.assertFileIsWritable(CALIBRATION_TABLE_OUT);
|
||||
|
||||
log.info("Reading input files and calculating calibration matrices.");
|
||||
|
||||
// Load things up and calculate the quality score calibrations
|
||||
SAMFileReader sam = new SAMFileReader(ALIGNED_SAM);
|
||||
ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE);
|
||||
DbSnpFileReader dbsnp = null;
|
||||
|
||||
if (DBSNP_FILE != null) {
|
||||
IoUtil.assertFileIsReadable(DBSNP_FILE);
|
||||
dbsnp = new DbSnpFileReader(DBSNP_FILE);
|
||||
}
|
||||
|
||||
QualityScoreCalibrator calibrator = new QualityScoreCalibrator(sam, ref, dbsnp);
|
||||
calibrator.calibrate(READ_LIMIT);
|
||||
|
||||
// Dump the calibration tables
|
||||
log.info("Writing out calibration table.");
|
||||
PrintStream stream = new PrintStream(IoUtil.openFileForWriting(CALIBRATION_TABLE_OUT));
|
||||
stream.println("Read 1 Calibration Table:");
|
||||
print(stream, calibrator.getRead1Matrix().getCalibratedQualities());
|
||||
|
||||
if (!calibrator.getRead2Matrix().isEmpty()) {
|
||||
stream.println();
|
||||
stream.println("Read 2 Calibration Table:");
|
||||
print(stream, calibrator.getRead2Matrix().getCalibratedQualities());
|
||||
}
|
||||
|
||||
// And then load up the input and rewrite with calibrated qualities
|
||||
log.info("Writing file with calibrated qualities.");
|
||||
SAMFileReader in = new SAMFileReader(INPUT);
|
||||
SAMFileWriter out = new SAMFileWriterFactory().makeSAMOrBAMWriter(in.getFileHeader(), true, OUTPUT);
|
||||
|
||||
for (SAMRecord rec : in) {
|
||||
byte[] quals = rec.getBaseQualities();
|
||||
byte[] calibrated = new byte[quals.length];
|
||||
QualityScoreMatrix matrix = rec.getFirstOfPairFlag() ? calibrator.getRead1Matrix() : calibrator.getRead2Matrix();
|
||||
|
||||
for (int i=0; i<quals.length; ++i) {
|
||||
calibrated[i] = (byte) matrix.getCalibratedQuality(i+1, quals[i]);
|
||||
}
|
||||
|
||||
rec.setBaseQualities(calibrated);
|
||||
out.addAlignment(rec);
|
||||
}
|
||||
|
||||
out.close();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/** Static helper method to dump a calibration matrix to the screen for debugging. */
|
||||
private void print(PrintStream out, int[][] matrix) {
|
||||
int maxY = 0;
|
||||
for (int x=0; x<matrix.length; ++x) {
|
||||
if (matrix[x] != null) {
|
||||
maxY = Math.max(maxY, matrix[x].length - 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Print out the header row
|
||||
for (int i=0;i<=maxY; ++i) {
|
||||
out.print(i + "\t");
|
||||
}
|
||||
out.println();
|
||||
|
||||
// Now print out the data cycle by cycle
|
||||
for (int cycle=1; cycle<matrix.length; ++cycle) {
|
||||
out.print(cycle + "\t");
|
||||
|
||||
int[] quals = matrix[cycle];
|
||||
|
||||
for (int qual=1; qual<quals.length; ++qual) {
|
||||
out.print(quals[qual] + "\t");
|
||||
}
|
||||
out.println();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,155 +0,0 @@
|
|||
package edu.mit.broad.picard.quality;
|
||||
|
||||
import edu.mit.broad.sam.SAMFileReader;
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.sam.AlignmentBlock;
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
import edu.mit.broad.picard.variation.DbSnpFileReader;
|
||||
import edu.mit.broad.picard.variation.KnownVariant;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||
import edu.mit.broad.picard.util.CoordMath;
|
||||
import edu.mit.broad.picard.util.Histogram;
|
||||
import edu.mit.broad.picard.util.SequenceUtil;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.BitSet;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* Takes a set of aligned reads with qualities and determines the empirical quality
|
||||
* score for each of the bins.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class QualityScoreCalibrator {
|
||||
private final SAMFileReader sam;
|
||||
private final ReferenceSequenceFile ref;
|
||||
private final DbSnpFileReader dbsnp;
|
||||
|
||||
private QualityScoreMatrix read1Matrix;
|
||||
private QualityScoreMatrix read2Matrix;
|
||||
|
||||
/**
|
||||
* Constructs a calibrator that will read records from the specified SAMFileReader
|
||||
* and compare them the supplied reference. Optionally takes a set of known variants
|
||||
* who's positions will be excluded during calibration.
|
||||
*
|
||||
* @param sam the set of SAM records to use to calibrate qualities
|
||||
* @param ref the reference sequence against which the records were aligned
|
||||
* @param dbsnp the (optional) set of dbsnp positions to mask during calibration
|
||||
*/
|
||||
public QualityScoreCalibrator(SAMFileReader sam, ReferenceSequenceFile ref, DbSnpFileReader dbsnp) {
|
||||
this.sam = sam;
|
||||
this.dbsnp = dbsnp;
|
||||
this.ref = ref;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates calibrated quality scores using at most the specified number of aligned
|
||||
* reads. If the end of the file is hit first then fewer reads will be used.
|
||||
*
|
||||
* @param readLimit the number of aligned reads to use if the file contains more
|
||||
*/
|
||||
public void calibrate(final int readLimit) {
|
||||
ReferenceSequence reference = null;
|
||||
SAMFileHeader header = this.sam.getFileHeader();
|
||||
CloseableIterator<SAMRecord> samIterator = this.sam.iterator();
|
||||
SAMRecord read = samIterator.next();
|
||||
int readsProcessed = 0;
|
||||
|
||||
// Quality score matrixes for reads 1 and 2 separately
|
||||
this.read1Matrix = new QualityScoreMatrix();
|
||||
this.read2Matrix = new QualityScoreMatrix();
|
||||
|
||||
|
||||
refloop: while ((reference = this.ref.nextSequence()) != null) {
|
||||
final byte[] refBases = reference.getBases();
|
||||
final BitSet snps = getDbSnpMask(reference);
|
||||
|
||||
while (read != null && read.getReferenceIndex(header) == reference.getContigIndex()) {
|
||||
if (!read.getReadUnmappedFlag() && !read.getNotPrimaryAlignmentFlag()) {
|
||||
final QualityScoreMatrix matrix = read.getFirstOfPairFlag() ? this.read1Matrix : this.read2Matrix;
|
||||
final byte[] readBases = read.getReadBases();
|
||||
final byte[] qualities = read.getBaseQualities();
|
||||
|
||||
for (AlignmentBlock block : read.getAlignmentBlocks()) {
|
||||
final int readIndex = block.getReadStart() - 1;
|
||||
final int refIndex = block.getReferenceStart() - 1;
|
||||
final int length = block.getLength();
|
||||
|
||||
for (int i=0; i<length; ++i) {
|
||||
// Skip dbSNP loci
|
||||
if (snps.get(refIndex+i+1)) continue;
|
||||
|
||||
final int readBaseIndex = readIndex+i;
|
||||
boolean match = SequenceUtil.basesEqual(readBases[readBaseIndex], refBases[refIndex+i]);
|
||||
int cycle = CoordMath.getCycle(
|
||||
read.getReadNegativeStrandFlag(), readBases.length, readBaseIndex);
|
||||
matrix.addObservation(cycle, qualities[readBaseIndex], !match);
|
||||
}
|
||||
}
|
||||
|
||||
if (readLimit > 0 && ++readsProcessed >= readLimit) {
|
||||
break refloop;
|
||||
}
|
||||
}
|
||||
|
||||
// Advance the sam iterator
|
||||
if (samIterator.hasNext()) {
|
||||
read = samIterator.next();
|
||||
}
|
||||
else {
|
||||
read = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.read1Matrix.computeCalibratedQualities();
|
||||
if (!this.read2Matrix.isEmpty()) this.read2Matrix.computeCalibratedQualities();
|
||||
}
|
||||
|
||||
/** Gets the calibration matrix for the first read. */
|
||||
public QualityScoreMatrix getRead1Matrix() { return read1Matrix; }
|
||||
|
||||
/** Gets the calibration matrix for the second read. May be empty if there was no second read data. */
|
||||
public QualityScoreMatrix getRead2Matrix() { return read2Matrix; }
|
||||
|
||||
/**
|
||||
* Returns a BitSet that denotes whether a dbSNP entry is present at each
|
||||
* base in the reference sequence. The set is reference.length() + 1 so that
|
||||
* it can be indexed by 1-based reference base. True means dbSNP present,
|
||||
* false means no dbSNP present.
|
||||
*/
|
||||
private BitSet getDbSnpMask(ReferenceSequence reference) {
|
||||
int index = reference.getContigIndex();
|
||||
BitSet bits = new BitSet(reference.length() + 1);
|
||||
|
||||
/* Just return an all false bit set if we don't have dbsnp data. */
|
||||
if (this.dbsnp == null) {
|
||||
return bits;
|
||||
}
|
||||
|
||||
/* Read off the next contig's worth of data. */
|
||||
while (this.dbsnp.hasNext()) {
|
||||
KnownVariant variant = this.dbsnp.peek();
|
||||
|
||||
if (variant.getSequenceIndex() < index) {
|
||||
this.dbsnp.next();
|
||||
}
|
||||
else if (variant.getSequenceIndex() == index) {
|
||||
variant = this.dbsnp.next();
|
||||
|
||||
for (int i=variant.getStartPos(); i<=variant.getEndPos(); ++i) {
|
||||
bits.set(i, true);
|
||||
}
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return bits;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,133 +0,0 @@
|
|||
package edu.mit.broad.picard.quality;
|
||||
|
||||
import edu.mit.broad.picard.util.Histogram;
|
||||
|
||||
import java.util.TreeMap;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
|
||||
/**
|
||||
* <p>Holds all the information necessary to perform quality score calibration for a single
|
||||
* end/read for a lane or run of sequencing. General usage is to construct an instance
|
||||
* an call {@link #addObservation(int, int, boolean)} repeatedly and when all input data
|
||||
* is consumed call {@link #computeCalibratedQualities()}.</p>
|
||||
*
|
||||
* <p>Once this is done then {@link #getCalibratedQualities()} can be called to get a matrix
|
||||
* of quality score calibrations by cycle and input quality. However it is preferred to call
|
||||
* {@link #getCalibratedQuality(int, int)} which will attempt to infer the correct value in the
|
||||
* case that the input quality was not observed in the training data.</p>
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class QualityScoreMatrix {
|
||||
// Maps by cycle, histograms by quality
|
||||
private SortedMap<Integer, Histogram<Integer>> observations = new TreeMap<Integer, Histogram<Integer>>();
|
||||
private SortedMap<Integer, Histogram<Integer>> errors = new TreeMap<Integer, Histogram<Integer>>();
|
||||
|
||||
private int[][] calibratedQualities = null;
|
||||
|
||||
/**
|
||||
* Adds an observation to the matrix.
|
||||
* @param cycle the cycle in the read (1-based)
|
||||
* @param quality the uncalibrated quality
|
||||
* @param error true if the base did not match the reference, false otherwise
|
||||
*/
|
||||
public void addObservation(int cycle, int quality, boolean error) {
|
||||
Histogram<Integer> obs = this.observations.get(cycle);
|
||||
if (obs == null) {
|
||||
obs = new Histogram<Integer>();
|
||||
this.observations.put(cycle, obs);
|
||||
}
|
||||
obs.increment(quality);
|
||||
|
||||
if (error) {
|
||||
Histogram<Integer> errs = this.errors.get(cycle);
|
||||
if (errs == null) {
|
||||
errs = new Histogram<Integer>();
|
||||
this.errors.put(cycle, errs);
|
||||
}
|
||||
errs.increment(quality);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes the input observations so far and builds a matrix of input cycle and
|
||||
* uncalibrated quality to calibrated quality value.
|
||||
*/
|
||||
public void computeCalibratedQualities() {
|
||||
this.calibratedQualities = new int[this.observations.lastKey() + 1][];
|
||||
|
||||
for (int cycle=1; cycle<this.calibratedQualities.length; ++cycle) {
|
||||
Histogram<Integer> obs = this.observations.get(cycle);
|
||||
Histogram<Integer> err = this.errors.get(cycle);
|
||||
|
||||
this.calibratedQualities[cycle] = new int[obs.lastKey() + 1];
|
||||
|
||||
for (Integer qual : obs.keySet()) {
|
||||
double o = obs.get(qual).getValue();
|
||||
Histogram<Integer>.Bin errBin = err.get(qual);
|
||||
double e = (errBin == null) ? 1 : errBin.getValue();
|
||||
|
||||
this.calibratedQualities[cycle][qual] = computePhredScore(e, o);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the set of calibrated quality scores from the training data. The array is
|
||||
* indexed first by the cycle (1-based, index 0 is empty) and then by input quality
|
||||
* (again, the actualy quality, not shifted).
|
||||
*
|
||||
* @return an array of calibrated qualities for the read
|
||||
*/
|
||||
public int[][] getCalibratedQualities() {
|
||||
return calibratedQualities;
|
||||
}
|
||||
|
||||
/**
|
||||
* Accesses the calibrated quality for the given input cycle and quality. If the quality
|
||||
* is outside the range given in the training data then the upper or lower bound of
|
||||
* the calibrated qualities is used instead.
|
||||
*
|
||||
* @param cycle the input cycle (1-based)
|
||||
* @param quality the uncalibrated quality
|
||||
* @return the calibrated quality for the cycle and uncalibrated quality
|
||||
*/
|
||||
public final int getCalibratedQuality(int cycle, int quality) {
|
||||
final int[] quals = this.calibratedQualities[cycle];
|
||||
|
||||
// TODO: proper iterpolation where we don't have the right quality
|
||||
try {
|
||||
int retval = quals[quality];
|
||||
|
||||
// If we didn't calibrate this quality value, search up and down for non-zero
|
||||
for (int i=quality; i>0 && retval == 0; --i) {
|
||||
if (quals[i] != 0) retval = quals[i];
|
||||
}
|
||||
|
||||
for (int i=quality; i<quals.length && retval == 0; ++i) {
|
||||
if (quals[i] != 0) retval = quals[i];
|
||||
}
|
||||
|
||||
return retval;
|
||||
}
|
||||
catch (IndexOutOfBoundsException ioobe) {
|
||||
// If we try to fetch a quality out of the calibrted range use either
|
||||
// 1 or max quality based on which side we were out of range on
|
||||
if (quality < 1) return 1;
|
||||
else return quals[quals.length - 1];
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if no observations were made, otherwise false. */
|
||||
public boolean isEmpty() {
|
||||
return this.observations.isEmpty();
|
||||
}
|
||||
|
||||
/** Just does the simple phred scaling given the errors and observations. */
|
||||
private int computePhredScore(double errors, double observations) {
|
||||
return (int) Math.round(-10d * Math.log10(errors / observations));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -1,137 +0,0 @@
|
|||
package edu.mit.broad.picard.reference;
|
||||
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
import edu.mit.broad.sam.SAMSequenceRecord;
|
||||
import edu.mit.broad.sam.SAMTextHeaderCodec;
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.sam.util.LineReader;
|
||||
import edu.mit.broad.sam.util.AsciiLineReader;
|
||||
|
||||
import java.io.*;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Implementation of ReferenceSequenceFile for reading from FASTA files.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
class FastaSequenceFile implements ReferenceSequenceFile {
|
||||
private static final Charset ASCII = Charset.forName("US-ASCII");
|
||||
private File file;
|
||||
private BufferedReader in;
|
||||
private List<SAMSequenceRecord> sequenceDictionary;
|
||||
private String cachedLine = null;
|
||||
private int index = -1;
|
||||
|
||||
/** Constructs a FastaSequenceFile that reads from the specified file. */
|
||||
FastaSequenceFile(File file) {
|
||||
this.file = file;
|
||||
this.in = new BufferedReader(new InputStreamReader(IoUtil.openFileForReading(file)));
|
||||
|
||||
// Try and locate the dictionary
|
||||
String dictionaryName = file.getAbsolutePath();
|
||||
dictionaryName = dictionaryName.substring(0, dictionaryName.lastIndexOf(".fasta"));
|
||||
dictionaryName += ".dict";
|
||||
File dictionary = new File(dictionaryName);
|
||||
if (dictionary.exists()) {
|
||||
IoUtil.assertFileIsReadable(dictionary);
|
||||
|
||||
try {
|
||||
SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
|
||||
SAMFileHeader header = codec.decode(new AsciiLineReader(new FileInputStream(dictionary)), dictionary);
|
||||
if (header.getSequences() != null && header.getSequences().size() > 0) {
|
||||
this.sequenceDictionary = header.getSequences();
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new PicardException("Could not open sequence dictionary file: " + dictionaryName, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the list of sequence records associated with the reference sequence if found
|
||||
* otherwise null.
|
||||
*/
|
||||
public List<SAMSequenceRecord> getSequenceDictionary() {
|
||||
return this.sequenceDictionary;
|
||||
}
|
||||
|
||||
public ReferenceSequence nextSequence() {
|
||||
String line = null;
|
||||
String name = null;
|
||||
|
||||
// Scan forward to a header line
|
||||
while ((line = readNextLine()) != null) {
|
||||
if (line.startsWith(">")) {
|
||||
name = line.substring(1).trim();
|
||||
this.index += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// No more!
|
||||
if (name == null) return null;
|
||||
|
||||
// Read the sequence
|
||||
int basesRead = 0;
|
||||
byte[] bases = new byte[250000000]; // big enough to hold human chr1!
|
||||
while ((line = readNextLine()) != null) {
|
||||
if (line.startsWith(">")) {
|
||||
pushBackLine(line);
|
||||
break;
|
||||
}
|
||||
else {
|
||||
final byte[] nextBases = line.getBytes(ASCII);
|
||||
final int lineLength = nextBases.length;
|
||||
|
||||
// If the array isn't big enough to hold the next chunk, resize it
|
||||
if (basesRead + lineLength > bases.length) {
|
||||
byte[] tmp = new byte[bases.length * 2];
|
||||
System.arraycopy(bases, 0, tmp, 0, basesRead);
|
||||
bases = tmp;
|
||||
}
|
||||
|
||||
// Now shunt the most recent bases onto the end of the array
|
||||
System.arraycopy(nextBases, 0, bases, basesRead, lineLength);
|
||||
basesRead += lineLength;
|
||||
}
|
||||
}
|
||||
|
||||
// And lastly resize the array down to the right size
|
||||
if (basesRead != bases.length) {
|
||||
byte[] tmp = new byte[basesRead];
|
||||
System.arraycopy(bases, 0, tmp, 0, basesRead);
|
||||
bases = tmp;
|
||||
}
|
||||
|
||||
return new ReferenceSequence(name, this.index, bases);
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next line from the file, or if we've saved a line earlier, returns that
|
||||
* instead.
|
||||
*/
|
||||
private String readNextLine() {
|
||||
// If we have a cached line use it
|
||||
if (this.cachedLine != null) {
|
||||
String tmp = this.cachedLine;
|
||||
this.cachedLine = null;
|
||||
return tmp;
|
||||
}
|
||||
else {
|
||||
try { return this.in.readLine(); }
|
||||
catch (IOException ioe) {
|
||||
throw new PicardException("Error reading line from file: " + this.file.getAbsolutePath(), ioe);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Pushed a line back so that the next call to readNextLine() will return it. */
|
||||
private void pushBackLine(String line) {
|
||||
this.cachedLine = line;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,48 +0,0 @@
|
|||
package edu.mit.broad.picard.reference;
|
||||
|
||||
/**
|
||||
* Wrapper around a reference sequence that has been read from a reference file.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class ReferenceSequence {
|
||||
private String name;
|
||||
private byte[] bases;
|
||||
private int contigIndex;
|
||||
private int length;
|
||||
|
||||
/**
|
||||
* Package level constructor that creates a fully formed ReferenceSequence
|
||||
*
|
||||
* @param name the name of the sequence from the source file
|
||||
* @param index the zero based index of this contig in the source file
|
||||
* @param bases the bases themselves stored as one-byte characters
|
||||
*/
|
||||
ReferenceSequence(String name, int index, byte[] bases) {
|
||||
this.name = name;
|
||||
this.contigIndex = index;
|
||||
this.bases = bases;
|
||||
this.length = bases.length;
|
||||
}
|
||||
|
||||
/** Gets the set of names given to this sequence in the source file. */
|
||||
public String getName() { return name; }
|
||||
|
||||
/**
|
||||
* Gets the array of bases that define this sequence. The bases can include any
|
||||
* letter and possibly include masking information in the form of lower case
|
||||
* letters. This array is mutable (obviously!) and it NOT a clone of the array
|
||||
* held interally. Do not modify it!!!
|
||||
*/
|
||||
public byte[] getBases() { return bases; }
|
||||
|
||||
/** Gets the 0-based index of this contig in the source file from which it came. */
|
||||
public int getContigIndex() { return contigIndex; }
|
||||
|
||||
/** Gets the length of this reference sequence in bases. */
|
||||
public int length() { return length; }
|
||||
|
||||
public String toString() {
|
||||
return "ReferenceSequence " + getName();
|
||||
}
|
||||
}
|
||||
|
|
@ -1,29 +0,0 @@
|
|||
package edu.mit.broad.picard.reference;
|
||||
|
||||
import edu.mit.broad.sam.SAMSequenceRecord;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* An interface for working with files of reference sequences regardless of the file format
|
||||
* being used.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public interface ReferenceSequenceFile {
|
||||
|
||||
/**
|
||||
* Must return a sequence dictionary with at least the following fields completed
|
||||
* for each sequence: name, length.
|
||||
*
|
||||
* @return a list of sequence records representing the sequences in this reference file
|
||||
*/
|
||||
public List<SAMSequenceRecord> getSequenceDictionary();
|
||||
|
||||
/**
|
||||
* Retrieves the next whole sequences from the file.
|
||||
* @return a ReferenceSequence or null if at the end of the file
|
||||
*/
|
||||
public ReferenceSequence nextSequence();
|
||||
|
||||
}
|
||||
|
|
@ -1,28 +0,0 @@
|
|||
package edu.mit.broad.picard.reference;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Factory class for creating ReferenceSequenceFile instances for reading reference
|
||||
* sequences store in various formats.
|
||||
*
|
||||
* @author Tim Fennell
|
||||
*/
|
||||
public class ReferenceSequenceFileFactory {
|
||||
|
||||
/**
|
||||
* Attempts to determine the type of the reference file and return an instance
|
||||
* of ReferenceSequenceFile that is appropriate to read it.
|
||||
*
|
||||
* @param file the reference sequence file on disk
|
||||
*/
|
||||
public static ReferenceSequenceFile getReferenceSequenceFile(File file) {
|
||||
String name = file.getName();
|
||||
if (name.endsWith(".fasta") || name.endsWith("fasta.gz") || name.endsWith(".txt") || name.endsWith(".txt.gz")) {
|
||||
return new FastaSequenceFile(file);
|
||||
}
|
||||
else {
|
||||
throw new IllegalArgumentException("File is not a supported reference file type: " + file.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,352 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
|
||||
package edu.mit.broad.picard.sam;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||
import edu.mit.broad.picard.cmdline.Option;
|
||||
import edu.mit.broad.picard.cmdline.Usage;
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
import edu.mit.broad.picard.metrics.AggregateMetricCollector;
|
||||
import edu.mit.broad.picard.metrics.MetricBase;
|
||||
import edu.mit.broad.picard.metrics.MetricCollector;
|
||||
import edu.mit.broad.picard.metrics.MetricsFile;
|
||||
import edu.mit.broad.picard.metrics.StringHeader;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
|
||||
import edu.mit.broad.picard.sam.CollectAlignmentSummaryMetrics.AlignmentSummaryMetrics.Type;
|
||||
import edu.mit.broad.picard.util.CoordMath;
|
||||
import edu.mit.broad.picard.util.Histogram;
|
||||
import edu.mit.broad.picard.util.SequenceUtil;
|
||||
import edu.mit.broad.sam.AlignmentBlock;
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.sam.SAMFileReader;
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
|
||||
/**
|
||||
* A command line tool to read a BAM file and produce standard alignment metrics that would be applicable to any alignment.
|
||||
* Metrics to include, but not limited to:
|
||||
* <ul>
|
||||
* <li>Total number of reads (total, period, no exclusions)</li>
|
||||
* <li>Total number of PF reads (PF == does not fail vendor check flag)</li>
|
||||
* <li>Number of PF noise reads (does not fail vendor check and has noise attr set)</li>
|
||||
* <li>Total aligned PF reads (any PF read that has a sequence and position)</li>
|
||||
* <li>High quality aligned PF reads (high quality == mapping quality >= 20)</li>
|
||||
* <li>High quality aligned PF bases (actual aligned bases, calculate off alignment blocks)</li>
|
||||
* <li>High quality aligned PF Q20 bases (subset of above where base quality >= 20)</li>
|
||||
* <li>Median mismatches in HQ aligned PF reads (how many aligned bases != ref on average)</li>
|
||||
* <li>Reads aligned in pairs (vs. reads aligned with mate unaligned/not present)</li>
|
||||
* <li>Read length (how to handle mixed lengths?)</li>
|
||||
* <li>Bad Cycles - how many machine cycles yielded combined no-call and mismatch rates of >= 80%</li>
|
||||
* <li>Strand balance - reads mapped to positive strand / total mapped reads</li>
|
||||
* </ul>
|
||||
* Metrics are written for the first read of a pair, the second read, and combined for the pair.
|
||||
*
|
||||
* @author Doug Voet
|
||||
*/
|
||||
public class CollectAlignmentSummaryMetrics extends CommandLineProgram {
|
||||
private static final int MAPPING_QUALITY_THRESHOLD = 20;
|
||||
private static final int BASE_QUALITY_THRESHOLD = 20;
|
||||
|
||||
// Usage and parameters
|
||||
@Usage(programVersion="1.0")
|
||||
public String USAGE = "Reads a SAM or BAM file and writes a file containing summary metrics.\n";
|
||||
@Option(shortName="I", doc="SAM or BAM file") public File INPUT;
|
||||
@Option(shortName="O", doc="File to write insert size metrics to") public File OUTPUT;
|
||||
@Option(shortName="R", doc="Reference sequence file") public File REFERENCE;
|
||||
@Option(doc="If true (default), \"unsorted\" SAM/BAM files will be considerd coordinate sorted")
|
||||
public Boolean ASSUME_COODINATE_SORTED = Boolean.TRUE;
|
||||
|
||||
private ReferenceSequenceFile ref;
|
||||
private ReferenceSequence refSequence;
|
||||
private SAMFileHeader samFileHeader;
|
||||
|
||||
/** Required main method implementation. */
|
||||
public static void main(String[] argv) {
|
||||
System.exit(new CollectAlignmentSummaryMetrics().instanceMain(argv));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int doWork() {
|
||||
IoUtil.assertFileIsReadable(INPUT);
|
||||
IoUtil.assertFileIsReadable(REFERENCE);
|
||||
IoUtil.assertFileIsWritable(OUTPUT);
|
||||
SAMFileReader in = new SAMFileReader(INPUT);
|
||||
assertCoordinateSortOrder(in);
|
||||
|
||||
this.ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE);
|
||||
this.samFileHeader = in.getFileHeader();
|
||||
|
||||
MetricsFile<AlignmentSummaryMetrics, Comparable<?>> file = collectMetrics(in.iterator());
|
||||
in.close();
|
||||
|
||||
file.write(OUTPUT);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private void assertCoordinateSortOrder(SAMFileReader in) {
|
||||
switch (in.getFileHeader().getSortOrder()) {
|
||||
case coordinate:
|
||||
break;
|
||||
case unsorted:
|
||||
if (this.ASSUME_COODINATE_SORTED) {
|
||||
break;
|
||||
}
|
||||
default:
|
||||
throw new PicardException("Cannot collect summary statistics in file " + INPUT.getAbsoluteFile() +
|
||||
" because it is not sorted in coordinate order.");
|
||||
}
|
||||
}
|
||||
|
||||
private ReferenceSequence getReference(SAMRecord record) {
|
||||
while (refSequence == null ||
|
||||
record.getReferenceIndex(samFileHeader) > refSequence.getContigIndex()) {
|
||||
|
||||
refSequence = ref.nextSequence();
|
||||
}
|
||||
|
||||
if (refSequence == null || record.getReferenceIndex() != refSequence.getContigIndex()) {
|
||||
throw new PicardException("Cannot find reference sequence [" +
|
||||
record.getReferenceIndex() + "] in reference file");
|
||||
}
|
||||
|
||||
return refSequence;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does all the work of iterating through the sam file and collecting summary alignment metrics.
|
||||
*/
|
||||
private MetricsFile<AlignmentSummaryMetrics, Comparable<?>> collectMetrics(
|
||||
CloseableIterator<SAMRecord> samIterator) {
|
||||
|
||||
final MetricCollector<AlignmentSummaryMetrics> unpairedCollector =
|
||||
constructCollector(Type.UNPAIRED);
|
||||
final MetricCollector<AlignmentSummaryMetrics> firstOfPairCollector =
|
||||
constructCollector(Type.FIRST_OF_PAIR);
|
||||
final MetricCollector<AlignmentSummaryMetrics> secondOfPairCollector =
|
||||
constructCollector(Type.SECOND_OF_PAIR);
|
||||
final MetricCollector<AlignmentSummaryMetrics> pairCollector =
|
||||
constructCollector(Type.PAIR);
|
||||
|
||||
while (samIterator.hasNext()) {
|
||||
SAMRecord record = samIterator.next();
|
||||
|
||||
if (record.getReadPairedFlag()) {
|
||||
if (record.getFirstOfPairFlag()) {
|
||||
firstOfPairCollector.addRecord(record);
|
||||
} else {
|
||||
secondOfPairCollector.addRecord(record);
|
||||
}
|
||||
pairCollector.addRecord(record);
|
||||
} else {
|
||||
unpairedCollector.addRecord(record);
|
||||
}
|
||||
}
|
||||
|
||||
firstOfPairCollector.onComplete();
|
||||
secondOfPairCollector.onComplete();
|
||||
pairCollector.onComplete();
|
||||
unpairedCollector.onComplete();
|
||||
|
||||
MetricsFile<AlignmentSummaryMetrics, Comparable<?>> file = getMetricsFile();
|
||||
file.addHeader(new StringHeader("Input file: " + INPUT.getAbsolutePath()));
|
||||
file.addHeader(new StringHeader("Output file: " + OUTPUT.getAbsolutePath()));
|
||||
file.addHeader(new StringHeader("Reference file: " + REFERENCE.getAbsolutePath()));
|
||||
|
||||
if (firstOfPairCollector.getMetrics().TOTAL_READS > 0) {
|
||||
file.addMetric(firstOfPairCollector.getMetrics());
|
||||
// override how bad cycle is determined for paired reads, it should be
|
||||
// the sum of first and second reads
|
||||
pairCollector.getMetrics().BAD_CYCLES =
|
||||
firstOfPairCollector.getMetrics().BAD_CYCLES +
|
||||
secondOfPairCollector.getMetrics().BAD_CYCLES;
|
||||
file.addMetric(secondOfPairCollector.getMetrics());
|
||||
file.addMetric(pairCollector.getMetrics());
|
||||
}
|
||||
if (unpairedCollector.getMetrics().TOTAL_READS > 0) {
|
||||
file.addMetric(unpairedCollector.getMetrics());
|
||||
}
|
||||
|
||||
return file;
|
||||
}
|
||||
|
||||
private MetricCollector<AlignmentSummaryMetrics> constructCollector(Type type) {
|
||||
MetricCollector<AlignmentSummaryMetrics> collector =
|
||||
new AggregateMetricCollector<AlignmentSummaryMetrics>(new ReadCounter(), new QualityMappingCounter());
|
||||
collector.setMetrics(new AlignmentSummaryMetrics());
|
||||
collector.getMetrics().TYPE = type;
|
||||
return collector;
|
||||
}
|
||||
|
||||
public static class AlignmentSummaryMetrics extends MetricBase {
|
||||
public enum Type { UNPAIRED, FIRST_OF_PAIR, SECOND_OF_PAIR, PAIR }
|
||||
public Type TYPE;
|
||||
public long TOTAL_READS;
|
||||
public long PF_READS;
|
||||
public long PF_NOISE_READS;
|
||||
public long PF_READS_ALIGNED;
|
||||
public long PF_HQ_ALIGNED_READS;
|
||||
public long PF_HQ_ALIGNED_BASES;
|
||||
public long PF_HQ_ALIGNED_Q20_BASES;
|
||||
public double PF_HQ_MEDIAN_MISMATCHES;
|
||||
public double MEAN_READ_LENGTH;
|
||||
public long READS_ALIGNED_IN_PAIRS;
|
||||
public long BAD_CYCLES;
|
||||
public double STRAND_BALANCE;
|
||||
}
|
||||
|
||||
/** counts reads that match various conditions */
|
||||
private class ReadCounter implements MetricCollector<AlignmentSummaryMetrics> {
|
||||
private long numPositiveStrand = 0;
|
||||
private Histogram<Integer> readLengthHistogram = new Histogram<Integer>();
|
||||
private AlignmentSummaryMetrics metrics;
|
||||
|
||||
@Override
|
||||
public void addRecord(SAMRecord record) {
|
||||
if (record.getNotPrimaryAlignmentFlag()) {
|
||||
// only want 1 count per read so skip non primary alignments
|
||||
return;
|
||||
}
|
||||
|
||||
metrics.TOTAL_READS++;
|
||||
readLengthHistogram.increment(record.getReadBases().length);
|
||||
|
||||
if (!record.getReadFailsVendorQualityCheckFlag()) {
|
||||
metrics.PF_READS++;
|
||||
|
||||
if (isNoiseRead(record)) {
|
||||
metrics.PF_NOISE_READS++;
|
||||
}
|
||||
if (!record.getReadUnmappedFlag()) {
|
||||
metrics.PF_READS_ALIGNED++;
|
||||
}
|
||||
}
|
||||
|
||||
if (!record.getReadUnmappedFlag() &&
|
||||
record.getReadPairedFlag() &&
|
||||
!record.getMateUnmappedFlag()) {
|
||||
metrics.READS_ALIGNED_IN_PAIRS++;
|
||||
}
|
||||
|
||||
if (!record.getReadNegativeStrandFlag()) {
|
||||
numPositiveStrand++;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onComplete() {
|
||||
metrics.MEAN_READ_LENGTH = readLengthHistogram.getMean();
|
||||
metrics.STRAND_BALANCE = numPositiveStrand / (double) metrics.TOTAL_READS;
|
||||
}
|
||||
|
||||
private boolean isNoiseRead(SAMRecord record) {
|
||||
final Object noiseAttribute = record.getAttribute(ReservedTagConstants.XN);
|
||||
return (noiseAttribute != null && noiseAttribute.equals(1));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setMetrics(AlignmentSummaryMetrics metrics) {
|
||||
this.metrics = metrics;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AlignmentSummaryMetrics getMetrics() {
|
||||
return this.metrics;
|
||||
}
|
||||
}
|
||||
|
||||
/** counts quality mappings & base calls that match various conditions */
|
||||
private class QualityMappingCounter implements MetricCollector<AlignmentSummaryMetrics> {
|
||||
private Histogram<Long> mismatchHistogram = new Histogram<Long>();
|
||||
private Histogram<Integer> badCycleHistogram = new Histogram<Integer>();
|
||||
private AlignmentSummaryMetrics metrics;
|
||||
|
||||
@Override
|
||||
public void addRecord(SAMRecord record) {
|
||||
if (record.getNotPrimaryAlignmentFlag()) {
|
||||
return;
|
||||
}
|
||||
if (record.getReadUnmappedFlag()) {
|
||||
final byte[] readBases = record.getReadBases();
|
||||
for (int i = 0; i < readBases.length; i++) {
|
||||
if (SequenceUtil.isNoCall(readBases[i])) {
|
||||
badCycleHistogram.increment(CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
boolean highQualityMapping = isHighQualityMapping(record);
|
||||
if (highQualityMapping) metrics.PF_HQ_ALIGNED_READS++;
|
||||
|
||||
final byte[] readBases = record.getReadBases();
|
||||
final byte[] refBases = getReference(record).getBases();
|
||||
final byte[] qualities = record.getBaseQualities();
|
||||
long mismatchCount = 0;
|
||||
|
||||
for (AlignmentBlock alignmentBlock : record.getAlignmentBlocks()) {
|
||||
final int readIndex = alignmentBlock.getReadStart() - 1;
|
||||
final int refIndex = alignmentBlock.getReferenceStart() - 1;
|
||||
final int length = alignmentBlock.getLength();
|
||||
if (highQualityMapping) metrics.PF_HQ_ALIGNED_BASES += alignmentBlock.getLength();
|
||||
|
||||
for (int i=0; i<length; ++i) {
|
||||
final int readBaseIndex = readIndex + i;
|
||||
boolean mismatch = !SequenceUtil.basesEqual(readBases[readBaseIndex], refBases[refIndex+i]);
|
||||
if (highQualityMapping) {
|
||||
if (qualities[readBaseIndex] >= BASE_QUALITY_THRESHOLD) {
|
||||
metrics.PF_HQ_ALIGNED_Q20_BASES++;
|
||||
}
|
||||
if (mismatch) {
|
||||
mismatchCount++;
|
||||
}
|
||||
}
|
||||
if (mismatch || SequenceUtil.isNoCall(readBases[readBaseIndex])) {
|
||||
badCycleHistogram.increment(CoordMath.getCycle(record.getReadNegativeStrandFlag(), readBases.length, i));
|
||||
}
|
||||
}
|
||||
}
|
||||
mismatchHistogram.increment(mismatchCount);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isHighQualityMapping(SAMRecord record) {
|
||||
return !record.getReadFailsVendorQualityCheckFlag() &&
|
||||
record.getMappingQuality() >= MAPPING_QUALITY_THRESHOLD;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onComplete() {
|
||||
metrics.PF_HQ_MEDIAN_MISMATCHES = mismatchHistogram.getMedian();
|
||||
metrics.BAD_CYCLES = 0;
|
||||
|
||||
for (Histogram<Integer>.Bin cycleBin : badCycleHistogram.values()) {
|
||||
double badCyclePercentage = cycleBin.getValue() / metrics.TOTAL_READS;
|
||||
if (badCyclePercentage >= .8) {
|
||||
metrics.BAD_CYCLES++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setMetrics(AlignmentSummaryMetrics metrics) {
|
||||
this.metrics = metrics;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AlignmentSummaryMetrics getMetrics() {
|
||||
return this.metrics;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,154 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
|
||||
package edu.mit.broad.picard.sam;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||
import edu.mit.broad.picard.cmdline.Option;
|
||||
import edu.mit.broad.picard.cmdline.Usage;
|
||||
import edu.mit.broad.picard.io.IoUtil;
|
||||
import edu.mit.broad.picard.metrics.MetricsFile;
|
||||
import edu.mit.broad.picard.util.Histogram;
|
||||
import edu.mit.broad.picard.util.Log;
|
||||
import edu.mit.broad.picard.util.RExecutor;
|
||||
import edu.mit.broad.sam.SAMFileReader;
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
import edu.mit.broad.sam.util.CloseableIterator;
|
||||
|
||||
/**
|
||||
* Command line program to read non-duplicate insert sizes, create a histogram
|
||||
* and report distribution statistics.
|
||||
*
|
||||
* @author Doug Voet
|
||||
*/
|
||||
public class CollectInsertSizeMetrics extends CommandLineProgram {
|
||||
private static Log log = Log.getInstance(CollectInsertSizeMetrics.class);
|
||||
private static final String HISTOGRAM_R_SCRIPT = "edu/mit/broad/picard/sam/insertSizeHistogram.R";
|
||||
// Usage and parameters
|
||||
@Usage(programVersion="1.0")
|
||||
public String USAGE = "Reads a SAM or BAM file and writes a file containing metrics about " +
|
||||
"the statistical distribution of insert size (excluding duplicates) " +
|
||||
"and generates a histogram plot.\n";
|
||||
@Option(shortName="I", doc="SAM or BAM file") public File INPUT;
|
||||
@Option(shortName="O", doc="File to write insert size metrics to") public File OUTPUT;
|
||||
@Option(shortName="H", doc="File to write insert size histogram chart to") public File HISTOGRAM_FILE;
|
||||
|
||||
/** Required main method implementation. */
|
||||
public static void main(String[] argv) {
|
||||
System.exit(new CollectInsertSizeMetrics().instanceMain(argv));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int doWork() {
|
||||
IoUtil.assertFileIsReadable(INPUT);
|
||||
IoUtil.assertFileIsWritable(OUTPUT);
|
||||
IoUtil.assertFileIsWritable(HISTOGRAM_FILE);
|
||||
|
||||
SAMFileReader in = new SAMFileReader(INPUT);
|
||||
MetricsFile<InsertSizeMetrics, Integer> file = collectMetrics(in.iterator());
|
||||
in.close();
|
||||
|
||||
file.write(OUTPUT);
|
||||
|
||||
if (file.getMetrics().get(0).READ_PAIRS == 0) {
|
||||
log.warn("Input file did not contain any records with insert size information.");
|
||||
} else {
|
||||
int rResult = RExecutor.executeFromClasspath(
|
||||
HISTOGRAM_R_SCRIPT,
|
||||
OUTPUT.getAbsolutePath(),
|
||||
HISTOGRAM_FILE.getAbsolutePath(),
|
||||
INPUT.getName());
|
||||
|
||||
if (rResult != 0) {
|
||||
throw new PicardException("R script " + HISTOGRAM_R_SCRIPT + " failed with return code " + rResult);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does all the work of iterating through the sam file and collecting insert size metrics.
|
||||
*/
|
||||
MetricsFile<InsertSizeMetrics, Integer> collectMetrics(CloseableIterator<SAMRecord> samIterator) {
|
||||
Histogram<Integer> insertSizeHistogram = new Histogram<Integer>("insert_size", "count");
|
||||
while (samIterator.hasNext()) {
|
||||
SAMRecord record = samIterator.next();
|
||||
if (skipRecord(record)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int insertSize = Math.abs(record.getInferredInsertSize());
|
||||
insertSizeHistogram.increment(insertSize);
|
||||
}
|
||||
|
||||
MetricsFile<InsertSizeMetrics, Integer> file = new MetricsFile<InsertSizeMetrics, Integer>();
|
||||
file.setHistogram(insertSizeHistogram);
|
||||
InsertSizeMetrics metrics = new InsertSizeMetrics();
|
||||
metrics.READ_PAIRS = (long) insertSizeHistogram.getCount();
|
||||
metrics.MAX_INSERT_SIZE = (int) insertSizeHistogram.getMax();
|
||||
metrics.MIN_INSERT_SIZE = (int) insertSizeHistogram.getMin();
|
||||
metrics.MEAN_INSERT_SIZE = insertSizeHistogram.getMean();
|
||||
metrics.STANDARD_DEVIATION = insertSizeHistogram.getStandardDeviation();
|
||||
metrics.MEDIAN_INSERT_SIZE = insertSizeHistogram.getMedian();
|
||||
|
||||
final double total = insertSizeHistogram.getCount();
|
||||
final double median = insertSizeHistogram.getMedian();
|
||||
double covered = 0;
|
||||
double low = median;
|
||||
double high = median;
|
||||
|
||||
while (low >= insertSizeHistogram.getMin() || high <= insertSizeHistogram.getMax()) {
|
||||
Histogram<Integer>.Bin lowBin = insertSizeHistogram.get((int) low);
|
||||
if (lowBin != null) covered += lowBin.getValue();
|
||||
|
||||
if (low != high) {
|
||||
Histogram<Integer>.Bin highBin = insertSizeHistogram.get((int) high);
|
||||
if (highBin != null) covered += highBin.getValue();
|
||||
}
|
||||
|
||||
double percentCovered = covered / total;
|
||||
int distance = (int) (high - low) + 1;
|
||||
if (percentCovered >= 0.1 && metrics.WIDTH_OF_10_PERCENT == 0) metrics.WIDTH_OF_10_PERCENT = distance;
|
||||
if (percentCovered >= 0.2 && metrics.WIDTH_OF_20_PERCENT == 0) metrics.WIDTH_OF_20_PERCENT = distance;
|
||||
if (percentCovered >= 0.3 && metrics.WIDTH_OF_30_PERCENT == 0) metrics.WIDTH_OF_30_PERCENT = distance;
|
||||
if (percentCovered >= 0.4 && metrics.WIDTH_OF_40_PERCENT == 0) metrics.WIDTH_OF_40_PERCENT = distance;
|
||||
if (percentCovered >= 0.5 && metrics.WIDTH_OF_50_PERCENT == 0) metrics.WIDTH_OF_50_PERCENT = distance;
|
||||
if (percentCovered >= 0.6 && metrics.WIDTH_OF_60_PERCENT == 0) metrics.WIDTH_OF_60_PERCENT = distance;
|
||||
if (percentCovered >= 0.7 && metrics.WIDTH_OF_70_PERCENT == 0) metrics.WIDTH_OF_70_PERCENT = distance;
|
||||
if (percentCovered >= 0.8 && metrics.WIDTH_OF_80_PERCENT == 0) metrics.WIDTH_OF_80_PERCENT = distance;
|
||||
if (percentCovered >= 0.9 && metrics.WIDTH_OF_90_PERCENT == 0) metrics.WIDTH_OF_90_PERCENT = distance;
|
||||
if (percentCovered >= 0.99 && metrics.WIDTH_OF_99_PERCENT == 0) metrics.WIDTH_OF_99_PERCENT = distance;
|
||||
|
||||
--low;
|
||||
++high;
|
||||
}
|
||||
|
||||
file.addMetric(metrics);
|
||||
|
||||
return file;
|
||||
}
|
||||
|
||||
/**
|
||||
* Figures out whether or not the record should be included in the counting of insert sizes
|
||||
*/
|
||||
private boolean skipRecord(SAMRecord record) {
|
||||
return !record.getReadPairedFlag() ||
|
||||
record.getMateUnmappedFlag() ||
|
||||
record.getFirstOfPairFlag() ||
|
||||
record.getNotPrimaryAlignmentFlag() ||
|
||||
record.getDuplicateReadFlag() ||
|
||||
record.getInferredInsertSize() == 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,64 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright Jan 22, 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.sam;
|
||||
|
||||
import edu.mit.broad.picard.util.PeekableIterator;
|
||||
import edu.mit.broad.sam.SAMFileReader;
|
||||
import edu.mit.broad.sam.SAMRecord;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* Iterator for SAM records that implements comparable to enable sorting of iterators.
|
||||
* The comparison is performed by comparing the next record in the iterator to the next
|
||||
* record in another iterator and returning the ordering between those SAM records.
|
||||
*/
|
||||
class ComparableSamRecordIterator extends PeekableIterator<SAMRecord> implements Comparable<ComparableSamRecordIterator> {
|
||||
private Comparator<SAMRecord> comparator;
|
||||
private SAMFileReader reader;
|
||||
|
||||
/**
|
||||
* Constructs an iterator for iteration over the supplied SAM file that will be
|
||||
* able to compare itself to other ComparableSAMRecordIterator instances using
|
||||
* the supplied comparator for ordering SAMRecords.
|
||||
*
|
||||
* @param sam the SAM file to read records from
|
||||
* @param comparator the Comparator to use to provide ordering fo SAMRecords
|
||||
*/
|
||||
public ComparableSamRecordIterator(SAMFileReader sam, Comparator<SAMRecord> comparator) {
|
||||
super(sam.iterator());
|
||||
this.reader = sam;
|
||||
this.comparator = comparator;
|
||||
}
|
||||
|
||||
/** Returns the reader from which this iterator was constructed. */
|
||||
public SAMFileReader getReader() {
|
||||
return reader;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compares this iterator to another comparable iterator based on the next record
|
||||
* available in each iterator. If the two comparable iterators have different
|
||||
* comparator types internally an exception is thrown.
|
||||
*
|
||||
* @param that another iterator to compare to
|
||||
* @return a negative, 0 or positive number as described in the Comparator interface
|
||||
*/
|
||||
public int compareTo(ComparableSamRecordIterator that) {
|
||||
if (this.comparator.getClass() != that.comparator.getClass()) {
|
||||
throw new IllegalStateException("Attempt to compare two ComparableSAMRecordIterators that " +
|
||||
"have different orderings internally");
|
||||
}
|
||||
|
||||
SAMRecord record = this.peek();
|
||||
SAMRecord record2 = that.peek();
|
||||
return comparator.compare(record, record2);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,145 +0,0 @@
|
|||
/*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*/
|
||||
package edu.mit.broad.picard.sam;
|
||||
|
||||
import edu.mit.broad.sam.SAMSequenceRecord;
|
||||
import edu.mit.broad.sam.SAMFileWriter;
|
||||
import edu.mit.broad.sam.SAMFileWriterFactory;
|
||||
import edu.mit.broad.sam.SAMFileHeader;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||
import edu.mit.broad.picard.cmdline.Option;
|
||||
import edu.mit.broad.picard.cmdline.Usage;
|
||||
import edu.mit.broad.picard.PicardException;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.io.File;
|
||||
import java.security.MessageDigest;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
import java.math.BigInteger;
|
||||
|
||||
/**
|
||||
* Create a SAM/BAM file from a fasta containing reference sequence. The output SAM file contains a header but no
|
||||
* SAMRecords, and the header contains only sequence records.
|
||||
*/
|
||||
public class CreateSequenceDictionary extends CommandLineProgram {
|
||||
|
||||
private static final String PROGRAM_VERSION = "1.0";
|
||||
|
||||
// The following attributes define the command-line arguments
|
||||
@Usage(programVersion=PROGRAM_VERSION)
|
||||
public String USAGE =
|
||||
"Usage: " + getClass().getName() + " [options]\n\n" +
|
||||
"Read fasta or fasta.gz containing reference sequences, and write as a SAM or BAM file with only sequence dictionary.\n";
|
||||
|
||||
@Option(doc = "Input reference fasta or fasta.gz")
|
||||
public File REFERENCE;
|
||||
|
||||
@Option(doc = "Output SAM or BAM file containing only the sequence dictionary")
|
||||
public File OUTPUT;
|
||||
|
||||
@Option(doc = "Put into AS field of sequence dictionary entry if supplied", optional = true)
|
||||
public String GENOME_ASSEMBLY;
|
||||
|
||||
@Option(doc = "Put into UIR field of sequence dictionary entry. If not supplied, input reference file is used",
|
||||
optional = true)
|
||||
public String URI;
|
||||
|
||||
@Option(doc = "Put into SP field of sequence dictionary entry", optional = true)
|
||||
public String SPECIES;
|
||||
|
||||
private final MessageDigest md5;
|
||||
|
||||
public CreateSequenceDictionary() {
|
||||
try {
|
||||
md5 = MessageDigest.getInstance("MD5");
|
||||
} catch (NoSuchAlgorithmException e) {
|
||||
throw new PicardException("MD5 algorithm not found", e);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(final String[] argv) {
|
||||
System.exit(new CreateSequenceDictionary().instanceMain(argv));
|
||||
}
|
||||
|
||||
/**
|
||||
* Use reference filename to create URI to go into header if URI was not passed on cmd line.
|
||||
*/
|
||||
protected boolean customCommandLineValidation() {
|
||||
if (URI == null) {
|
||||
URI = "file:" + REFERENCE.getAbsolutePath();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Do the work after command line has been parsed.
|
||||
* RuntimeException may be thrown by this method, and are reported appropriately.
|
||||
*
|
||||
* @return program exit status.
|
||||
*/
|
||||
protected int doWork() {
|
||||
final List<SAMSequenceRecord> sequences = makeSequenceDictionary(REFERENCE);
|
||||
final SAMFileHeader samHeader = new SAMFileHeader();
|
||||
samHeader.setSequences(sequences);
|
||||
final SAMFileWriter samWriter = new SAMFileWriterFactory().makeSAMOrBAMWriter(samHeader, false, OUTPUT);
|
||||
samWriter.close();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Read all the sequences from the given reference file, and convert into SAMSequenceRecords
|
||||
* @param referenceFile fasta or fasta.gz
|
||||
* @return SAMSequenceRecords containing info from the fasta, plus from cmd-line arguments.
|
||||
*/
|
||||
List<SAMSequenceRecord> makeSequenceDictionary(final File referenceFile) {
|
||||
final ReferenceSequenceFile refSeqFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile);
|
||||
ReferenceSequence refSeq;
|
||||
final List<SAMSequenceRecord> ret = new ArrayList<SAMSequenceRecord>();
|
||||
while ((refSeq = refSeqFile.nextSequence()) != null) {
|
||||
ret.add(makeSequenceRecord(refSeq));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create one SAMSequenceRecord from a single fasta sequence
|
||||
*/
|
||||
private SAMSequenceRecord makeSequenceRecord(final ReferenceSequence refSeq) {
|
||||
final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName());
|
||||
ret.setSequenceLength(refSeq.length());
|
||||
|
||||
// Compute MD5 of upcased bases
|
||||
final byte[] bases = refSeq.getBases();
|
||||
for (int i = 0; i < bases.length; ++i) {
|
||||
bases[i] = (byte) (Character.toUpperCase(bases[i]) & 0xff);
|
||||
}
|
||||
|
||||
ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases));
|
||||
if (GENOME_ASSEMBLY != null) {
|
||||
ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY);
|
||||
}
|
||||
ret.setAttribute(SAMSequenceRecord.URI_TAG, URI);
|
||||
if (SPECIES != null) {
|
||||
ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
private String md5Hash(final byte[] bytes) {
|
||||
md5.reset();
|
||||
md5.update(bytes);
|
||||
return new BigInteger(1, md5.digest()).toString(16);
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue