gatk-3.8/java/lib/edu/mit/broad/cnv/kmer/CountKMers.java

1495 lines
51 KiB
Java
Raw Normal View History

/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.cnv.kmer;
import edu.mit.broad.cnv.util.SequenceIterator;
import java.io.*;
import java.util.*;
/**
* Tool for counting unique kmers.
*/
public class CountKMers
{
private static final int NONUNIQUE_MARKER = -1;
private String mAction = null;
private static int mK = 0;
private int mMinimumK = 0;
private int mMaximumK = 0;
private int mBatchSize = 0;
private List<File> mInputFiles = null;
private File mSearchFile = null;
private String mSequenceName = null;
private File mInputDirectory = null;
private File mOutputDirectory = null;
private boolean mRunDistributed = false;
private int mDistributedWorkerCount = 0;
private boolean mVerbose = false;
private boolean mDebug = false;
private List<String> mSequenceList = null;
private List<Integer> mSequenceOffsetList = null;
private List<File> mSpillFileList = null;
private double mSpillFactor = 0.9;
private long mKMerCount = 0;
private long mUniquePriorCount = 0;
private long mUniqueNewCount = 0;
private long mPriorMapUniqueCount = 0;
private InputStream mPriorMapStream = null;
private int mPriorMapPosition = -1;
private int mPriorMapValue = 0;
private int mInputFileIndex = 0;
private LineNumberReader mCurrentReader = null;
private String mNextSequence = null;
private char[] mKMerBuffer = null;
private int mKMerBufferedCount = 0;
private String mLineBuffer = null;
private int mLineBufferIndex = 0;
private int mBaseIndex = -1;
private byte[] mIOBuffer = null;
/* Design
Inputs:
- One or more fasta files to search (currently one).
- Output directory for the result files.
- Optionally an input k-1-mer file (output from previous pass).
Outputs:
- Unique kmer file: <kmer> <chr> <pos> (sorted by kmer)
This is unique globally or unique wrt unique (K-1) mers (i.e. K unique, K-1 not).
- Per chromosome bit map: pos (implicit) new-bit cum-bit
New-bit is 1 if Kmer starting at pos is unique but (K-1)-mer is not.
Cum-bit is 1 if Kmer starting at pos is unique for some L <= K.
- Statistics
Plan:
- Reducing memory footprint is crucial.
- Sequential pass over the input sequences to generate kmers.
- BatchSize kmers are cached in memory, then sorted and uniqified.
- As batch array fills, batches are spilled to disk.
- Batches are reloaded from disk and merged (N-finger algorithm)
- and streamed to a merge file.
- Merge file is read from disk and processed as final results.
*/
public static void main(String[] args)
throws Exception {
new CountKMers().run(args);
}
private void usage() {
System.out.println("Usage: CountKMers ...");
System.out.println(" -action <action>");
System.out.println(" -genome <fasta-file>");
System.out.println(" -chromosome <name>");
System.out.println(" -k <k>");
System.out.println(" -minK <k>");
System.out.println(" -maxK <k>");
System.out.println(" -batchSize <n>");
System.out.println(" -inputDir <directory>");
System.out.println(" -outputDir <directory>");
System.out.println(" -distributed");
System.out.println(" -workers <n>");
System.out.println(" -verbose");
System.out.println(" -debug");
}
private boolean parseArguments(String[] args) {
int argpos = 0;
int argsleft = 0;
while (argpos < args.length) {
argsleft = args.length - argpos;
String arg = args[argpos];
if (arg.equals("-action") && argsleft > 1) {
argpos++;
mAction = args[argpos++];
} else if (arg.equals("-genome") && argsleft > 1) {
argpos++;
if (mInputFiles == null) {
mInputFiles = new ArrayList<File>();
}
mInputFiles.add(new File(args[argpos++]));
} else if (arg.equals("-chromosome") && argsleft > 1) {
argpos++;
mSequenceName = args[argpos++];
} else if (arg.equals("-k") && argsleft > 1) {
argpos++;
mK = Integer.parseInt(args[argpos++]);
} else if (arg.equals("-minK") && argsleft > 1) {
argpos++;
mMinimumK = Integer.parseInt(args[argpos++]);
} else if (arg.equals("-maxK") && argsleft > 1) {
argpos++;
mMaximumK = Integer.parseInt(args[argpos++]);
} else if (arg.equals("-batchSize") && argsleft > 1) {
argpos++;
mBatchSize = Integer.parseInt(args[argpos++]);
} else if (arg.equals("-inputDir") && argsleft > 1) {
argpos++;
mInputDirectory = new File(args[argpos++]);
} else if (arg.equals("-outputDir") && argsleft > 1) {
argpos++;
mOutputDirectory = new File(args[argpos++]);
} else if (arg.equals("-searchFile") && argsleft > 1) {
argpos++;
mSearchFile = new File(args[argpos++]);
} else if (arg.equals("-distributed")) {
argpos++;
mRunDistributed = true;
} else if (arg.equals("-workers") && argsleft > 1) {
argpos++;
mDistributedWorkerCount = Integer.parseInt(args[argpos++]);
} else if (arg.equals("-verbose")) {
argpos++;
mVerbose = true;
} else if (arg.equals("-debug")) {
argpos++;
mDebug = true;
} else if (arg.startsWith("-")) {
usage();
return false;
} else {
break;
}
}
argsleft = args.length - argpos;
if (argsleft != 0) {
usage();
return false;
}
return true;
}
private void run(String[] args)
throws Exception {
if (!parseArguments(args)) {
System.exit(1);
}
if (mAction == null || mAction.equals("mapKMers")) {
if (mRunDistributed) {
mapKMersDistributed();
} else {
mapKMers();
}
} else if (mAction.equals("mapGaps")) {
mapGaps();
} else if (mAction.equals("rollUp")) {
rollUp();
} else if (mAction.equals("search")) {
search();
}
}
private void search()
throws IOException {
char[][] searchStrings = loadSearchFile(mSearchFile);
while (true) {
String seqName = getNextSequence();
if (seqName == null) {
break;
}
int position = 0;
log("Scanning " + seqName + " ...");
while (true) {
char[] kmerChars = getNextKMer();
if (kmerChars == null) {
break;
}
position++;
for (int i = 0; i < searchStrings.length; i++) {
if (Arrays.equals(searchStrings[i], kmerChars)) {
String kmer = new String(searchStrings[i]);
String strand = ((i % 2) == 0) ? "F" : "R";
System.out.println(kmer + "\t" + seqName + "\t" + position + "\t" + strand);
}
}
}
}
}
private char[][] loadSearchFile(File file)
throws IOException {
List<char[]> list = new ArrayList<char[]>();
LineNumberReader reader = new LineNumberReader(new FileReader(file));
while (true) {
String line = reader.readLine();
if (line == null) {
reader.close();
break;
}
String text = line.trim();
if (text.length() == 0 || text.startsWith("#")) {
continue;
}
String[] fields = text.split("\\s+");
char[] kmer = fields[0].toUpperCase().toCharArray();
list.add(kmer);
list.add(reverseComplement(kmer));
}
return list.toArray(new char[0][0]);
}
// Can be used to scan genome for sequence names/lengths.
private void scanKMers()
throws IOException {
mSequenceList = new ArrayList<String>();
mSequenceOffsetList = new ArrayList<Integer>();
File priorMapFile =
new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin");
openPriorMap(priorMapFile);
while (true) {
String seqName = getNextSequence();
if (seqName == null) {
break;
}
mSequenceList.add(seqName);
mSequenceOffsetList.add(mBaseIndex+1);
log("Scanning " + seqName + " ...");
while (true) {
char[] kmerChars = getNextKMer();
if (kmerChars == null) {
break;
}
mKMerCount++;
if (isUniqueInPriorMap(mBaseIndex)) {
continue;
}
}
}
closePriorMap();
}
private void mapGaps()
throws IOException {
while (true) {
String seqName = getNextSequence();
if (seqName == null) {
break;
}
int pos = 0;
int gapStart = 0;
while (true) {
char base = getNextBase();
if (base == 0) {
break;
}
pos++;
if (base == 'N') {
if (gapStart == 0) {
gapStart = pos;
}
} else {
if (gapStart > 0) {
System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1));
gapStart = 0;
}
}
}
if (gapStart > 0) {
System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1));
gapStart = 0;
}
}
}
private void rollUp()
throws IOException {
// Roll up based on the middle of the reads.
File[] mapFiles = getAllMapFiles();
if (mapFiles.length > 127) {
throw new RuntimeException("K to large for byte sized counts");
}
SequenceIterator seqIterator = new SequenceIterator(mInputFiles);
while (true) {
String seqName = seqIterator.getNextSequence();
if (seqName == null) {
break;
}
if (mSequenceName != null && !mSequenceName.equals(seqName)) {
continue;
}
log("Rolling up sequence " + seqName + " ...");
int seqBaseIndex = seqIterator.getBaseIndex() + 1;
char[] seqChars = loadSequence(seqIterator);
int seqLength = seqChars.length;
int seqMapOffset = (seqBaseIndex >> 3) & 0x1FFFFFFF;
int seqMapModulus = (seqBaseIndex & 0x7);
int seqMapLength = (seqMapModulus + seqLength + 7)/8;
// log(" seqLength = " + seqLength);
// log(" baseIndex = " + Integer.toHexString(seqBaseIndex)
// + " (" + (((long)seqBaseIndex) & 0xFFFFFFFFL) + ")");
// log(" seqMapOffset = " + seqMapOffset);
// log(" seqMapLength = " + seqMapLength);
byte[] counts = new byte[seqLength];
for (int pos = 1; pos <= seqLength; pos++) {
if (seqChars[pos-1] == 'N') {
counts[pos-1] = -1;
}
}
for (int k = 1; k <= mapFiles.length; k++) {
if (mapFiles[k-1] == null) {
continue;
}
log("Processing map file " + mapFiles[k-1] + " ...");
byte[] kmerMap = readMapFileRegion(mapFiles[k-1], seqMapOffset, seqMapLength);
for (int pos = 1; pos <= seqLength; pos++) {
if (counts[pos-1] != 0) {
continue;
} else if (isNearContigBoundary(pos, seqChars, k)) {
counts[pos-1] = -1;
} else {
int baseOffset = pos - (k+1)/2;
int mapIndex = seqMapModulus + baseOffset;
if (isUniqueInMap(kmerMap, mapIndex)) {
counts[pos-1] = (byte) k;
}
}
}
}
File outputFile =
new File(mOutputDirectory, "rollup_" + seqName + ".bin");
writeRollUpFile(outputFile, counts);
}
}
private boolean isNearContigBoundary(int pos, char[] seqChars, int k) {
int windowStart = pos - (k-1)/2;
int windowEnd = pos + k/2;
if (windowStart < 1 || windowEnd > seqChars.length) {
return true;
}
for (int i = windowStart-1; i < windowEnd; i++) {
if (seqChars[i] == 'N') {
return true;
}
}
return false;
}
private void writeRollUpFile(File file, byte[] counts)
throws IOException {
FileOutputStream stream = new FileOutputStream(file);
stream.write(counts);
stream.flush();
stream.close();
if (mDebug) {
PrintWriter writer = new PrintWriter(file + ".dbg");
for (int i = 0; i < counts.length; i++) {
writer.println(counts[i]);
}
writer.flush();
writer.close();
}
}
/**
* Returns an array of files, indexed by K,
* where the array index = K-1 (i.e. K=1 is the first file).
* If there is no file for index K, then the array element is null.
*/
private File[] getAllMapFiles() {
int maxK = mMaximumK;
if (maxK == 0) {
// Safe upper bound
maxK = 1000;
}
List<File> fileList = new ArrayList<File>();
for (int k = 1; k <= maxK; k++) {
if (mMinimumK > 0 && k < mMinimumK) {
continue;
}
File mapFile =
new File(mInputDirectory, "unique_" + k + "_mers_map.bin");
if (mapFile.exists()) {
while (fileList.size() < k-1) {
fileList.add(null);
}
fileList.add(mapFile);
} else {
if (mMaximumK == 0 && !fileList.isEmpty()) {
break;
}
}
}
File[] result = new File[fileList.size()];
result = fileList.toArray(result);
if (mDebug) {
for (int i = 0; i < result.length; i++) {
debug("mapFiles[k=" + (i+1) + "] = " + result[i]);
}
}
return result;
}
private char[] loadSequence(SequenceIterator seqIterator)
throws IOException {
StringBuilder builder = new StringBuilder();
while (true) {
char ch = seqIterator.getNextBase();
if (ch == 0) {
break;
}
builder.append(ch);
}
char[] result = new char[builder.length()];
builder.getChars(0, builder.length(), result, 0);
return result;
}
private void mapKMersDistributed()
throws Exception {
DistributedKMerCounter algorithm = new DistributedKMerCounter();
algorithm.setDebug(mDebug);
algorithm.setVerbose(mVerbose);
algorithm.setInputFiles(mInputFiles);
algorithm.setK(mK);
algorithm.setMaximumWorkerCount(mDistributedWorkerCount);
// algorithm.setLsfQueue(mLsfQueue);
// algorithm.setLsfLogDirectory(mLsfLogDirectory);
// algorithm.setEnableGcLogging(mEnableGcLogging);
algorithm.run();
}
private void mapKMers()
throws IOException {
File textKMerFile =
new File(mOutputDirectory, "unique_" + mK + "_mers.txt");
File binaryKMerFile =
new File(mOutputDirectory, "unique_" + mK + "_mers.bin");
File exceptionFile =
new File(mOutputDirectory, "unique_" + mK + "_mers.extra");
File mapFile =
new File(mOutputDirectory, "unique_" + mK + "_mers_map.bin");
File priorMapFile =
new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin");
File statsFile =
new File(mOutputDirectory, "unique_" + mK + "_mers_stats.txt");
if (mBatchSize == 0) {
throw new RuntimeException("Batch size not specified");
}
int kmerCount = 0;
int batchSize = mBatchSize;
KMerPosition[] kmerArray = new KMerPosition[batchSize];
List<StringKMerPosition> exceptionList = new ArrayList<StringKMerPosition>();
mSequenceList = new ArrayList<String>();
mSequenceOffsetList = new ArrayList<Integer>();
mIOBuffer = new byte[Math.max(20,4 + 2*((mK + 7)/8))];
openPriorMap(priorMapFile);
while (true) {
String seqName = getNextSequence();
if (seqName == null) {
break;
}
mSequenceList.add(seqName);
mSequenceOffsetList.add(mBaseIndex+1);
log("Processing " + seqName + " ...");
while (true) {
char[] kmerChars = getNextKMer();
if (kmerChars == null) {
break;
}
mKMerCount++;
int baseIndex = mBaseIndex;
if (isUniqueInPriorMap(baseIndex)) {
mUniquePriorCount++;
continue;
}
KMerPosition kmp = encodeKMer(kmerChars, baseIndex);
if (kmp == null) {
// Note: We currently do not handle the reverse
// complement of exception characters correctly.
// For hg18, however, this doesn't matter as
// none of the kmers containing non-ACGT characters
// are present on the reverse strand.
String kmer = new String(kmerChars);
exceptionList.add(new StringKMerPosition(kmer, baseIndex));
continue;
}
kmerArray[kmerCount++] = kmp;
if (kmerCount == batchSize) {
kmerCount = compactKMers(kmerArray, kmerCount);
if (kmerCount > mSpillFactor * batchSize) {
spillKMers(kmerArray, kmerCount);
kmerCount = 0;
}
}
}
}
if (kmerCount > 0) {
kmerCount = compactKMers(kmerArray, kmerCount);
if (mSpillFileList != null) {
spillKMers(kmerArray, kmerCount);
kmerCount = 0;
}
}
closePriorMap();
// Write out the exception kmers (text file).
compactKMers(exceptionList);
writeExceptionFile(exceptionList, exceptionFile);
// Write out the binary file of unique encoded kmers.
if (mSpillFileList == null) {
kmerCount = removeNonUnique(kmerArray, kmerCount);
writeKMerBinaryFile(kmerArray, kmerCount, binaryKMerFile);
mUniqueNewCount = kmerCount;
} else {
mUniqueNewCount = mergeSpillFiles(mSpillFileList, binaryKMerFile);
}
mUniqueNewCount += countUniqueKMers(exceptionList);
// Write out the text file of (all) unique kmers.
writeKMerTextFile(binaryKMerFile, exceptionList, textKMerFile);
// Create map file from prior map plus the new unique kmers.
long mapSize = (mBaseIndex + 1) & 0xFFFFFFFFL;
createMapFile(mapSize, binaryKMerFile, exceptionList, priorMapFile, mapFile);
// Write summary statistics file.
writeSummaryStatistics(statsFile);
}
private int compactKMers(KMerPosition[] kmerArray, int kmerCount) {
if (kmerCount == 0) {
return 0;
}
log("Compacting " + kmerCount + " kmers at index " +
Integer.toHexString(mBaseIndex) + " ...");
Arrays.sort(kmerArray, 0, kmerCount);
int newCount = 1;
KMerPosition current = kmerArray[0];
for (int i = 1; i < kmerCount; i++) {
KMerPosition kmp = kmerArray[i];
if (current.compareTo(kmp) == 0) {
current.setBaseIndex(NONUNIQUE_MARKER);
} else {
kmerArray[newCount++] = kmp;
current = kmp;
}
}
log("Compaction finished, new count is " + newCount);
return newCount;
}
private int compactKMers(StringKMerPosition[] kmerArray, int kmerCount) {
if (kmerCount == 0) {
return 0;
}
log("Compacting " + kmerCount + " string kmers ...");
Arrays.sort(kmerArray, 0, kmerCount);
int newCount = 1;
String kmerString = kmerArray[0].getKMer();
for (int i = 1; i < kmerCount; i++) {
StringKMerPosition kmp = kmerArray[i];
String ks = kmp.getKMer();
if (ks.equals(kmerString)) {
kmerArray[newCount-1].setBaseIndex(NONUNIQUE_MARKER);
} else {
kmerArray[newCount++] = kmp;
kmerString = ks;
}
}
log("Compaction finished, new count is " + newCount);
return newCount;
}
private void compactKMers(List<StringKMerPosition> kmerList) {
int kmerCount = kmerList.size();
if (kmerCount <= 1) {
return;
}
StringKMerPosition[] kmerArray =
kmerList.toArray(new StringKMerPosition[kmerCount]);
kmerCount = compactKMers(kmerArray, kmerCount);
kmerList.clear();
for (int i = 0; i < kmerCount; i++) {
kmerList.add(kmerArray[i]);
}
}
private int removeNonUnique(KMerPosition[] kmerArray, int kmerCount) {
int uniqueCount = 0;
for (int i = 0; i < kmerCount; i++) {
KMerPosition kmp = kmerArray[i];
if (kmp.getBaseIndex() != NONUNIQUE_MARKER) {
kmerArray[uniqueCount++] = kmp;
}
}
return uniqueCount;
}
private int countUniqueKMers(List<StringKMerPosition> kmerList) {
int uniqueCount = 0;
for (StringKMerPosition kmp : kmerList) {
if (kmp.getBaseIndex() != NONUNIQUE_MARKER) {
uniqueCount++;
}
}
return uniqueCount;
}
private void spillKMers(KMerPosition[] kmerArray, int kmerCount)
throws IOException {
if (mSpillFileList == null) {
mSpillFileList = new ArrayList<File>();
}
int fileNumber = mSpillFileList.size() + 1;
log("Spilling " + kmerCount + " kmers to file " + fileNumber + " ...");
File spillFile = new File(mOutputDirectory,
"spill_" + mK + "_" + fileNumber + ".tmp");
mSpillFileList.add(spillFile);
writeKMerBinaryFile(kmerArray, kmerCount, spillFile);
log("Spill file written");
}
private void writeKMerBinaryFile(KMerPosition[] kmerArray,
int kmerCount,
File outputFile)
throws IOException {
OutputStream outputStream =
new BufferedOutputStream(new FileOutputStream(outputFile));
for (int i = 0; i < kmerCount; i++) {
KMerPosition kmp = kmerArray[i];
writeKMerPosition(outputStream, kmerArray[i]);
}
outputStream.flush();
outputStream.close();
}
private void writeExceptionFile(List<StringKMerPosition> kmerList,
File outputFile)
throws IOException {
PrintWriter writer =
new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));
for (StringKMerPosition kmer : kmerList) {
writeUniqueKMer(kmer, writer);
}
writer.flush();
writer.close();
}
private KMerPosition readKMerPosition(InputStream stream)
throws IOException {
byte[] buffer = mIOBuffer;
int encodingLength = (mK + 7)/8;
int fileLength = 4 + 2*encodingLength;
int count = readFully(stream, buffer, 0, fileLength);
if (count <= 0) {
return null;
} else if (count != fileLength) {
throw new RuntimeException("Unexpected end of file");
}
char[] encoding = new char[encodingLength];
int baseIndex = ((buffer[0] & 0xFF) |
(buffer[1] & 0xFF) << 8 |
(buffer[2] & 0xFF) << 16 |
(buffer[3] & 0xFF) << 24);
for (int i = 0; i < encodingLength; i++) {
encoding[i] = (char) ((buffer[2*i+4] & 0xFF) |
((buffer[2*i+5] & 0xFF) << 8));
}
return new KMerPosition(encoding, baseIndex);
}
private int readFully(InputStream stream, byte[] buffer, int offset, int count)
throws IOException {
int readCount = 0;
while (readCount < count) {
int read = stream.read(buffer, offset, count-readCount);
if (read <= 0) {
break;
}
offset += read;
readCount += read;
}
return readCount;
}
private void skipBytes(InputStream stream, int count)
throws IOException {
long longCount = count;
long skipCount = 0;
while (skipCount < longCount) {
long skipped = stream.skip(longCount - skipCount);
if (skipped <= 0) {
throw new RuntimeException("Skip failed");
}
skipCount += skipped;
}
}
private void writeKMerPosition(OutputStream stream, KMerPosition kmer)
throws IOException {
byte[] buffer = mIOBuffer;
int baseIndex = kmer.getBaseIndex();
char[] encoding = kmer.getKMerEncoding();
int offset = 0;
buffer[offset++] = (byte) ((baseIndex) & 0xFF);
buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF);
buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF);
buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF);
for (int i = 0; i < encoding.length; i++) {
buffer[offset++] = (byte) ((encoding[i]) & 0xFF);
buffer[offset++] = (byte) ((encoding[i] >> 8) & 0xFF);
}
stream.write(buffer, 0, offset);
}
private long mergeSpillFiles(List<File> spillFiles, File outputFile)
throws IOException {
if (spillFiles == null) {
return 0;
}
log("Merging spill files ...");
OutputStream outputStream =
new BufferedOutputStream(new FileOutputStream(outputFile));
long uniqueCount = 0;
int fileCount = spillFiles.size();
InputStream[] inputStreams = new InputStream[fileCount];
KMerPosition[] kmers = new KMerPosition[fileCount];
for (int i = 0; i < fileCount; i++) {
inputStreams[i] =
new BufferedInputStream(new FileInputStream(spillFiles.get(i)));
}
while (true) {
for (int i = 0; i < fileCount; i++) {
if (kmers[i] == null && inputStreams[i] != null) {
kmers[i] = readKMerPosition(inputStreams[i]);
if (kmers[i] == null) {
inputStreams[i].close();
inputStreams[i] = null;
}
}
}
int count = 0;
KMerPosition kmer = null;
for (int i = 0; i < fileCount; i++) {
KMerPosition kmp = kmers[i];
if (kmp == null) {
continue;
} else if (kmer == null) {
kmer = kmp;
count = 1;
} else {
int cmp = kmp.compareTo(kmer);
if (cmp == 0) {
count++;
} else if (cmp < 0) {
kmer = kmp;
count = 1;
}
}
}
if (kmer == null) {
break;
}
for (int i = 0; i < fileCount; i++) {
if (kmers[i] != null && kmer.compareTo(kmers[i]) == 0) {
kmers[i] = null;
}
}
if (count == 1 && kmer.getBaseIndex() != NONUNIQUE_MARKER) {
uniqueCount++;
writeKMerPosition(outputStream, kmer);
}
}
outputStream.flush();
outputStream.close();
for (int i = 0; i < fileCount; i++) {
// spillFiles.get(i).delete();
}
log("Spill files merged, unique count is " + uniqueCount);
return uniqueCount;
}
private void writeKMerTextFile(File inputFile,
List<StringKMerPosition> exceptionList,
File outputFile)
throws IOException {
log("Writing kmer file " + outputFile + " ...");
int exceptionIndex = 0;
StringKMerPosition excKMer = null;
Iterator<StringKMerPosition> excIter = null;
if (!exceptionList.isEmpty()) {
excIter = exceptionList.iterator();
excKMer = excIter.next();
}
InputStream inputStream =
new BufferedInputStream(new FileInputStream(inputFile));
PrintWriter writer =
new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));
KMerPosition kmer = readKMerPosition(inputStream);
while (kmer != null || excKMer != null) {
if (excKMer == null) {
writeUniqueKMer(kmer, writer);
kmer = readKMerPosition(inputStream);
} else if (kmer == null) {
writeUniqueKMer(excKMer, writer);
excKMer = excIter.hasNext() ? excIter.next() : null;
} else if (kmer.getKMer().compareTo(excKMer.getKMer()) < 0) {
writeUniqueKMer(kmer, writer);
kmer = readKMerPosition(inputStream);
} else {
writeUniqueKMer(excKMer, writer);
excKMer = excIter.hasNext() ? excIter.next() : null;
}
}
inputStream.close();
writer.flush();
writer.close();
log("Wrote kmer file: " + outputFile);
}
private void writeUniqueKMer(KMerPosition kmer, PrintWriter writer) {
if (kmer.getBaseIndex() != NONUNIQUE_MARKER) {
writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer);
}
}
private void writeUniqueKMer(StringKMerPosition kmer, PrintWriter writer) {
if (kmer.getBaseIndex() != NONUNIQUE_MARKER) {
writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer);
}
}
private void writeKMer(String kmer, int baseIndex, PrintWriter writer) {
String chr = getBaseIndexSequenceName(baseIndex);
int pos = getBaseIndexCoordinate(baseIndex);
writer.println(kmer + "\t" + chr + "\t" + pos);
}
private void createMapFile(long mapSize,
File kmerFile,
List<StringKMerPosition> exceptionList,
File priorMapFile,
File mapFile)
throws IOException {
byte[] map = null;
long uniquePriorCount = 0;
long byteSize = (mapSize + 7)/8;
int mapByteSize = (int) byteSize;
if (mapByteSize != byteSize) {
throw new RuntimeException("Map too large: " + mapSize);
}
if (priorMapFile.exists()) {
map = readMapFile(priorMapFile);
if (map.length != mapByteSize) {
throw new RuntimeException("Prior map is wrong size");
}
// Count the prior unique positions
for (int i = 0; i < mapByteSize; i++) {
uniquePriorCount += Integer.bitCount(map[i] & 0xFF);
}
} else {
map = new byte[mapByteSize];
}
for (StringKMerPosition kmp : exceptionList) {
addToMap(kmp, map);
}
mPriorMapUniqueCount = uniquePriorCount;
InputStream inputStream =
new BufferedInputStream(new FileInputStream(kmerFile));
while (true) {
KMerPosition kmp = readKMerPosition(inputStream);
if (kmp == null) {
inputStream.close();
break;
}
addToMap(kmp, map);
}
writeMapFile(map, mapFile);
}
private void addToMap(KMerPosition kmp, byte[] map) {
int baseIndex = kmp.getBaseIndex();
if (baseIndex != NONUNIQUE_MARKER) {
addToMap(baseIndex, map);
}
}
private void addToMap(StringKMerPosition kmp, byte[] map) {
int baseIndex = kmp.getBaseIndex();
if (baseIndex != NONUNIQUE_MARKER) {
addToMap(baseIndex, map);
}
}
private void addToMap(int baseIndex, byte[] map) {
int mod = baseIndex & 0x7;
int offset = (baseIndex >> 3) & 0x1FFFFFFF;
if ((map[offset] & (1 << mod)) != 0) {
throw new RuntimeException("Map entry already set: " + baseIndex);
}
map[offset] |= (1 << mod);
}
private boolean isUniqueInMap(byte[] map, int baseIndex) {
int mod = baseIndex & 0x7;
int offset = (baseIndex >> 3) & 0x1FFFFFFF;
return ((map[offset] & (1 << mod)) != 0);
}
private void writeSummaryStatistics(File outputFile)
throws IOException {
PrintWriter writer =
new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));
long baseCount = (mBaseIndex + 1) & 0xFFFFFFFFL;
long uniqueCount = mUniquePriorCount + mUniqueNewCount;
long nonUniqueCount = mKMerCount - uniqueCount;
writer.println("K: " + mK);
writer.println("Sequences: " + mSequenceList.size());
writer.println("Bases: " + baseCount);
writer.println("KMers: " + mKMerCount);
writer.println("Prior map count: " + mPriorMapUniqueCount);
writer.println("Unique prior: " + mUniquePriorCount +
" (" + formatPercent(mUniquePriorCount, mKMerCount) + ")");
writer.println("Unique new: " + mUniqueNewCount +
" (" + formatPercent(mUniqueNewCount, mKMerCount) + ")");
writer.println("Unique cumulative: " + uniqueCount +
" (" + formatPercent(uniqueCount, mKMerCount) + ")");
writer.println("Nonunique: " + nonUniqueCount +
" (" + formatPercent(nonUniqueCount, mKMerCount) + ")");
writer.flush();
writer.close();
}
private String formatPercent(long numerator, long denominator) {
double fraction = 0.0;
if (denominator != 0) {
fraction = numerator / (double) denominator;
}
return String.format("%1.1f%%", fraction * 100.0);
}
private void openPriorMap(File mapFile)
throws IOException {
if (mapFile.exists()) {
mPriorMapStream = new BufferedInputStream(new FileInputStream(mapFile));
mPriorMapPosition = -1;
mPriorMapValue = 0;
}
}
private void closePriorMap()
throws IOException {
if (mPriorMapStream != null) {
mPriorMapStream.close();
}
mPriorMapStream = null;
mPriorMapPosition = -1;
mPriorMapValue = 0;
}
private byte[] readMapFile(File file)
throws IOException {
long fileLength = file.length();
if (fileLength > 1000000000) {
throw new RuntimeException("Prior map too large: " + file);
}
int length = (int) fileLength;
byte[] map = new byte[length];
FileInputStream stream = new FileInputStream(file);
int count = readFully(stream, map, 0, length);
if (count != length) {
throw new RuntimeException("Failed to read map: " + file);
}
stream.close();
return map;
}
/**
* Read just a subset of a map file.
*/
private byte[] readMapFileRegion(File file, int offset, int length)
throws IOException {
byte[] map = new byte[length];
FileInputStream stream = new FileInputStream(file);
skipBytes(stream, offset);
int count = readFully(stream, map, 0, length);
if (count != length) {
throw new RuntimeException("Failed to read map: " + file);
}
stream.close();
return map;
}
private void writeMapFile(byte[] map, File file)
throws IOException {
FileOutputStream stream = new FileOutputStream(file);
stream.write(map);
stream.flush();
stream.close();
}
private boolean isUniqueInPriorMap(int baseIndex)
throws IOException {
if (mPriorMapStream == null) {
return false;
}
int byteOffset = (baseIndex >> 3) & 0x1FFFFFFF;
if (byteOffset != mPriorMapPosition) {
int delta = byteOffset - mPriorMapPosition;
if (delta < 0) {
throw new RuntimeException("Attempt to seek backwards in prior map");
}
if (delta > 1) {
skipFully(mPriorMapStream, delta-1);
}
mPriorMapValue = mPriorMapStream.read();
if (mPriorMapValue < 0) {
throw new RuntimeException("Unexpected end of file in prior map");
}
mPriorMapPosition += delta;
}
int mod = baseIndex & 0x7;
return (((1 << mod) & mPriorMapValue) != 0);
}
private void skipFully(InputStream stream, long amount)
throws IOException {
while (amount > 0) {
long skip = stream.skip(amount);
if (skip <= 0 || skip > amount) {
throw new RuntimeException("Skip failed");
}
amount -= skip;
}
}
private String getBaseIndexSequenceName(int baseIndex) {
int sequenceCount = mSequenceList.size();
for (int i = 0; i < sequenceCount-1; i++) {
int nextOffset = mSequenceOffsetList.get(i+1);
if (compareBaseIndex(nextOffset, baseIndex) > 0) {
return mSequenceList.get(i);
}
}
return mSequenceList.get(sequenceCount-1);
}
private int getBaseIndexCoordinate(int baseIndex) {
Integer sequenceOffset = null;
for (Integer offset : mSequenceOffsetList) {
if (compareBaseIndex(offset, baseIndex) > 0) {
break;
}
sequenceOffset = offset;
}
if (sequenceOffset == null) {
return 0;
}
int coordinate = baseIndex - sequenceOffset + 1;
if (coordinate <= 0) {
dumpSequenceList();
System.out.println("coordinate: " + coordinate);
System.out.println("sequenceOffset: " + Integer.toHexString(sequenceOffset));
System.out.println("baseIndex: " + Integer.toHexString(baseIndex));
throw new RuntimeException("Internal error: illegal coordinate " +
coordinate + " for base index " + baseIndex);
}
return coordinate;
}
private void dumpSequenceList() {
System.out.println("# Sequences:");
int count = mSequenceList.size();
for (int i = 0; i < count; i++) {
String seqName = mSequenceList.get(i);
int offset = mSequenceOffsetList.get(i);
System.out.println("# " + seqName +
"\t" + offset +
"\t" + Integer.toHexString(offset));
}
}
private int compareBaseIndex(int baseIndex1, int baseIndex2) {
// Implements unsigned comparison, a la compareTo
if (baseIndex1 < 0 ^ baseIndex2 < 0) {
return ((baseIndex1 < 0) ? 1 : -1);
} else {
return (baseIndex1 - baseIndex2);
}
}
private String getNextSequence()
throws IOException {
while (mNextSequence == null) {
if (mCurrentReader == null) {
mCurrentReader = getNextReader();
if (mCurrentReader == null) {
return null;
}
}
String line = mCurrentReader.readLine();
if (line == null) {
mCurrentReader.close();
mCurrentReader = null;
continue;
}
if (line.startsWith(">")) {
String[] tokens = line.substring(1).trim().split("\\s+");
mNextSequence = tokens[0];
}
}
String result = mNextSequence;
mNextSequence = null;
return result;
}
private LineNumberReader getNextReader()
throws IOException {
if (mInputFileIndex >= mInputFiles.size()) {
return null;
}
File file = mInputFiles.get(mInputFileIndex++);
return new LineNumberReader(new FileReader(file));
}
private char[] getNextKMer()
throws IOException {
if (mKMerBuffer == null) {
mKMerBuffer = new char[mK];
}
System.arraycopy(mKMerBuffer, 1, mKMerBuffer, 0, mKMerBuffer.length - 1);
if (mKMerBufferedCount > 0) {
mKMerBufferedCount--;
}
while (mKMerBufferedCount < mK) {
char base = getNextBase();
if (base == 0) {
incrementBaseIndex(mKMerBufferedCount);
mKMerBufferedCount = 0;
return null;
} else if (base == 'N') {
incrementBaseIndex(mKMerBufferedCount+1);
mKMerBufferedCount = 0;
} else {
mKMerBuffer[mKMerBufferedCount++] = base;
}
}
incrementBaseIndex(1);
return mKMerBuffer;
}
private char getNextBase()
throws IOException {
if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) {
if (mCurrentReader == null) {
return 0;
}
String line = mCurrentReader.readLine();
if (line == null) {
mLineBuffer = null;
mLineBufferIndex = 0;
mCurrentReader.close();
mCurrentReader = null;
return 0;
}
if (line.startsWith(">")) {
String[] tokens = line.substring(1).trim().split("\\s+");
mNextSequence = tokens[0];
mLineBuffer = null;
mLineBufferIndex = 0;
return 0;
}
mLineBuffer = line.toUpperCase();
mLineBufferIndex = 0;
}
return mLineBuffer.charAt(mLineBufferIndex++);
}
private void incrementBaseIndex(int amount) {
if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) {
throw new RuntimeException("Base index: 32-bit overflow");
}
mBaseIndex += amount;
}
private void log(String text) {
if (mVerbose) {
System.out.println("# " + new Date() + " " + text);
}
}
private void debug(String text) {
if (mDebug) {
System.out.println("# " + new Date() + " " + text);
}
}
private static KMerPosition encodeKMer(char[] kmerChars, int baseIndex) {
char[] encoding = encodeKMerChars(kmerChars);
if (encoding == null) {
return null;
}
char[] reverseEncoding = encodeKMerChars(reverseComplement(kmerChars));
if (compareEncodings(encoding, reverseEncoding) <= 0) {
return new KMerPosition(encoding, baseIndex);
} else {
KMerPosition kmp = new KMerPosition(reverseEncoding, baseIndex);
kmp.setIsReversed(true);
return kmp;
}
}
private static char[] encodeKMerChars(char[] kmerChars) {
if (kmerChars == null) {
return null;
}
int kmerLength = kmerChars.length;
int encodingLength = (kmerLength + 7) / 8;
char[] encoding = new char[encodingLength];
int offset = kmerLength % 8;
offset = (offset == 0) ? 8 : offset;
int bits = encodeKMerBits(kmerChars, 0, offset);
if (bits < 0) {
return null;
}
encoding[0] = (char) bits;
for (int i = 1; i < encodingLength; i++) {
bits = encodeKMerBits(kmerChars, offset, 8);
if (bits < 0) {
return null;
}
encoding[i] = (char) bits;
offset += 8;
}
return encoding;
}
private static int compareEncodings(char[] encoding1, char[] encoding2) {
int length = Math.max(encoding1.length, encoding2.length);
for (int i = 0; i < length; i++) {
int result = encoding1[i] - encoding2[i];
if (result != 0) {
return result;
}
}
return 0;
}
private static int encodeKMerBits(char[] kmerChars, int offset, int length) {
int bits = 0;
for (int i = 0; i < length; i++) {
char base = kmerChars[offset + i];
int baseBits = "ACGT".indexOf(base);
if (baseBits < 0) {
return -1;
}
bits |= baseBits << (2*(length-i-1));
}
return bits;
}
private static String decodeKMer(char[] encoding, boolean reverse) {
int length = mK;
char[] buffer = new char[length];
int offset = length % 8;
offset = (offset == 0) ? 8 : offset;
decodeKMerBits(encoding[0], buffer, 0, offset);
for (int i = 1; i < encoding.length; i++) {
decodeKMerBits(encoding[i], buffer, offset, 8);
offset += 8;
}
if (reverse) {
reverseComplementInPlace(buffer);
}
return new String(buffer);
}
private static void decodeKMerBits(char bits, char[] buffer, int offset, int length) {
for (int i = 0; i < length; i++) {
int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3);
buffer[offset + i] = "ACGT".charAt(baseBits);
}
}
private static void decodeKMerBits(long bits, char[] buffer, int offset, int length) {
for (int i = 0; i < length; i++) {
int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3);
buffer[offset + i] = "ACGT".charAt(baseBits);
}
}
private static char[] reverseComplement(char[] buffer) {
int length = buffer.length;
char[] result = new char[length];
System.arraycopy(buffer, 0, result, 0, length);
reverseComplementInPlace(result);
return result;
}
private static void reverseComplementInPlace(char[] buffer) {
int length = buffer.length;
int limit = (length + 1)/2;
for (int i = 0; i < limit; i++) {
char ch1 = reverseComplement(buffer[i]);
char ch2 = reverseComplement(buffer[length-i-1]);
buffer[i] = ch2;
buffer[length-i-1] = ch1;
}
}
private static char reverseComplement(char base) {
switch (base) {
case 'A':
return 'T';
case 'C':
return 'G';
case 'G':
return 'C';
case 'T':
return 'A';
}
return base;
}
private static String formatEncoding(char[] encoding) {
if (encoding == null) {
return null;
}
StringBuilder builder = new StringBuilder();
builder.append('[');
for (int i = 0; i < encoding.length; i++) {
String hex = Integer.toHexString(encoding[i]);
int length = hex.length();
while (length < 4) {
builder.append('0');
length++;
}
builder.append(hex);
}
builder.append(']');
return builder.toString();
}
static class KMerPosition
implements Comparable<KMerPosition> {
private int mBaseIndex;
private boolean mReversed;
private char[] mKMerEncoding;
KMerPosition(char[] encoding, int baseIndex) {
mBaseIndex = baseIndex;
mReversed = false;
mKMerEncoding = encoding;
}
public final String getKMer() {
return decodeKMer(mKMerEncoding, mReversed);
}
public final boolean getIsReversed() {
return mReversed;
}
public final void setIsReversed(boolean value) {
mReversed = value;
}
public final int getBaseIndex() {
return mBaseIndex;
}
public final void setBaseIndex(int baseIndex) {
mBaseIndex = baseIndex;
}
public final char[] getKMerEncoding() {
return mKMerEncoding;
}
public int compareTo(KMerPosition kmp) {
return compareEncodings(getKMerEncoding(), kmp.getKMerEncoding());
}
public boolean equals(Object object) {
if (!(object instanceof KMerPosition)) {
return false;
}
KMerPosition kmp = (KMerPosition) object;
return (getBaseIndex() == kmp.getBaseIndex() &&
this.compareTo(kmp) == 0);
}
public String format() {
return(getKMer() +
" " + formatEncoding(getKMerEncoding()) +
" " + (mReversed ? 'R' : 'F') +
" " + Integer.toHexString(mBaseIndex));
}
}
static class StringKMerPosition
implements Comparable<StringKMerPosition> {
private String mKMerString = null;
private int mBaseIndex;
StringKMerPosition(String kmer, int baseIndex) {
mKMerString = kmer;
mBaseIndex = baseIndex;
}
public final String getKMer() {
return mKMerString;
}
public final int getBaseIndex() {
return mBaseIndex;
}
public final void setBaseIndex(int baseIndex) {
mBaseIndex = baseIndex;
}
public int compareTo(StringKMerPosition kmp) {
return mKMerString.compareTo(kmp.mKMerString);
}
public boolean equals(Object object) {
if (!(object instanceof StringKMerPosition)) {
return false;
}
StringKMerPosition kmp = (StringKMerPosition) object;
return (mBaseIndex == kmp.mBaseIndex &&
mKMerString.equals(kmp.mKMerString));
}
}
}