gatk-3.8/java/lib/edu/mit/broad/cnv/CountKMers.java

1302 lines
44 KiB
Java

/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.cnv;
import java.io.*;
import java.util.*;
/**
* Tool for counting unique kmers.
*/
public class CountKMers
{
private static final int NONUNIQUE_MARKER = -1;
private static boolean mUseOldFormat = false;
private String mAction = null;
private static int mK = 0;
private int mBatchSize = 0;
private List<File> mInputFiles = null;
private File mInputDirectory = null;
private File mOutputDirectory = null;
private boolean mVerbose = false;
private boolean mDebug = false;
private List<String> mSequenceList = null;
private List<Integer> mSequenceOffsetList = null;
private List<File> mSpillFileList = null;
private double mSpillFactor = 0.9;
private long mKMerCount = 0;
private long mUniquePriorCount = 0;
private long mUniqueNewCount = 0;
private long mPriorMapUniqueCount = 0;
private InputStream mPriorMapStream = null;
private int mPriorMapPosition = -1;
private int mPriorMapValue = 0;
private int mInputFileIndex = 0;
private LineNumberReader mCurrentReader = null;
private String mNextSequence = null;
private char[] mKMerBuffer = null;
private int mKMerBufferedCount = 0;
private String mLineBuffer = null;
private int mLineBufferIndex = 0;
private int mBaseIndex = -1;
private byte[] mIOBuffer = null;
/* Design
Inputs:
- One or more fasta files to search (currently one).
- Output directory for the result files.
- Optionally an input k-1-mer file (output from previous pass).
Outputs:
- Unique kmer file: <kmer> <chr> <pos> (sorted by kmer)
This is unique globally or unique wrt unique (K-1) mers (i.e. K unique, K-1 not).
- Per chromosome bit map: pos (implicit) new-bit cum-bit
New-bit is 1 if Kmer starting at pos is unique but (K-1)-mer is not.
Cum-bit is 1 if Kmer starting at pos is unique for some L <= K.
- Statistics
Plan:
- Reducing memory footprint is crucial.
- Sequential pass over the input sequences to generate kmers.
- BatchSize kmers are cached in memory, then sorted and uniqified.
- As batch array fills, batches are spilled to disk.
- Batches are reloaded from disk and merged (N-finger algorithm)
- and streamed to a merge file.
- Merge file is read from disk and processed as final results.
*/
public static void main(String[] args)
throws Exception {
new CountKMers().run(args);
}
private void usage() {
System.out.println("Usage: CountKMers ...");
System.out.println(" -action <action>");
System.out.println(" -genome <fasta-file>");
System.out.println(" -k <k>");
System.out.println(" -batchSize <n>");
System.out.println(" -inputDir <directory>");
System.out.println(" -outputDir <directory>");
System.out.println(" -verbose");
System.out.println(" -debug");
}
private boolean parseArguments(String[] args) {
int argpos = 0;
int argsleft = 0;
while (argpos < args.length) {
argsleft = args.length - argpos;
String arg = args[argpos];
if (arg.equals("-action") && argsleft > 1) {
argpos++;
mAction = args[argpos++];
} else if (arg.equals("-genome") && argsleft > 1) {
argpos++;
if (mInputFiles == null) {
mInputFiles = new ArrayList<File>();
}
mInputFiles.add(new File(args[argpos++]));
} else if (arg.equals("-k") && argsleft > 1) {
argpos++;
mK = Integer.parseInt(args[argpos++]);
} else if (arg.equals("-batchSize") && argsleft > 1) {
argpos++;
mBatchSize = Integer.parseInt(args[argpos++]);
} else if (arg.equals("-inputDir") && argsleft > 1) {
argpos++;
mInputDirectory = new File(args[argpos++]);
} else if (arg.equals("-outputDir") && argsleft > 1) {
argpos++;
mOutputDirectory = new File(args[argpos++]);
} else if (arg.equals("-oldFormat")) {
argpos++;
mUseOldFormat = true;
} else if (arg.equals("-verbose")) {
argpos++;
mVerbose = true;
} else if (arg.equals("-debug")) {
argpos++;
mDebug = true;
} else if (arg.startsWith("-")) {
usage();
return false;
} else {
break;
}
}
argsleft = args.length - argpos;
if (argsleft != 0) {
usage();
return false;
}
return true;
}
private void run(String[] args)
throws Exception {
if (!parseArguments(args)) {
System.exit(1);
}
if (mAction == null || mAction.equals("mapKMers")) {
mapKMers();
} else if (mAction.equals("mapGaps")) {
mapGaps();
}
}
// Can be used to scan genome for sequence names/lengths.
private void scanKMers()
throws IOException {
mSequenceList = new ArrayList<String>();
mSequenceOffsetList = new ArrayList<Integer>();
File priorMapFile =
new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin");
openPriorMap(priorMapFile);
while (true) {
String seqName = getNextSequence();
if (seqName == null) {
break;
}
mSequenceList.add(seqName);
mSequenceOffsetList.add(mBaseIndex+1);
log("Scanning " + seqName + " ...");
while (true) {
char[] kmerChars = getNextKMer();
if (kmerChars == null) {
break;
}
mKMerCount++;
if (isUniqueInPriorMap(mBaseIndex)) {
continue;
}
}
}
closePriorMap();
}
private void mapGaps()
throws IOException {
while (true) {
String seqName = getNextSequence();
if (seqName == null) {
break;
}
int pos = 0;
int gapStart = 0;
while (true) {
char base = getNextBase();
if (base == 0) {
break;
}
pos++;
if (base == 'N') {
if (gapStart == 0) {
gapStart = pos;
}
} else {
if (gapStart > 0) {
System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1));
gapStart = 0;
}
}
}
if (gapStart > 0) {
System.out.println(seqName + "\t" + gapStart + "\t" + (pos-1));
gapStart = 0;
}
}
}
private void mapKMers()
throws IOException {
File textKMerFile =
new File(mOutputDirectory, "unique_" + mK + "_mers.txt");
File binaryKMerFile =
new File(mOutputDirectory, "unique_" + mK + "_mers.bin");
File exceptionFile =
new File(mOutputDirectory, "unique_" + mK + "_mers.extra");
File mapFile =
new File(mOutputDirectory, "unique_" + mK + "_mers_map.bin");
File priorMapFile =
new File(mOutputDirectory, "unique_" + (mK-1) + "_mers_map.bin");
File statsFile =
new File(mOutputDirectory, "unique_" + mK + "_mers_stats.txt");
if (mBatchSize == 0) {
throw new RuntimeException("Batch size not specified");
}
int kmerCount = 0;
int batchSize = mBatchSize;
KMerPosition[] kmerArray = new KMerPosition[batchSize];
List<StringKMerPosition> exceptionList = new ArrayList<StringKMerPosition>();
mSequenceList = new ArrayList<String>();
mSequenceOffsetList = new ArrayList<Integer>();
mIOBuffer = new byte[Math.max(20,4 + 2*((mK + 7)/8))];
openPriorMap(priorMapFile);
while (true) {
String seqName = getNextSequence();
if (seqName == null) {
break;
}
mSequenceList.add(seqName);
mSequenceOffsetList.add(mBaseIndex+1);
log("Processing " + seqName + " ...");
while (true) {
char[] kmerChars = getNextKMer();
if (kmerChars == null) {
break;
}
mKMerCount++;
int baseIndex = mBaseIndex;
if (isUniqueInPriorMap(baseIndex)) {
mUniquePriorCount++;
continue;
}
KMerPosition kmp = encodeKMer(kmerChars, baseIndex);
if (kmp == null) {
String kmer = new String(kmerChars);
exceptionList.add(new StringKMerPosition(kmer, baseIndex));
continue;
}
kmerArray[kmerCount++] = kmp;
if (kmerCount == batchSize) {
kmerCount = compactKMers(kmerArray, kmerCount);
if (kmerCount > mSpillFactor * batchSize) {
spillKMers(kmerArray, kmerCount);
kmerCount = 0;
}
}
}
}
if (kmerCount > 0) {
kmerCount = compactKMers(kmerArray, kmerCount);
if (mSpillFileList != null) {
spillKMers(kmerArray, kmerCount);
kmerCount = 0;
}
}
closePriorMap();
// Write out the exception kmers (text file).
compactKMers(exceptionList);
writeExceptionFile(exceptionList, exceptionFile);
// Write out the binary file of unique encoded kmers.
if (mSpillFileList == null) {
kmerCount = removeNonUnique(kmerArray, kmerCount);
writeKMerBinaryFile(kmerArray, kmerCount, binaryKMerFile);
mUniqueNewCount = kmerCount;
} else {
mUniqueNewCount = mergeSpillFiles(mSpillFileList, binaryKMerFile);
}
mUniqueNewCount += countUniqueKMers(exceptionList);
// Write out the text file of (all) unique kmers.
writeKMerTextFile(binaryKMerFile, exceptionList, textKMerFile);
// Create map file from prior map plus the new unique kmers.
int mapSize = ((mBaseIndex >> 2) & 0x3FFFFFFF) + 1;
createMapFile(mapSize, binaryKMerFile, exceptionList, priorMapFile, mapFile);
// Write summary statistics file.
writeSummaryStatistics(statsFile);
}
private int compactKMers(KMerPosition[] kmerArray, int kmerCount) {
if (kmerCount == 0) {
return 0;
}
log("Compacting " + kmerCount + " kmers at index " +
Integer.toHexString(mBaseIndex) + " ...");
Arrays.sort(kmerArray, 0, kmerCount);
int newCount = 1;
KMerPosition current = kmerArray[0];
for (int i = 1; i < kmerCount; i++) {
KMerPosition kmp = kmerArray[i];
if (current.compareTo(kmp) == 0) {
current.setBaseIndex(NONUNIQUE_MARKER);
} else {
kmerArray[newCount++] = kmp;
current = kmp;
}
}
log("Compaction finished, new count is " + newCount);
return newCount;
}
private int compactKMers(StringKMerPosition[] kmerArray, int kmerCount) {
if (kmerCount == 0) {
return 0;
}
log("Compacting " + kmerCount + " string kmers ...");
Arrays.sort(kmerArray, 0, kmerCount);
int newCount = 1;
String kmerString = kmerArray[0].getKMer();
for (int i = 1; i < kmerCount; i++) {
StringKMerPosition kmp = kmerArray[i];
String ks = kmp.getKMer();
if (ks.equals(kmerString)) {
kmerArray[newCount-1].setBaseIndex(NONUNIQUE_MARKER);
} else {
kmerArray[newCount++] = kmp;
kmerString = ks;
}
}
log("Compaction finished, new count is " + newCount);
return newCount;
}
private void compactKMers(List<StringKMerPosition> kmerList) {
int kmerCount = kmerList.size();
if (kmerCount <= 1) {
return;
}
StringKMerPosition[] kmerArray =
kmerList.toArray(new StringKMerPosition[kmerCount]);
kmerCount = compactKMers(kmerArray, kmerCount);
kmerList.clear();
for (int i = 0; i < kmerCount; i++) {
kmerList.add(kmerArray[i]);
}
}
private int removeNonUnique(KMerPosition[] kmerArray, int kmerCount) {
int uniqueCount = 0;
for (int i = 0; i < kmerCount; i++) {
KMerPosition kmp = kmerArray[i];
if (kmp.getBaseIndex() != NONUNIQUE_MARKER) {
kmerArray[uniqueCount++] = kmp;
}
}
return uniqueCount;
}
private int countUniqueKMers(List<StringKMerPosition> kmerList) {
int uniqueCount = 0;
for (StringKMerPosition kmp : kmerList) {
if (kmp.getBaseIndex() != NONUNIQUE_MARKER) {
uniqueCount++;
}
}
return uniqueCount;
}
private void spillKMers(KMerPosition[] kmerArray, int kmerCount)
throws IOException {
if (mSpillFileList == null) {
mSpillFileList = new ArrayList<File>();
}
int fileNumber = mSpillFileList.size() + 1;
log("Spilling " + kmerCount + " kmers to file " + fileNumber + " ...");
File spillFile = new File(mOutputDirectory,
"spill_" + mK + "_" + fileNumber + ".tmp");
mSpillFileList.add(spillFile);
writeKMerBinaryFile(kmerArray, kmerCount, spillFile);
log("Spill file written");
}
private void writeKMerBinaryFile(KMerPosition[] kmerArray,
int kmerCount,
File outputFile)
throws IOException {
OutputStream outputStream =
new BufferedOutputStream(new FileOutputStream(outputFile));
for (int i = 0; i < kmerCount; i++) {
KMerPosition kmp = kmerArray[i];
writeKMerPosition(outputStream, kmerArray[i]);
}
outputStream.flush();
outputStream.close();
}
private void writeExceptionFile(List<StringKMerPosition> kmerList,
File outputFile)
throws IOException {
PrintWriter writer =
new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));
for (StringKMerPosition kmer : kmerList) {
writeUniqueKMer(kmer, writer);
}
writer.flush();
writer.close();
}
private KMerPosition readKMerPosition(InputStream stream)
throws IOException {
if (mUseOldFormat) {
return readKMerPositionOldFormat(stream);
}
byte[] buffer = mIOBuffer;
int encodingLength = (mK + 7)/8;
int fileLength = 4 + 2*encodingLength;
int count = readFully(stream, buffer, 0, fileLength);
if (count <= 0) {
return null;
} else if (count != fileLength) {
throw new RuntimeException("Unexpected end of file");
}
char[] encoding = new char[encodingLength];
int baseIndex = ((buffer[0] & 0xFF) |
(buffer[1] & 0xFF) << 8 |
(buffer[2] & 0xFF) << 16 |
(buffer[3] & 0xFF) << 24);
for (int i = 0; i < encodingLength; i++) {
encoding[i] = (char) ((buffer[2*i+4] & 0xFF) |
((buffer[2*i+5] & 0xFF) << 8));
}
return new KMerPositionN(encoding, baseIndex);
}
private KMerPosition readKMerPositionOldFormat(InputStream stream)
throws IOException {
byte[] buffer = mIOBuffer;
int length = (mK >= 32 ? 20 : 12);
int count = readFully(stream, buffer, 0, length);
if (count <= 0) {
return null;
} else if (count != length) {
throw new RuntimeException("Unexpected end of file");
}
long encoding = (((long)(buffer[0] & 0xFF)) |
((long)(buffer[1] & 0xFF)) << 8 |
((long)(buffer[2] & 0xFF)) << 16 |
((long)(buffer[3] & 0xFF)) << 24 |
((long)(buffer[4] & 0xFF)) << 32 |
((long)(buffer[5] & 0xFF)) << 40 |
((long)(buffer[6] & 0xFF)) << 48 |
((long)(buffer[7] & 0xFF)) << 56);
int baseIndex = ((buffer[length-4] & 0xFF) |
(buffer[length-3] & 0xFF) << 8 |
(buffer[length-2] & 0xFF) << 16 |
(buffer[length-1] & 0xFF) << 24);
if (length == 12) {
return new KMerPosition1(encoding, baseIndex);
} else {
long encoding2 = (((long)(buffer[8] & 0xFF)) |
((long)(buffer[9] & 0xFF)) << 8 |
((long)(buffer[10] & 0xFF)) << 16 |
((long)(buffer[11] & 0xFF)) << 24 |
((long)(buffer[12] & 0xFF)) << 32 |
((long)(buffer[13] & 0xFF)) << 40 |
((long)(buffer[14] & 0xFF)) << 48 |
((long)(buffer[15] & 0xFF)) << 56);
return new KMerPosition2(encoding, encoding2, baseIndex);
}
}
private int readFully(InputStream stream, byte[] buffer, int offset, int count)
throws IOException {
int readCount = 0;
while (readCount < count) {
int read = stream.read(buffer, offset, count-readCount);
if (read <= 0) {
break;
}
offset += read;
readCount += read;
}
return readCount;
}
private void writeKMerPosition(OutputStream stream, KMerPosition kmer)
throws IOException {
if (mUseOldFormat) {
writeKMerPositionOldFormat(stream, kmer);
return;
}
byte[] buffer = mIOBuffer;
int baseIndex = kmer.getBaseIndex();
char[] encoding = kmer.getKMerEncoding();
int offset = 0;
buffer[offset++] = (byte) ((baseIndex) & 0xFF);
buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF);
buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF);
buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF);
for (int i = 0; i < encoding.length; i++) {
buffer[offset++] = (byte) ((encoding[i]) & 0xFF);
buffer[offset++] = (byte) ((encoding[i] >> 8) & 0xFF);
}
stream.write(buffer, 0, offset);
}
private void writeKMerPositionOldFormat(OutputStream stream, KMerPosition kmer)
throws IOException {
byte[] buffer = mIOBuffer;
long encoding1 = kmer.getKMerEncoding1();
long encoding2 = kmer.getKMerEncoding2();
int baseIndex = kmer.getBaseIndex();
int offset = 0;
buffer[offset++] = (byte) ((encoding1) & 0xFF);
buffer[offset++] = (byte) ((encoding1 >> 8) & 0xFF);
buffer[offset++] = (byte) ((encoding1 >> 16) & 0xFF);
buffer[offset++] = (byte) ((encoding1 >> 24) & 0xFF);
buffer[offset++] = (byte) ((encoding1 >> 32) & 0xFF);
buffer[offset++] = (byte) ((encoding1 >> 40) & 0xFF);
buffer[offset++] = (byte) ((encoding1 >> 48) & 0xFF);
buffer[offset++] = (byte) ((encoding1 >> 56) & 0xFF);
if (mK >= 32) {
buffer[offset++] = (byte) ((encoding2) & 0xFF);
buffer[offset++] = (byte) ((encoding2 >> 8) & 0xFF);
buffer[offset++] = (byte) ((encoding2 >> 16) & 0xFF);
buffer[offset++] = (byte) ((encoding2 >> 24) & 0xFF);
buffer[offset++] = (byte) ((encoding2 >> 32) & 0xFF);
buffer[offset++] = (byte) ((encoding2 >> 40) & 0xFF);
buffer[offset++] = (byte) ((encoding2 >> 48) & 0xFF);
buffer[offset++] = (byte) ((encoding2 >> 56) & 0xFF);
}
buffer[offset++] = (byte) ((baseIndex) & 0xFF);
buffer[offset++] = (byte) ((baseIndex >> 8) & 0xFF);
buffer[offset++] = (byte) ((baseIndex >> 16) & 0xFF);
buffer[offset++] = (byte) ((baseIndex >> 24) & 0xFF);
stream.write(buffer, 0, offset);
}
private long mergeSpillFiles(List<File> spillFiles, File outputFile)
throws IOException {
if (spillFiles == null) {
return 0;
}
log("Merging spill files ...");
OutputStream outputStream =
new BufferedOutputStream(new FileOutputStream(outputFile));
long uniqueCount = 0;
int fileCount = spillFiles.size();
InputStream[] inputStreams = new InputStream[fileCount];
KMerPosition[] kmers = new KMerPosition[fileCount];
for (int i = 0; i < fileCount; i++) {
inputStreams[i] =
new BufferedInputStream(new FileInputStream(spillFiles.get(i)));
}
while (true) {
for (int i = 0; i < fileCount; i++) {
if (kmers[i] == null && inputStreams[i] != null) {
kmers[i] = readKMerPosition(inputStreams[i]);
if (kmers[i] == null) {
inputStreams[i].close();
inputStreams[i] = null;
}
}
}
int count = 0;
KMerPosition kmer = null;
for (int i = 0; i < fileCount; i++) {
KMerPosition kmp = kmers[i];
if (kmp == null) {
continue;
} else if (kmer == null) {
kmer = kmp;
count = 1;
} else {
int cmp = kmp.compareTo(kmer);
if (cmp == 0) {
count++;
} else if (cmp < 0) {
kmer = kmp;
count = 1;
}
}
}
if (kmer == null) {
break;
}
for (int i = 0; i < fileCount; i++) {
if (kmers[i] == kmer) {
kmers[i] = null;
}
}
if (count == 1 && kmer.getBaseIndex() != NONUNIQUE_MARKER) {
uniqueCount++;
writeKMerPosition(outputStream, kmer);
}
}
outputStream.flush();
outputStream.close();
for (int i = 0; i < fileCount; i++) {
// spillFiles.get(i).delete();
}
log("Spill files merged, unique count is " + uniqueCount);
return uniqueCount;
}
private void writeKMerTextFile(File inputFile,
List<StringKMerPosition> exceptionList,
File outputFile)
throws IOException {
log("Writing kmer file " + outputFile + " ...");
int exceptionIndex = 0;
StringKMerPosition excKMer = null;
Iterator<StringKMerPosition> excIter = null;
if (!exceptionList.isEmpty()) {
excIter = exceptionList.iterator();
excKMer = excIter.next();
}
InputStream inputStream =
new BufferedInputStream(new FileInputStream(inputFile));
PrintWriter writer =
new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));
KMerPosition kmer = readKMerPosition(inputStream);
while (kmer != null || excKMer != null) {
if (excKMer == null) {
writeUniqueKMer(kmer, writer);
kmer = readKMerPosition(inputStream);
} else if (kmer == null) {
writeUniqueKMer(excKMer, writer);
excKMer = excIter.hasNext() ? excIter.next() : null;
} else if (kmer.getKMer().compareTo(excKMer.getKMer()) < 0) {
writeUniqueKMer(kmer, writer);
kmer = readKMerPosition(inputStream);
} else {
writeUniqueKMer(excKMer, writer);
excKMer = excIter.hasNext() ? excIter.next() : null;
}
}
inputStream.close();
writer.flush();
writer.close();
log("Wrote kmer file: " + outputFile);
}
private void writeUniqueKMer(KMerPosition kmer, PrintWriter writer) {
if (kmer.getBaseIndex() != NONUNIQUE_MARKER) {
writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer);
}
}
private void writeUniqueKMer(StringKMerPosition kmer, PrintWriter writer) {
if (kmer.getBaseIndex() != NONUNIQUE_MARKER) {
writeKMer(kmer.getKMer(), kmer.getBaseIndex(), writer);
}
}
private void writeKMer(String kmer, int baseIndex, PrintWriter writer) {
String chr = getBaseIndexSequenceName(baseIndex);
int pos = getBaseIndexCoordinate(baseIndex);
writer.println(kmer + "\t" + chr + "\t" + pos);
}
private void createMapFile(int mapSize,
File kmerFile,
List<StringKMerPosition> exceptionList,
File priorMapFile,
File mapFile)
throws IOException {
byte[] map = null;
long uniquePriorCount = 0;
if (priorMapFile.exists()) {
map = readMapFile(priorMapFile);
if (map.length != mapSize) {
throw new RuntimeException("Prior map is wrong size");
}
// Clear the new bits from prior map.
// Also count the prior unique positions while we are at it.
// Note that this is a count of positions, not kmers.
for (int i = 0; i < mapSize; i++) {
int cumBits = map[i] & 0x55;
uniquePriorCount += Integer.bitCount(cumBits);
map[i] = (byte) cumBits;
}
} else {
map = new byte[mapSize];
}
for (StringKMerPosition kmp : exceptionList) {
addToMap(kmp, map);
}
mPriorMapUniqueCount = uniquePriorCount;
InputStream inputStream =
new BufferedInputStream(new FileInputStream(kmerFile));
while (true) {
KMerPosition kmp = readKMerPosition(inputStream);
if (kmp == null) {
inputStream.close();
break;
}
addToMap(kmp, map);
}
long testCum = 0;
for (int i = 0; i < map.length; i++) {
testCum += Integer.bitCount(map[i] & 0x55);
}
writeMapFile(map, mapFile);
}
private void addToMap(KMerPosition kmp, byte[] map) {
int baseIndex = kmp.getBaseIndex();
if (baseIndex != NONUNIQUE_MARKER) {
addToMap(baseIndex, map);
}
}
private void addToMap(StringKMerPosition kmp, byte[] map) {
int baseIndex = kmp.getBaseIndex();
if (baseIndex != NONUNIQUE_MARKER) {
addToMap(baseIndex, map);
}
}
private void addToMap(int baseIndex, byte[] map) {
int mod = baseIndex & 0x3;
int offset = (baseIndex >> 2) & 0x3FFFFFFF;
if (((map[offset] >> (2*mod)) & 0x3) != 0) {
throw new RuntimeException("Map entry already set: " + baseIndex);
}
map[offset] |= (0x3 << (2*mod));
}
private void writeSummaryStatistics(File outputFile)
throws IOException {
PrintWriter writer =
new PrintWriter(new BufferedWriter(new FileWriter(outputFile)));
long baseCount = (mBaseIndex + 1) & 0xFFFFFFFFL;
long uniqueCount = mUniquePriorCount + mUniqueNewCount;
long nonUniqueCount = mKMerCount - uniqueCount;
writer.println("K: " + mK);
writer.println("Sequences: " + mSequenceList.size());
writer.println("Bases: " + baseCount);
writer.println("KMers: " + mKMerCount);
writer.println("Prior map count: " + mPriorMapUniqueCount);
writer.println("Unique prior: " + mUniquePriorCount +
" (" + formatPercent(mUniquePriorCount, mKMerCount) + ")");
writer.println("Unique new: " + mUniqueNewCount +
" (" + formatPercent(mUniqueNewCount, mKMerCount) + ")");
writer.println("Unique cumulative: " + uniqueCount +
" (" + formatPercent(uniqueCount, mKMerCount) + ")");
writer.println("Nonunique: " + nonUniqueCount +
" (" + formatPercent(nonUniqueCount, mKMerCount) + ")");
writer.flush();
writer.close();
}
private String formatPercent(long numerator, long denominator) {
double fraction = 0.0;
if (denominator != 0) {
fraction = numerator / (double) denominator;
}
return String.format("%1.1f%%", fraction * 100.0);
}
private void openPriorMap(File mapFile)
throws IOException {
if (mapFile.exists()) {
mPriorMapStream = new BufferedInputStream(new FileInputStream(mapFile));
mPriorMapPosition = -1;
mPriorMapValue = 0;
}
}
private void closePriorMap()
throws IOException {
if (mPriorMapStream != null) {
mPriorMapStream.close();
}
mPriorMapStream = null;
mPriorMapPosition = -1;
mPriorMapValue = 0;
}
private byte[] readMapFile(File file)
throws IOException {
long fileLength = file.length();
if (fileLength > 1000000000) {
throw new RuntimeException("Prior map too large: " + file);
}
int length = (int) fileLength;
byte[] map = new byte[length];
FileInputStream stream = new FileInputStream(file);
int count = readFully(stream, map, 0, length);
if (count != length) {
throw new RuntimeException("Failed to read map: " + file);
}
stream.close();
return map;
}
private void writeMapFile(byte[] map, File file)
throws IOException {
FileOutputStream stream = new FileOutputStream(file);
stream.write(map);
stream.flush();
stream.close();
}
private boolean isUniqueInPriorMap(int baseIndex)
throws IOException {
if (mPriorMapStream == null) {
return false;
}
int byteOffset = (baseIndex >> 2) & 0x3FFFFFFF;
if (byteOffset != mPriorMapPosition) {
int delta = byteOffset - mPriorMapPosition;
if (delta < 0) {
throw new RuntimeException("Attempt to seek backwards in prior map");
}
if (delta > 1) {
skipFully(mPriorMapStream, delta-1);
}
mPriorMapValue = mPriorMapStream.read();
if (mPriorMapValue < 0) {
throw new RuntimeException("Unexpected end of file in prior map");
}
mPriorMapPosition += delta;
}
int mod = baseIndex & 0x3;
return (((mPriorMapValue >> (2*mod)) & 1) != 0);
}
private void skipFully(InputStream stream, long amount)
throws IOException {
while (amount > 0) {
long skip = stream.skip(amount);
if (skip <= 0 || skip > amount) {
throw new RuntimeException("Skip failed");
}
amount -= skip;
}
}
private String getBaseIndexSequenceName(int baseIndex) {
int sequenceCount = mSequenceList.size();
for (int i = 0; i < sequenceCount-1; i++) {
int nextOffset = mSequenceOffsetList.get(i+1);
if (compareBaseIndex(nextOffset, baseIndex) > 0) {
return mSequenceList.get(i);
}
}
return mSequenceList.get(sequenceCount-1);
}
private int getBaseIndexCoordinate(int baseIndex) {
Integer sequenceOffset = null;
for (Integer offset : mSequenceOffsetList) {
if (compareBaseIndex(offset, baseIndex) > 0) {
break;
}
sequenceOffset = offset;
}
if (sequenceOffset == null) {
return 0;
}
int coordinate = baseIndex - sequenceOffset + 1;
if (coordinate <= 0) {
dumpSequenceList();
System.out.println("coordinate: " + coordinate);
System.out.println("sequenceOffset: " + Integer.toHexString(sequenceOffset));
System.out.println("baseIndex: " + Integer.toHexString(baseIndex));
throw new RuntimeException("Internal error: illegal coordinate " +
coordinate + " for base index " + baseIndex);
}
return coordinate;
}
private void dumpSequenceList() {
System.out.println("# Sequences:");
int count = mSequenceList.size();
for (int i = 0; i < count; i++) {
String seqName = mSequenceList.get(i);
int offset = mSequenceOffsetList.get(i);
System.out.println("# " + seqName +
"\t" + offset +
"\t" + Integer.toHexString(offset));
}
}
private int compareBaseIndex(int baseIndex1, int baseIndex2) {
// Implements unsigned comparison, a la compareTo
if (baseIndex1 < 0 ^ baseIndex2 < 0) {
return ((baseIndex1 < 0) ? 1 : -1);
} else {
return (baseIndex1 - baseIndex2);
}
}
private String getNextSequence()
throws IOException {
while (mNextSequence == null) {
if (mCurrentReader == null) {
mCurrentReader = getNextReader();
if (mCurrentReader == null) {
return null;
}
}
String line = mCurrentReader.readLine();
if (line == null) {
mCurrentReader.close();
mCurrentReader = null;
continue;
}
if (line.startsWith(">")) {
String[] tokens = line.substring(1).trim().split("\\s+");
mNextSequence = tokens[0];
}
}
String result = mNextSequence;
mNextSequence = null;
return result;
}
private LineNumberReader getNextReader()
throws IOException {
if (mInputFileIndex >= mInputFiles.size()) {
return null;
}
File file = mInputFiles.get(mInputFileIndex++);
return new LineNumberReader(new FileReader(file));
}
private char[] getNextKMer()
throws IOException {
if (mKMerBuffer == null) {
mKMerBuffer = new char[mK];
}
System.arraycopy(mKMerBuffer, 1, mKMerBuffer, 0, mKMerBuffer.length - 1);
if (mKMerBufferedCount > 0) {
mKMerBufferedCount--;
}
while (mKMerBufferedCount < mK) {
char base = getNextBase();
if (base == 0) {
incrementBaseIndex(mKMerBufferedCount);
mKMerBufferedCount = 0;
return null;
} else if (base == 'N') {
incrementBaseIndex(mKMerBufferedCount+1);
mKMerBufferedCount = 0;
} else {
mKMerBuffer[mKMerBufferedCount++] = base;
}
}
incrementBaseIndex(1);
return mKMerBuffer;
}
private char getNextBase()
throws IOException {
if (mLineBuffer == null || mLineBufferIndex >= mLineBuffer.length()) {
if (mCurrentReader == null) {
return 0;
}
String line = mCurrentReader.readLine();
if (line == null) {
mLineBuffer = null;
mLineBufferIndex = 0;
mCurrentReader.close();
mCurrentReader = null;
return 0;
}
if (line.startsWith(">")) {
String[] tokens = line.substring(1).trim().split("\\s+");
mNextSequence = tokens[0];
mLineBuffer = null;
mLineBufferIndex = 0;
return 0;
}
mLineBuffer = line.toUpperCase();
mLineBufferIndex = 0;
}
return mLineBuffer.charAt(mLineBufferIndex++);
}
private void incrementBaseIndex(int amount) {
if (mBaseIndex < -1 && (mBaseIndex + amount) >= -1) {
throw new RuntimeException("Base index: 32-bit overflow");
}
mBaseIndex += amount;
}
private void log(String text) {
if (mVerbose) {
System.out.println("# " + new Date() + " " + text);
}
}
private static KMerPosition encodeKMer(char[] kmerChars, int baseIndex) {
if (mUseOldFormat) {
return encodeKMerOldFormat(kmerChars, baseIndex);
}
if (kmerChars == null) {
return null;
}
int kmerLength = kmerChars.length;
int encodingLength = (kmerLength + 7) / 8;
char[] encoding = new char[encodingLength];
int offset = kmerLength % 8;
offset = (offset == 0) ? 8 : offset;
int bits = encodeKMerBits(kmerChars, 0, offset);
if (bits < 0) {
return null;
}
encoding[0] = (char) bits;
for (int i = 1; i < encodingLength; i++) {
bits = encodeKMerBits(kmerChars, offset, 8);
if (bits < 0) {
return null;
}
encoding[i] = (char) bits;
offset += 8;
}
return new KMerPositionN(encoding, baseIndex);
}
private static KMerPosition encodeKMerOldFormat(char[] kmerChars, int baseIndex) {
if (kmerChars == null) {
return null;
}
int length = kmerChars.length;
if (length <= 31) {
long bits = encodeKMerBitsLong(kmerChars, 0, length);
if (bits == -1) {
return null;
}
return new KMerPosition1(bits, baseIndex);
} else if (length <= 62) {
long bits1 = encodeKMerBitsLong(kmerChars, 0, 31);
long bits2 = encodeKMerBitsLong(kmerChars, 31, length - 31);
if (bits1 == -1 || bits2 == -1) {
return null;
}
return new KMerPosition2(bits1, bits2, baseIndex);
} else {
return null;
}
}
private static int encodeKMerBits(char[] kmerChars, int offset, int length) {
int bits = 0;
for (int i = 0; i < length; i++) {
char base = kmerChars[offset + i];
int baseBits = "ACGT".indexOf(base);
if (baseBits < 0) {
return -1;
}
bits |= baseBits << (2*(length-i-1));
}
return bits;
}
private static long encodeKMerBitsLong(char[] kmerChars, int offset, int length) {
long bits = 0;
for (int i = 0; i < length; i++) {
char base = kmerChars[offset + i];
int baseBits = "ACGT".indexOf(base);
if (baseBits < 0) {
return -1;
}
bits |= ((long)baseBits) << (2*(length-i-1));
}
return bits;
}
private static String decodeKMer1(long bits) {
int length = mK;
char[] buffer = new char[length];
decodeKMerBits(bits, buffer, 0, length);
return new String(buffer);
}
private static String decodeKMer2(long bits1, long bits2) {
int length = mK;
char[] buffer = new char[length];
decodeKMerBits(bits1, buffer, 0, 31);
decodeKMerBits(bits2, buffer, 31, length-31);
return new String(buffer);
}
private static String decodeKMerN(char[] encoding) {
int length = mK;
char[] buffer = new char[length];
int offset = length % 8;
offset = (offset == 0) ? 8 : offset;
decodeKMerBits(encoding[0], buffer, 0, offset);
for (int i = 1; i < encoding.length; i++) {
decodeKMerBits(encoding[i], buffer, offset, 8);
offset += 8;
}
return new String(buffer);
}
private static void decodeKMerBits(char bits, char[] buffer, int offset, int length) {
for (int i = 0; i < length; i++) {
int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3);
buffer[offset + i] = "ACGT".charAt(baseBits);
}
}
private static void decodeKMerBits(long bits, char[] buffer, int offset, int length) {
for (int i = 0; i < length; i++) {
int baseBits = (int) ((bits >> (2*(length-i-1))) & 0x3);
buffer[offset + i] = "ACGT".charAt(baseBits);
}
}
static class KMerPosition
implements Comparable<KMerPosition> {
private int mBaseIndex;
KMerPosition(int baseIndex) {
mBaseIndex = baseIndex;
}
public String getKMer() {
return null;
}
public long getKMerEncoding1() {
return -1;
}
public long getKMerEncoding2() {
return -1;
}
public final int getBaseIndex() {
return mBaseIndex;
}
public final void setBaseIndex(int baseIndex) {
mBaseIndex = baseIndex;
}
public char[] getKMerEncoding() {
return null;
}
public int compareTo(KMerPosition kmp) {
char[] encoding1 = getKMerEncoding();
char[] encoding2 = kmp.getKMerEncoding();
int length = Math.max(encoding1.length, encoding2.length);
for (int i = 0; i < length; i++) {
int result = encoding1[i] - encoding2[i];
if (result != 0) {
return result;
}
}
return 0;
}
}
static class KMerPosition1
extends KMerPosition {
private long mKMerEncoding1;
KMerPosition1(long kmer, int baseIndex) {
super(baseIndex);
mKMerEncoding1 = kmer;
}
public String getKMer() {
return decodeKMer1(getKMerEncoding1());
}
public final long getKMerEncoding1() {
return mKMerEncoding1;
}
public int compareTo(KMerPosition kmp) {
int result = Long.signum(getKMerEncoding1() - kmp.getKMerEncoding1());
if (result == 0) {
result = Long.signum(getKMerEncoding2() - kmp.getKMerEncoding2());
}
return result;
}
}
static class KMerPosition2
extends KMerPosition1 {
private long mKMerEncoding2;
KMerPosition2(long encoding1, long encoding2, int baseIndex) {
super(encoding1, baseIndex);
mKMerEncoding2 = encoding2;
}
public String getKMer() {
return decodeKMer2(getKMerEncoding1(), getKMerEncoding2());
}
public final long getKMerEncoding2() {
return mKMerEncoding2;
}
}
static class KMerPositionN
extends KMerPosition {
private char[] mKMerEncoding;
KMerPositionN(char[] encoding, int baseIndex) {
super(baseIndex);
mKMerEncoding = encoding;
}
public String getKMer() {
return decodeKMerN(mKMerEncoding);
}
public final char[] getKMerEncoding() {
return mKMerEncoding;
}
}
static class StringKMerPosition
implements Comparable<StringKMerPosition> {
private String mKMerString = null;
private int mBaseIndex;
StringKMerPosition(String kmer, int baseIndex) {
mKMerString = kmer;
mBaseIndex = baseIndex;
}
public final String getKMer() {
return mKMerString;
}
public final int getBaseIndex() {
return mBaseIndex;
}
public final void setBaseIndex(int baseIndex) {
mBaseIndex = baseIndex;
}
public int compareTo(StringKMerPosition kmp) {
return mKMerString.compareTo(kmp.mKMerString);
}
}
}