gatk-3.8/java/lib/edu/mit/broad/cnv/GatherAlignments.java

400 lines
13 KiB
Java

/*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2008 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever.
* Neither the Broad Institute nor MIT can be responsible for its use, misuse,
* or functionality.
*/
package edu.mit.broad.cnv;
import edu.mit.broad.arachne.Alignment;
import edu.mit.broad.arachne.LookAlignReader;
import java.io.*;
import java.util.*;
/**
* Utility program to gather CNV alignments from LookAlign files in an I/O efficient manner.
*/
public class GatherAlignments {
public static void main(String[] args)
throws Exception {
new GatherAlignments().run(args);
}
private void usage() {
System.out.println("Usage: GatherAlignments ...");
System.out.println(" -cnpList <cnp-file>");
System.out.println(" -sampleId <sample-id>");
System.out.println(" -inputFileList <fofn>");
System.out.println(" -outputDirectory <dir>");
System.out.println(" -padding <n-bases>");
System.out.println(" -bestAlignments");
System.out.println(" -verbose");
System.out.println(" -debug");
}
private boolean parseArguments(String[] args) {
int argpos = 0;
int argsleft = 0;
while (argpos < args.length) {
argsleft = args.length - argpos;
String arg = args[argpos];
if (arg.equals("-cnpList") && argsleft > 1) {
argpos++;
mCnpListPath = args[argpos++];
} else if (arg.equals("-sampleId") && argsleft > 1) {
argpos++;
mSampleId = args[argpos++];
} else if (arg.equals("-inputFileList") && argsleft > 1) {
argpos++;
mInputFileListPath = args[argpos++];
} else if (arg.equals("-outputDirectory") && argsleft > 1) {
argpos++;
mOutputDirectory = args[argpos++];
} else if (arg.equals("-padding") && argsleft > 1) {
argpos++;
mCnpRegionPadding = Integer.parseInt(args[argpos++]);
} else if (arg.equals("-bestAlignments")) {
argpos++;
mReturnBestHits = true;
} else if (arg.equals("-verbose")) {
argpos++;
mVerbose = true;
} else if (arg.equals("-debug")) {
argpos++;
mDebug = true;
} else if (arg.startsWith("-")) {
usage();
return false;
} else {
break;
}
}
argsleft = args.length - argpos;
if (argsleft != 0) {
usage();
return false;
}
return true;
}
private void run(String[] args)
throws Exception {
if (!parseArguments(args)) {
System.exit(1);
}
List<File> mInputFileList = parseInputFiles(mInputFileListPath);
Map<Integer, List<CnpRegion>> mCnpMap = parseCnpFile(mCnpListPath);
for (File inputFile : mInputFileList) {
scanInputFile(inputFile, mCnpMap);
}
}
private List<File> parseInputFiles(String path)
throws IOException {
List<File> fileList = new ArrayList<File>();
LineNumberReader reader = new LineNumberReader(new FileReader(path));
while (true) {
String line = reader.readLine();
if (line == null) {
reader.close();
break;
}
line = line.trim();
if (line.length() == 0 || line.startsWith("#")) {
continue;
}
String[] fields = line.split("\\s+");
fileList.add(new File(fields[0]));
}
return fileList;
}
private Map<Integer, List<CnpRegion>> parseCnpFile(String path)
throws IOException {
Map<Integer, List<CnpRegion>> cnpMap = new HashMap<Integer, List<CnpRegion>>();
LineNumberReader reader = new LineNumberReader(new FileReader(path));
while (true) {
String line = reader.readLine();
if (line == null) {
reader.close();
break;
}
line = line.trim();
if (line.length() == 0 || line.startsWith("#")) {
continue;
}
String[] fields = line.split("\\s+");
if (fields.length != 4) {
throw new RuntimeException("Invalid CNP line: " + line);
}
if (fields[0].equalsIgnoreCase("CNPID")) {
continue;
}
String cnpId = fields[0];
String chromosome = fields[1];
int start = Integer.parseInt(fields[2].replaceAll(",", ""));
int end = Integer.parseInt(fields[3].replaceAll(",", ""));
int sequenceId = chromosomeToSequenceId(chromosome);
if (sequenceId < 0) {
throw new RuntimeException("Unrecognized chromosome: " + chromosome);
}
if (mCnpRegionPadding > 0) {
start = Math.max(1, start - mCnpRegionPadding);
end = end + mCnpRegionPadding;
}
CnpRegion cnp = new CnpRegion(cnpId, sequenceId, start, end);
List<CnpRegion> cnpList = cnpMap.get(sequenceId);
if (cnpList == null) {
cnpList = new ArrayList<CnpRegion>();
cnpMap.put(sequenceId, cnpList);
}
cnpList.add(cnp);
}
return cnpMap;
}
private int chromosomeToSequenceId(String text) {
if (text == null || text.length() == 0) {
return -1;
}
if (text.matches("\\d+")) {
return Integer.parseInt(text);
}
if (text.startsWith("chr") && text.length() > 3) {
text = text.substring(3);
}
if (text.matches("\\d+") && !text.startsWith("0")) {
return Integer.parseInt(text);
}
if (text.equals("M")) {
return 0;
} else if (text.equals("X")) {
return 23;
} else if (text.equals("Y")) {
return 24;
} else {
return -1;
}
}
private void scanInputFile(File inputFile,
Map<Integer, List<CnpRegion>> cnpMap)
throws IOException {
LookAlignReader reader = new LookAlignReader(inputFile);
while (true) {
Alignment alignment = getNextAlignment(reader);
if (alignment == null) {
reader.close();
break;
}
List<CnpRegion> cnpList = cnpMap.get(alignment.getBSequenceId());
if (cnpList == null) {
continue;
}
for (CnpRegion cnp : cnpList) {
if (overlaps(cnp, alignment)) {
saveCnpAlignment(cnp, alignment, inputFile);
}
}
}
flushCnpAlignments(inputFile);
}
private Alignment getNextAlignment(LookAlignReader reader)
throws IOException {
if (!mReturnBestHits) {
if (reader.hasNext()) {
return reader.next();
} else {
return null;
}
}
while (true) {
Alignment seed = mPendingAlignment;
mPendingAlignment = null;
if (seed == null && reader.hasNext()) {
seed = reader.next();
}
if (seed == null) {
return null;
}
List<Alignment> secondaryHits = null;
while (reader.hasNext()) {
Alignment alignment = reader.next();
if (alignment.getASequenceId() != seed.getASequenceId()) {
if (alignment.getASequenceId() < seed.getASequenceId()) {
throw new RuntimeException("Alignments not sorted by A sequence: " + alignment.format());
}
mPendingAlignment = alignment;
break;
}
if (secondaryHits == null) {
secondaryHits = new ArrayList<Alignment>();
}
secondaryHits.add(alignment);
}
if (secondaryHits == null) {
return seed;
}
secondaryHits.add(seed);
Alignment result = getUniqueBestAlignment(secondaryHits);
if (result != null) {
return result;
}
}
}
private Alignment getUniqueBestAlignment(List<Alignment> alignments) {
int bestMismatches = 0;
List<Alignment> best = new ArrayList<Alignment>();
for (Alignment a : alignments) {
int mismatches = getAlignmentMismatches(a);
if (best.isEmpty()) {
best.add(a);
bestMismatches = mismatches;
}
if (mismatches == bestMismatches) {
best.add(a);
} else if (mismatches < bestMismatches) {
best.clear();
best.add(a);
bestMismatches = mismatches;
}
}
if (best.size() != 1) {
return null;
}
return best.get(0);
}
private int getAlignmentMismatches(Alignment alignment) {
int mismatches = 0;
int[] blocks = alignment.getAlignmentBlocks();
for (int i = 0; i < blocks.length; i += 3) {
int gap = blocks[i];
int duration = blocks[i+1];
int mm = blocks[i+2];
if (mm > duration) {
throw new RuntimeException("Invalid alignment? : " + alignment.format());
}
mismatches += Math.abs(gap);
mismatches += mm;
}
return mismatches;
}
private boolean overlaps(CnpRegion cnp, Alignment alignment) {
return (cnp.getSequenceId() == alignment.getBSequenceId() &&
cnp.getStart() <= alignment.getBEnd() &&
cnp.getEnd() >= alignment.getBStart());
}
private void saveCnpAlignment(CnpRegion cnp, Alignment alignment, File inputFile)
throws IOException {
if (mCnpAlignmentCount > mCnpAlignmentLimit) {
flushCnpAlignments(inputFile);
}
String cnpId = cnp.getCnpId();
List<Alignment> alignmentList = mCnpAlignmentMap.get(cnpId);
if (alignmentList == null) {
alignmentList = new ArrayList<Alignment>();
mCnpAlignmentMap.put(cnpId, alignmentList);
}
alignmentList.add(alignment);
mCnpAlignmentCount++;
}
private void flushCnpAlignments(File inputFile)
throws IOException {
while (!mCnpAlignmentMap.isEmpty()) {
String cnpId = mCnpAlignmentMap.keySet().iterator().next();
List<Alignment> alignmentList = mCnpAlignmentMap.get(cnpId);
writeAlignments(cnpId, mSampleId, alignmentList, inputFile);
mCnpAlignmentMap.remove(cnpId);
mCnpAlignmentCount -= alignmentList.size();
}
if (mCnpAlignmentCount != 0) {
throw new RuntimeException("Unsynchronized alignment count");
}
}
private void writeAlignments(String cnpId, String sampleId, List<Alignment> alignmentList, File inputFile)
throws IOException {
File outputDir = new File(".");
if (mOutputDirectory != null) {
outputDir = new File(mOutputDirectory);
}
String cnpSample = cnpId;
if (sampleId != null) {
cnpSample = cnpSample + "_" + sampleId;
}
File cnpSampleDir = new File(outputDir, cnpSample);
if (!cnpSampleDir.exists()) {
if (!cnpSampleDir.mkdir()) {
throw new RuntimeException("Failed to create directory " + cnpSampleDir);
}
}
String fileName = inputFile.getName();
File alignmentFile = new File(cnpSampleDir, fileName);
PrintWriter writer = new PrintWriter(new FileWriter(alignmentFile, true));
for (Alignment alignment : alignmentList) {
writer.println(alignment.arachneFormat());
}
writer.flush();
writer.close();
}
private GatherAlignments() {
}
private static class CnpRegion {
private CnpRegion(String cnpId, int sequenceId, int start, int end) {
mCnpId = cnpId;
mSequenceId = sequenceId;
mStart = start;
mEnd = end;
}
public String getCnpId() { return mCnpId; };
public int getSequenceId() { return mSequenceId; };
public int getStart() { return mStart; };
public int getEnd() { return mEnd; };
private String mCnpId;
private int mSequenceId;
private int mStart;
private int mEnd;
}
private boolean mDebug = false;
private boolean mVerbose = false;
private boolean mReturnBestHits = false;
private String mCnpListPath = null;
private String mSampleId = null;
private String mInputFileListPath = null;
private String mOutputDirectory = null;
private int mCnpRegionPadding = 0;
private Alignment mPendingAlignment = null;
private int mCnpAlignmentCount = 0;
private int mCnpAlignmentLimit = 1000000;
private Map<String, List<Alignment>> mCnpAlignmentMap = new LinkedHashMap<String, List<Alignment>>();
}