Merged bug fix from Stable into Unstable
This commit is contained in:
commit
d0980e236a
|
|
@ -1,305 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package net.sf.picard.reference;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import static net.sf.picard.reference.FastaSequenceIndexBuilder.Status.*;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
/**
|
||||
* Builds FastaSequenceIndex from fasta file.
|
||||
* Produces fai file with same output as samtools faidx
|
||||
*/
|
||||
public class FastaSequenceIndexBuilder {
|
||||
final public File fastaFile;
|
||||
final boolean printProgress;
|
||||
|
||||
// keep track of location in file
|
||||
long bytesRead, endOfLastLine, lastTimestamp, fileLength; // initialized to -1 to keep 0-indexed position in file;
|
||||
|
||||
// vars that store information about the contig that is currently being read
|
||||
String contig;
|
||||
long location, size, bytesPerLine, basesPerLine, basesThisLine;
|
||||
int thisSequenceIndex = 0;
|
||||
|
||||
// vars that keep loop state
|
||||
byte lastByte = 0, currentByte = 0, nextByte = 0;
|
||||
public enum Status { NONE, CONTIG, FIRST_SEQ_LINE, SEQ_LINE, COMMENT }
|
||||
Status status = Status.NONE; // keeps state of what is currently being read. better to use int instead of enum?
|
||||
|
||||
public FastaSequenceIndexBuilder(File fastaFile, boolean printProgress) {
|
||||
this.fastaFile = fastaFile;
|
||||
fileLength = fastaFile.length();
|
||||
this.printProgress = printProgress;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates fasta sequence index from fasta file
|
||||
* @return FastaSequenceIndex that is read from file
|
||||
*/
|
||||
public FastaSequenceIndex createIndex() { // should this be static?
|
||||
bytesRead = -1;
|
||||
endOfLastLine = -1;
|
||||
contig = "";
|
||||
location = 0;
|
||||
size = 0;
|
||||
bytesPerLine = 0;
|
||||
basesPerLine = 0;
|
||||
basesThisLine = 0;
|
||||
lastTimestamp = System.currentTimeMillis();
|
||||
FastaSequenceIndex sequenceIndex = new FastaSequenceIndex();
|
||||
|
||||
// initialize input stream
|
||||
DataInputStream in;
|
||||
try {
|
||||
in = new DataInputStream(new BufferedInputStream(new FileInputStream(fastaFile)));
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read fasta file", e);
|
||||
}
|
||||
|
||||
/*
|
||||
* iterate through each character in file one at a time, but must account for variance in line terminators
|
||||
* strategy is to check if current character is a line terminator (\n, \r), then check next character
|
||||
* only treat as end of line if next character is NOT a line terminator
|
||||
*/
|
||||
try {
|
||||
// intialize iterators
|
||||
nextByte = in.readByte();
|
||||
currentByte = '\n';
|
||||
while(currentByte != -1) {
|
||||
|
||||
bytesRead ++; // update position in file
|
||||
lastByte = currentByte;
|
||||
currentByte = nextByte;
|
||||
try {
|
||||
nextByte = in.readByte();
|
||||
}
|
||||
catch (EOFException e) {
|
||||
nextByte = -1;
|
||||
}
|
||||
|
||||
switch(status) {
|
||||
|
||||
// if not currently reading anything
|
||||
// only thing that can trigger action is '>' (start of contig) or ';' (comment)
|
||||
case NONE:
|
||||
if (currentByte == '>')
|
||||
status = CONTIG;
|
||||
else if (currentByte == ';')
|
||||
status = COMMENT;
|
||||
break;
|
||||
|
||||
// if reading a comment, just ignore everything until end of line
|
||||
case COMMENT:
|
||||
if (isEol(currentByte)) {
|
||||
if (!isEol(nextByte))
|
||||
status = Status.NONE;
|
||||
}
|
||||
break;
|
||||
|
||||
// if reading a contig, add char to contig string
|
||||
// contig string can be anything, including spaces
|
||||
case CONTIG:
|
||||
if (isEol(currentByte)) {
|
||||
if (!isEol(nextByte)) {
|
||||
status = Status.FIRST_SEQ_LINE;
|
||||
location = bytesRead + 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
contig += (char) currentByte;
|
||||
break;
|
||||
|
||||
// record bases and bytes of first sequence line, to validate against later lines
|
||||
case FIRST_SEQ_LINE:
|
||||
|
||||
if (isEol(currentByte)) {
|
||||
|
||||
// record bases per line if last character was a base
|
||||
if (!isEol(lastByte)) {
|
||||
basesPerLine = bytesRead - location;
|
||||
basesThisLine = basesPerLine;
|
||||
size += basesPerLine;
|
||||
}
|
||||
|
||||
// next character is start of next line, now know bytes per line
|
||||
if (!isEol(nextByte)) { // figure out what to do if there is only one data line
|
||||
bytesPerLine = bytesRead - location + 1;
|
||||
status = Status.SEQ_LINE;
|
||||
endOfLastLine = bytesRead;
|
||||
|
||||
// if next char is ';' or '>', then there is only one contig =>
|
||||
if (nextByte == ';' || nextByte == '>')
|
||||
finishReadingContig(sequenceIndex);
|
||||
}
|
||||
}
|
||||
|
||||
// validate base character
|
||||
else {
|
||||
if (!isValidBase(currentByte))
|
||||
throw new UserException.MalformedFile(fastaFile, String.format("An invalid base was found in the contig: %s", contig));
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
case SEQ_LINE:
|
||||
|
||||
if (isEol(currentByte)) {
|
||||
|
||||
// record bases per line if last character was a base
|
||||
if (!isEol(lastByte)) {
|
||||
basesThisLine = bytesRead - endOfLastLine - 1;
|
||||
size += basesThisLine;
|
||||
}
|
||||
|
||||
// reached end of line - check if end of contig
|
||||
if (!isEol(nextByte)) {
|
||||
|
||||
// if comment or new contig, definitely end of sequence
|
||||
if (nextByte == ';' || nextByte == '>')
|
||||
finishReadingContig(sequenceIndex);
|
||||
|
||||
// if this line has different # of bases OR same # of bases and different # of bytes:
|
||||
// error if next char is a valid base, end of contig otherwise
|
||||
else if (basesThisLine != basesPerLine || bytesPerLine != bytesRead - endOfLastLine) {
|
||||
if (isValidBase(nextByte) && nextByte != -1) {
|
||||
throw new UserException.MalformedFile(fastaFile, String.format("An invalid line was found in the contig: %s", contig));
|
||||
}
|
||||
else
|
||||
finishReadingContig(sequenceIndex);
|
||||
}
|
||||
endOfLastLine = bytesRead;
|
||||
}
|
||||
}
|
||||
|
||||
// validate base character
|
||||
else {
|
||||
if (!isValidBase(currentByte))
|
||||
throw new UserException.MalformedFile(fastaFile, String.format("An invalid base was found in the contig: %s", contig));
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
in.close();
|
||||
return sequenceIndex;
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read fasta file", e);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new ReviewedStingException(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if character is an end of line character, \n or \r
|
||||
* @param currentByte Character to check, as a byte
|
||||
* @return True if current character is \n or \r' false otherwise
|
||||
*/
|
||||
private boolean isEol(byte currentByte) {
|
||||
return (currentByte == '\n' || currentByte == '\r');
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* checks that character is valid base
|
||||
* only checks that the base isn't whitespace, like picard does
|
||||
* could add check that character is A/C/G/T/U if wanted
|
||||
* @param currentByte Character to check, as a byte
|
||||
* @return True if character is not whitespace, false otherwise
|
||||
*/
|
||||
private boolean isValidBase(byte currentByte) {
|
||||
return (!Character.isWhitespace(currentByte) && currentByte != ';' && currentByte != '>');
|
||||
}
|
||||
|
||||
/*
|
||||
* When reader reaches the end of a contig
|
||||
* Reset iterators and add contig to sequence index
|
||||
*/
|
||||
private void finishReadingContig(FastaSequenceIndex sequenceIndex) {
|
||||
sequenceIndex.add(new FastaSequenceIndexEntry(trimContigName(contig), location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++));
|
||||
status = Status.NONE;
|
||||
contig = "";
|
||||
size = 0;
|
||||
|
||||
if (System.currentTimeMillis() - lastTimestamp > 10000) {
|
||||
int percentProgress = (int) (100*bytesRead/fileLength);
|
||||
if (printProgress)
|
||||
System.out.println(String.format("PROGRESS UPDATE: file is %d percent complete", percentProgress));
|
||||
lastTimestamp = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Trims the contig name to the expected value by removing any characters after the first whitespace
|
||||
*/
|
||||
private static String trimContigName(final String contigName) {
|
||||
int whitespaceIndex = contigName.indexOf(' ');
|
||||
return ( whitespaceIndex == -1 ) ? contigName : contigName.substring(0, whitespaceIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores FastaSequenceIndex as a .fasta.fai file on local machine
|
||||
* Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder
|
||||
* @param sequenceIndex sequenceIndex to be saved
|
||||
* @param faiFile file where we should store index
|
||||
*/
|
||||
public static void saveAsFaiFile(FastaSequenceIndex sequenceIndex, File faiFile) {
|
||||
|
||||
BufferedWriter out;
|
||||
try {
|
||||
out = new BufferedWriter(new FileWriter(faiFile));
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(faiFile, e);
|
||||
}
|
||||
|
||||
try {
|
||||
for(FastaSequenceIndexEntry entry: sequenceIndex) {
|
||||
out.write(toIndexFileLine(entry));
|
||||
out.newLine();
|
||||
}
|
||||
out.close();
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(faiFile, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Print string in format of fai file line
|
||||
* @return Contig as one line in a fai file
|
||||
*/
|
||||
private static String toIndexFileLine(FastaSequenceIndexEntry entry) {
|
||||
return String.format("%s\t%d\t%d\t%d\t%d", entry.getContig(), entry.getSize(), entry.getLocation(), entry.getBasesPerLine(), entry.getBytesPerLine());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,117 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package net.sf.picard.reference;
|
||||
|
||||
import org.testng.Assert;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.annotations.BeforeMethod;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
|
||||
/**
|
||||
* Test the fasta sequence index reader.
|
||||
*/
|
||||
public class FastaSequenceIndexBuilderUnitTest extends BaseTest {
|
||||
|
||||
private FastaSequenceIndexBuilder builder;
|
||||
private File fastaFile;
|
||||
private FastaSequenceIndex controlIndex;
|
||||
|
||||
@BeforeMethod
|
||||
public void doForEachTest() throws FileNotFoundException {
|
||||
controlIndex = new FastaSequenceIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests basic unix file with one contig
|
||||
* File is the exampleFASTA.fasta shipped with GATK
|
||||
*/
|
||||
@Test
|
||||
public void unixFileTest() {
|
||||
logger.warn("Executing unixFileTest");
|
||||
|
||||
fastaFile = new File(publicTestDir + "exampleFASTA.fasta");
|
||||
builder = new FastaSequenceIndexBuilder(fastaFile, false);
|
||||
FastaSequenceIndex index = builder.createIndex();
|
||||
controlIndex.add(new FastaSequenceIndexEntry("chr1", 6, 100000, 60, 61,0));
|
||||
|
||||
Assert.assertTrue(index.equals(controlIndex));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Tests basic windows file with one contig
|
||||
* File is a simple fasta file
|
||||
*/
|
||||
@Test
|
||||
public void windowsFileTest() {
|
||||
logger.warn("Executing windowsFileTest");
|
||||
|
||||
fastaFile = new File(publicTestDir + "exampleFASTA-windows.fasta");
|
||||
builder = new FastaSequenceIndexBuilder(fastaFile, false);
|
||||
FastaSequenceIndex index = builder.createIndex();
|
||||
controlIndex.add(new FastaSequenceIndexEntry("chr2", 7, 29, 7, 9,0));
|
||||
|
||||
Assert.assertTrue(index.equals(controlIndex));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests fasta with the two contigs from above combined
|
||||
* File is the exampleFASTA.fasta shipped with GATK
|
||||
*/
|
||||
@Test
|
||||
public void combinedWindowsUnix() {
|
||||
logger.warn("Executing combinedWindowsUnix");
|
||||
|
||||
fastaFile = new File(publicTestDir + "exampleFASTA-combined.fasta");
|
||||
builder = new FastaSequenceIndexBuilder(fastaFile, false);
|
||||
FastaSequenceIndex index = builder.createIndex();
|
||||
controlIndex.add(new FastaSequenceIndexEntry("chr1", 6, 100000, 60, 61,0));
|
||||
controlIndex.add(new FastaSequenceIndexEntry("chr2", 101680, 29, 7, 9,1));
|
||||
|
||||
Assert.assertTrue(index.equals(controlIndex));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests fasta with the two contigs from above combined
|
||||
* File is the exampleFASTA.fasta shipped with GATK
|
||||
*/
|
||||
@Test
|
||||
public void threeVariableLengthContigs() {
|
||||
logger.warn("Executing threeVariableLengthContigs");
|
||||
|
||||
fastaFile = new File(publicTestDir + "exampleFASTA-3contigs.fasta");
|
||||
builder = new FastaSequenceIndexBuilder(fastaFile, false);
|
||||
FastaSequenceIndex index = builder.createIndex();
|
||||
controlIndex.add(new FastaSequenceIndexEntry("chr1", 6, 17, 5, 6,0));
|
||||
controlIndex.add(new FastaSequenceIndexEntry("chr2", 35, 21, 7, 8,1));
|
||||
controlIndex.add(new FastaSequenceIndexEntry("chr3", 66, 100, 10, 11,2));
|
||||
|
||||
Assert.assertTrue(index.equals(controlIndex));
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue