diff --git a/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java b/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java deleted file mode 100644 index f225ffcda..000000000 --- a/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java +++ /dev/null @@ -1,305 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package net.sf.picard.reference; - -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import static net.sf.picard.reference.FastaSequenceIndexBuilder.Status.*; - -import java.io.*; - -import org.broadinstitute.sting.utils.exceptions.UserException; - -/** - * Builds FastaSequenceIndex from fasta file. - * Produces fai file with same output as samtools faidx - */ -public class FastaSequenceIndexBuilder { - final public File fastaFile; - final boolean printProgress; - - // keep track of location in file - long bytesRead, endOfLastLine, lastTimestamp, fileLength; // initialized to -1 to keep 0-indexed position in file; - - // vars that store information about the contig that is currently being read - String contig; - long location, size, bytesPerLine, basesPerLine, basesThisLine; - int thisSequenceIndex = 0; - - // vars that keep loop state - byte lastByte = 0, currentByte = 0, nextByte = 0; - public enum Status { NONE, CONTIG, FIRST_SEQ_LINE, SEQ_LINE, COMMENT } - Status status = Status.NONE; // keeps state of what is currently being read. better to use int instead of enum? - - public FastaSequenceIndexBuilder(File fastaFile, boolean printProgress) { - this.fastaFile = fastaFile; - fileLength = fastaFile.length(); - this.printProgress = printProgress; - } - - /** - * Creates fasta sequence index from fasta file - * @return FastaSequenceIndex that is read from file - */ - public FastaSequenceIndex createIndex() { // should this be static? - bytesRead = -1; - endOfLastLine = -1; - contig = ""; - location = 0; - size = 0; - bytesPerLine = 0; - basesPerLine = 0; - basesThisLine = 0; - lastTimestamp = System.currentTimeMillis(); - FastaSequenceIndex sequenceIndex = new FastaSequenceIndex(); - - // initialize input stream - DataInputStream in; - try { - in = new DataInputStream(new BufferedInputStream(new FileInputStream(fastaFile))); - } - catch (Exception e) { - throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read fasta file", e); - } - - /* - * iterate through each character in file one at a time, but must account for variance in line terminators - * strategy is to check if current character is a line terminator (\n, \r), then check next character - * only treat as end of line if next character is NOT a line terminator - */ - try { - // intialize iterators - nextByte = in.readByte(); - currentByte = '\n'; - while(currentByte != -1) { - - bytesRead ++; // update position in file - lastByte = currentByte; - currentByte = nextByte; - try { - nextByte = in.readByte(); - } - catch (EOFException e) { - nextByte = -1; - } - - switch(status) { - - // if not currently reading anything - // only thing that can trigger action is '>' (start of contig) or ';' (comment) - case NONE: - if (currentByte == '>') - status = CONTIG; - else if (currentByte == ';') - status = COMMENT; - break; - - // if reading a comment, just ignore everything until end of line - case COMMENT: - if (isEol(currentByte)) { - if (!isEol(nextByte)) - status = Status.NONE; - } - break; - - // if reading a contig, add char to contig string - // contig string can be anything, including spaces - case CONTIG: - if (isEol(currentByte)) { - if (!isEol(nextByte)) { - status = Status.FIRST_SEQ_LINE; - location = bytesRead + 1; - } - } - else - contig += (char) currentByte; - break; - - // record bases and bytes of first sequence line, to validate against later lines - case FIRST_SEQ_LINE: - - if (isEol(currentByte)) { - - // record bases per line if last character was a base - if (!isEol(lastByte)) { - basesPerLine = bytesRead - location; - basesThisLine = basesPerLine; - size += basesPerLine; - } - - // next character is start of next line, now know bytes per line - if (!isEol(nextByte)) { // figure out what to do if there is only one data line - bytesPerLine = bytesRead - location + 1; - status = Status.SEQ_LINE; - endOfLastLine = bytesRead; - - // if next char is ';' or '>', then there is only one contig => - if (nextByte == ';' || nextByte == '>') - finishReadingContig(sequenceIndex); - } - } - - // validate base character - else { - if (!isValidBase(currentByte)) - throw new UserException.MalformedFile(fastaFile, String.format("An invalid base was found in the contig: %s", contig)); - } - break; - - - case SEQ_LINE: - - if (isEol(currentByte)) { - - // record bases per line if last character was a base - if (!isEol(lastByte)) { - basesThisLine = bytesRead - endOfLastLine - 1; - size += basesThisLine; - } - - // reached end of line - check if end of contig - if (!isEol(nextByte)) { - - // if comment or new contig, definitely end of sequence - if (nextByte == ';' || nextByte == '>') - finishReadingContig(sequenceIndex); - - // if this line has different # of bases OR same # of bases and different # of bytes: - // error if next char is a valid base, end of contig otherwise - else if (basesThisLine != basesPerLine || bytesPerLine != bytesRead - endOfLastLine) { - if (isValidBase(nextByte) && nextByte != -1) { - throw new UserException.MalformedFile(fastaFile, String.format("An invalid line was found in the contig: %s", contig)); - } - else - finishReadingContig(sequenceIndex); - } - endOfLastLine = bytesRead; - } - } - - // validate base character - else { - if (!isValidBase(currentByte)) - throw new UserException.MalformedFile(fastaFile, String.format("An invalid base was found in the contig: %s", contig)); - } - break; - } - } - in.close(); - return sequenceIndex; - } - catch (IOException e) { - throw new UserException.CouldNotReadInputFile(fastaFile, "Could not read fasta file", e); - } - catch (Exception e) { - throw new ReviewedStingException(e.getMessage(), e); - } - } - - /** - * Checks if character is an end of line character, \n or \r - * @param currentByte Character to check, as a byte - * @return True if current character is \n or \r' false otherwise - */ - private boolean isEol(byte currentByte) { - return (currentByte == '\n' || currentByte == '\r'); - } - - - /** - * checks that character is valid base - * only checks that the base isn't whitespace, like picard does - * could add check that character is A/C/G/T/U if wanted - * @param currentByte Character to check, as a byte - * @return True if character is not whitespace, false otherwise - */ - private boolean isValidBase(byte currentByte) { - return (!Character.isWhitespace(currentByte) && currentByte != ';' && currentByte != '>'); - } - - /* - * When reader reaches the end of a contig - * Reset iterators and add contig to sequence index - */ - private void finishReadingContig(FastaSequenceIndex sequenceIndex) { - sequenceIndex.add(new FastaSequenceIndexEntry(trimContigName(contig), location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++)); - status = Status.NONE; - contig = ""; - size = 0; - - if (System.currentTimeMillis() - lastTimestamp > 10000) { - int percentProgress = (int) (100*bytesRead/fileLength); - if (printProgress) - System.out.println(String.format("PROGRESS UPDATE: file is %d percent complete", percentProgress)); - lastTimestamp = System.currentTimeMillis(); - } - } - - /* - * Trims the contig name to the expected value by removing any characters after the first whitespace - */ - private static String trimContigName(final String contigName) { - int whitespaceIndex = contigName.indexOf(' '); - return ( whitespaceIndex == -1 ) ? contigName : contigName.substring(0, whitespaceIndex); - } - - /** - * Stores FastaSequenceIndex as a .fasta.fai file on local machine - * Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder - * @param sequenceIndex sequenceIndex to be saved - * @param faiFile file where we should store index - */ - public static void saveAsFaiFile(FastaSequenceIndex sequenceIndex, File faiFile) { - - BufferedWriter out; - try { - out = new BufferedWriter(new FileWriter(faiFile)); - } - catch (Exception e) { - throw new UserException.CouldNotCreateOutputFile(faiFile, e); - } - - try { - for(FastaSequenceIndexEntry entry: sequenceIndex) { - out.write(toIndexFileLine(entry)); - out.newLine(); - } - out.close(); - } - catch (Exception e) { - throw new UserException.CouldNotCreateOutputFile(faiFile, e); - } - } - - /** - * Print string in format of fai file line - * @return Contig as one line in a fai file - */ - private static String toIndexFileLine(FastaSequenceIndexEntry entry) { - return String.format("%s\t%d\t%d\t%d\t%d", entry.getContig(), entry.getSize(), entry.getLocation(), entry.getBasesPerLine(), entry.getBytesPerLine()); - } - -} diff --git a/public/java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java b/public/java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java deleted file mode 100644 index 9ac3bedf4..000000000 --- a/public/java/test/net/sf/picard/reference/FastaSequenceIndexBuilderUnitTest.java +++ /dev/null @@ -1,117 +0,0 @@ -/* -* Copyright (c) 2012 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR -* THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -package net.sf.picard.reference; - -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; -import org.testng.annotations.BeforeMethod; -import org.testng.annotations.Test; - -import java.io.File; -import java.io.FileNotFoundException; - -/** - * Test the fasta sequence index reader. - */ -public class FastaSequenceIndexBuilderUnitTest extends BaseTest { - - private FastaSequenceIndexBuilder builder; - private File fastaFile; - private FastaSequenceIndex controlIndex; - - @BeforeMethod - public void doForEachTest() throws FileNotFoundException { - controlIndex = new FastaSequenceIndex(); - } - - /** - * Tests basic unix file with one contig - * File is the exampleFASTA.fasta shipped with GATK - */ - @Test - public void unixFileTest() { - logger.warn("Executing unixFileTest"); - - fastaFile = new File(publicTestDir + "exampleFASTA.fasta"); - builder = new FastaSequenceIndexBuilder(fastaFile, false); - FastaSequenceIndex index = builder.createIndex(); - controlIndex.add(new FastaSequenceIndexEntry("chr1", 6, 100000, 60, 61,0)); - - Assert.assertTrue(index.equals(controlIndex)); - } - - - /** - * Tests basic windows file with one contig - * File is a simple fasta file - */ - @Test - public void windowsFileTest() { - logger.warn("Executing windowsFileTest"); - - fastaFile = new File(publicTestDir + "exampleFASTA-windows.fasta"); - builder = new FastaSequenceIndexBuilder(fastaFile, false); - FastaSequenceIndex index = builder.createIndex(); - controlIndex.add(new FastaSequenceIndexEntry("chr2", 7, 29, 7, 9,0)); - - Assert.assertTrue(index.equals(controlIndex)); - } - - /** - * Tests fasta with the two contigs from above combined - * File is the exampleFASTA.fasta shipped with GATK - */ - @Test - public void combinedWindowsUnix() { - logger.warn("Executing combinedWindowsUnix"); - - fastaFile = new File(publicTestDir + "exampleFASTA-combined.fasta"); - builder = new FastaSequenceIndexBuilder(fastaFile, false); - FastaSequenceIndex index = builder.createIndex(); - controlIndex.add(new FastaSequenceIndexEntry("chr1", 6, 100000, 60, 61,0)); - controlIndex.add(new FastaSequenceIndexEntry("chr2", 101680, 29, 7, 9,1)); - - Assert.assertTrue(index.equals(controlIndex)); - } - - /** - * Tests fasta with the two contigs from above combined - * File is the exampleFASTA.fasta shipped with GATK - */ - @Test - public void threeVariableLengthContigs() { - logger.warn("Executing threeVariableLengthContigs"); - - fastaFile = new File(publicTestDir + "exampleFASTA-3contigs.fasta"); - builder = new FastaSequenceIndexBuilder(fastaFile, false); - FastaSequenceIndex index = builder.createIndex(); - controlIndex.add(new FastaSequenceIndexEntry("chr1", 6, 17, 5, 6,0)); - controlIndex.add(new FastaSequenceIndexEntry("chr2", 35, 21, 7, 8,1)); - controlIndex.add(new FastaSequenceIndexEntry("chr3", 66, 100, 10, 11,2)); - - Assert.assertTrue(index.equals(controlIndex)); - } -}