diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 883dde665..a13f6bc9f 100755 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -52,10 +52,8 @@ import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.commandline.ArgumentException; import org.broadinstitute.sting.commandline.ArgumentSource; -import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; import java.io.File; -import java.io.FileNotFoundException; import java.util.*; public class GenomeAnalysisEngine { @@ -72,7 +70,7 @@ public class GenomeAnalysisEngine { /** * Accessor for sharded reference data. */ - private IndexedFastaSequenceFile referenceDataSource = null; + private ReferenceDataSource referenceDataSource = null; /** * Accessor for sharded reference-ordered data. @@ -192,7 +190,7 @@ public class GenomeAnalysisEngine { else { // if include argument isn't given, create new set of all possible intervals GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null && argCollection.RODToInterval == null ? - GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getSequenceDictionary()) : + GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) : loadIntervals(argCollection.intervals, argCollection.intervalMerging, GenomeLocParser.mergeIntervalLocations(checkRODToIntervalArgument(),argCollection.intervalMerging))); @@ -352,7 +350,7 @@ public class GenomeAnalysisEngine { validateSuppliedReferenceOrderedDataAgainstWalker(my_walker, tracks); // validate all the sequence dictionaries against the reference - validateSourcesAgainstReference(readsDataSource, referenceDataSource, tracks); + validateSourcesAgainstReference(readsDataSource, referenceDataSource.getReference(), tracks); rodDataSources = getReferenceOrderedDataSources(my_walker, tracks); } @@ -373,7 +371,7 @@ public class GenomeAnalysisEngine { Utils.scareUser(String.format("Read-based traversals require a reference file but none was given")); } - return MicroScheduler.create(this,my_walker,readsDataSource,referenceDataSource,rodDataSources,argCollection.numberOfThreads); + return MicroScheduler.create(this,my_walker,readsDataSource,referenceDataSource.getReference(),rodDataSources,argCollection.numberOfThreads); } /** @@ -762,14 +760,14 @@ public class GenomeAnalysisEngine { ShardStrategyFactory.SHATTER_STRATEGY.INTERVAL : ShardStrategyFactory.SHATTER_STRATEGY.LINEAR; shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource, + referenceDataSource.getReference(), !argCollection.disableExperimentalSharding ? ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL : shardType, drivingDataSource.getSequenceDictionary(), SHARD_SIZE, intervals, maxIterations); } else shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource, + referenceDataSource.getReference(), !argCollection.disableExperimentalSharding ? ShardStrategyFactory.SHATTER_STRATEGY.LOCUS_EXPERIMENTAL : ShardStrategyFactory.SHATTER_STRATEGY.LINEAR, drivingDataSource.getSequenceDictionary(), SHARD_SIZE, maxIterations); @@ -782,14 +780,14 @@ public class GenomeAnalysisEngine { if (intervals != null && !intervals.isEmpty()) { shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource, + referenceDataSource.getReference(), shardType, drivingDataSource.getSequenceDictionary(), SHARD_SIZE, intervals, maxIterations); } else { shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource, + referenceDataSource.getReference(), shardType, drivingDataSource.getSequenceDictionary(), SHARD_SIZE, maxIterations); @@ -803,7 +801,7 @@ public class GenomeAnalysisEngine { Utils.scareUser("Pairs traversal cannot be used in conjunction with intervals."); shardStrategy = ShardStrategyFactory.shatter(readsDataSource, - referenceDataSource, + referenceDataSource.getReference(), ShardStrategyFactory.SHATTER_STRATEGY.READS_EXPERIMENTAL, drivingDataSource.getSequenceDictionary(), SHARD_SIZE, maxIterations); @@ -839,15 +837,9 @@ public class GenomeAnalysisEngine { * @param refFile Handle to a reference sequence file. Non-null. * @return A thread-safe file wrapper. */ - private IndexedFastaSequenceFile openReferenceSequenceFile(File refFile) { - IndexedFastaSequenceFile ref = null; - try { - ref = new IndexedFastaSequenceFile(refFile); - } - catch (FileNotFoundException ex) { - throw new StingException("I/O error while opening fasta file: " + ex.getMessage(), ex); - } - GenomeLocParser.setupRefContigOrdering(ref); + private ReferenceDataSource openReferenceSequenceFile(File refFile) { + ReferenceDataSource ref = new ReferenceDataSource(refFile); + GenomeLocParser.setupRefContigOrdering(ref.getReference()); return ref; } diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSource.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSource.java new file mode 100644 index 000000000..5764ea8f3 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSource.java @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.simpleDataSources; + +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.StingException; +import org.broadinstitute.sting.utils.fasta.FastaSequenceIndex; +import org.broadinstitute.sting.utils.fasta.FastaSequenceIndexBuilder; +import org.broadinstitute.sting.utils.fasta.IndexedFastaSequenceFile; +import net.sf.picard.sam.CreateSequenceDictionary; + +import java.io.File; +import java.io.IOException; +/** + * Loads reference data from fasta file + * Looks for fai and dict files, and tries to create them if they don't exist + */ +public class ReferenceDataSource implements ReferenceDataSourceProgressListener { + private IndexedFastaSequenceFile index; + + /** our log, which we want to capture anything from this class */ + protected static org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(ReferenceDataSource.class); + + /** + * Create reference data source from fasta file + * @param fastaFile Fasta file to be used as reference + * + * Please note: This should be re-done when PIC-370 is fixed. (See below.) At that time, we may consider loading + * sequenceIndex and sequenceDict here, instead of in IndexedFastaSequenceFile. + */ + public ReferenceDataSource(File fastaFile) { + File indexFile = new File(fastaFile.getAbsolutePath() + ".fai"); + File dictFile = new File(fastaFile.getAbsolutePath().replace(".fasta", ".dict")); + FastaSequenceIndex sequenceIndex; // stores FastaSequenceIndex if file doesn't exist + + /* + Note: this code is temporary. See commented code below, which will be used when thread safety is resolved. + */ + if (!indexFile.exists()) { + FastaSequenceIndexBuilder faiBuilder = new FastaSequenceIndexBuilder(fastaFile, this); + sequenceIndex = faiBuilder.sequenceIndex; + try { + index = new IndexedFastaSequenceFile(fastaFile, sequenceIndex); + } + catch (Exception e) { + throw new StingException("Could not load fasta file. Stack trace below. ", e); + } + } + else { + try { + index = new IndexedFastaSequenceFile(fastaFile); + } + catch (Exception e) { + throw new StingException("Could not load fasta file. Stack trace below.", e); + } + } + + /* + * If dict file doesn't exist, try to create it using Picard's CreateSequenceDictionary + * Currently, dictionary cannot be created without running CreateSequenceDictionary's main routine, hence the + * argument string + * This has been filed in trac as (PIC-370) Want programmatic interface to CreateSequenceDictionary + * + * Todo: Currently working on making the creation of dictionary and index thread-safe + * + */ + /*if (!dictFile.exists()) { + logger.info(String.format("Fasta dictionary file %s does not exist. Trying to create it now.", + dictFile.getAbsolutePath())); + String args[] = {String.format("r=%s", fastaFile.getAbsolutePath()), + String.format("o=%s", dictFile.getAbsolutePath())}; + new CreateSequenceDictionary().instanceMain(args); + logger.info(String.format("Dictionary file created successfully!")); + }*/ + + /** + * Create fai file if it doesn't exist and throw exception if unsuccessful + * Note that this implies walkers cannot be run if a fai file is not provided and GATK cannot write to disk + * + * Todo: currently working on making this thread safe. + * Rather than creating a new fai file, structure is created in memory. This whole block will be fixed + * in a couple days when we figure out the thread stuff. + * + */ + /*if (!indexFile.exists()) { + logger.info(String.format("Fasta index file %s does not exist. Trying to create it now.", + indexFile.getAbsolutePath())); + FastaSequenceIndexBuilder faiBuilder = new FastaSequenceIndexBuilder(fastaFile, this); + + try { + faiBuilder.saveAsFaiFile(); + logger.info(String.format("Index file created successfully!")); + } + catch (Exception e) { + throw new StingException("Index file could not be saved", e); + } + } + + // now create IndexedFastaSequenceFile + try { + index = indexFile.exists() ? new IndexedFastaSequenceFile(fastaFile) : + new IndexedFastaSequenceFile(fastaFile, sequenceIndex); + } + catch (IOException e) { + throw new StingException("An error occurred while loading the fasta file and its .fasta.fai and " + + ".dict counterparts.", e); + } + catch (Exception e) { + throw new StingException("An error occurred while processing the fasta file and its .fasta.fai and " + + ".dict counterparts. If the error could have been caused by the .fasta.fai or .dict files, " + + "you can re-create them by removing them from the folder that the fasta file is in and " + + "running GATK again.", e); + }*/ + } + + /** + * Get indexed fasta file + * @return IndexedFastaSequenceFile that was created from file + */ + public IndexedFastaSequenceFile getReference() { + return this.index; + } + + /** + * Notify user of progress in creating fai file + * @param percent Percent of fasta file read as a percent + */ + public void percentProgress(int percent) { + System.out.println(String.format("PROGRESS UPDATE: file is %d percent complete", percent)); + } + +} diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSourceProgressListener.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSourceProgressListener.java new file mode 100644 index 000000000..b22f139a6 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ReferenceDataSourceProgressListener.java @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.simpleDataSources; + +public interface ReferenceDataSourceProgressListener { + public void percentProgress(int percent); +} diff --git a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java b/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java index a65dfdabf..53b23e1a8 100755 --- a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java +++ b/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndex.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.fasta; import net.sf.picard.PicardException; @@ -18,19 +43,26 @@ public class FastaSequenceIndex implements Iterable { /** * Store the entries. Use a LinkedHashMap for consistent iteration in insertion order. */ - private Map sequenceEntries = new LinkedHashMap(); + private final Map sequenceEntries = new LinkedHashMap(); /** * Build a sequence index from the specified file. * @param indexFile File to open. * @throws FileNotFoundException if the index file cannot be found. */ - public FastaSequenceIndex( File indexFile ) throws FileNotFoundException { + protected FastaSequenceIndex( File indexFile ) throws FileNotFoundException { if(!indexFile.exists()) throw new FileNotFoundException(String.format("Fasta index file is missing: %s",indexFile.getAbsolutePath())); IoUtil.assertFileIsReadable(indexFile); parseIndexFile(indexFile); + } + + /** + * Build an empty sequence index. Entries can be added later. + */ + protected FastaSequenceIndex() { + } /** @@ -102,17 +134,46 @@ public class FastaSequenceIndex implements Iterable { public int size() { return sequenceEntries.size(); } + + /** + * Adds entry to index. Used by Fai file generator to create index entry on the fly. + * @param contig The name of the contig + * @param location Byte-referenced location of contig in file + * @param size Number of bases in contig + * @param basesPerLine Number of bases in each line. Must be uniform. + * @param bytesPerLine Number of bytes in each line. Must be uniform. + */ + public void addIndexEntry(String contig, long location, long size, int basesPerLine, int bytesPerLine) { + sequenceEntries.put( contig,new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine) ); + } + + /** + * Compare two FastaSequenceIndex objects. Built for use in testing. No hash function has been created. + * @param other Another FastaSequenceIndex to compare + * @return True if index has the same entries as other instance, in the same order + */ + public boolean equals(FastaSequenceIndex other) { + Iterator iter = this.iterator(); + Iterator otherIter = other.iterator(); + while (iter.hasNext()) { + if (!otherIter.hasNext()) + return false; + if (!iter.next().equals(otherIter.next())) + return false; + } + return true; + } } /** * Hold an individual entry in a fasta sequence index file. */ class FastaSequenceIndexEntry { - private String contig; - private long location; - private long size; - private int basesPerLine; - private int bytesPerLine; + private final String contig; + private final long location; + private final long size; + private final int basesPerLine; + private final int bytesPerLine; /** * Create a new entry with the given parameters. @@ -186,4 +247,22 @@ class FastaSequenceIndexEntry { basesPerLine, bytesPerLine ); } + + /** + * Print string in format of fai file line + * @return Contig as one line in a fai file + */ + public String toIndexFileLine() { + return String.format("%s\t%d\t%d\t%d\t%d", contig, size, location, basesPerLine, bytesPerLine); + } + + /** + * Compare entry to another instance + * @param other another FastaSequenceIndexEntry + * @return True if each has the same name, location, size, basesPerLine and bytesPerLine + */ + public boolean equals(FastaSequenceIndexEntry other) { + return (contig.equals(other.contig) && size == other.size && location == other.location + && basesPerLine == other.basesPerLine && bytesPerLine == other.bytesPerLine); + } } diff --git a/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexBuilder.java b/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexBuilder.java new file mode 100644 index 000000000..7a96ccafa --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexBuilder.java @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.fasta; + +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSourceProgressListener; +import org.broadinstitute.sting.utils.StingException; +import static org.broadinstitute.sting.utils.fasta.FastaSequenceIndexBuilder.Status.*; + +import java.io.*; +import java.util.Iterator; + +/** + * Builds FastaSequenceIndex from fasta file. + * Produces fai file with same output as samtools faidx + */ +public class FastaSequenceIndexBuilder { + public File fastaFile; + ReferenceDataSourceProgressListener progress; // interface that provides a method for updating user on progress of reading file + public FastaSequenceIndex sequenceIndex = new FastaSequenceIndex(); + + // keep track of location in file + long bytesRead, endOfLastLine, lastTimestamp, fileLength; // initialized to -1 to keep 0-indexed position in file; + + // vars that store information about the contig that is currently being read + String contig; + long location, size, bytesPerLine, basesPerLine, basesThisLine; + + // vars that keep loop state + byte lastByte = 0, currentByte = 0, nextByte = 0; + public enum Status { NONE, CONTIG, FIRST_SEQ_LINE, SEQ_LINE, COMMENT } + Status status = Status.NONE; // keeps state of what is currently being read. better to use int instead of enum? + + public FastaSequenceIndexBuilder(File fastaFile, ReferenceDataSourceProgressListener progress) { + this.progress = progress; + this.fastaFile = fastaFile; + fileLength = fastaFile.length(); + parseFastaFile(); + } + + /** + * Creates fasta sequence index from fasta file + */ + private void parseFastaFile() { + bytesRead = -1; + endOfLastLine = -1; + contig = ""; + location = 0; + size = 0; + bytesPerLine = 0; + basesPerLine = 0; + basesThisLine = 0; + lastTimestamp = System.currentTimeMillis(); + + // initialize input stream + DataInputStream in; + try { + in = new DataInputStream(new BufferedInputStream(new FileInputStream(fastaFile))); + } + catch (Exception e) { + throw new StingException(String.format("Could not read fasta file %s", fastaFile.getAbsolutePath())); + } + + /* + * iterate through each character in file one at a time, but must account for variance in line terminators + * strategy is to check if current character is a line terminator (\n, \r), then check next character + * only treat as end of line if next character is NOT a line terminator + */ + try { + // intialize iterators + nextByte = in.readByte(); + currentByte = '\n'; + while(currentByte != -1) { + + bytesRead ++; // update position in file + lastByte = currentByte; + currentByte = nextByte; + try { + nextByte = in.readByte(); + } + catch (EOFException e) { + nextByte = -1; + } + + switch(status) { + + // if not currently reading anything + // only thing that can trigger action is '>' (start of contig) or ';' (comment) + case NONE: + if (currentByte == '>') + status = CONTIG; + else if (currentByte == ';') + status = COMMENT; + break; + + // if reading a comment, just ignore everything until end of line + case COMMENT: + if (isEol(currentByte)) { + if (!isEol(nextByte)) + status = Status.NONE; + } + break; + + // if reading a contig, add char to contig string + // contig string can be anything, including spaces + case CONTIG: + if (isEol(currentByte)) { + if (!isEol(nextByte)) { + status = Status.FIRST_SEQ_LINE; + location = bytesRead + 1; + } + } + else + contig += (char) currentByte; + break; + + // record bases and bytes of first sequence line, to validate against later lines + case FIRST_SEQ_LINE: + + if (isEol(currentByte)) { + + // record bases per line if last character was a base + if (!isEol(lastByte)) { + basesPerLine = bytesRead - location; + basesThisLine = basesPerLine; + size += basesPerLine; + } + + // next character is start of next line, now know bytes per line + if (!isEol(nextByte)) { // figure out what to do if there is only one data line + bytesPerLine = bytesRead - location + 1; + status = Status.SEQ_LINE; + endOfLastLine = bytesRead; + + // if next char is ';' or '>', then there is only one contig => + if (nextByte == ';' || nextByte == '>') + finishReadingContig(); + } + } + + // validate base character + else { + if (!isValidBase(currentByte)) + throw new StingException(String.format("An invalid base was found in the contig: %s", contig)); + } + break; + + + case SEQ_LINE: + + if (isEol(currentByte)) { + + // record bases per line if last character was a base + if (!isEol(lastByte)) { + basesThisLine = bytesRead - endOfLastLine - 1; + size += basesThisLine; + } + + // reached end of line - check if end of contig + if (!isEol(nextByte)) { + + // if comment or new contig, definitely end of sequence + if (nextByte == ';' || nextByte == '>') + finishReadingContig(); + + // if this line has different # of bases OR same # of bases and different # of bytes: + // error if next char is a valid base, end of contig otherwise + else if (basesThisLine != basesPerLine || bytesPerLine != bytesRead - endOfLastLine) { + if (isValidBase(nextByte) && nextByte != -1) { + throw new StingException(String.format("An invalid line was found in the contig: %s", contig)); + } + else + finishReadingContig(); + } + endOfLastLine = bytesRead; + } + } + + // validate base character + else { + if (!isValidBase(currentByte)) + throw new StingException(String.format("An invalid base was found in the contig: %s", contig)); + } + break; + } + } + } + catch (IOException e) { + throw new StingException(String.format("Could not read fasta file %s", fastaFile.getAbsolutePath()), e); } + catch (Exception e) { + throw new StingException(e.getMessage(), e); + } + } + + /** + * Checks if character is an end of line character, \n or \r + * @param currentByte Character to check, as a byte + * @return True if current character is \n or \r' false otherwise + */ + private boolean isEol(byte currentByte) { + return (currentByte == '\n' || currentByte == '\r'); + } + + + /** + * checks that character is valid base + * only checks that the base isn't whitespace, like picard does + * could add check that character is A/C/G/T/U if wanted + * @param currentByte Character to check, as a byte + * @return True if character is not whitespace, false otherwise + */ + private boolean isValidBase(byte currentByte) { + return (!Character.isWhitespace(currentByte) && currentByte != ';' && currentByte != '>'); + } + + /* + * When reader reaches the end of a contig + * Reset iterators and add contig to sequence index + */ + private void finishReadingContig() { + sequenceIndex.addIndexEntry(contig, location, size, (int) basesPerLine, (int) bytesPerLine); + status = Status.NONE; + contig = ""; + size = 0; + + if (System.currentTimeMillis() - lastTimestamp > 10000) { + int percentProgress = (int) (100*bytesRead/fileLength); + if (progress != null) + progress.percentProgress(percentProgress); + lastTimestamp = System.currentTimeMillis(); + } + } + + /** + * Stores FastaSequenceIndex as a .fasta.fai file on local machine + * Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder + */ + public void saveAsFaiFile() { + File indexFile = new File(fastaFile.getAbsolutePath() + ".fai"); + + if (indexFile.exists()) { + throw new StingException(String.format("Fai file could not be created, because a file with name %s already exists." + + "Please delete or rename this file and try again.", indexFile.getAbsolutePath())); + } + + BufferedWriter out; + try { + out = new BufferedWriter(new FileWriter(indexFile)); + } + catch (Exception e) { + throw new StingException(String.format("Could not open file %s for writing. Check that GATK is permitted to write to disk.", + indexFile.getAbsolutePath()), e); + } + + Iterator iter = sequenceIndex.iterator(); + + try { + while (iter.hasNext()) { + out.write(iter.next().toIndexFileLine()); + out.newLine(); + } + out.close(); + } + catch (Exception e) { + throw new StingException(String.format("An error occurred while writing file %s", e)); + } + } +} diff --git a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java index ef9ad6a12..f3f5db8a0 100755 --- a/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java +++ b/java/src/org/broadinstitute/sting/utils/fasta/IndexedFastaSequenceFile.java @@ -64,6 +64,24 @@ public class IndexedFastaSequenceFile implements ReferenceSequenceFile { sanityCheckDictionaryAgainstIndex(); } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * @param file The file to open. + * @param sequenceIndex FastaSequenceIndex that was previously created + * @throws FileNotFoundException If the fasta or any of its supporting files cannot be found. + */ + public IndexedFastaSequenceFile(File file, FastaSequenceIndex sequenceIndex) throws FileNotFoundException { + this.file = file; + FileInputStream in = new FileInputStream(file); + channel = in.getChannel(); + + loadDictionary(file); + // Temporary change: sequenceIndex is passed in directly. See comments in ReferenceDataSource. + index = sequenceIndex; + sanityCheckDictionaryAgainstIndex(); + } + /** * Loads a dictionary, if available. * @param fastaFile File to check for a match. diff --git a/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexBuilderUnitTest.java b/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexBuilderUnitTest.java new file mode 100644 index 000000000..107fb6d4e --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/fasta/FastaSequenceIndexBuilderUnitTest.java @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.fasta; + +import net.sf.picard.PicardException; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSourceProgressListener; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.File; +import java.io.FileNotFoundException; + +/** + * Test the fasta sequence index reader. + */ +public class FastaSequenceIndexBuilderUnitTest extends BaseTest { + + private FastaSequenceIndexBuilder builder; + private ReferenceDataSourceProgressListener progress; + private File fastaFile; + private FastaSequenceIndex sequenceIndex; + + @Before + public void doForEachTest() throws FileNotFoundException { + sequenceIndex = new FastaSequenceIndex(); + } + + /** + * Tests basic unix file with one contig + * File is the exampleFASTA.fasta shipped with GATK + */ + @Test + public void unixFileTest() { + logger.warn("Executing unixFileTest"); + + fastaFile = new File(validationDataLocation + "exampleFASTA.fasta"); + builder = new FastaSequenceIndexBuilder(fastaFile, progress); + sequenceIndex.addIndexEntry("chr1", 6, 100000, 60, 61); + + Assert.assertTrue(sequenceIndex.equals(builder.sequenceIndex)); + } + + + /** + * Tests basic windows file with one contig + * File is a simple fasta file + */ + @Test + public void windowsFileTest() { + logger.warn("Executing windowsFileTest"); + + fastaFile = new File(validationDataLocation + "exampleFASTA-windows.fasta"); + builder = new FastaSequenceIndexBuilder(fastaFile, progress); + sequenceIndex.addIndexEntry("chr2", 7, 29, 7, 9); + + Assert.assertTrue(sequenceIndex.equals(builder.sequenceIndex)); + } + + /** + * Tests fasta with the two contigs from above combined + * File is the exampleFASTA.fasta shipped with GATK + */ + @Test + public void combinedWindowsUnix() { + logger.warn("Executing combinedWindowsUnix"); + + fastaFile = new File(validationDataLocation + "exampleFASTA-combined.fasta"); + builder = new FastaSequenceIndexBuilder(fastaFile, progress); + sequenceIndex.addIndexEntry("chr1", 6, 100000, 60, 61); + sequenceIndex.addIndexEntry("chr2", 101680, 29, 7, 9); + + Assert.assertTrue(sequenceIndex.equals(builder.sequenceIndex)); + } + + /** + * Tests fasta with the two contigs from above combined + * File is the exampleFASTA.fasta shipped with GATK + */ + @Test + public void threeVariableLengthContigs() { + logger.warn("Executing threeVariableLengthContigs"); + + fastaFile = new File(validationDataLocation + "exampleFASTA-3contigs.fasta"); + builder = new FastaSequenceIndexBuilder(fastaFile, progress); + sequenceIndex.addIndexEntry("chr1", 6, 17, 5, 6); + sequenceIndex.addIndexEntry("chr2", 35, 21, 7, 8); + sequenceIndex.addIndexEntry("chr3", 66, 100, 10, 11); + + Assert.assertTrue(sequenceIndex.equals(builder.sequenceIndex)); + } +} \ No newline at end of file