diff --git a/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java b/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java new file mode 100644 index 000000000..4b1c7a999 --- /dev/null +++ b/public/java/src/net/sf/picard/sam/MergingSamRecordIterator.java @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package net.sf.picard.sam; + +import net.sf.picard.PicardException; + +import java.util.*; +import java.lang.reflect.Constructor; + +import net.sf.samtools.*; +import net.sf.samtools.util.CloseableIterator; + +/** + * Provides an iterator interface for merging multiple underlying iterators into a single + * iterable stream. The underlying iterators/files must all have the same sort order unless + * the requested output format is unsorted, in which case any combination is valid. + */ +public class MergingSamRecordIterator implements CloseableIterator { + private final PriorityQueue pq; + private final SamFileHeaderMerger samHeaderMerger; + private final Collection readers; + private final SAMFileHeader.SortOrder sortOrder; + private final SAMRecordComparator comparator; + + private boolean initialized = false; + private boolean iterationStarted = false; + + /** + * Constructs a new merging iterator with the same set of readers and sort order as + * provided by the header merger parameter. + * @param headerMerger The merged header and contents of readers. + * @param forcePresorted True to ensure that the iterator checks the headers of the readers for appropriate sort order. + * @deprecated replaced by (SamFileHeaderMerger, Collection, boolean) + */ + public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, final boolean forcePresorted) { + this(headerMerger, headerMerger.getReaders(), forcePresorted); + } + + /** + * Constructs a new merging iterator with the same set of readers and sort order as + * provided by the header merger parameter. + * @param headerMerger The merged header and contents of readers. + * @param assumeSorted false ensures that the iterator checks the headers of the readers for appropriate sort order. + */ + public MergingSamRecordIterator(final SamFileHeaderMerger headerMerger, Collection readers, final boolean assumeSorted) { + this.samHeaderMerger = headerMerger; + this.sortOrder = headerMerger.getMergedHeader().getSortOrder(); + this.comparator = getComparator(); + this.readers = readers; + + this.pq = new PriorityQueue(readers.size()); + + for (final SAMFileReader reader : readers) { + if (!assumeSorted && this.sortOrder != SAMFileHeader.SortOrder.unsorted && + reader.getFileHeader().getSortOrder() != this.sortOrder){ + throw new PicardException("Files are not compatible with sort order"); + } + } + } + + /** + * Add a given SAM file iterator to the merging iterator. Use this to restrict the merged iteration to a given genomic interval, + * rather than iterating over every read in the backing file or stream. + * @param reader Reader to add to the merging iterator. + * @param iterator Iterator traversing over reader contents. + */ + public void addIterator(final SAMFileReader reader, final CloseableIterator iterator) { + if(iterationStarted) + throw new PicardException("Cannot add another iterator; iteration has already begun"); + if(!samHeaderMerger.containsHeader(reader.getFileHeader())) + throw new PicardException("All iterators to be merged must be accounted for in the SAM header merger"); + final ComparableSamRecordIterator comparableIterator = new ComparableSamRecordIterator(reader,iterator,comparator); + addIfNotEmpty(comparableIterator); + initialized = true; + } + + private void startIterationIfRequired() { + if(initialized) + return; + for(SAMFileReader reader: readers) + addIterator(reader,reader.iterator()); + iterationStarted = true; + } + + /** + * Close down all open iterators. + */ + public void close() { + // Iterators not in the priority queue have already been closed; only close down the iterators that are still in the priority queue. + for(CloseableIterator iterator: pq) + iterator.close(); + } + + /** Returns true if any of the underlying iterators has more records, otherwise false. */ + public boolean hasNext() { + startIterationIfRequired(); + return !this.pq.isEmpty(); + } + + /** Returns the next record from the top most iterator during merging. */ + public SAMRecord next() { + startIterationIfRequired(); + + final ComparableSamRecordIterator iterator = this.pq.poll(); + final SAMRecord record = iterator.next(); + addIfNotEmpty(iterator); + record.setHeader(this.samHeaderMerger.getMergedHeader()); + + // Fix the read group if needs be + if (this.samHeaderMerger.hasReadGroupCollisions()) { + final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.READ_GROUP_ID); + if (oldGroupId != null ) { + final String newGroupId = this.samHeaderMerger.getReadGroupId(iterator.getReader().getFileHeader(),oldGroupId); + record.setAttribute(ReservedTagConstants.READ_GROUP_ID, newGroupId); + } + } + + // Fix the program group if needs be + if (this.samHeaderMerger.hasProgramGroupCollisions()) { + final String oldGroupId = (String) record.getAttribute(ReservedTagConstants.PROGRAM_GROUP_ID); + if (oldGroupId != null ) { + final String newGroupId = this.samHeaderMerger.getProgramGroupId(iterator.getReader().getFileHeader(),oldGroupId); + record.setAttribute(ReservedTagConstants.PROGRAM_GROUP_ID, newGroupId); + } + } + + // Fix up the sequence indexes if needs be + if (this.samHeaderMerger.hasMergedSequenceDictionary()) { + if (record.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + record.setReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getReferenceIndex())); + } + + if (record.getReadPairedFlag() && record.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + record.setMateReferenceIndex(this.samHeaderMerger.getMergedSequenceIndex(iterator.getReader().getFileHeader(),record.getMateReferenceIndex())); + } + } + + return record; + } + + /** + * Adds iterator to priority queue. If the iterator has more records it is added + * otherwise it is closed and not added. + */ + private void addIfNotEmpty(final ComparableSamRecordIterator iterator) { + if (iterator.hasNext()) { + pq.offer(iterator); + } + else { + iterator.close(); + } + } + + /** Unsupported operation. */ + public void remove() { + throw new UnsupportedOperationException("MergingSAMRecorderIterator.remove()"); + } + + /** + * Get the right comparator for a given sort order (coordinate, alphabetic). In the + * case of "unsorted" it will return a comparator that gives an arbitrary but reflexive + * ordering. + */ + private SAMRecordComparator getComparator() { + // For unsorted build a fake comparator that compares based on object ID + if (this.sortOrder == SAMFileHeader.SortOrder.unsorted) { + return new SAMRecordComparator() { + public int fileOrderCompare(final SAMRecord lhs, final SAMRecord rhs) { + return System.identityHashCode(lhs) - System.identityHashCode(rhs); + } + + public int compare(final SAMRecord lhs, final SAMRecord rhs) { + return fileOrderCompare(lhs, rhs); + } + }; + } + if (samHeaderMerger.hasMergedSequenceDictionary() && sortOrder.equals(SAMFileHeader.SortOrder.coordinate)) { + return new MergedSequenceDictionaryCoordinateOrderComparator(); + } + + // Otherwise try and figure out what kind of comparator to return and build it + return this.sortOrder.getComparatorInstance(); + } + + /** Returns the merged header that the merging iterator is working from. */ + public SAMFileHeader getMergedHeader() { + return this.samHeaderMerger.getMergedHeader(); + } + + /** + * Ugh. Basically does a regular coordinate compare, but looks up the sequence indices in the merged + * sequence dictionary. I hate the fact that this extends SAMRecordCoordinateComparator, but it avoids + * more copy & paste. + */ + private class MergedSequenceDictionaryCoordinateOrderComparator extends SAMRecordCoordinateComparator { + + public int fileOrderCompare(final SAMRecord samRecord1, final SAMRecord samRecord2) { + final int referenceIndex1 = getReferenceIndex(samRecord1); + final int referenceIndex2 = getReferenceIndex(samRecord2); + if (referenceIndex1 != referenceIndex2) { + if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + return 1; + } else if (referenceIndex2 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + return -1; + } else { + return referenceIndex1 - referenceIndex2; + } + } + if (referenceIndex1 == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + // Both are unmapped. + return 0; + } + return samRecord1.getAlignmentStart() - samRecord2.getAlignmentStart(); + } + + private int getReferenceIndex(final SAMRecord samRecord) { + if (samRecord.getReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getReferenceIndex()); + } + if (samRecord.getMateReferenceIndex() != SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) { + return samHeaderMerger.getMergedSequenceIndex(samRecord.getHeader(), samRecord.getMateReferenceIndex()); + } + return SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX; + } + } +} diff --git a/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java b/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java new file mode 100644 index 000000000..f78cd81da --- /dev/null +++ b/public/java/src/net/sf/picard/sam/SamFileHeaderMerger.java @@ -0,0 +1,744 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package net.sf.picard.sam; + +import java.util.*; + +import net.sf.picard.PicardException; +import net.sf.samtools.AbstractSAMHeaderRecord; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMProgramRecord; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMSequenceDictionary; +import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.util.SequenceUtil; + +/** + * Merges SAMFileHeaders that have the same sequences into a single merged header + * object while providing read group translation for cases where read groups + * clash across input headers. + */ +public class SamFileHeaderMerger { + //Super Header to construct + private final SAMFileHeader mergedHeader; + private Collection readers; + private final Collection headers; + + //Translation of old group ids to new group ids + private final Map> samReadGroupIdTranslation = + new IdentityHashMap>(); + + //the read groups from different files use the same group ids + private boolean hasReadGroupCollisions = false; + + //the program records from different files use the same program record ids + private boolean hasProgramGroupCollisions = false; + + //Translation of old program group ids to new program group ids + private Map> samProgramGroupIdTranslation = + new IdentityHashMap>(); + + private boolean hasMergedSequenceDictionary = false; + + // Translation of old sequence dictionary ids to new dictionary ids + // This is an IdentityHashMap because it can be quite expensive to compute the hashCode for + // large SAMFileHeaders. It is possible that two input files will have identical headers so that + // the regular HashMap would fold them together, but the value stored in each of the two + // Map entries will be the same, so it should not hurt anything. + private final Map> samSeqDictionaryIdTranslationViaHeader = + new IdentityHashMap>(); + + //HeaderRecordFactory that creates SAMReadGroupRecord instances. + private static final HeaderRecordFactory READ_GROUP_RECORD_FACTORY = new HeaderRecordFactory() { + public SAMReadGroupRecord createRecord(String id, SAMReadGroupRecord srcReadGroupRecord) { + return new SAMReadGroupRecord(id, srcReadGroupRecord); + } + }; + + //HeaderRecordFactory that creates SAMProgramRecord instances. + private static final HeaderRecordFactory PROGRAM_RECORD_FACTORY = new HeaderRecordFactory() { + public SAMProgramRecord createRecord(String id, SAMProgramRecord srcProgramRecord) { + return new SAMProgramRecord(id, srcProgramRecord); + } + }; + + //comparator used to sort lists of program group and read group records + private static final Comparator RECORD_ID_COMPARATOR = new Comparator() { + public int compare(AbstractSAMHeaderRecord o1, AbstractSAMHeaderRecord o2) { + return o1.getId().compareTo(o2.getId()); + } + }; + + /** + * Create SAMFileHeader with additional information. Required that sequence dictionaries agree. + * + * @param readers sam file readers to combine + * @param sortOrder sort order new header should have + * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean) + */ + public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder) { + this(readers, sortOrder, false); + } + + /** + * Create SAMFileHeader with additional information. + * + * @param readers sam file readers to combine + * @param sortOrder sort order new header should have + * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that + * all input sequence dictionaries be identical. + * @deprecated replaced by SamFileHeaderMerger(Collection, SAMFileHeader.SortOrder, boolean) + */ + public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder, final boolean mergeDictionaries) { + this(sortOrder, getHeadersFromReaders(readers), mergeDictionaries); + this.readers = readers; + } + + /** + * Create SAMFileHeader with additional information.. This is the preferred constructor. + * + * @param sortOrder sort order new header should have + * @param headers sam file headers to combine + * @param mergeDictionaries If true, merge sequence dictionaries in new header. If false, require that + * all input sequence dictionaries be identical. + */ + public SamFileHeaderMerger(final SAMFileHeader.SortOrder sortOrder, final Collection headers, final boolean mergeDictionaries) { + this.headers = headers; + this.mergedHeader = new SAMFileHeader(); + + SAMSequenceDictionary sequenceDictionary; + try { + sequenceDictionary = getSequenceDictionary(headers); + this.hasMergedSequenceDictionary = false; + } + catch (SequenceUtil.SequenceListsDifferException pe) { + if (mergeDictionaries) { + sequenceDictionary = mergeSequenceDictionaries(headers); + this.hasMergedSequenceDictionary = true; + } + else { + throw pe; + } + } + + this.mergedHeader.setSequenceDictionary(sequenceDictionary); + + // Set program that creates input alignments + for (final SAMProgramRecord program : mergeProgramGroups(headers)) { + this.mergedHeader.addProgramRecord(program); + } + + // Set read groups for merged header + final List readGroups = mergeReadGroups(headers); + this.mergedHeader.setReadGroups(readGroups); + this.mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.none); + + this.mergedHeader.setSortOrder(sortOrder); + + for (final SAMFileHeader header : headers) { + for (final String comment : header.getComments()) { + this.mergedHeader.addComment(comment); + } + } + } + + // Utilility method to make use with old constructor + private static List getHeadersFromReaders(Collection readers) { + List headers = new ArrayList(readers.size()); + for (SAMFileReader reader : readers) { + headers.add(reader.getFileHeader()); + } + return headers; + } + + + /** + * Checks to see if there are clashes where different readers are using the same read + * group IDs. If yes, then those IDs that collided are remapped. + * + * @param headers headers to combine + * @return new list of read groups constructed from all the readers + */ + private List mergeReadGroups(final Collection headers) { + //prepare args for mergeHeaderRecords(..) call + final HashSet idsThatAreAlreadyTaken = new HashSet(); + + final List> readGroupsToProcess = new LinkedList>(); + for (final SAMFileHeader header : headers) { + for (final SAMReadGroupRecord readGroup : header.getReadGroups()) { + //verify that there are no existing id collisions in this input file + if(!idsThatAreAlreadyTaken.add(readGroup.getId())) + throw new PicardException("Input file: " + header + " contains more than one RG with the same id (" + readGroup.getId() + ")"); + + readGroupsToProcess.add(new HeaderRecordAndFileHeader(readGroup, header)); + } + idsThatAreAlreadyTaken.clear(); + } + + final List result = new LinkedList(); + + hasReadGroupCollisions = mergeHeaderRecords(readGroupsToProcess, READ_GROUP_RECORD_FACTORY, idsThatAreAlreadyTaken, samReadGroupIdTranslation, result); + + //sort the result list by record id + Collections.sort(result, RECORD_ID_COMPARATOR); + + return result; + } + + + /** + * Checks to see if there are clashes where different readers are using the same program + * group IDs. If yes, then those IDs that collided are remapped. + * + * @param headers headers to combine + * @return new list of program groups constructed from all the readers + */ + private List mergeProgramGroups(final Collection headers) { + + final List overallResult = new LinkedList(); + + //this Set will accumulate all SAMProgramRecord ids that have been encountered so far. + final HashSet idsThatAreAlreadyTaken = new HashSet(); + + //need to process all program groups + List> programGroupsLeftToProcess = new LinkedList>(); + for (final SAMFileHeader header : headers) { + for (final SAMProgramRecord programGroup : header.getProgramRecords()) { + //verify that there are no existing id collisions in this input file + if(!idsThatAreAlreadyTaken.add(programGroup.getId())) + throw new PicardException("Input file: " + header + " contains more than one PG with the same id (" + programGroup.getId() + ")"); + + programGroupsLeftToProcess.add(new HeaderRecordAndFileHeader(programGroup, header)); + } + idsThatAreAlreadyTaken.clear(); + } + + //A program group header (lets say ID=2 PN=B PP=1) may have a PP (previous program) attribute which chains it to + //another program group header (lets say ID=1 PN=A) to indicate that the given file was + //processed by program A followed by program B. These PP attributes potentially + //connect headers into one or more tree structures. Merging is done by + //first merging all headers that don't have PP attributes (eg. tree roots), + //then updating and merging all headers whose PPs point to the tree-root headers, + //and so on until all program group headers are processed. + + //currentProgramGroups is the list of records to merge next. Start by merging the programGroups that don't have a PP attribute (eg. the tree roots). + List< HeaderRecordAndFileHeader > currentProgramGroups = new LinkedList>(); + for(final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) { + final HeaderRecordAndFileHeader pair = programGroupsLeftToProcessIterator.next(); + if(pair.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG) == null) { + programGroupsLeftToProcessIterator.remove(); + currentProgramGroups.add(pair); + } + } + + //merge currentProgramGroups + while(!currentProgramGroups.isEmpty()) + { + final List currentResult = new LinkedList(); + + hasProgramGroupCollisions |= mergeHeaderRecords(currentProgramGroups, PROGRAM_RECORD_FACTORY, idsThatAreAlreadyTaken, samProgramGroupIdTranslation, currentResult); + + //add currentResults to overallResults + overallResult.addAll(currentResult); + + //apply the newly-computed id translations to currentProgramGroups and programGroupsLeftToProcess + currentProgramGroups = translateIds(currentProgramGroups, samProgramGroupIdTranslation, false); + programGroupsLeftToProcess = translateIds(programGroupsLeftToProcess, samProgramGroupIdTranslation, true); + + //find all records in programGroupsLeftToProcess whose ppId points to a record that was just processed (eg. a record that's in currentProgramGroups), + //and move them to the list of programGroupsToProcessNext. + LinkedList> programGroupsToProcessNext = new LinkedList>(); + for(final Iterator> programGroupsLeftToProcessIterator = programGroupsLeftToProcess.iterator(); programGroupsLeftToProcessIterator.hasNext(); ) { + final HeaderRecordAndFileHeader pairLeftToProcess = programGroupsLeftToProcessIterator.next(); + final Object ppIdOfRecordLeftToProcess = pairLeftToProcess.getHeaderRecord().getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG); + //find what currentProgramGroups this ppId points to (NOTE: they have to come from the same file) + for(final HeaderRecordAndFileHeader justProcessedPair : currentProgramGroups) { + String idJustProcessed = justProcessedPair.getHeaderRecord().getId(); + if(pairLeftToProcess.getFileHeader() == justProcessedPair.getFileHeader() && ppIdOfRecordLeftToProcess.equals(idJustProcessed)) { + programGroupsLeftToProcessIterator.remove(); + programGroupsToProcessNext.add(pairLeftToProcess); + break; + } + } + } + + currentProgramGroups = programGroupsToProcessNext; + } + + //verify that all records were processed + if(!programGroupsLeftToProcess.isEmpty()) { + StringBuffer errorMsg = new StringBuffer(programGroupsLeftToProcess.size() + " program groups weren't processed. Do their PP ids point to existing PGs? \n"); + for( final HeaderRecordAndFileHeader pair : programGroupsLeftToProcess ) { + SAMProgramRecord record = pair.getHeaderRecord(); + errorMsg.append("@PG ID:"+record.getProgramGroupId()+" PN:"+record.getProgramName()+" PP:"+record.getPreviousProgramGroupId() +"\n"); + } + throw new PicardException(errorMsg.toString()); + } + + //sort the result list by record id + Collections.sort(overallResult, RECORD_ID_COMPARATOR); + + return overallResult; + } + + + /** + * Utility method that takes a list of program groups and remaps all their + * ids (including ppIds if requested) using the given idTranslationTable. + * + * NOTE: when remapping, this method creates new SAMProgramRecords and + * doesn't mutate any records in the programGroups list. + * + * @param programGroups The program groups to translate. + * @param idTranslationTable The translation table. + * @param translatePpIds Whether ppIds should be translated as well. + * + * @return The list of translated records. + */ + private List> translateIds( + List> programGroups, + Map> idTranslationTable, + boolean translatePpIds) { + + //go through programGroups and translate any IDs and PPs based on the idTranslationTable. + List> result = new LinkedList>(); + for(final HeaderRecordAndFileHeader pair : programGroups ) { + final SAMProgramRecord record = pair.getHeaderRecord(); + final String id = record.getProgramGroupId(); + final String ppId = (String) record.getAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG); + + final SAMFileHeader header = pair.getFileHeader(); + final Map translations = idTranslationTable.get(header); + + //see if one or both ids need to be translated + SAMProgramRecord translatedRecord = null; + if(translations != null) + { + String translatedId = translations.get( id ); + String translatedPpId = translatePpIds ? translations.get( ppId ) : null; + + boolean needToTranslateId = translatedId != null && !translatedId.equals(id); + boolean needToTranslatePpId = translatedPpId != null && !translatedPpId.equals(ppId); + + if(needToTranslateId && needToTranslatePpId) { + translatedRecord = new SAMProgramRecord(translatedId, record); + translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId); + } else if(needToTranslateId) { + translatedRecord = new SAMProgramRecord(translatedId, record); + } else if(needToTranslatePpId) { + translatedRecord = new SAMProgramRecord(id, record); + translatedRecord.setAttribute(SAMProgramRecord.PREVIOUS_PROGRAM_GROUP_ID_TAG, translatedPpId); + } + } + + if(translatedRecord != null) { + result.add(new HeaderRecordAndFileHeader(translatedRecord, header)); + } else { + result.add(pair); //keep the original record + } + } + + return result; + } + + + /** + * Utility method for merging a List of AbstractSAMHeaderRecords. If it finds + * records that have identical ids and attributes, it will collapse them + * into one record. If it finds records that have identical ids but + * non-identical attributes, this is treated as a collision. When collision happens, + * the records' ids are remapped, and an old-id to new-id mapping is added to the idTranslationTable. + * + * NOTE: Non-collided records also get recorded in the idTranslationTable as + * old-id to old-id. This way, an idTranslationTable lookup should never return null. + * + * @param headerRecords The header records to merge. + * @param headerRecordFactory Constructs a specific subclass of AbstractSAMHeaderRecord. + * @param idsThatAreAlreadyTaken If the id of a headerRecord matches an id in this set, it will be treated as a collision, and the headRecord's id will be remapped. + * @param idTranslationTable When records collide, their ids are remapped, and an old-id to new-id + * mapping is added to the idTranslationTable. Non-collided records also get recorded in the idTranslationTable as + * old-id to old-id. This way, an idTranslationTable lookup should never return null. + * + * @param result The list of merged header records. + * + * @return True if there were collisions. + */ + private boolean mergeHeaderRecords(final List> headerRecords, HeaderRecordFactory headerRecordFactory, + final HashSet idsThatAreAlreadyTaken, Map> idTranslationTable, List result) { + + //The outer Map bins the header records by their ids. The nested Map further collapses + //header records which, in addition to having the same id, also have identical attributes. + //In other words, each key in the nested map represents one or more + //header records which have both identical ids and identical attributes. The List of + //SAMFileHeaders keeps track of which readers these header record(s) came from. + final Map>> idToRecord = + new HashMap>>(); + + //Populate the idToRecord and seenIds data structures + for (final HeaderRecordAndFileHeader pair : headerRecords) { + final RecordType record = pair.getHeaderRecord(); + final SAMFileHeader header = pair.getFileHeader(); + final String recordId = record.getId(); + Map> recordsWithSameId = idToRecord.get(recordId); + if(recordsWithSameId == null) { + recordsWithSameId = new LinkedHashMap>(); + idToRecord.put(recordId, recordsWithSameId); + } + + List fileHeaders = recordsWithSameId.get(record); + if(fileHeaders == null) { + fileHeaders = new LinkedList(); + recordsWithSameId.put(record, fileHeaders); + } + + fileHeaders.add(header); + } + + //Resolve any collisions between header records by remapping their ids. + boolean hasCollisions = false; + for (final Map.Entry>> entry : idToRecord.entrySet() ) + { + final String recordId = entry.getKey(); + final Map> recordsWithSameId = entry.getValue(); + + + for( Map.Entry> recordWithUniqueAttr : recordsWithSameId.entrySet()) { + final RecordType record = recordWithUniqueAttr.getKey(); + final List fileHeaders = recordWithUniqueAttr.getValue(); + + String newId; + if(!idsThatAreAlreadyTaken.contains(recordId)) { + //don't remap 1st record. If there are more records + //with this id, they will be remapped in the 'else'. + newId = recordId; + idsThatAreAlreadyTaken.add(recordId); + } else { + //there is more than one record with this id. + hasCollisions = true; + + //find a unique newId for this record + int idx=1; + while(idsThatAreAlreadyTaken.contains(newId = recordId + "." + Integer.toString(idx++))) + ; + + idsThatAreAlreadyTaken.add( newId ); + } + + for(SAMFileHeader fileHeader : fileHeaders) { + Map readerTranslationTable = idTranslationTable.get(fileHeader); + if(readerTranslationTable == null) { + readerTranslationTable = new HashMap(); + idTranslationTable.put(fileHeader, readerTranslationTable); + } + readerTranslationTable.put(recordId, newId); + } + + result.add( headerRecordFactory.createRecord(newId, record) ); + } + } + + return hasCollisions; + } + + + /** + * Get the sequences off the SAMFileHeader. Throws runtime exception if the sequence + * are different from one another. + * + * @param headers headers to pull sequences from + * @return sequences from files. Each file should have the same sequence + */ + private SAMSequenceDictionary getSequenceDictionary(final Collection headers) { + SAMSequenceDictionary sequences = null; + for (final SAMFileHeader header : headers) { + + if (sequences == null) { + sequences = header.getSequenceDictionary(); + } + else { + final SAMSequenceDictionary currentSequences = header.getSequenceDictionary(); + SequenceUtil.assertSequenceDictionariesEqual(sequences, currentSequences); + } + } + + return sequences; + } + + /** + * Get the sequences from the SAMFileHeader, and merge the resulting sequence dictionaries. + * + * @param headers headers to pull sequences from + * @return sequences from files. Each file should have the same sequence + */ + private SAMSequenceDictionary mergeSequenceDictionaries(final Collection headers) { + SAMSequenceDictionary sequences = new SAMSequenceDictionary(); + for (final SAMFileHeader header : headers) { + final SAMSequenceDictionary currentSequences = header.getSequenceDictionary(); + sequences = mergeSequences(sequences, currentSequences); + } + // second pass, make a map of the original seqeunce id -> new sequence id + createSequenceMapping(headers, sequences); + return sequences; + } + + /** + * They've asked to merge the sequence headers. What we support right now is finding the sequence name superset. + * + * @param mergeIntoDict the result of merging so far. All SAMSequenceRecords in here have been cloned from the originals. + * @param mergeFromDict A new sequence dictionary to merge into mergeIntoDict. + * @return A new sequence dictionary that resulting from merging the two inputs. + */ + private SAMSequenceDictionary mergeSequences(SAMSequenceDictionary mergeIntoDict, SAMSequenceDictionary mergeFromDict) { + + // a place to hold the sequences that we haven't found a home for, in the order the appear in mergeFromDict. + LinkedList holder = new LinkedList(); + + // Return value will be created from this. + LinkedList resultingDict = new LinkedList(); + for (final SAMSequenceRecord sequenceRecord : mergeIntoDict.getSequences()) { + resultingDict.add(sequenceRecord); + } + + // Index into resultingDict of previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict. + int prevloc = -1; + // Previous SAMSequenceRecord from mergeFromDict that already existed in mergeIntoDict. + SAMSequenceRecord previouslyMerged = null; + + for (SAMSequenceRecord sequenceRecord : mergeFromDict.getSequences()) { + // Does it already exist in resultingDict? + int loc = getIndexOfSequenceName(resultingDict, sequenceRecord.getSequenceName()); + if (loc == -1) { + // If doesn't already exist in resultingDict, save it an decide where to insert it later. + holder.add(sequenceRecord.clone()); + } else if (prevloc > loc) { + // If sequenceRecord already exists in resultingDict, but prior to the previous one + // from mergeIntoDict that already existed, cannot merge. + throw new PicardException("Cannot merge sequence dictionaries because sequence " + + sequenceRecord.getSequenceName() + " and " + previouslyMerged.getSequenceName() + + " are in different orders in two input sequence dictionaries."); + } else { + // Since sequenceRecord already exists in resultingDict, don't need to add it. + // Add in all the sequences prior to it that have been held in holder. + resultingDict.addAll(loc, holder); + // Remember the index of sequenceRecord so can check for merge imcompatibility. + prevloc = loc + holder.size(); + previouslyMerged = sequenceRecord; + holder.clear(); + } + } + // Append anything left in holder. + if (holder.size() != 0) { + resultingDict.addAll(holder); + } + return new SAMSequenceDictionary(resultingDict); + } + + /** + * Find sequence in list. + * @param list List to search for the sequence name. + * @param sequenceName Name to search for. + * @return Index of SAMSequenceRecord with the given name in list, or -1 if not found. + */ + private static int getIndexOfSequenceName(final List list, final String sequenceName) { + for (int i = 0; i < list.size(); ++i) { + if (list.get(i).getSequenceName().equals(sequenceName)) { + return i; + } + } + return -1; + } + + /** + * create the sequence mapping. This map is used to convert the unmerged header sequence ID's to the merged + * list of sequence id's. + * @param headers the collections of headers. + * @param masterDictionary the superset dictionary we've created. + */ + private void createSequenceMapping(final Collection headers, SAMSequenceDictionary masterDictionary) { + LinkedList resultingDictStr = new LinkedList(); + for (SAMSequenceRecord r : masterDictionary.getSequences()) { + resultingDictStr.add(r.getSequenceName()); + } + for (final SAMFileHeader header : headers) { + Map seqMap = new HashMap(); + SAMSequenceDictionary dict = header.getSequenceDictionary(); + for (SAMSequenceRecord rec : dict.getSequences()) { + seqMap.put(rec.getSequenceIndex(), resultingDictStr.indexOf(rec.getSequenceName())); + } + this.samSeqDictionaryIdTranslationViaHeader.put(header, seqMap); + } + } + + + + /** + * Returns the read group id that should be used for the input read and RG id. + * + * @deprecated replaced by getReadGroupId(SAMFileHeader, String) + * */ + public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) { + return getReadGroupId(reader.getFileHeader(), originalReadGroupId); + } + + /** Returns the read group id that should be used for the input read and RG id. */ + public String getReadGroupId(final SAMFileHeader header, final String originalReadGroupId) { + return this.samReadGroupIdTranslation.get(header).get(originalReadGroupId); + } + + /** + * @param reader one of the input files + * @param originalProgramGroupId a program group ID from the above input file + * @return new ID from the merged list of program groups in the output file + * @deprecated replaced by getProgramGroupId(SAMFileHeader, String) + */ + public String getProgramGroupId(final SAMFileReader reader, final String originalProgramGroupId) { + return getProgramGroupId(reader.getFileHeader(), originalProgramGroupId); + } + + /** + * @param header one of the input headers + * @param originalProgramGroupId a program group ID from the above input file + * @return new ID from the merged list of program groups in the output file + */ + public String getProgramGroupId(final SAMFileHeader header, final String originalProgramGroupId) { + return this.samProgramGroupIdTranslation.get(header).get(originalProgramGroupId); + } + + /** Returns true if there are read group duplicates within the merged headers. */ + public boolean hasReadGroupCollisions() { + return this.hasReadGroupCollisions; + } + + /** Returns true if there are program group duplicates within the merged headers. */ + public boolean hasProgramGroupCollisions() { + return hasProgramGroupCollisions; + } + + /** @return if we've merged the sequence dictionaries, return true */ + public boolean hasMergedSequenceDictionary() { + return hasMergedSequenceDictionary; + } + + /** Returns the merged header that should be written to any output merged file. */ + public SAMFileHeader getMergedHeader() { + return this.mergedHeader; + } + + /** Returns the collection of readers that this header merger is working with. May return null. + * @deprecated replaced by getHeaders() + */ + public Collection getReaders() { + return this.readers; + } + + /** Returns the collection of readers that this header merger is working with. + */ + public Collection getHeaders() { + return this.headers; + } + + /** + * Tells whether this header merger contains a given SAM file header. Note that header presence + * is confirmed / blocked by == equality, rather than actually testing SAMFileHeader.equals(), for + * reasons of performance. + * @param header header to check for. + * @return True if the header exists in this HeaderMerger. False otherwise. + */ + boolean containsHeader(SAMFileHeader header) { + for(SAMFileHeader headerMergerHeader: headers) { + if(headerMergerHeader == header) + return true; + } + return false; + } + + /** + * returns the new mapping for a specified reader, given it's old sequence index + * @param reader the reader + * @param oldReferenceSequenceIndex the old sequence (also called reference) index + * @return the new index value + * @deprecated replaced by getMergedSequenceIndex(SAMFileHeader, Integer) + */ + public Integer getMergedSequenceIndex(SAMFileReader reader, Integer oldReferenceSequenceIndex) { + return this.getMergedSequenceIndex(reader.getFileHeader(), oldReferenceSequenceIndex); + } + + /** + * Another mechanism for getting the new sequence index, for situations in which the reader is not available. + * Note that if the SAMRecord has already had its header replaced with the merged header, this won't work. + * @param header The original header for the input record in question. + * @param oldReferenceSequenceIndex The original sequence index. + * @return the new index value that is compatible with the merged sequence index. + */ + public Integer getMergedSequenceIndex(final SAMFileHeader header, Integer oldReferenceSequenceIndex) { + final Map mapping = this.samSeqDictionaryIdTranslationViaHeader.get(header); + if (mapping == null) { + throw new PicardException("No sequence dictionary mapping available for header: " + header); + } + + final Integer newIndex = mapping.get(oldReferenceSequenceIndex); + if (newIndex == null) { + throw new PicardException("No mapping for reference index " + oldReferenceSequenceIndex + " from header: " + header); + } + + return newIndex; + } + + + /** + * Implementations of this interface are used by mergeHeaderRecords(..) to instantiate + * specific subclasses of AbstractSAMHeaderRecord. + */ + private static interface HeaderRecordFactory { + + /** + * Constructs a new instance of RecordType. + * @param id The id of the new record. + * @param srcRecord Except for the id, the new record will be a copy of this source record. + */ + public RecordType createRecord(final String id, RecordType srcRecord); + } + + /** + * Struct that groups together a subclass of AbstractSAMHeaderRecord with the + * SAMFileHeader that it came from. + */ + private static class HeaderRecordAndFileHeader { + private RecordType headerRecord; + private SAMFileHeader samFileHeader; + + public HeaderRecordAndFileHeader(RecordType headerRecord, SAMFileHeader samFileHeader) { + this.headerRecord = headerRecord; + this.samFileHeader = samFileHeader; + } + + public RecordType getHeaderRecord() { + return headerRecord; + } + public SAMFileHeader getFileHeader() { + return samFileHeader; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index d5d8ab2ef..75a851d65 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -64,6 +64,9 @@ import java.util.concurrent.*; public class SAMDataSource { final private static GATKSamRecordFactory factory = new GATKSamRecordFactory(); + /** If true, we will load SAMReaders in parallel */ + final private static boolean USE_PARALLEL_LOADING = false; + /** Backing support for reads. */ protected final ReadProperties readProperties; @@ -259,6 +262,11 @@ public class SAMDataSource { validationStringency = strictness; if(readBufferSize != null) ReadShard.setReadBufferSize(readBufferSize); + else { + // Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively + // will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once. + ReadShard.setReadBufferSize(Math.min(1000*samFiles.size(),250000)); + } resourcePool = new SAMResourcePool(Integer.MAX_VALUE); SAMReaders readers = resourcePool.getAvailableReaders(); @@ -479,10 +487,13 @@ public class SAMDataSource { // Cache the most recently viewed read so that we can check whether we've reached the end of a pair. SAMRecord read = null; + Map positionUpdates = new IdentityHashMap(); + CloseableIterator iterator = getIterator(readers,shard,sortOrder == SAMFileHeader.SortOrder.coordinate); while(!shard.isBufferFull() && iterator.hasNext()) { read = iterator.next(); - addReadToBufferingShard(shard,getReaderID(readers,read),read); + shard.addRead(read); + noteFilePositionUpdate(positionUpdates,read); } // If the reads are sorted in queryname order, ensure that all reads @@ -492,11 +503,21 @@ public class SAMDataSource { SAMRecord nextRead = iterator.next(); if(read == null || !read.getReadName().equals(nextRead.getReadName())) break; - addReadToBufferingShard(shard,getReaderID(readers,nextRead),nextRead); + shard.addRead(nextRead); + noteFilePositionUpdate(positionUpdates,nextRead); } } iterator.close(); + + // Make the updates specified by the reader. + for(Map.Entry positionUpdate: positionUpdates.entrySet()) + readerPositions.put(readers.getReaderID(positionUpdate.getKey()),positionUpdate.getValue()); + } + + private void noteFilePositionUpdate(Map positionMapping, SAMRecord read) { + GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing()); + positionMapping.put(read.getFileSource().getReader(),endChunk); } public StingSAMIterator seek(Shard shard) { @@ -574,18 +595,6 @@ public class SAMDataSource { readProperties.defaultBaseQualities()); } - /** - * Adds this read to the given shard. - * @param shard The shard to which to add the read. - * @param id The id of the given reader. - * @param read The read to add to the shard. - */ - private void addReadToBufferingShard(Shard shard,SAMReaderID id,SAMRecord read) { - GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing()); - shard.addRead(read); - readerPositions.put(id,endChunk); - } - /** * Filter reads based on user-specified criteria. * @@ -714,68 +723,98 @@ public class SAMDataSource { * @param validationStringency validation stringency. */ public SAMReaders(Collection readerIDs, SAMFileReader.ValidationStringency validationStringency) { - final int N_THREADS = 8; - int totalNumberOfFiles = readerIDs.size(); + final int totalNumberOfFiles = readerIDs.size(); int readerNumber = 1; + final SimpleTimer timer = new SimpleTimer().start(); - ExecutorService executor = Executors.newFixedThreadPool(N_THREADS); - final List inits = new ArrayList(totalNumberOfFiles); - Queue> futures = new LinkedList>(); - for (SAMReaderID readerID: readerIDs) { - logger.debug("Enqueuing for initialization: " + readerID.samFile); - final ReaderInitializer init = new ReaderInitializer(readerID); - inits.add(init); - futures.add(executor.submit(init)); - } - - final SimpleTimer timer = new SimpleTimer(); - try { - final int MAX_WAIT = 30 * 1000; - final int MIN_WAIT = 1 * 1000; - - timer.start(); - while ( ! futures.isEmpty() ) { - final int prevSize = futures.size(); - final double waitTime = prevSize * (0.5 / N_THREADS); // about 0.5 seconds to load each file - final int waitTimeInMS = Math.min(MAX_WAIT, Math.max((int) (waitTime * 1000), MIN_WAIT)); - Thread.sleep(waitTimeInMS); - - Queue> pending = new LinkedList>(); - for ( final Future initFuture : futures ) { - if ( initFuture.isDone() ) { - final ReaderInitializer init = initFuture.get(); - if (threadAllocation.getNumIOThreads() > 0) { - inputStreams.put(init.readerID, init.blockInputStream); // get from initializer - } - logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, init.readerID)); - readers.put(init.readerID, init.reader); - } else { - pending.add(initFuture); - } + if ( totalNumberOfFiles > 0 ) logger.info("Initializing SAMRecords " + (USE_PARALLEL_LOADING ? "in parallel" : "in serial")); + if ( ! USE_PARALLEL_LOADING ) { + final int tickSize = 50; + int nExecutedTotal = 0; + long lastTick = timer.currentTime(); + for(final SAMReaderID readerID: readerIDs) { + final ReaderInitializer init = new ReaderInitializer(readerID).call(); + if (threadAllocation.getNumIOThreads() > 0) { + inputStreams.put(init.readerID, init.blockInputStream); // get from initializer } - final int pendingSize = pending.size(); - final int nExecutedInTick = prevSize - pendingSize; - final int nExecutedTotal = totalNumberOfFiles - pendingSize; - final double totalTimeInSeconds = timer.getElapsedTime(); - final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds); - final int nRemaining = pendingSize; - final double estTimeToComplete = pendingSize / nTasksPerSecond; - logger.info(String.format("Init %d BAMs in last %d s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m", - nExecutedInTick, (int)(waitTimeInMS / 1000.0), - nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond, - nRemaining, estTimeToComplete, estTimeToComplete / 60)); - - futures = pending; + logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, readerID.samFile)); + readers.put(init.readerID,init.reader); + if ( ++nExecutedTotal % tickSize == 0) { + double tickInSec = (timer.currentTime() - lastTick) / 1000.0; + printReaderPerformance(nExecutedTotal, tickSize, totalNumberOfFiles, timer, tickInSec); + lastTick = timer.currentTime(); + } } - } catch ( InterruptedException e ) { - throw new ReviewedStingException("Interrupted SAMReader initialization", e); - } catch ( ExecutionException e ) { - throw new ReviewedStingException("Execution exception during SAMReader initialization", e); + } else { + final int N_THREADS = 8; + + final ExecutorService executor = Executors.newFixedThreadPool(N_THREADS); + final List inits = new ArrayList(totalNumberOfFiles); + Queue> futures = new LinkedList>(); + for (final SAMReaderID readerID: readerIDs) { + logger.debug("Enqueuing for initialization: " + readerID.samFile); + final ReaderInitializer init = new ReaderInitializer(readerID); + inits.add(init); + futures.add(executor.submit(init)); + } + + try { + final int MAX_WAIT = 30 * 1000; + final int MIN_WAIT = 1 * 1000; + + while ( ! futures.isEmpty() ) { + final int prevSize = futures.size(); + final double waitTime = prevSize * (0.5 / N_THREADS); // about 0.5 seconds to load each file + final int waitTimeInMS = Math.min(MAX_WAIT, Math.max((int) (waitTime * 1000), MIN_WAIT)); + Thread.sleep(waitTimeInMS); + + Queue> pending = new LinkedList>(); + for ( final Future initFuture : futures ) { + if ( initFuture.isDone() ) { + final ReaderInitializer init = initFuture.get(); + if (threadAllocation.getNumIOThreads() > 0) { + inputStreams.put(init.readerID, init.blockInputStream); // get from initializer + } + logger.debug(String.format("Processing file (%d of %d) %s...", readerNumber++, totalNumberOfFiles, init.readerID)); + readers.put(init.readerID, init.reader); + } else { + pending.add(initFuture); + } + } + + final int nExecutedTotal = totalNumberOfFiles - pending.size(); + final int nExecutedInTick = prevSize - pending.size(); + printReaderPerformance(nExecutedTotal, nExecutedInTick, totalNumberOfFiles, timer, waitTimeInMS / 1000.0); + futures = pending; + } + } catch ( InterruptedException e ) { + throw new ReviewedStingException("Interrupted SAMReader initialization", e); + } catch ( ExecutionException e ) { + throw new ReviewedStingException("Execution exception during SAMReader initialization", e); + } + + executor.shutdown(); } - logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); - executor.shutdown(); + if ( totalNumberOfFiles > 0 ) logger.info(String.format("Done initializing BAM readers: total time %.2f", timer.getElapsedTime())); + } + + final private void printReaderPerformance(final int nExecutedTotal, + final int nExecutedInTick, + final int totalNumberOfFiles, + final SimpleTimer timer, + final double tickDurationInSec) { + final int pendingSize = totalNumberOfFiles - nExecutedTotal; + final double totalTimeInSeconds = timer.getElapsedTime(); + final double nTasksPerSecond = nExecutedTotal / (1.0*totalTimeInSeconds); + final int nRemaining = pendingSize; + final double estTimeToComplete = pendingSize / nTasksPerSecond; + logger.info(String.format("Init %d BAMs in last %.2f s, %d of %d in %.2f s / %.2f m (%.2f tasks/s). %d remaining with est. completion in %.2f s / %.2f m", + nExecutedInTick, tickDurationInSec, + nExecutedTotal, totalNumberOfFiles, totalTimeInSeconds, totalTimeInSeconds / 60, nTasksPerSecond, + nRemaining, estTimeToComplete, estTimeToComplete / 60)); + } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 339b8e0e6..7cc5b1625 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -175,7 +175,7 @@ public class VariantRecalibrator extends RodWalker implements TreeR @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; - @Hidden - @Argument(fullName="family_structure_file", shortName="familyFile", doc="use -family unless you know what you're doing", required=false) - private File FAMILY_STRUCTURE_FILE = null; - - /** - * String formatted as dad+mom=child where these parameters determine which sample names are examined. - */ - @Argument(fullName="family_structure", shortName="family", doc="string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false) - private String FAMILY_STRUCTURE = ""; - /** * This activates the mendelian violation module that will select all variants that correspond to a mendelian violation following the rules given by the family structure. */ @@ -286,13 +278,21 @@ public class SelectVariants extends RodWalker implements TreeR private double fractionGenotypes = 0; /** - * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. + * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria. * When specified one or more times, a particular type of variant is selected. * - */ + */ @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false) private List TYPES_TO_INCLUDE = new ArrayList(); + /** + * If provided, we will only include variants whose ID field is present in this list of ids. The matching + * is exact string matching. The file format is just one ID per line + * + */ + @Argument(fullName="keepIDs", shortName="IDs", doc="Only emit sites whose ID is found in this file (one ID per line)", required=false) + private File rsIDFile = null; + @Hidden @Argument(fullName="outMVFile", shortName="outMVFile", doc="", required=false) @@ -313,9 +313,9 @@ public class SelectVariants extends RodWalker implements TreeR } public enum NumberAlleleRestriction { - ALL, - BIALLELIC, - MULTIALLELIC + ALL, + BIALLELIC, + MULTIALLELIC } private ArrayList selectedTypes = new ArrayList(); @@ -339,17 +339,13 @@ public class SelectVariants extends RodWalker implements TreeR private int positionToAdd = 0; private RandomVariantStructure [] variantArray; - - /* Variables used for random selection with AF boosting */ - private ArrayList afBreakpoints = null; - private ArrayList afBoosts = null; - double bkDelta = 0.0; - private PrintStream outMVFileStream = null; - //Random number generator for the genotypes to remove + //Random number generator for the genotypes to remove private Random randomGenotypes = new Random(); + private Set IDsToKeep = null; + /** * Set up the VCF writer, the sample expressions and regexs, and the JEXL matcher */ @@ -437,7 +433,18 @@ public class SelectVariants extends RodWalker implements TreeR if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track"); - + /** load in the IDs file to a hashset for matching */ + if ( rsIDFile != null ) { + IDsToKeep = new HashSet(); + try { + for ( final String line : new XReadLines(rsIDFile).readLines() ) { + IDsToKeep.add(line.trim()); + } + logger.info("Selecting only variants with one of " + IDsToKeep.size() + " IDs from " + rsIDFile); + } catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(rsIDFile, e); + } + } } /** @@ -460,20 +467,23 @@ public class SelectVariants extends RodWalker implements TreeR } for (VariantContext vc : vcs) { + if ( IDsToKeep != null && ! IDsToKeep.contains(vc.getID()) ) + continue; + if (MENDELIAN_VIOLATIONS && mv.countViolations(this.getSampleDB().getFamilies(samples),vc) < 1) - break; + break; if (outMVFile != null){ for( String familyId : mv.getViolationFamilies()){ for(Sample sample : this.getSampleDB().getFamily(familyId)){ if(sample.getParents().size() > 0){ - outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " + - "childG=%s childGL=%s\n",vc.getChr(), vc.getStart(), - vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getCalledChrCount(vc.getAlternateAllele(0)), - sample.getMaternalID(), sample.getPaternalID(), sample.getID(), - vc.getGenotype(sample.getMaternalID()).toBriefString(), vc.getGenotype(sample.getMaternalID()).getLikelihoods().getAsString(), - vc.getGenotype(sample.getPaternalID()).toBriefString(), vc.getGenotype(sample.getPaternalID()).getLikelihoods().getAsString(), - vc.getGenotype(sample.getID()).toBriefString(),vc.getGenotype(sample.getID()).getLikelihoods().getAsString() ); + outMVFileStream.format("MV@%s:%d. REF=%s, ALT=%s, AC=%d, momID=%s, dadID=%s, childID=%s, momG=%s, momGL=%s, dadG=%s, dadGL=%s, " + + "childG=%s childGL=%s\n",vc.getChr(), vc.getStart(), + vc.getReference().getDisplayString(), vc.getAlternateAllele(0).getDisplayString(), vc.getCalledChrCount(vc.getAlternateAllele(0)), + sample.getMaternalID(), sample.getPaternalID(), sample.getID(), + vc.getGenotype(sample.getMaternalID()).toBriefString(), vc.getGenotype(sample.getMaternalID()).getLikelihoods().getAsString(), + vc.getGenotype(sample.getPaternalID()).toBriefString(), vc.getGenotype(sample.getPaternalID()).getLikelihoods().getAsString(), + vc.getGenotype(sample.getID()).toBriefString(),vc.getGenotype(sample.getID()).getLikelihoods().getAsString() ); } } @@ -513,10 +523,7 @@ public class SelectVariants extends RodWalker implements TreeR else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { vcfWriter.add(sub); } - - } - } return 1; @@ -647,7 +654,7 @@ public class SelectVariants extends RodWalker implements TreeR // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate) if ( vc.getAlleles().size() != sub.getAlleles().size() ) - newGC = VariantContextUtils.stripPLs(sub.getGenotypes()); + newGC = VariantContextUtils.stripPLs(sub.getGenotypes()); //Remove a fraction of the genotypes if needed if(fractionGenotypes>0){ @@ -655,10 +662,10 @@ public class SelectVariants extends RodWalker implements TreeR for ( Genotype genotype : newGC ) { //Set genotype to no call if it falls in the fraction. if(fractionGenotypes>0 && randomGenotypes.nextDouble() alleles = new ArrayList(2); - alleles.add(Allele.create((byte)'.')); - alleles.add(Allele.create((byte)'.')); - genotypes.add(new Genotype(genotype.getSampleName(),alleles, Genotype.NO_LOG10_PERROR,genotype.getFilters(),new HashMap(),false)); + ArrayList alleles = new ArrayList(2); + alleles.add(Allele.create((byte)'.')); + alleles.add(Allele.create((byte)'.')); + genotypes.add(new Genotype(genotype.getSampleName(),alleles, Genotype.NO_LOG10_PERROR,genotype.getFilters(),new HashMap(),false)); } else{ genotypes.add(genotype); diff --git a/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java b/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java index d8176ff4e..d753da1c8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java +++ b/public/java/src/org/broadinstitute/sting/utils/R/RScriptExecutor.java @@ -109,7 +109,7 @@ public class RScriptExecutor { List tempFiles = new ArrayList(); try { - File tempLibDir = IOUtils.tempDir("R.", ".lib"); + File tempLibDir = IOUtils.tempDir("Rlib.", ""); tempFiles.add(tempLibDir); StringBuilder expression = new StringBuilder("tempLibDir = '").append(tempLibDir).append("';"); diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index b79770eb5..10bc050da 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -654,4 +654,18 @@ public class Utils { // handle exception } } + + + public static byte [] arrayFromArrayWithLength(byte[] array, int length) { + byte [] output = new byte[length]; + for (int j = 0; j < length; j++) + output[j] = array[(j % array.length)]; + return output; + } + + public static void fillArrayWithByte(byte[] array, byte value) { + for (int i=0; i read.getReadLength() - 1) throw new ReviewedStingException("Trying to clip before the start or after the end of a read"); if ( start > stop ) throw new ReviewedStingException("START > STOP -- this should never happen -- call Mauricio!"); + if ( start > 0 && stop < read.getReadLength() - 1) + throw new ReviewedStingException(String.format("Trying to clip the middle of the read: start %d, stop %d, cigar: %s", start, stop, read.getCigarString())); + this.addOp(new ClippingOp(start, stop)); GATKSAMRecord clippedRead = clipRead(ClippingRepresentation.HARDCLIP_BASES); this.ops = null; @@ -76,6 +82,9 @@ public class ReadClipper { } public GATKSAMRecord hardClipByReadCoordinates(int start, int stop) { + if (read.isEmpty() || (start == 0 && stop == read.getReadLength() - 1)) + return new GATKSAMRecord(read.getHeader()); + this.addOp(new ClippingOp(start, stop)); return clipRead(ClippingRepresentation.HARDCLIP_BASES); } @@ -172,7 +181,7 @@ public class ReadClipper { //check if the clipped read can still be clipped in the range requested if (op.start < clippedRead.getReadLength()) { ClippingOp fixedOperation = op; - if (op.stop > clippedRead.getReadLength()) + if (op.stop >= clippedRead.getReadLength()) fixedOperation = new ClippingOp(op.start, clippedRead.getReadLength() - 1); clippedRead = fixedOperation.apply(algorithm, clippedRead); @@ -195,16 +204,12 @@ public class ReadClipper { for(CigarElement cigarElement : read.getCigar().getCigarElements()) { if (cigarElement.getOperator() != CigarOperator.HARD_CLIP && cigarElement.getOperator() != CigarOperator.SOFT_CLIP && - cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.DELETION) + cigarElement.getOperator() != CigarOperator.INSERTION) break; - else if (cigarElement.getOperator() == CigarOperator.INSERTION) { + else if (cigarElement.getOperator() == CigarOperator.INSERTION) this.addOp(new ClippingOp(0, cigarElement.getLength() - 1)); - } - else if (cigarElement.getOperator() == CigarOperator.DELETION) { - throw new ReviewedStingException("No read should start with a deletion. Aligner bug?"); - } } return clipRead(ClippingRepresentation.HARDCLIP_BASES); } diff --git a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java index 6f0573b02..b3fdb93d3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java @@ -79,7 +79,7 @@ public class IOUtils { tempDirParent = FileUtils.getTempDirectory(); if (!tempDirParent.exists() && !tempDirParent.mkdirs()) throw new UserException.BadTmpDir("Could not create temp directory: " + tempDirParent); - File temp = File.createTempFile(prefix + "-", suffix, tempDirParent); + File temp = File.createTempFile(prefix, suffix, tempDirParent); if (!temp.delete()) throw new UserException.BadTmpDir("Could not delete sub file: " + temp.getAbsolutePath()); if (!temp.mkdir()) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 8d9018045..ac1f00437 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -910,4 +910,9 @@ public class ReadUtils { return comp.compare(read1, read2); } + // TEST UTILITIES + + + + } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 91a018c4e..c9a4965c1 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -619,8 +619,9 @@ public class VariantContextUtils { if (vc.alleles.size() == 1) continue; if ( hasPLIncompatibleAlleles(alleles, vc.alleles)) { - logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", - genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); + if ( ! genotypes.isEmpty() ) + logger.warn(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", + genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); genotypes = stripPLs(genotypes); // this will remove stale AC,AF attributed from vc calculateChromosomeCounts(vc, attributes, true); diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 54d34fcfa..8e218f950 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -1,18 +1,12 @@ package org.broadinstitute.sting; -import org.apache.commons.io.FileUtils; import org.apache.log4j.*; import org.apache.log4j.spi.LoggingEvent; import org.broadinstitute.sting.commandline.CommandLineUtils; -import org.broadinstitute.sting.gatk.walkers.diffengine.DiffEngine; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.testng.Assert; +import org.broadinstitute.sting.utils.io.IOUtils; -import javax.swing.*; import java.io.*; -import java.math.BigInteger; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; import java.util.*; /** @@ -78,8 +72,8 @@ public abstract class BaseTest { public static final String hg19Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list"; public static final String hg19Chr20Intervals = intervalsLocation + "whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.chr20.interval_list"; - public static final String networkTempDir = "/broad/shptmp/" + System.getProperty("user.name") + "/"; - public static final File networkTempDirFile = new File(networkTempDir); + public static final String networkTempDir; + public static final File networkTempDirFile; public static final File testDirFile = new File("public/testdata/"); public static final String testDir = testDirFile.getAbsolutePath() + "/"; @@ -99,6 +93,10 @@ public abstract class BaseTest { // Set the Root logger to only output warnings. logger.setLevel(Level.WARN); + networkTempDirFile = IOUtils.tempDir("temp.", ".dir", new File("/broad/shptmp/" + System.getProperty("user.name"))); + networkTempDirFile.deleteOnExit(); + networkTempDir = networkTempDirFile.getAbsolutePath() + "/"; + // find our file sources // if (!fileExist(hg18Reference) || !fileExist(hg19Reference) || !fileExist(b36KGReference)) { // logger.fatal("We can't locate the reference directories. Aborting!"); @@ -233,18 +231,12 @@ public abstract class BaseTest { /** * Creates a temp file that will be deleted on exit after tests are complete. - * @param name Prefix of the file. - * @param extension Extension to concat to the end of the file. - * @return A file in the network temporary directory starting with name, ending with extension, which will be deleted after the program exits. + * @param name Name of the file. + * @return A file in the network temporary directory with name, which will be deleted after the program exits. */ - public static File createNetworkTempFile(String name, String extension) { - try { - FileUtils.forceMkdir(networkTempDirFile); - File file = File.createTempFile(name, extension, networkTempDirFile); - file.deleteOnExit(); - return file; - } catch (IOException ex) { - throw new ReviewedStingException("Cannot create temp file: " + ex.getMessage(), ex); - } + public static File createNetworkTempFile(String name) { + File file = new File(networkTempDirFile, name); + file.deleteOnExit(); + return file; } } diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java index 48f4c3777..f68a96d26 100644 --- a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSessionIntegrationTest.java @@ -62,7 +62,7 @@ public class JnaSessionIntegrationTest extends BaseTest { return; } - File outFile = createNetworkTempFile("JnaSessionIntegrationTest-", ".out"); + File outFile = createNetworkTempFile("JnaSessionIntegrationTest.out"); Session session = factory.getSession(); session.init(null); try { diff --git a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java index d98281ad3..4c7d4ce06 100644 --- a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java @@ -86,7 +86,7 @@ public class LibDrmaaIntegrationTest extends BaseTest { @Test(dependsOnMethods = { "testDrmaa" }) public void testSubmitEcho() throws Exception { - if (implementation.indexOf("LSF") >= 0) { + if (implementation.contains("LSF")) { System.err.println(" *********************************************************"); System.err.println(" ***********************************************************"); System.err.println(" **** ****"); @@ -101,7 +101,7 @@ public class LibDrmaaIntegrationTest extends BaseTest { Memory error = new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER); int errnum; - File outFile = createNetworkTempFile("LibDrmaaIntegrationTest-", ".out"); + File outFile = createNetworkTempFile("LibDrmaaIntegrationTest.out"); errnum = LibDrmaa.drmaa_init(null, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN); diff --git a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java index b4fb5cfa3..21339eb46 100644 --- a/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/jna/lsf/v7_0_6/LibBatIntegrationTest.java @@ -93,7 +93,7 @@ public class LibBatIntegrationTest extends BaseTest { @Test public void testSubmitEcho() throws Exception { String queue = "hour"; - File outFile = createNetworkTempFile("LibBatIntegrationTest-", ".out"); + File outFile = createNetworkTempFile("LibBatIntegrationTest.out"); submit req = new submit(); diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/CigarStringTestPair.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/CigarStringTestPair.java deleted file mode 100644 index cc9021fae..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/CigarStringTestPair.java +++ /dev/null @@ -1,18 +0,0 @@ -package org.broadinstitute.sting.utils.clipreads; - -/** - * Created by IntelliJ IDEA. - * User: roger - * Date: 11/29/11 - * Time: 4:53 PM - * To change this template use File | Settings | File Templates. - */ -public class CigarStringTestPair { - public String toTest; - public String expected; - - public CigarStringTestPair(String ToTest, String Expected) { - this.toTest = ToTest; - this.expected = Expected; - } -} diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java index de9d8fb50..048d9d8e0 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClipReadsTestUtils.java @@ -2,8 +2,8 @@ package org.broadinstitute.sting.utils.clipreads; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.TextCigarCodec; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.Assert; @@ -21,9 +21,149 @@ public class ClipReadsTestUtils { //Should contain all the utils needed for tests to mass produce //reads, cigars, and other needed classes - final static String BASES = "ACTG"; - final static String QUALS = "!+5?"; //ASCII values = 33,43,53,63 + final static byte [] BASES = {'A', 'C', 'T', 'G'}; + final static byte [] QUALS = {2, 15, 25, 30}; + final static String CIGAR = "4M"; + final static CigarElement[] cigarElements = { new CigarElement(1, CigarOperator.HARD_CLIP), + new CigarElement(1, CigarOperator.SOFT_CLIP), + new CigarElement(1, CigarOperator.INSERTION), + new CigarElement(1, CigarOperator.DELETION), + new CigarElement(1, CigarOperator.MATCH_OR_MISMATCH)}; + + public static GATKSAMRecord makeReadFromCigar(Cigar cigar) { + return ArtificialSAMUtils.createArtificialRead(Utils.arrayFromArrayWithLength(BASES, cigar.getReadLength()), Utils.arrayFromArrayWithLength(QUALS, cigar.getReadLength()), cigar.toString()); + } + + /** + * This function generates every valid permutation of cigar strings with a given length. + * + * A valid cigar object obeys the following rules: + * - No Hard/Soft clips in the middle of the read + * - No deletions in the beginning / end of the read + * - No repeated adjacent element (e.g. 1M2M -> this should be 3M) + * + * @param maximumLength the maximum number of elements in the cigar + * @return a list with all valid Cigar objects + */ + public static List generateCigarList(int maximumLength) { + int numCigarElements = cigarElements.length; + LinkedList cigarList = new LinkedList(); + byte [] cigarCombination = new byte[maximumLength]; + + Utils.fillArrayWithByte(cigarCombination, (byte) 0); // we start off with all 0's in the combination array. + int currentIndex = 0; + while (true) { + Cigar cigar = createCigarFromCombination(cigarCombination); // create the cigar + cigar = combineAdjacentCigarElements(cigar); // combine adjacent elements + if (isCigarValid(cigar)) { // check if it's valid + cigarList.add(cigar); // add it + } + + boolean currentIndexChanged = false; + while (currentIndex < maximumLength && cigarCombination[currentIndex] == numCigarElements - 1) { + currentIndex++; // find the next index to increment + currentIndexChanged = true; // keep track of the fact that we have changed indices! + } + + if (currentIndex == maximumLength) // if we hit the end of the array, we're done. + break; + + cigarCombination[currentIndex]++; // otherwise advance the current index + + if (currentIndexChanged) { // if we have changed index, then... + for (int i = 0; i < currentIndex; i++) + cigarCombination[i] = 0; // reset everything from 0->currentIndex + currentIndex = 0; // go back to the first index + } + } + + return cigarList; + } + + private static boolean isCigarValid(Cigar cigar) { + if (cigar.isValid(null, -1) == null) { // This should take care of most invalid Cigar Strings (picard's "exhaustive" implementation) + + Stack cigarElementStack = new Stack(); // Stack to invert cigar string to find ending operator + CigarOperator startingOp = null; + CigarOperator endingOp = null; + + // check if it doesn't start with deletions + boolean readHasStarted = false; // search the list of elements for the starting operator + for (CigarElement cigarElement : cigar.getCigarElements()) { + if (!readHasStarted) { + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { + readHasStarted = true; + startingOp = cigarElement.getOperator(); + } + } + cigarElementStack.push(cigarElement); + } + + readHasStarted = false; // search the inverted list of elements (stack) for the stopping operator + while (!cigarElementStack.empty()) { + CigarElement cigarElement = cigarElementStack.pop(); + if (cigarElement.getOperator() != CigarOperator.SOFT_CLIP && cigarElement.getOperator() != CigarOperator.HARD_CLIP) { + readHasStarted = true; + endingOp = cigarElement.getOperator(); + break; + } + } + + if (startingOp != CigarOperator.DELETION && endingOp != CigarOperator.DELETION && startingOp != CigarOperator.INSERTION && endingOp != CigarOperator.INSERTION) + return true; // we don't accept reads starting or ending in deletions (add any other constraint here) + } + + return false; + } + + private static Cigar createCigarFromCombination(byte[] cigarCombination) { + Cigar cigar = new Cigar(); + for (byte i : cigarCombination) { + cigar.add(cigarElements[i]); + } + return cigar; + } + + + /** + * Combines equal adjacent elements of a Cigar object + * + * @param rawCigar the cigar object + * @return a combined cigar object + */ + private static Cigar combineAdjacentCigarElements(Cigar rawCigar) { + Cigar combinedCigar = new Cigar(); + CigarElement lastElement = null; + int lastElementLength = 0; + for (CigarElement cigarElement : rawCigar.getCigarElements()) { + if (lastElement != null && lastElement.getOperator() == cigarElement.getOperator()) + lastElementLength += cigarElement.getLength(); + else + { + if (lastElement != null) + combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); + + lastElement = cigarElement; + lastElementLength = cigarElement.getLength(); + } + } + if (lastElement != null) + combinedCigar.add(new CigarElement(lastElementLength, lastElement.getOperator())); + + return combinedCigar; + } + + public static GATKSAMRecord makeRead() { + return ArtificialSAMUtils.createArtificialRead(BASES, QUALS, CIGAR); + } + + /** + * Asserts that the two reads have the same bases, qualities and cigar strings + * + * @param actual the calculated read + * @param expected the expected read + */ public static void assertEqualReads(GATKSAMRecord actual, GATKSAMRecord expected) { // If they're both not empty, test their contents if(!actual.isEmpty() && !expected.isEmpty()) { @@ -36,162 +176,16 @@ public class ClipReadsTestUtils { Assert.assertEquals(actual.isEmpty(), expected.isEmpty()); } - public static void testBaseQualCigar(GATKSAMRecord read, byte[] readBases, byte[] baseQuals, String cigar) { - // Because quals to char start at 33 for visibility - baseQuals = subtractToArray(baseQuals, 33); + public static Cigar invertCigar (Cigar cigar) { + Stack cigarStack = new Stack(); + for (CigarElement cigarElement : cigar.getCigarElements()) + cigarStack.push(cigarElement); - Assert.assertEquals(read.getReadBases(), readBases); - Assert.assertEquals(read.getBaseQualities(), baseQuals); - Assert.assertEquals(read.getCigarString(), cigar); + Cigar invertedCigar = new Cigar(); + while (!cigarStack.isEmpty()) + invertedCigar.add(cigarStack.pop()); + + return invertedCigar; } - public static void testCigar(GATKSAMRecord read, String cigar) { - Assert.assertEquals(read.getCigarString(), cigar); - } - - public static void testBaseQual(GATKSAMRecord read, byte[] readBases, byte[] baseQuals) { - // Because quals to chars start at 33 for visibility - baseQuals = subtractToArray(baseQuals, 33); - - if (readBases.length > 0 && baseQuals.length > 0) { - Assert.assertEquals(read.getReadBases(), readBases); - Assert.assertEquals(read.getBaseQualities(), baseQuals); - } else - Assert.assertTrue(read.isEmpty()); - } - - public static byte[] subtractToArray(byte[] array, int n) { - if (array == null) - return null; - - byte[] output = new byte[array.length]; - - for (int i = 0; i < array.length; i++) - output[i] = (byte) (array[i] - n); - - return output; - } - - // What the test read looks like - // Ref: 10 11 12 13 14 15 16 17 - // Read: 0 1 2 3 - - - - - // -------------------------------- - // Bases: A C T G - - - - - // Quals: ! + 5 ? - - - - - - public static GATKSAMRecord makeRead() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - GATKSAMRecord output = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, BASES.length()); - output.setReadBases(new String(BASES).getBytes()); - output.setBaseQualityString(new String(QUALS)); - - return output; - } - - public static GATKSAMRecord makeReadFromCigar(Cigar cigar) { - - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - GATKSAMRecord output = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 10, cigar.getReadLength()); - output.setReadBases(cycleString(BASES, cigar.getReadLength()).getBytes()); - output.setBaseQualityString(cycleString(QUALS, cigar.getReadLength())); - output.setCigar(cigar); - - return output; - } - - private static String cycleString(String string, int length) { - String output = ""; - int cycles = (length / string.length()) + 1; - - for (int i = 1; i < cycles; i++) - output += string; - - for (int j = 0; output.length() < length; j++) - output += string.charAt(j % string.length()); - - return output; - } - - public static Set generateCigars() { - - // This function generates every permutation of cigar strings we need. - - LinkedHashSet output = new LinkedHashSet(); - - List clippingOptionsStart = new LinkedList(); - clippingOptionsStart.add(new Cigar()); - clippingOptionsStart.add(TextCigarCodec.getSingleton().decode("1H1S")); - clippingOptionsStart.add(TextCigarCodec.getSingleton().decode("1S")); - clippingOptionsStart.add(TextCigarCodec.getSingleton().decode("1H")); - - LinkedList clippingOptionsEnd = new LinkedList(); - clippingOptionsEnd.add(new Cigar()); - clippingOptionsEnd.add(TextCigarCodec.getSingleton().decode("1S1H")); - clippingOptionsEnd.add(TextCigarCodec.getSingleton().decode("1S")); - clippingOptionsEnd.add(TextCigarCodec.getSingleton().decode("1H")); - - - LinkedList indelOptions1 = new LinkedList(); - indelOptions1.add(new Cigar()); - //indelOptions1.add( TextCigarCodec.getSingleton().decode("1I1D")); - //indelOptions1.add( TextCigarCodec.getSingleton().decode("1D1I") ); - indelOptions1.add(TextCigarCodec.getSingleton().decode("1I")); - indelOptions1.add(TextCigarCodec.getSingleton().decode("1D")); - - LinkedList indelOptions2 = new LinkedList(); - indelOptions2.add(new Cigar()); - indelOptions2.add(TextCigarCodec.getSingleton().decode("1I")); - indelOptions2.add(null); - - - // Start With M as base CigarElements, M, - - LinkedList base = new LinkedList(); - base.add(TextCigarCodec.getSingleton().decode("1M")); - base.add(TextCigarCodec.getSingleton().decode("5M")); - base.add(TextCigarCodec.getSingleton().decode("25M")); - // Should indel be added as a base? - - // Nested loops W00t! - for (Cigar Base : base) { - for (Cigar indelStart : indelOptions1) { - for (Cigar indelEnd : indelOptions2) { - for (Cigar clipStart : clippingOptionsStart) { - for (Cigar clipEnd : clippingOptionsEnd) { - // Create a list of Cigar Elements and construct Cigar - List CigarBuilder = new ArrayList(); - // add starting clipping (H/S) - CigarBuilder.addAll(clipStart.getCigarElements()); - // add first base (M) - CigarBuilder.addAll(Base.getCigarElements()); - // add first indel - CigarBuilder.addAll(indelStart.getCigarElements()); - // add second base (M) - CigarBuilder.addAll(Base.getCigarElements()); - // add another indel or nothing (M) - if (indelEnd != null) - CigarBuilder.addAll(indelEnd.getCigarElements()); - // add final clipping (S/H) - CigarBuilder.addAll(clipEnd.getCigarElements()); - - - output.add(new Cigar(removeConsecutiveElements(CigarBuilder))); - - } - } - } - } - } - - return output; - } - - private static List removeConsecutiveElements(List cigarBuilder) { - LinkedList output = new LinkedList(); - for (CigarElement E : cigarBuilder) { - if (output.isEmpty() || output.getLast().getOperator() != E.getOperator()) - output.add(E); - } - return output; - } } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClippingOpUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClippingOpUnitTest.java index 719d04287..dd674ff1c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ClippingOpUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ClippingOpUnitTest.java @@ -5,8 +5,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; -import java.util.LinkedList; -import java.util.List; /** * Created by IntelliJ IDEA. @@ -27,43 +25,43 @@ public class ClippingOpUnitTest extends BaseTest { @Test private void testHardClip() { - List testList = new LinkedList(); - testList.add(new TestParameter(0, 0, 1, 4, "1H3M"));//clip 1 base at start - testList.add(new TestParameter(3, 3, 0, 3, "3M1H"));//clip 1 base at end - testList.add(new TestParameter(0, 1, 2, 4, "2H2M"));//clip 2 bases at start - testList.add(new TestParameter(2, 3, 0, 2, "2M2H"));//clip 2 bases at end - testList.add(new TestParameter(0, 2, 3, 4, "3H1M"));//clip 3 bases at start - testList.add(new TestParameter(1, 3, 0, 1, "1M3H"));//clip 3 bases at end - - for (TestParameter p : testList) { - init(); - clippingOp = new ClippingOp(p.inputStart, p.inputStop); - logger.warn("Testing Parameters: " + p.inputStart + "," + p.inputStop + "," + p.substringStart + "," + p.substringStop + "," + p.cigar); - ClipReadsTestUtils.testBaseQualCigar(clippingOp.apply(ClippingRepresentation.HARDCLIP_BASES, read), - ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), - ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), - p.cigar); - } +// List testList = new LinkedList(); +// testList.add(new TestParameter(0, 0, 1, 4, "1H3M"));//clip 1 base at start +// testList.add(new TestParameter(3, 3, 0, 3, "3M1H"));//clip 1 base at end +// testList.add(new TestParameter(0, 1, 2, 4, "2H2M"));//clip 2 bases at start +// testList.add(new TestParameter(2, 3, 0, 2, "2M2H"));//clip 2 bases at end +// testList.add(new TestParameter(0, 2, 3, 4, "3H1M"));//clip 3 bases at start +// testList.add(new TestParameter(1, 3, 0, 1, "1M3H"));//clip 3 bases at end +// +// for (TestParameter p : testList) { +// init(); +// clippingOp = new ClippingOp(p.inputStart, p.inputStop); +// logger.warn("Testing Parameters: " + p.inputStart + "," + p.inputStop + "," + p.substringStart + "," + p.substringStop + "," + p.cigar); +// ClipReadsTestUtils.testBaseQualCigar(clippingOp.apply(ClippingRepresentation.HARDCLIP_BASES, read), +// ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), +// ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), +// p.cigar); +// } } @Test private void testSoftClip() { - List testList = new LinkedList(); - testList.add(new TestParameter(0, 0, -1, -1, "1S3M"));//clip 1 base at start - testList.add(new TestParameter(3, 3, -1, -1, "3M1S"));//clip 1 base at end - testList.add(new TestParameter(0, 1, -1, -1, "2S2M"));//clip 2 bases at start - testList.add(new TestParameter(2, 3, -1, -1, "2M2S"));//clip 2 bases at end - testList.add(new TestParameter(0, 2, -1, -1, "3S1M"));//clip 3 bases at start - testList.add(new TestParameter(1, 3, -1, -1, "1M3S"));//clip 3 bases at end - - for (TestParameter p : testList) { - init(); - clippingOp = new ClippingOp(p.inputStart, p.inputStop); - logger.warn("Testing Parameters: " + p.inputStart + "," + p.inputStop + "," + p.cigar); - ClipReadsTestUtils.testBaseQualCigar(clippingOp.apply(ClippingRepresentation.SOFTCLIP_BASES, read), - ClipReadsTestUtils.BASES.getBytes(), ClipReadsTestUtils.QUALS.getBytes(), p.cigar); - } +// List testList = new LinkedList(); +// testList.add(new TestParameter(0, 0, -1, -1, "1S3M"));//clip 1 base at start +// testList.add(new TestParameter(3, 3, -1, -1, "3M1S"));//clip 1 base at end +// testList.add(new TestParameter(0, 1, -1, -1, "2S2M"));//clip 2 bases at start +// testList.add(new TestParameter(2, 3, -1, -1, "2M2S"));//clip 2 bases at end +// testList.add(new TestParameter(0, 2, -1, -1, "3S1M"));//clip 3 bases at start +// testList.add(new TestParameter(1, 3, -1, -1, "1M3S"));//clip 3 bases at end +// +// for (TestParameter p : testList) { +// init(); +// clippingOp = new ClippingOp(p.inputStart, p.inputStop); +// logger.warn("Testing Parameters: " + p.inputStart + "," + p.inputStop + "," + p.cigar); +// ClipReadsTestUtils.testBaseQualCigar(clippingOp.apply(ClippingRepresentation.SOFTCLIP_BASES, read), +// ClipReadsTestUtils.BASES.getBytes(), ClipReadsTestUtils.QUALS.getBytes(), p.cigar); +// } } } diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java index f998a4e78..fcde5f360 100644 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/clipreads/ReadClipperUnitTest.java @@ -28,212 +28,181 @@ package org.broadinstitute.sting.utils.clipreads; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import net.sf.samtools.TextCigarCodec; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; -import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.util.LinkedList; import java.util.List; /** - * Created by IntelliJ IDEA. * User: roger * Date: 9/28/11 - * Time: 9:54 PM - * To change this template use File | Settings | File Templates. */ public class ReadClipperUnitTest extends BaseTest { - // TODO: exception testing, make cases that should fail will fail + List cigarList; + int maximumCigarSize = 6; // 6 is the minimum necessary number to try all combinations of cigar types with guarantee of clipping an element with length = 2 - // TODO: add indels to all test cases - - ReadClipper readClipper; - - @BeforeMethod + @BeforeClass public void init() { - readClipper = new ReadClipper(ClipReadsTestUtils.makeRead()); + cigarList = ClipReadsTestUtils.generateCigarList(maximumCigarSize); } @Test(enabled = true) public void testHardClipBothEndsByReferenceCoordinates() { - - logger.warn("Executing testHardClipBothEndsByReferenceCoordinates"); - //int debug = 1; - //Clip whole read - Assert.assertEquals(readClipper.hardClipBothEndsByReferenceCoordinates(10, 10), new GATKSAMRecord(readClipper.read.getHeader())); - - //clip 1 base - ClipReadsTestUtils.testBaseQualCigar(readClipper.hardClipBothEndsByReferenceCoordinates(10, 13), - ClipReadsTestUtils.BASES.substring(1, 3).getBytes(), ClipReadsTestUtils.QUALS.substring(1, 3).getBytes(), - "1H2M1H"); - - List cigarStringTestPairs = new LinkedList(); - cigarStringTestPairs.add(new CigarStringTestPair("5M1D1M2I4M5I6M1D3M2I100M", "1H4M1D1M2I4M5I6M1D3M2I99M1H")); - //cigarStringTestPairs.add( new CigarStringTestPair("5M1I1M2I1M","1H4M1I1M2I1H")); - cigarStringTestPairs.add(new CigarStringTestPair("1S1M1I1M1I1M1I1M1I1M1I1M1S", "1H1M1I1M1I1M1I1M1I1M1I1M1H")); - cigarStringTestPairs.add(new CigarStringTestPair("1S1M1D1M1D1M1D1M1D1M1D1M1S", "1H1M1D1M1D1M1D1M1D1M1D1M1H")); - - for (CigarStringTestPair pair : cigarStringTestPairs) { - readClipper = new ReadClipper(ClipReadsTestUtils.makeReadFromCigar(TextCigarCodec.getSingleton().decode(pair.toTest))); - ClipReadsTestUtils.testCigar(readClipper.hardClipBothEndsByReferenceCoordinates( - ReadUtils.getRefCoordSoftUnclippedStart(readClipper.read), - ReadUtils.getRefCoordSoftUnclippedEnd(readClipper.read)), - pair.expected); - } - /* - for ( Cigar cigar: ClipReadsTestUtils.generateCigars() ) { - // The read has to be long enough to clip one base from each side - // This also filters a lot of cigars - if ( cigar.getReadLength() > 26 ) { - readClipper = new ReadClipper(ClipReadsTestUtils.makeReadFromCigar( cigar )); - System.out.println( "Testing Cigar: "+cigar.toString() ) ; - //cigar length reference plus soft clip - - ClipReadsTestUtils.testBaseQual( - readClipper.hardClipBothEndsByReferenceCoordinates( - ReadUtils.getRefCoordSoftUnclippedStart(readClipper.read), - ReadUtils.getRefCoordSoftUnclippedEnd(readClipper.read) ), - readClipper.read.getReadString().substring(1, (cigar.getReadLength() - 1)).getBytes(), - readClipper.read.getBaseQualityString().substring(1, (cigar.getReadLength() - 1)).getBytes()); + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ClipReadsTestUtils.makeReadFromCigar(cigar); + int alnStart = read.getAlignmentStart(); + int alnEnd = read.getAlignmentEnd(); + int readLength = alnStart - alnEnd; + for (int i=0; i= alnStart + i, String.format("Clipped alignment start is less than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); + Assert.assertTrue(clippedRead.getAlignmentEnd() <= alnEnd + i, String.format("Clipped alignment end is greater than original read (minus %d): %s -> %s", i, read.getCigarString(), clippedRead.getCigarString())); } } - */ } @Test(enabled = true) public void testHardClipByReadCoordinates() { + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ClipReadsTestUtils.makeReadFromCigar(cigar); + int readLength = read.getReadLength(); + for (int i=0; i %s", i, read.getCigarString(), clipLeft.getCigarString())); - logger.warn("Executing testHardClipByReadCoordinates"); - - //Clip whole read - Assert.assertEquals(readClipper.hardClipByReadCoordinates(0, 3), new GATKSAMRecord(readClipper.read.getHeader())); - - List testList = new LinkedList(); - testList.add(new TestParameter(0, 0, 1, 4, "1H3M"));//clip 1 base at start - testList.add(new TestParameter(3, 3, 0, 3, "3M1H"));//clip 1 base at end - testList.add(new TestParameter(0, 1, 2, 4, "2H2M"));//clip 2 bases at start - testList.add(new TestParameter(2, 3, 0, 2, "2M2H"));//clip 2 bases at end - - for (TestParameter p : testList) { - init(); - //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); - ClipReadsTestUtils.testBaseQualCigar(readClipper.hardClipByReadCoordinates(p.inputStart, p.inputStop), - ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), - ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), - p.cigar); + GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReadCoordinates(i, readLength-1); + Assert.assertTrue(clipRight.getReadLength() <= i, String.format("Clipped read length is greater than original read length (minus %d): %s -> %s", i, read.getCigarString(), clipRight.getCigarString())); + } } - } @Test(enabled = true) public void testHardClipByReferenceCoordinates() { - logger.warn("Executing testHardClipByReferenceCoordinates"); - //logger.warn(debug); - //Clip whole read - Assert.assertEquals(readClipper.hardClipByReferenceCoordinates(10, 13), new GATKSAMRecord(readClipper.read.getHeader())); + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ClipReadsTestUtils.makeReadFromCigar(cigar); + int alnStart = read.getAlignmentStart(); + int alnEnd = read.getAlignmentEnd(); + for (int i=alnStart; i<=alnEnd; i++) { + if (ReadUtils.getRefCoordSoftUnclippedStart(read) == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side + GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(alnStart, i); + if (!clipLeft.isEmpty()) + Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + } - List testList = new LinkedList(); - testList.add(new TestParameter(-1, 10, 1, 4, "1H3M"));//clip 1 base at start - testList.add(new TestParameter(13, -1, 0, 3, "3M1H"));//clip 1 base at end - testList.add(new TestParameter(-1, 11, 2, 4, "2H2M"));//clip 2 bases at start - testList.add(new TestParameter(12, -1, 0, 2, "2M2H"));//clip 2 bases at end - - for (TestParameter p : testList) { - init(); - //logger.warn("Testing Parameters: " + p.inputStart+","+p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); - ClipReadsTestUtils.testBaseQualCigar(readClipper.hardClipByReferenceCoordinates(p.inputStart, p.inputStop), - ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), - ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), - p.cigar); - } - - List cigarStringTestPairs = new LinkedList(); - cigarStringTestPairs.add(new CigarStringTestPair("5M1D1M2I4M5I6M1D3M2I100M", "1H4M1D1M2I4M5I6M1D3M2I100M")); - //cigarStringTestPairs.add( new CigarStringTestPair("5M1I1M2I1M","1H4M1I1M2I1M")); - cigarStringTestPairs.add(new CigarStringTestPair("1S1M1I1M1I1M1I1M1I1M1I1M1S", "1H1M1I1M1I1M1I1M1I1M1I1M1S")); - cigarStringTestPairs.add(new CigarStringTestPair("1S1M1D1M1D1M1D1M1D1M1D1M1S", "1H1M1D1M1D1M1D1M1D1M1D1M1S")); - - //Clips only first base - for (CigarStringTestPair pair : cigarStringTestPairs) { - readClipper = new ReadClipper(ClipReadsTestUtils.makeReadFromCigar(TextCigarCodec.getSingleton().decode(pair.toTest))); - ClipReadsTestUtils.testCigar(readClipper.hardClipByReadCoordinates(0, 0), pair.expected); - } - /* - for ( Cigar cigar: ClipReadsTestUtils.generateCigars() ) { - // The read has to be long enough to clip one base - // This also filters a lot of cigars - if ( cigar.getReadLength() > 26 ) { - readClipper = new ReadClipper(ClipReadsTestUtils.makeReadFromCigar( cigar )); - System.out.println( "Testing Cigar: "+cigar.toString() ) ; - //cigar length reference plus soft clip - - // Clip first read - ClipReadsTestUtils.testBaseQual( - readClipper.hardClipByReadCoordinates(0,0), - readClipper.read.getReadString().substring(1, cigar.getReadLength()).getBytes(), - readClipper.read.getBaseQualityString().substring(1, cigar.getReadLength()).getBytes()); + if (ReadUtils.getRefCoordSoftUnclippedEnd(read) == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side + GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, alnEnd); + if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. + Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + } } } - */ } @Test(enabled = true) public void testHardClipByReferenceCoordinatesLeftTail() { - init(); - logger.warn("Executing testHardClipByReferenceCoordinatesLeftTail"); - - //Clip whole read - Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesLeftTail(13), new GATKSAMRecord(readClipper.read.getHeader())); - - List testList = new LinkedList(); - testList.add(new TestParameter(10, -1, 1, 4, "1H3M"));//clip 1 base at start - testList.add(new TestParameter(11, -1, 2, 4, "2H2M"));//clip 2 bases at start - - for (TestParameter p : testList) { - init(); - //logger.warn("Testing Parameters: " + p.inputStart+","+p.substringStart+","+p.substringStop+","+p.cigar); - ClipReadsTestUtils.testBaseQualCigar(readClipper.hardClipByReferenceCoordinatesLeftTail(p.inputStart), - ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), - ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), - p.cigar); + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ClipReadsTestUtils.makeReadFromCigar(cigar); + int alnStart = read.getAlignmentStart(); + int alnEnd = read.getAlignmentEnd(); + if (ReadUtils.getRefCoordSoftUnclippedStart(read) == alnStart) { // we can't test left clipping if the read has hanging soft clips on the left side + for (int i=alnStart; i<=alnEnd; i++) { + GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipByReferenceCoordinates(alnStart, i); + if (!clipLeft.isEmpty()) + Assert.assertTrue(clipLeft.getAlignmentStart() >= i + 1, String.format("Clipped alignment start (%d) is less the expected (%d): %s -> %s", clipLeft.getAlignmentStart(), i + 1, read.getCigarString(), clipLeft.getCigarString())); + } + } } - } @Test(enabled = true) public void testHardClipByReferenceCoordinatesRightTail() { - init(); - logger.warn("Executing testHardClipByReferenceCoordinatesRightTail"); - - //Clip whole read - Assert.assertEquals(readClipper.hardClipByReferenceCoordinatesRightTail(10), new GATKSAMRecord(readClipper.read.getHeader())); - - List testList = new LinkedList(); - testList.add(new TestParameter(-1, 13, 0, 3, "3M1H"));//clip 1 base at end - testList.add(new TestParameter(-1, 12, 0, 2, "2M2H"));//clip 2 bases at end - - for (TestParameter p : testList) { - init(); - //logger.warn("Testing Parameters: " + p.inputStop+","+p.substringStart+","+p.substringStop+","+p.cigar); - ClipReadsTestUtils.testBaseQualCigar(readClipper.hardClipByReferenceCoordinatesRightTail(p.inputStop), - ClipReadsTestUtils.BASES.substring(p.substringStart, p.substringStop).getBytes(), - ClipReadsTestUtils.QUALS.substring(p.substringStart, p.substringStop).getBytes(), - p.cigar); + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ClipReadsTestUtils.makeReadFromCigar(cigar); + int alnStart = read.getAlignmentStart(); + int alnEnd = read.getAlignmentEnd(); + if (ReadUtils.getRefCoordSoftUnclippedEnd(read) == alnEnd) { // we can't test right clipping if the read has hanging soft clips on the right side + for (int i=alnStart; i<=alnEnd; i++) { + GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipByReferenceCoordinates(i, alnEnd); + if (!clipRight.isEmpty() && clipRight.getAlignmentStart() <= clipRight.getAlignmentEnd()) // alnStart > alnEnd if the entire read is a soft clip now. We can't test those. + Assert.assertTrue(clipRight.getAlignmentEnd() <= i - 1, String.format("Clipped alignment end (%d) is greater than expected (%d): %s -> %s", clipRight.getAlignmentEnd(), i - 1, read.getCigarString(), clipRight.getCigarString())); + } + } } - } @Test(enabled = true) public void testHardClipLowQualEnds() { - logger.warn("Executing testHardClipLowQualEnds"); + final byte LOW_QUAL = 2; + final byte HIGH_QUAL = 30; - // Testing clipping that ends inside an insertion + // create a read for every cigar permutation + for (Cigar cigar : cigarList) { + GATKSAMRecord read = ClipReadsTestUtils.makeReadFromCigar(cigar); + int readLength = read.getReadLength(); + byte [] quals = new byte[readLength]; + + for (int nLowQualBases = 0; nLowQualBases < readLength; nLowQualBases++) { + + // create a read with nLowQualBases in the left tail + Utils.fillArrayWithByte(quals, HIGH_QUAL); + for (int addLeft = 0; addLeft < nLowQualBases; addLeft++) + quals[addLeft] = LOW_QUAL; + read.setBaseQualities(quals); + GATKSAMRecord clipLeft = (new ReadClipper(read)).hardClipLowQualEnds(LOW_QUAL); + + // Tests + + // Make sure the low qualities are gone + assertNoLowQualBases(clipLeft, LOW_QUAL); + + // Can't run this test with the current contract of no hanging insertions + //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipLeft.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipLeft.getCigarString())); + + // create a read with nLowQualBases in the right tail + Utils.fillArrayWithByte(quals, HIGH_QUAL); + for (int addRight = 0; addRight < nLowQualBases; addRight++) + quals[readLength - addRight - 1] = LOW_QUAL; + read.setBaseQualities(quals); + GATKSAMRecord clipRight = (new ReadClipper(read)).hardClipLowQualEnds(LOW_QUAL); + + // Tests + + // Make sure the low qualities are gone + assertNoLowQualBases(clipRight, LOW_QUAL); + + // Make sure we haven't clipped any high quals -- Can't run this test with the current contract of no hanging insertions + //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - nLowQualBases, read.getCigarString(), clipRight.getCigarString())); + + // create a read with nLowQualBases in the both tails + if (nLowQualBases <= readLength/2) { + Utils.fillArrayWithByte(quals, HIGH_QUAL); + for (int addBoth = 0; addBoth < nLowQualBases; addBoth++) { + quals[addBoth] = LOW_QUAL; + quals[readLength - addBoth - 1] = LOW_QUAL; + } + read.setBaseQualities(quals); + GATKSAMRecord clipBoth = (new ReadClipper(read)).hardClipLowQualEnds(LOW_QUAL); + + // Tests + + // Make sure the low qualities are gone + assertNoLowQualBases(clipBoth, LOW_QUAL); + + // Can't run this test with the current contract of no hanging insertions + //Assert.assertEquals(clipLeft.getReadLength(), readLength - nLowQualBases, String.format("Clipped read size (%d) is different than the number high qual bases (%d) -- Cigars: %s -> %s", clipRight.getReadLength(), readLength - (2*nLowQualBases), read.getCigarString(), clipBoth.getCigarString())); + } + } +// logger.warn(String.format("Testing %s for all combinations of low/high qual... PASSED", read.getCigarString())); + } + + // ONE OFF Testing clipping that ends inside an insertion ( Ryan's bug ) final byte[] BASES = {'A','C','G','T','A','C','G','T'}; final byte[] QUALS = {2, 2, 2, 2, 20, 20, 20, 2}; final String CIGAR = "1S1M5I1S"; @@ -248,17 +217,15 @@ public class ReadClipperUnitTest extends BaseTest { ReadClipper lowQualClipper = new ReadClipper(read); ClipReadsTestUtils.assertEqualReads(lowQualClipper.hardClipLowQualEnds((byte) 2), expected); - - } @Test(enabled = true) public void testHardClipSoftClippedBases() { // Generate a list of cigars to test - for (Cigar cigar : ClipReadsTestUtils.generateCigars()) { + for (Cigar cigar : cigarList) { GATKSAMRecord read = ClipReadsTestUtils.makeReadFromCigar(cigar); - readClipper = new ReadClipper(read); + ReadClipper readClipper = new ReadClipper(read); GATKSAMRecord clippedRead = readClipper.hardClipSoftClippedBases(); int sumHardClips = 0; @@ -301,7 +268,59 @@ public class ReadClipperUnitTest extends BaseTest { Assert.assertTrue( sumMatches == 0, String.format("Cigar %s -> %s -- FAILED (number of matches mismatched by %d)", read.getCigarString(), clippedRead.getCigarString(), sumMatches)); - logger.warn(String.format("Cigar %s -> %s -- PASSED!", read.getCigarString(), clippedRead.getCigarString())); +// logger.warn(String.format("Cigar %s -> %s -- PASSED!", read.getCigarString(), clippedRead.getCigarString())); } } + + @Test(enabled = false) + public void testHardClipLeadingInsertions() { + for (Cigar cigar : cigarList) { + if (startsWithInsertion(cigar)) { + GATKSAMRecord read = ClipReadsTestUtils.makeReadFromCigar(cigar); + GATKSAMRecord clippedRead = (new ReadClipper(read)).hardClipLeadingInsertions(); + + int expectedLength = read.getReadLength() - leadingInsertionLength(read.getCigar()); + if (cigarHasElementsDifferentThanInsertionsAndHardClips(read.getCigar())) + expectedLength -= leadingInsertionLength(ClipReadsTestUtils.invertCigar(read.getCigar())); + + if (! clippedRead.isEmpty()) { + Assert.assertEquals(expectedLength, clippedRead.getReadLength(), String.format("%s -> %s", read.getCigarString(), clippedRead.getCigarString())); // check that everything else is still there + Assert.assertFalse(startsWithInsertion(clippedRead.getCigar())); // check that the insertions are gone + } + else + Assert.assertTrue(expectedLength == 0, String.format("expected length: %d", expectedLength)); // check that the read was expected to be fully clipped + } + } + } + + + + private void assertNoLowQualBases(GATKSAMRecord read, byte low_qual) { + if (!read.isEmpty()) { + byte [] quals = read.getBaseQualities(); + for (int i=0; i 0; + } + + private int leadingInsertionLength(Cigar cigar) { + for (CigarElement cigarElement : cigar.getCigarElements()) { + if (cigarElement.getOperator() == CigarOperator.INSERTION) + return cigarElement.getLength(); + if (cigarElement.getOperator() != CigarOperator.HARD_CLIP) + break; + } + return 0; + } + + private boolean cigarHasElementsDifferentThanInsertionsAndHardClips (Cigar cigar) { + for (CigarElement cigarElement : cigar.getCigarElements()) + if (cigarElement.getOperator() != CigarOperator.INSERTION && cigarElement.getOperator() != CigarOperator.HARD_CLIP) + return true; + return false; + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/clipreads/TestParameter.java b/public/java/test/org/broadinstitute/sting/utils/clipreads/TestParameter.java deleted file mode 100644 index 155fe094e..000000000 --- a/public/java/test/org/broadinstitute/sting/utils/clipreads/TestParameter.java +++ /dev/null @@ -1,24 +0,0 @@ -package org.broadinstitute.sting.utils.clipreads; - -/** - * Created by IntelliJ IDEA. - * User: roger - * Date: 11/28/11 - * Time: 4:07 PM - * To change this template use File | Settings | File Templates. - */ -public class TestParameter { - int inputStart; - int inputStop; - int substringStart; - int substringStop; - String cigar; - - public TestParameter(int InputStart, int InputStop, int SubstringStart, int SubstringStop, String Cigar) { - inputStart = InputStart; - inputStop = InputStop; - substringStart = SubstringStart; - substringStop = SubstringStop; - cigar = Cigar; - } -} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala index 4ca3cbb89..149376018 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountLoci.scala @@ -19,7 +19,7 @@ class ExampleCountLoci extends QScript { @Output var out: File = _ - def script = { + def script() { val countLoci = new CountLoci countLoci.reference_sequence = referenceFile countLoci.input_file = bamFiles diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala index 9fdd1ba4c..7f9d3f87a 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala @@ -24,7 +24,7 @@ class ExampleCountReads extends QScript { /** * In script, you create and then add() functions to the pipeline. */ - def script = { + def script() { // Run CountReads for all bams jointly. @@ -41,6 +41,9 @@ class ExampleCountReads extends QScript { // matches the full form of the argument, but will actually be a scala List[] jointCountReads.input_file = bamFiles + // Set the memory limit. Also acts as a memory request on LSF and GridEngine. + jointCountReads.memoryLimit = 1 + // Add the newly created function to the pipeline. add(jointCountReads) @@ -51,6 +54,7 @@ class ExampleCountReads extends QScript { singleCountReads.reference_sequence = referenceFile // ':+' is the scala List append operator singleCountReads.input_file :+= bamFile + singleCountReads.memoryLimit = 1 add(singleCountReads) } } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala index d3796d350..d30668c19 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCustomWalker.scala @@ -24,7 +24,7 @@ class ExampleCustomWalker extends QScript { /** * In script, you create and then add() functions to the pipeline. */ - def script = { + def script() { val customWalker = new CommandLineGATK { // Set the name of your walker, for example this will be passed as -T MyCustomWalker this.analysis_type = "MyCustomWalker" diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala index 2f4aea755..8cb86db0b 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala @@ -33,7 +33,6 @@ class ExampleUnifiedGenotyper extends QScript { @Argument(doc="An optional list of filter expressions.", shortName="filterExpression", required=false) var filterExpressions: List[String] = Nil - // This trait allows us set the variables below in one place, // and then reuse this trait on each CommandLineGATK function below. trait UnifiedGenotyperArguments extends CommandLineGATK { diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 913bd243c..32913deb4 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -89,7 +89,7 @@ class QCommandLine extends CommandLineProgram with Logging { private var shuttingDown = false private lazy val pluginManager = { - qScriptClasses = IOUtils.tempDir("Q-Classes", "", settings.qSettings.tempDirectory) + qScriptClasses = IOUtils.tempDir("Q-Classes-", "", settings.qSettings.tempDirectory) qScriptManager.loadScripts(scripts, qScriptClasses) new PluginManager[QScript](classOf[QScript], List(qScriptClasses.toURI.toURL)) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala index bb14bb6e6..73d1c028a 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala @@ -81,6 +81,10 @@ trait QJobReport extends Logging { this.reportFeatures = features.mapValues(_.toString) } + def addJobReportBinding(key: String, value: Any) { + this.reportFeatures += (key -> value.toString) + } + // copy the QJobReport information -- todo : what's the best way to do this? override def copySettingsTo(function: QFunction) { self.copySettingsTo(function) diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala index 5901cab46..f657e4be1 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala @@ -30,7 +30,7 @@ import org.broadinstitute.sting.BaseTest class ExampleCountLociPipelineTest { @Test - def testCountLoci { + def testCountLoci() { val testOut = "count.out" val spec = new PipelineTestSpec spec.name = "countloci" diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala new file mode 100644 index 000000000..8b286f090 --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.pipeline.examples + +import org.testng.annotations.Test +import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} +import org.broadinstitute.sting.BaseTest + +class ExampleCountReadsPipelineTest { + @Test + def testCountReads() { + val spec = new PipelineTestSpec + spec.name = "countreads" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleCountReads.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -I " + BaseTest.testDir + "exampleBAM.bam").mkString + PipelineTest.executeTest(spec) + } +} diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala new file mode 100644 index 000000000..d50673a1a --- /dev/null +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.pipeline.examples + +import org.testng.annotations.Test +import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} +import org.broadinstitute.sting.BaseTest + +class ExampleUnifiedGenotyperPipelineTest { + @Test + def testUnifiedGenotyper() { + val spec = new PipelineTestSpec + spec.name = "unifiedgenotyper" + spec.args = Array( + " -S public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleUnifiedGenotyper.scala", + " -R " + BaseTest.testDir + "exampleFASTA.fasta", + " -I " + BaseTest.testDir + "exampleBAM.bam", + " -filter QD", + " -filterExpression 'QD < 2.0'").mkString + PipelineTest.executeTest(spec) + } +}