/* * The Broad Institute * SOFTWARE COPYRIGHT NOTICE AGREEMENT * This software and its documentation are copyright 2009 by the * Broad Institute/Massachusetts Institute of Technology. All rights are reserved. * * This software is supplied without any warranty or guaranteed support whatsoever. * Neither the Broad Institute nor MIT can be responsible for its use, misuse, or * functionality. */ package edu.mit.broad.picard.sam; import edu.mit.broad.sam.*; import edu.mit.broad.picard.PicardException; import java.util.*; /** * Merges SAMFileHeaders that have the same sequences into a single merged header * object while providing read group translation for cases where read groups * clash across input headers. * * @author Dave Tefft */ public class SamFileHeaderMerger { //Super Header to construct private final SAMFileHeader mergedHeader; private final Collection readers; //Translation of old group ids to new group ids private final Map> samGroupIdTranslation = new HashMap>(); //the groups from different files use the same group ids private boolean hasGroupIdDuplicates = false; //Translation of old program group ids to new program group ids private final Map> samProgramGroupIdTranslation = new HashMap>(); //Letters to construct new ids from a counter private static final String ALPHABET = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; /** * Create SAMFileHeader with additional information * * @param readers same file readers to combine * @param sortOrder sort order new header should have */ public SamFileHeaderMerger(final Collection readers, final SAMFileHeader.SortOrder sortOrder) { this.readers = readers; this.mergedHeader = new SAMFileHeader(); // Set sequences first because if it throws exception there is no need to continue final List sequences = getSAMSequences(readers); this.mergedHeader.setSequences(sequences); // Set program that creates input alignments for (final SAMProgramRecord program : mergeSAMProgramRecordLists(readers)) { this.mergedHeader.addProgramRecord(program); } // Set read groups for merged header final List readGroups = getReadGroups(readers); this.mergedHeader.setReadGroups(readGroups); this.mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.none); this.mergedHeader.setSortOrder(sortOrder); } /** * Checks to see if there are clashes where different readers are using the same read * group IDs. If they are then a new set of unique read group IDs are generated (across all * read groups) otherwise the original read group headers are returned. * * @param readers readers to combine * @return new list of readgroups constructed from all the readers */ private List getReadGroups(final Collection readers) { // Read groups as read from the readers final List orginalReadGroups = new ArrayList(); // Read group with new ids that don't confict final List modifiedReadGroups = new ArrayList(); //set to see if there are duplicate group ids and whether or not we need to modify them final Set groupIdsSeenBefore = new HashSet(); int x = 0; this.hasGroupIdDuplicates = false; for (final SAMFileReader reader : readers) { final SAMFileHeader header = reader.getFileHeader(); final Map idTranslation = new HashMap(); // Iterate over read groups to find conflicting ids for (final SAMReadGroupRecord readGroup : header.getReadGroups()) { final String groupId = readGroup.getReadGroupId(); final String newGroupId = createNewId(x++); // Check to see if same group id is used in two different readers if (groupIdsSeenBefore.contains(groupId)) { hasGroupIdDuplicates = true; } groupIdsSeenBefore.add(groupId); // Creates a new read group with the new id and copies all it's attributes final SAMReadGroupRecord groupRecordWithNewId = copyReadGroup(readGroup, newGroupId); orginalReadGroups.add(readGroup); modifiedReadGroups.add(groupRecordWithNewId); idTranslation.put(groupId, newGroupId); } // Add id tranlation for updating SamRecords with new ids if neccessary this.samGroupIdTranslation.put(reader, idTranslation); } // return approriate readgroups whether or not the new ids have to be used if (this.hasGroupIdDuplicates) { return modifiedReadGroups; } else { return orginalReadGroups; } } /** * Get the sequences off the SAMFileReader header. Throws runtime exception if the sequence * are different from one another * * @param readers readers to pull sequences from * @return sequences from files. Each file should have the same sequence */ private List getSAMSequences(final Collection readers) { List sequences = null; for (final SAMFileReader reader : readers) { final SAMFileHeader header = reader.getFileHeader(); if (sequences == null) { sequences = header.getSequences(); } else { final List currentSequences = header.getSequences(); if (!sequenceListsEqual(sequences, currentSequences)) { throw new PicardException("Files are not compatible with each other. They can not be combined"); } } } return sequences; } /** * Checks the equality of two lists of sequence records using the isSameSequence * method instead of the equals method which is a more strict identity check. * @param s1 a list of sequence headers * @param s2 a second list of sequence headers * @return true if the two lists match otherwise false */ private boolean sequenceListsEqual(final List s1, final List s2) { if (s1.size() != s2.size()) { return false; } for (int i = 0; i < s1.size(); ++i) { if (!s1.get(i).isSameSequence(s2.get(i))) { return false; } } return true; } /** * Find the alignment program that produced the readers. If there are more than one * generate a new program represents that * * @param readers SAMFileReaders to pull program information from * @return SAMProgram record that represents all the readers */ // TODO: this needs to be fixed up to support multiple program records (PIC-15) private List mergeSAMProgramRecordLists(final Collection readers) { final boolean programMixed = false; final List ret = new ArrayList(); int nextProgramGroupId = 0; for (final SAMFileReader reader : readers) { final SAMFileHeader header = reader.getFileHeader(); final Map idTranslation = new HashMap(); for (final SAMProgramRecord oldProgramRecord : header.getProgramRecords()) { boolean foundMatch = false; for (final SAMProgramRecord newProgramRecord : ret) { if (newProgramRecord.equivalent(oldProgramRecord)) { idTranslation.put(oldProgramRecord.getProgramGroupId(), newProgramRecord.getProgramGroupId()); foundMatch = true; break; } } if (!foundMatch) { final SAMProgramRecord newProgramRecord = new SAMProgramRecord(Integer.toString(nextProgramGroupId++)); copyProgramGroupAttributes(oldProgramRecord, newProgramRecord); ret.add(newProgramRecord); idTranslation.put(oldProgramRecord.getProgramGroupId(), newProgramRecord.getProgramGroupId()); } } samProgramGroupIdTranslation.put(reader, idTranslation); } return ret; } private void copyProgramGroupAttributes(final SAMProgramRecord oldProgramRecord, final SAMProgramRecord newProgramRecord) { for (final Map.Entry entry : oldProgramRecord.getAttributes()) { newProgramRecord.setAttribute(entry.getKey(), entry.getValue()); } } /** * Copies all the attribute of a readgroup to a new readgroup with a new id * * @param readGroup the group to be copied * @param modifiedId the id for the new readgroup * @return new read group */ private SAMReadGroupRecord copyReadGroup(final SAMReadGroupRecord readGroup, final String modifiedId) { final SAMReadGroupRecord retval = new SAMReadGroupRecord(modifiedId); retval.setLibrary(readGroup.getLibrary()); retval.setSample(readGroup.getSample()); for (final Map.Entry attr : readGroup.getAttributes()) { retval.setAttribute(attr.getKey(), attr.getValue()); } return retval; } /** * Creates a base 26 representation of an int * * @param n int to covert to letter representation * @return string rep for an int eg 0 = A 27 = AB */ protected static String createNewId(int n) { final int base = ALPHABET.length(); String s = ""; while (true) { final int r = n % base; s = ALPHABET.charAt(r) + s; n = n / base; if (n == 0) { return s; } n -= 1; } } /** Returns the read group id that should be used for the input read and RG id. */ public String getReadGroupId(final SAMFileReader reader, final String originalReadGroupId) { return this.samGroupIdTranslation.get(reader).get(originalReadGroupId); } /** * @param reader one of the input files * @param originalProgramGroupId a program group ID from the above input file * @return new ID from the merged list of program groups in the output file */ public String getProgramGroupId(final SAMFileReader reader, final String originalProgramGroupId) { return this.samProgramGroupIdTranslation.get(reader).get(originalProgramGroupId); } /** Returns true if there are read group duplicates within the merged headers. */ public boolean hasGroupIdDuplicates() { return this.hasGroupIdDuplicates; } /** Returns the merged header that should be written to any output merged file. */ public SAMFileHeader getMergedHeader() { return this.mergedHeader; } /** Returns the collection of readers that this header merger is working with. */ public Collection getReaders() { return this.readers; } }