diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 468bdcfca..3fc76c8a5 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -88,7 +88,7 @@ public class VariantContextAdaptors { return null; Allele refAllele = Allele.create(DbSNPHelper.getReference(dbsnp), true); - if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || dbsnp.getVariantType().contains("mixed") ) { + if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || DbSNPHelper.isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) { // add the reference allele List alleles = new ArrayList(); alleles.add(refAllele); diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelGenotyperV2Walker.java b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelGenotyperV2Walker.java index f82b6511f..ff053385d 100644 --- a/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelGenotyperV2Walker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelGenotyperV2Walker.java @@ -45,6 +45,9 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.SAMReaderID; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.interval.IntervalFileMergingIterator; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.sam.AlignmentUtils; @@ -84,6 +87,17 @@ public class IndelGenotyperV2Walker extends ReadWalker { // @Argument(fullName="vcf_format", shortName="vcf", doc="generate output file in VCF format", required=false) // boolean FORMAT_VCF = false; + @Argument(fullName = "genotype_intervals", shortName = "genotype", + doc = "Calls will be made at each position within the specified interval(s), whether there is an indel or it's the ref", required = false) + public String genotypeIntervalsFile = null; + + @Argument(fullName="genotypeIntervalsAreNotSorted", shortName="giNotSorted", required=false, + doc="This tool assumes that the genotyping interval list (--genotype_intervals) is sorted; "+ + "if the list turns out to be unsorted, it will throw an exception. "+ + "Use this argument when your interval list is not sorted to instruct the IndelGenotyper "+ + "to sort and keep it in memory (increases memory usage!).") + protected boolean GENOTYPE_NOT_SORTED = false; + @Argument(fullName="somatic", shortName="somatic", doc="Perform somatic calls; two input alignment files (-I option) must be specified. Calls are performed from the second file (\"tumor\") against the first one (\"normal\").", required=false) boolean call_somatic = false; @@ -167,6 +181,11 @@ public class IndelGenotyperV2Walker extends ReadWalker { private SAMRecord lastRead; private byte[] refBases; private ReferenceDataSource refData; + private Iterator genotypeIntervals = null; + private GenomeLocSortedSet traverseIntervals = null; // these are the traversal intervals passed with -L option (if any) + + // the current interval in the list of intervals, for which we want to do full genotyping + private GenomeLoc currentGenotypeInterval = null; // "/humgen/gsa-scr1/GATK_Data/refGene.sorted.txt" @@ -272,6 +291,22 @@ public class IndelGenotyperV2Walker extends ReadWalker { normalSamples.add(rg.getSample()); } } + if ( genotypeIntervalsFile != null ) { + + traverseIntervals = getToolkit().getIntervals(); + + if ( ! GENOTYPE_NOT_SORTED && IntervalUtils.isIntervalFile(genotypeIntervalsFile)) { + // prepare to read intervals one-by-one, as needed (assuming they are sorted). + genotypeIntervals = new IntervalFileMergingIterator( + new java.io.File(genotypeIntervalsFile), IntervalMergingRule.OVERLAPPING_ONLY ); + } else { + // read in the whole list of intervals for cleaning + GenomeLocSortedSet locs = IntervalUtils.sortAndMergeIntervals( + IntervalUtils.parseIntervalArguments(Arrays.asList(genotypeIntervalsFile)), IntervalMergingRule.OVERLAPPING_ONLY); + genotypeIntervals = locs.iterator(); + } + currentGenotypeInterval = genotypeIntervals.hasNext() ? genotypeIntervals.next() : null; + } } diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentInfo.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentInfo.java new file mode 100644 index 000000000..c56ee5ea2 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentInfo.java @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.utils; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Aug 3, 2010 + * Time: 4:10:42 PM + * To change this template use File | Settings | File Templates. + */ + +/** A simple utility class that encapsulates information about a single alignment (offset, strand, overlap, mismatch count). + * + */ +public class AlignmentInfo { + private int offset; + private int overlap; + private int mm ; + private Assembly a = null; + + private static int RIDICULOUSLY_LARGE_NUMBER = 1000000000; + + public AlignmentInfo() { + offset = 0; + mm = RIDICULOUSLY_LARGE_NUMBER; + overlap = -1; + a = null; + } + + public AlignmentInfo(int mm, int offset, boolean isRc, int overlap, Assembly a) { + this.offset = (isRc ? (-offset-1) : offset ); + this.overlap = overlap; + this.mm = mm; + this.a = a; + } + + boolean isAligned() { return mm < RIDICULOUSLY_LARGE_NUMBER; } + + public boolean isNegativeStrand() { return offset < 0; } + public Assembly getAssembly() { return a; } + public int getOffset() { return ( offset < 0 ? (-offset-1) : offset ); } + public int getMismatchCount() { return mm; } + public int getOverlap() { return overlap; } + public double getMismatchRate() { return isAligned() ? ((double)mm)/overlap : 1.0; } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentList.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentList.java new file mode 100644 index 000000000..85dfa8984 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentList.java @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.utils; + +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.util.List; +import java.util.ArrayList; +import java.util.Iterator; + +public class AlignmentList implements Iterable { + private int best_mm = 1000000000; + private int next_best_mm = 1000000000; + private List als = null; + private int next_best_count = 0; + private int best_overlap = 0; + + private AlignmentStrategy strategy = null; + + public AlignmentList(AlignmentStrategy s) { + this.strategy = s; + best_mm = 1000000000; + next_best_mm = 1000000000; + best_overlap = 0; + als = new ArrayList(1); + } + + public boolean isAligned() { + return best_mm < 1000000000; + } + + public List getAlignments() { return als; } + + public int size() { return als.size(); } + + public Iterator iterator() { return als.iterator(); } + + // public void tryAdd(int mm, int offset, boolean isRc, int overlap) { + // tryAdd(new AlignmentInfo(mm,offset,isRc,overlap)); + // } + + public void tryAdd(AlignmentInfo ai) { + AlignmentStrategy.Action a = strategy.action(ai,this) ; + switch ( a ) { + case DISCARD: break; + case REPLACE_BEST: + next_best_mm = best_mm; + next_best_count = size(); + als.clear(); + als.add(ai); + best_mm = ai.getMismatchCount(); + best_overlap = ai.getOverlap(); + break; + case ADD_BEST: + als.add(ai); + if ( ai.getMismatchCount() < best_mm ) best_mm = ai.getMismatchCount(); + if ( ai.getOverlap() > best_overlap) best_overlap = ai.getOverlap(); + break; + case REPLACE_NEXTBEST: + next_best_mm = ai.getMismatchCount(); + next_best_count = 1; + break; + case ADD_NEXTBEST: + next_best_count++; + if ( ai.getMismatchCount() < next_best_mm ) next_best_mm = ai.getMismatchCount(); + break; + default: throw new StingException("Unrecognized action requested: "+a); + } + } + + public void tryAddAll(AlignmentList al) { + for( AlignmentInfo ai : al) { + tryAdd(ai); + } + } + + public int getBestMMCount() { return best_mm; } + public int getBestOverlap() { return best_overlap; } + public int getBestHitCount() { return als.size() ; } + public int getNextBestHitCount() { return next_best_count; } + public int getNextBestMMCount() { return next_best_mm; } +// public int getOverlap() { return overlap; } +// public int getOffset() { return offset; } +// public boolean isNegativeStrand() { return rc; } + +// public double getMismatchRate() { return isAligned() ? ((double)best_mm)/overlap : 1.0 ; } + +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentStrategy.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentStrategy.java new file mode 100644 index 000000000..559a9ff16 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AlignmentStrategy.java @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.utils; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Aug 3, 2010 + * Time: 2:53:43 PM + * To change this template use File | Settings | File Templates. + */ +public interface AlignmentStrategy { + enum Action { + DISCARD, + REPLACE_BEST, + ADD_BEST, + REPLACE_NEXTBEST, + ADD_NEXTBEST + }; + + public Action action(AlignmentInfo alignment, AlignmentList currentList); +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/Assembly.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/Assembly.java new file mode 100644 index 000000000..265cf498e --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/Assembly.java @@ -0,0 +1,442 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.utils; + +import org.broadinstitute.sting.utils.collections.PrimitivePair; +import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.Utils; + +import java.util.List; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; + +import net.sf.samtools.util.StringUtil; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Aug 3, 2010 + * Time: 2:20:22 PM + * To change this template use File | Settings | File Templates. + */ +public class Assembly { + private byte[] consensus; + private short[] coverage; + private short[] mismatches; + private short [][] base_counts; + + private boolean debug = false; + private List seq_ids; + private List seqs; + private List seq_offsets; + + private KmerIndex lookup; // assembled consensus sequence is indexed here + + private int hookedAt = -1; // if set, specifies start on the ref of the assembled consensus sequence + + private static List EMPTY_KMER_LIST = new ArrayList(0); + + private int K = 15; + + private AlignmentStrategy strategy = null; + + /** Creates new assembly seeded with the specified sequence; default key length (15) is used. + * + * @param seq + * @param id + */ + public Assembly(final byte[] seq, String id) { + this(15,seq,id); + } + + /** Creates new assembly seeded with the specified sequence and sets kmer (key) length K for the internally maintained + * lookup index tables. + * @param K + * @param seq + * @param id + */ + public Assembly(int K, final byte[] seq, String id) { + this.K = K; + seq_ids = new ArrayList(); + seq_offsets = new ArrayList(); + seqs = new ArrayList(); + seq_ids.add(id); + seq_offsets.add(0); + seqs.add(seq); + consensus = Arrays.copyOf(seq,seq.length); + coverage = new short[seq.length]; + Arrays.fill(coverage,(short)1); + mismatches = new short[seq.length]; // filled with 0's + base_counts = new short[4][seq.length]; + for ( int i = 0 ; i < seq.length ; i++ ) { + int j = BaseUtils.simpleBaseToBaseIndex(seq[i]); + if ( j != -1) base_counts[j][i] = 1; + } + lookup = new KmerIndex(K,seq); + strategy = new DefaultAlignmentStrategy(); + } + + /** Creates new assembly seeded with the specified sequence; default key length (15) is used and the position on the + * reference of the entire assembly is set (as assemblly grows, position on the ref will be updated properly). + * + * @param seq + * @param id + */ + public Assembly(final byte[] seq, String id, int posOnRef) { + this(seq,id); + hookedAt = posOnRef; + } + + /** Creates new assembly seeded the specified sequence and sets kmer (key) length K for the internally maintained + * lookup index tables. Parameter posOnRef specifies the (initial) position of the entire assembly on the + * ref; as the assembly grows, the position on ref will be updated properly. + * @param K + * @param seq + * @param id + */ + public Assembly(int K, final byte[] seq, String id, int posOnRef) { + this(K,seq,id); + hookedAt = posOnRef; + } + + /** Returns total number of sequences currently held by this assembly. + * + * @return + */ + public int getNumSeqs() { return seqs.size() ; } + + /** Attempts to align seq to this assembly's consensus. Does NOT add + * the sequence to the consensus even if it aligns! This methods returns a list of alternative + * best alignments found (according to the strategy used) in a newly allocated AlignmentList object. + * @param seq sequence to align to this consensus + * @param tryRC if true, will try aligning both seq and its reverse complement; otherwise + * only forward alignment will be attempted (i.e. best placement of the seq, as it is provided, + * along the assembled consensus sequence) + * @return a newly allocated alignment list; returned list can be empty if no alignments are found + */ + public AlignmentList align(final byte[] seq, boolean tryRC) { + return align(seq,tryRC,null); + } + + /** Attempts to align seq to this assembly's consensus. Does NOT add + * the sequence to the consensus even if it aligns! This method uses existing list of alignments + * (which can contain alignments to a different assembly) and updates it as necessary if a better alignment + * (or multiple better alignments) than the one(s) already held in the list is found. Reference to the + * same alignment list object is returned: this method modifies it's argument. If alignment list argument + * is null, new alignment list object will be allocated and returned by this method. + * + * @param seq sequence to align to this consensus + * @param tryRC if true, will try aligning both seq and its reverse complement; otherwise + * only forward alignment will be attempted (i.e. best placement of the seq, as it is provided, + * along the assembled consensus sequence) + * @return a newly allocated alignment list; returned list can be empty if no alignments are found + */ + public AlignmentList align(final byte[] seq, boolean tryRC, AlignmentList a) { + if ( debug ) System.out.println("Assembly:: aligning sequence of length "+seq.length+"; tryRC="+tryRC+"; K="+K); + + List fw_kmers = KmerIndex.toKeyOffsetList(K,seq); + + if ( debug ) { + for( PrimitivePair.Int kmer: fw_kmers) { + System.out.println("id="+kmer.getFirst()+" seq="+new String(KmerIndex.idToSeq(K,kmer.getFirst()))+" offset on seq="+kmer.getSecond()); + } + } + + byte [] rc_seq = (tryRC ? BaseUtils.simpleReverseComplement(seq) : null ); + List rc_kmers = (tryRC ? KmerIndex.toKeyOffsetList(K,rc_seq) : EMPTY_KMER_LIST ); + + if ( a == null ) a = new AlignmentList(strategy); + + // i is the position on the sequence seq or on its reverse complement + for(PrimitivePair.Int kmer : fw_kmers ) { + + List offsets = lookup.getOffsets(kmer.first); + if ( offsets != null ) { + // kmer present in consensus sequence + for ( int s : offsets ) { // s=offset of the current kmer on the assembled consensus + int trial_offset = s - kmer.second; // offset of the seq on the assembled consensus suggested by current kmer/offset + int trial_mm = countMismatches(seq,trial_offset,a.getNextBestMMCount()); + a.tryAdd(new AlignmentInfo(trial_mm,trial_offset,false,overlap(trial_offset,seq.length),this)); + } + } + } + + for ( PrimitivePair.Int kmer : rc_kmers ) { + + List offsets = lookup.getOffsets(kmer.first); + if ( offsets != null ) { + // kmer present in consensus sequence + for ( int s : offsets ) { + int trial_offset = s - kmer.second; + int trial_mm = countMismatches(rc_seq,trial_offset,a.getNextBestMMCount()); + a.tryAdd(new AlignmentInfo(trial_mm,trial_offset,true,overlap(trial_offset,seq.length),this)); + } + } + } + return a; + } + + public void setDebug(boolean d) { this.debug = d; lookup.setDebug(d);} + + public int numSequences() { return seq_ids.size(); } + + private int overlap(int offset, int seq_length ) { + return Math.min(consensus.length,offset+seq_length)-Math.max(0,offset); + } + + private int countMismatches(final byte seq[], int offset, int cutoff) { + int mm = 0; + + int i ; + if ( offset >= 0 ) i = 0; + else { i = (-offset); offset = 0; } + for ( ; i < seq.length && offset < consensus.length ; i++ , offset++ ) { + if ( seq[i] != consensus[offset] ) { + mm++; + if ( mm > cutoff ) break; + } + } + + return mm; + } + + public byte[] getConsensus() { return consensus; } + + public int getPosOnRef() { return hookedAt; } + + public int getConsensusLength() { return consensus.length; } + + public List getOffsets() { return seq_offsets; } + public int getOffset(int i) { return seq_offsets.get(i); } + + public List getSeqIds() { return Collections.unmodifiableList(seq_ids); } + + /** Adds specified sequence to this assembly according to the provided + * alignment information. Will properly update consensus sequence of this assembly + * and all associated information (mismatches, base counts etc) + * @param seq + * @param id + * @param a + */ + public void add(final byte[] seq, String id, AlignmentInfo a) { + + if ( ! a.isAligned() ) throw new StingException("Can not add sequence to the assembly: provided alignment is empty"); + + seq_ids.add(id); + + int offset = a.getOffset(); + int oldConsensusLength = consensus.length; + + byte [] seq_to_add = ( a.isNegativeStrand() ? BaseUtils.simpleReverseComplement(seq) : seq); + + seqs.add(seq_to_add); + + int pos_on_seq = 0; + int pos_on_cons = 0; + + int leftExtension = 0; // how many bases we added to the consensus on the left + int rightExtension = 0; // how many bases we added to the consensus on the right + + if ( offset < 0 ) { + // if sequence sticks out to the left of the current consensus: + + leftExtension = -offset; + for(int i = 0 ; i < seq_offsets.size() ; i++ ) { + // we are going to extend consensus to the left, so we need to update all current offsets: + seq_offsets.set(i,seq_offsets.get(i)+leftExtension); + } + + if ( hookedAt > 0 ) hookedAt -= leftExtension; + // extend consensus and associated arrays to the left : + + consensus = Utils.extend(consensus,offset,(byte)0); // remember, offset is negative here, extending to the left + coverage = Utils.extend(coverage,offset,(short)1) ; + mismatches = Utils.extend(mismatches,offset,(short)0); + for ( int i = 0 ; i < 4 ; i++ ) base_counts[i] = Utils.extend(base_counts[i],offset,(short)0); + + for ( int j = 0 ; j < -offset ; j++ ) { + consensus[j] = seq_to_add[j]; + int b = BaseUtils.simpleBaseToBaseIndex(seq_to_add[j]); + if ( b != -1 ) base_counts[b][j]=1; + } + + pos_on_seq = pos_on_cons = -offset; + + offset = 0; + } + if ( offset > 0 ) pos_on_cons = offset; + + seq_offsets.add(offset); + + boolean consensus_changed = false; + + for ( ; pos_on_seq < seq_to_add.length && pos_on_cons < consensus.length ; pos_on_seq++, pos_on_cons++ ) { + coverage[pos_on_cons]++; + final byte base = seq_to_add[pos_on_seq]; + final int b = BaseUtils.simpleBaseToBaseIndex(base); + if ( b != -1 ) { + // if base on seq is not a regular base, there is nothing to do; + // otherwise count mismatches and optionally update consensus if current base tips the balance + base_counts[b][pos_on_cons]++; + int maxcount = 0; + int maxb = -1; + for ( int j = 0 ; j < 4 ; j++ ) { + if ( base_counts[j][pos_on_cons] > maxcount ) { + maxcount = base_counts[j][pos_on_cons]; + maxb = j; + } + } + // we are guaranteed here that maxb != -1 since we just added one regular base (the current one) + // few lines above... + byte newbase = BaseUtils.baseIndexToSimpleBase(maxb); + if ( newbase != consensus[pos_on_cons] ) { // need to change the consensus base (will recompute mismatches) + consensus[pos_on_cons] = newbase; + consensus_changed = true; + mismatches[pos_on_cons] = 0; + for ( int i = 0 ; i < 4 ; i++ ) { + if ( i == maxb ) continue; + mismatches[pos_on_cons] += base_counts[i][pos_on_cons]; + } + } else { // consensus base did not change; just increment mismatches if current sequence's base differs from consensus + if ( base != consensus[pos_on_cons]) mismatches[pos_on_cons]++; + } + } + + } + + // Last step: if sequence sticks out of current consensus on the right, we need to extend the latter: + + if ( pos_on_seq < seq_to_add.length ) { + // sequence sticks out of consensus to the right + rightExtension = seq_to_add.length - pos_on_seq; + consensus = Utils.extend(consensus,rightExtension,(byte)0); + coverage = Utils.extend(coverage,rightExtension,(short)1); + mismatches = Utils.extend(mismatches,rightExtension,(short)0); + for ( int i = 0 ; i < 4 ; i++ ) base_counts[i] = Utils.extend(base_counts[i],rightExtension,(short)0); + for ( ; pos_on_seq < seq_to_add.length ; pos_on_seq++, pos_on_cons++ ) { + byte base = seq_to_add[pos_on_seq]; + consensus[pos_on_cons] = base; + int b = BaseUtils.simpleBaseToBaseIndex(base); + if ( b != -1 ) base_counts[b][pos_on_cons] = base; + } + } + + // finally, the new sequence we just added could have mismatches that tip some consensus bases into new values; + // let's catch those cases: + + + for ( int i = 0 ; i < consensus.length ; i++ ) { + byte cons_base = consensus[i]; + int b = BaseUtils.simpleBaseToBaseIndex(cons_base); + } + + // there is probably a better way, but for now we just recompute the whole lookup table when consensus + // changes somewhere in the middle (if we want to be samrt we need to identify just the kmers that changed + // and find/change them in the lookup table). + if ( consensus_changed ) { + lookup.clear(); + lookup.index(consensus); + } else { + if ( leftExtension > 0 || rightExtension > 0 ) lookup.updateIndex(consensus,leftExtension,oldConsensusLength); + } + + + } + + public String toAlignmentString(boolean mismatchesOnly, boolean printNames) { + + int maxNameLength = 0; + int spacing=3; + + if ( printNames ) { + for ( String n : seq_ids ) if ( n.length() > maxNameLength ) maxNameLength++; + } + + StringBuilder b = new StringBuilder(); + if ( printNames ) b.append(Utils.dupString(' ',maxNameLength+spacing)); + b.append(new String(consensus)); + b.append('\n'); + + for ( int j = 0; j < seqs.size() ; j++ ) { + int offset = seq_offsets.get(j); + byte [] seq = seqs.get(j); + + if ( printNames ) { + b.append(seq_ids.get(j)); + b.append(Utils.dupString(' ',maxNameLength-seq_ids.get(j).length()+spacing)); + } + + for ( int i = 0 ; i < offset ; i++ ) b.append(' '); + + for ( int i = 0 ; i < seq.length ; i++ ) { + + byte base = seq[i]; + if ( mismatchesOnly && base == consensus[i+offset] ) { + b.append('.'); + } else b.append((char)base); + } + b.append('\n'); + } + return b.toString(); + } + + + public static void testMe(String [] argv ) { + byte [] seq1 = "ACGTTGCGTGGTTCACTGCAGTAACTGACTGATGCA".getBytes(); + byte [] seq2 = "GCGTGGTTTACTGCAGTAACTGACTGATGCAACGTGTTTG".getBytes(); + byte [] seq3 = "GGNTGACGTTGCGTGGTTTACTGCAGTAACTGACT".getBytes(); + byte [] seq4 = "NNNTTNCGTGGTTTACTGCAGTAACTGACTGATGCA".getBytes(); + + Assembly a = new Assembly(seq1,"1"); + + AlignmentList al = a.align(seq2,false); + if ( al.isAligned() ) System.out.println("seq 2 aligned"); + else System.out.println("seq 2 did NOT align"); + + if ( al.size() == 1 ) a.add(seq2,"2",al.getAlignments().get(0)); + else System.out.println("Multiple alignments found for seq 2"); + + al = a.align(seq3,false); + if ( al.isAligned() ) System.out.println("seq 3 aligned"); + else System.out.println("seq 3 did NOT align"); + + if ( al.size() == 1 ) a.add(seq3,"3",al.getAlignments().get(0)); + else System.out.println("Multiple alignments found for seq 3"); + + al = a.align(seq4,false); + if ( al.isAligned() ) System.out.println("seq 4 aligned"); + else System.out.println("seq 4 did NOT align"); + + if ( al.size() == 1 ) a.add(seq4,"4",al.getAlignments().get(0)); + else System.out.println("Multiple alignments found for seq 4"); + + System.out.println(a.toAlignmentString(true, true)); + + } + +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/AssemblyGraph.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AssemblyGraph.java new file mode 100644 index 000000000..302b35c5f --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/AssemblyGraph.java @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.utils; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.exceptions.StingException; + +import java.util.List; +import java.util.LinkedList; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Sep 13, 2010 + * Time: 5:53:33 PM + * To change this template use File | Settings | File Templates. + */ +public class AssemblyGraph { + + private List sources; + + public AssemblyGraph(Assembly a) { + sources = new LinkedList(); + sources.add(a); + } + + /** Initializes assembly from the single specified read, and sets this assembly as the root of this + * assembly graph + * @param r read; must be aligned, otherwise exception will be thrown + * @param K index (Kmer) size of the assembly that will be initialized with the read r + */ + public AssemblyGraph(SAMRecord r, int K) { + if (AlignmentUtils.isReadUnmapped(r)) + throw new StingException("Can not initialize assembly graph with unaligned read"); + sources = new LinkedList(); + sources.add( new Assembly(K,r.getReadBases(),r.getReadName(), r.getAlignmentStart()) ); + } + + public void add(SAMRecord r) { + + } + +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/DefaultAlignmentStrategy.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/DefaultAlignmentStrategy.java new file mode 100644 index 000000000..35ceef4a4 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/DefaultAlignmentStrategy.java @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.utils; + +import org.broadinstitute.sting.utils.exceptions.StingException; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Aug 3, 2010 + * Time: 4:53:01 PM + * To change this template use File | Settings | File Templates. + */ +public class DefaultAlignmentStrategy implements AlignmentStrategy { + + public Action action(AlignmentInfo alignment, AlignmentList currentList) { + if ( currentList.size() == 0 ) return Action.REPLACE_BEST; + + if ( alignment.getMismatchCount() > currentList.getNextBestMMCount() ) return Action.DISCARD; + + if ( alignment.getMismatchCount() < currentList.getBestMMCount() ) return Action.REPLACE_BEST; + if ( alignment.getMismatchCount() == currentList.getBestMMCount() ) return Action.ADD_BEST; + if ( alignment.getMismatchCount() < currentList.getNextBestMMCount() ) return Action.REPLACE_NEXTBEST; + if ( alignment.getMismatchCount() == currentList.getNextBestMMCount() ) return Action.ADD_NEXTBEST; + + throw new StingException("Unexpected case found and left unprocessed"); +// return null; //To change body of implemented methods use File | Settings | File Templates. + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/KmerIndex.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/KmerIndex.java new file mode 100644 index 000000000..179c09655 --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/KmerIndex.java @@ -0,0 +1,332 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.utils; + +import org.broadinstitute.sting.utils.collections.PrimitivePair; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.BaseUtils; + +import java.util.List; +import java.util.HashMap; +import java.util.ArrayList; +import java.util.Map; + +/** + * Created by IntelliJ IDEA. +* User: asivache +* Date: Aug 3, 2010 +* Time: 1:31:15 PM +* To change this template use File | Settings | File Templates. +*/ +public class KmerIndex { + private HashMap > lookup; + private int K = -1; + private int mask = 0; + private boolean debug = false; + + /** + * Translates sequence seq into the list of all valid kmers paired + * with their offsets on that sequence. Valid kmer is a kmer that contains only ACGT bases; if seq + * contains any other symbols, no kmers overlapping with such symbols will be generated. This method + * returns a linear (possibly gapped if non-ACGT symbols are present) representation of the sequence as its kmers + * with corresponding offsets, NOT a lookup index. If a specific kmer occurs on the sequence N times, + * the returned list will have N occurences of this kmer, each paired with one unique location on the sequence. + * Kmers themselves are represented as integer kmer ids here, see #idToSeq() if string (ACGT bases) representation + * of kmers is needed. Empty list if returned if no valid kmers are found on the sequence (i.e. too many non-ACGT bases) + * + * @param K key (kmer) length + * @param seq sequence to translate into kmer/offset representation + * @return list of kmer/offsets + */ + public static List toKeyOffsetList(int K, byte seq[]) { + return toKeyOffsetList(K,seq,0,seq.length); + } + + /** Same as #toKeyOffsetList(int K, byte [] seq) (see docs), except that this method is not static and + * uses key length K associated with the specific instance of the KmerIndex class. + * @param seq + * @return + */ + public List toKeyOffsetList(byte [] seq) { + return toKeyOffsetList(this.K,seq); + } + + /** Returns an ordered sequence of overlapping (1-shift in the ideal case) k-mers of length K found in the subsequence of + * length length of sequence seq starting at position start. All returned kmers + * are fully subsumed by the interval [start, start+length) on the sequence seq (no partial overlaps). + * Each kmer is paired with its offset on the (full length) seq in the returned list. + * Note that only k-mers on the forward strand are returned. You need to manually rc the string and + * call toKeyOffsetList() again to get rc k-mers. If sequence contains any other symbols than ACGT, all k-mers + * that would overlap with those symbols will be skipped (not present in the returned list). See also + * #toKeyOffsetList(int K, byte [] seq) which translates the whole sequence seq. + * + * @param K index key (k-mer) length + * @param seq sequence to compute k-mers from + * @param start compute kmers for subsequence [start, start+length) of seq + * @param length compute kmers for subsequence [start, start+length) of seq + * @return a list of pairs (kmer,offset_on_the_seq) for each valid kmer (i.e. kmer that does not overlap with + * non-ACGT bases); if no valid kmers exist, the returned list will be empty. + */ + public static List toKeyOffsetList(int K, byte[] seq, int start, int length) { + + if ( length < K ) throw new StingException("Can not index sequence that is shorter than key length: total seq="+seq.length+"; start="+start+"; length="+length); + + int mask = 0; + if ( length > K ) { + for ( int i = 0; i < K ; i++ ) { + mask <<= 2; + mask |= 0x03; + } + } + + int key = 0; + int i ; + final int final_pos = start+length; // first base *after* the last position we want to index + + ArrayList l = new ArrayList(length-K+1); + + PrimitivePair.Int firstK = toFirstKey(K,seq,start,final_pos); + if ( firstK == null ) { + // ooops, too many non-ACGT bases, we were not able to find a single valid k-mer on the whole sequence! + return l; + } + + l.add(firstK); + + start = firstK.getSecond(); + i = start + K; // i points to the first base after the returned kmer firstK + key = firstK.getFirst(); + + // now let's try recomputing next kmers in an efficient way: we reuse previous kmer and add only the new last base. + // This will break if we encounter a non-ACGT base, in which case we will have to start over. + + for ( start++ ; i < final_pos ; i++, start++ ) { + int d = BaseUtils.simpleBaseToBaseIndex(seq[i]); + if ( d == -1 ) { + // ooops, we ran into a bad base; let's jump over it completely and reinitialize the key + // (since all kmers overlapping with the current base are invalid) + firstK = toFirstKey(K,seq,i+1,final_pos); + if ( firstK == null ) break; // no more valid kmers + l.add(firstK); + start = firstK.getSecond(); + i = start+K; // points to the base right after the Kmer we just found + key = firstK.getFirst(); // reset key to the new kmer we just found + } else { + // the base is good, so we can compute our new kmer very efficiently using the old one: + key <<= 2; + key &= mask; + key += d; + l.add(new PrimitivePair.Int(key,start)); + } + } + return l; + } + + /** Non-static version of #toKeyOffsetList(int K, byte [] seq, int start, int length) (see docs), which + * uses key length K associated with this instance of the KmerIndex object. + * @param seq + * @param start + * @param length + * @return + */ + public List toKeyOffsetList(byte[] seq, int start, int length) { + return toKeyOffsetList(this.K,seq,start,length); + } + + + /** Computes index (key) of the first valid kmer in the interval [start,stop) of the sequence seq. Kmer is valid + * if it contains only valid (ACGT) bases. Returns key and actual offset of first such kmer found, or null + * if such kmer does not exist (i.e. if seq does not contain a continuous span of ACGT bases at least K bases long). + * @param K + * @param seq + * @param start + * @param stop + * @return + */ + private static PrimitivePair.Int toFirstKey(int K, byte[] seq, int start, int stop) { + int d = -1; + int key = 0 ; + while ( d == -1 && start < stop - K + 1) { + key = 0; + for ( int i = start ; i < start+K; i++ ) { + key <<= 2; + d = BaseUtils.simpleBaseToBaseIndex(seq[i]); + if ( d == -1) { + // ooops, non-ACGT base found, abort and start over. Next kmer that + // have a chance to be valid (contain only ACGT bases) can start only after the current position: + start = i+1; + break; + } + key += d; + } + } // got the first key + + if ( d != -1 ) return new PrimitivePair.Int(key,start); + else return null; + } + + /** Creates an empty kmer index table with specified key length + * + * @param K + */ + public KmerIndex(final int K) { + if ( K > 16 ) throw new StingException("Lookup keys longer than 16 bases are currently not supported"); + if ( K % 2 == 0 ) throw new StingException("Even keys require additional processing of palindromes, currently not supported. Please use odd key."); + this.K = K; + + mask = 0; + for ( int i = 0; i < K; i++ ) { + mask <<= 2; + mask |= 0x03; + } // got the first key + + lookup = new HashMap>(); + } + + /** Builds kmer index table with key length K for the sequence seq. + * + * @param K + * @param seq + */ + public KmerIndex(final int K, final byte[] seq) { + this(K); + + if ( seq.length < K ) throw new StingException("Sequence is shorter than requested lookup index key length"); + + addToIndex(toKeyOffsetList(K,seq,0,seq.length)); + } + + public void setDebug(boolean d) { this.debug = d; } + + /** Clears current lookup index table completely (but preserves the key length previously set). + * + */ + public void clear() { lookup.clear(); } + + /** Builds complete index for the sequence seq. This method can be used only when lookup table is + * empty (i.e. use #clear() first), otherwise an exception will be thrown. + * @param seq + */ + public void index(final byte[] seq) { + if ( ! lookup.isEmpty() ) { + throw new StingException("Can not index new sequence: lookup table is already non-empty"); + } + addToIndex(toKeyOffsetList(K,seq,0,seq.length)); + } + + /** + * Updates existing index. It is assumed that the sequence that was already indexed by this KmerIndex object is + * the exact subsequence of length old_length of the new sequence seq, starting at + * position old_start. No checks are performed, so it is the responsibility of the caller to ensure + * that this is indeed the case, otherwise the index will be inconsistent. Since the old sequence is a part + * of the new one, this method will keep all the already computed kmers (and update their offsets as needed), + * and compute and add kmers/offsets for all the novel bases added to the sequence seq compared + * to the old, already indexed subsequnce. If old_length is less than + * K (i.e. old sequence could not be and was not indexed at all), the new sequence seq will + * be fully indexed from start to end. + * @param seq + * @param old_start already indexed subsequence starts at this position in seq + * @param old_length length of the already indexed subsequence + */ + public void updateIndex(final byte[] seq, final int old_start, final int old_length) { + + if ( old_length < K ) { + if ( ! lookup.isEmpty()) + throw new StingException("It is claimed that old indexed sequence is shorter than K (i.e. it could not be indexed), but index is non empty"); + addToIndex( toKeyOffsetList(K,seq,0,seq.length)); + return; + } + + if ( old_start > 0 ) { + // update positions of previously indexed k-mers: + for ( Map.Entry> e : lookup.entrySet() ) { + List l = e.getValue(); + for ( int i = 0 ; i < l.size(); i++ ) l.set(i,l.get(i)+old_start); + } + // take care of additional k-mers appearing *before* the already indexed subsequence: + // if already indexed subsequence starts at 'start', the first k-mer from that sequence + // ends at start+K-1 (inclusive) and it is obviously already indexed. So the last k-mer we want to index now ends at + // start+K-2 (inclusive), the length of [0,start+K-2] interval that we need to index is + // start+K-1. + addToIndex( toKeyOffsetList(K,seq,0,old_start+K-1) ); + } + + // the last k-mer we already indexed ends at start+length-1 (inclusive); so it starts at start+length-1-(K-1)=start+length-K. + // Hence, the first k-mer that is not indexed yet starts at start+length-K+1. The length of the subsequence that + // we need to index, [start+length-K+1,seq.length) is seq.length - start - length +K - 1 + + int pos = old_start+old_length-K+1; + addToIndex( toKeyOffsetList(K,seq,pos,seq.length-pos) ); + + + } + + /** Convenience shortcut: takes the list of keys/offsets and pushes offsets into the lookup index for the keys that + * do exist already, or first creates the new entry and then pushes the offset for keys that are novel. This method + * is quiet: if keys is null or an empty list, it does nothing. + * @param keys + */ + private void addToIndex(final List keys ) { + if ( keys == null ) return; + for ( PrimitivePair.Int key: keys ) { + List l = lookup.get(key.getFirst()); + if ( l == null ) { + l = new ArrayList(); + lookup.put(key.getFirst(),l); + } + l.add(key.getSecond()); + } + + } + + /** + * Converts kmer (integer key) of length K into its sequence representation. Returns a sequence (over ACGT alphabet) + * of length K that corresponds to the specified key. + * @param K + * @param kmer + * @return + */ + public static byte [] idToSeq(int K, int kmer) { + byte [] seq = new byte[K]; + for ( int i = K-1; i >=0 ; i-- ) { + seq[i] = BaseUtils.baseIndexToSimpleBase(kmer & 0x3); + kmer >>= 2; + } + return seq; + } + + /** Returns all offsets for the specified kmer (key) on the sequence indexed by this KmerIndex object. Returns + * null if specified kmer is not present on the indexed sequence. + * @param key + * @return + */ + public List getOffsets(int key) { return lookup.get(key); } +// public List getOffsets(byte[] seq) { +// if ( seq.length != K ) throw new StingException("Can not perform direct lookup of a sequence with length different from key size"); +// +// return getOffsets( toKey(seq) ) ; +// } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/utils/ReadPair.java b/java/src/org/broadinstitute/sting/oneoffprojects/utils/ReadPair.java new file mode 100644 index 000000000..ce5dea0ec --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/utils/ReadPair.java @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.utils; + +import net.sf.samtools.SAMRecord; +import net.sf.samtools.CigarElement; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import org.broadinstitute.sting.utils.exceptions.StingException; + +/** + * Created by IntelliJ IDEA. +* User: asivache +* Date: Aug 6, 2010 +* Time: 6:18:01 PM +* To change this template use File | Settings | File Templates. +*/ +public class ReadPair { + + public enum PairType { + UNKNOWN, + BOTH_UNMAPPED, + ONE_UNMAPPED, + PROPER, + LEFT, + RIGHT, + OUTER, + INTER + }; + + + private SAMRecord end1 = null; + private SAMRecord end2 = null; + private PairType pType = PairType.UNKNOWN; + private int leftStart = -1; + private int rightStart = -1; + private SAMRecord leftRead = null; + private SAMRecord rightRead = null; + + + + /** Creates an empty read pair object */ + public ReadPair() {} + + /** Creates a read pair objects initialized with the specified read */ + public ReadPair(SAMRecord read) { + addRead(read); + } + + /** Returns name of the paired read (it is assumed that both individual reads in the pair share same name). + * + * @return + */ + public String getName() { return ( end1 != null ? end1.getReadName() : (end2 != null ? end2.getReadName() : null) ); } + + /** Returns true if both ends are recorded in this read pair object. Note that because SAM records carry + * mate information, a pair can be (partially) initialized from one end. This method verifies that this is not the case + * and both records are actually present. + * @return + */ + public boolean hasBothEnds() { return end1 != null && end2 != null ; } + + /** Returns true if this pair object was initialized with at least one end. Since SAM records carry mate information, + * it is sometimes sufficient to have only one read (fragment end) actually recorded in the pair object, at which + * point some useful information can be retrieved for the pair already. + * @return + */ + public boolean hasAnyData() { return end1 != null || end2 != null ; } + + /** Returns true if both ends in the pair are mapped. The pair object must be at least partially initialized (i.e. + * it has to hold a reference to at least one end of the pair), otherwise an exception will be thrown. + * @return + */ + public boolean bothEndsMapped() { + if ( pType == PairType.UNKNOWN ) throw new StingException("ReadPair object was not initialized yet, method can not be applied"); + + if ( pType == PairType.BOTH_UNMAPPED || pType == PairType.ONE_UNMAPPED ) return false; + return true; + } + + /** Returns true if both ends in the pair are mapped uniquely. This method requires both ends being already registered + * in this pair object (i.e. hasBothEnds() is true), otherwise an exception will be thrown. + * @return + */ + public boolean bothEndsUniquelyMapped() { + if ( ! hasBothEnds() ) throw new StingException("Can not determine if both ends are uniquely mapped until both ends are recorded"); + return bothEndsMapped() && end1.getMappingQuality() > 0 && end2.getMappingQuality() > 0; + } + + /** Returns true if this pair is in proper orientation, i.e. ---> <--- on the same contig */ + public boolean isProper() { return pType == PairType.PROPER; } + + /* Returns true if this pair is in outer orientation, i.e. <--- ---> on the same chromosome */ + public boolean isOuter() { return pType == PairType.OUTER; } + + /** Returns left (coordinate-wise) read in the pair. Both ends need to be mapped, and they should map + * onto the same contig, otherwise an exception will be thrown. + * @return + */ + public SAMRecord getLeftRead() { + if ( ! bothEndsMapped() || pType == PairType.INTER ) + throw new StingException("Left read can be identified only when both reads are mapped onto the same contig, and the are not for "+getName()); + if ( leftRead == null ) + throw new StingException("Left read is not recorded. Maybe we have not seen it yet? Pair: "+getName()); + return leftRead; + } + + /** Returns right (coordinate-wise) read in the pair. Both ends need to be mapped, and they should map + * onto the same contig, otherwise an exception will be thrown. + * @return + */ + public SAMRecord getRightRead() { + if ( ! bothEndsMapped() || pType == PairType.INTER ) + throw new StingException("Right read can be identified only when both reads are mapped onto the same contig, and the are not for "+getName()); + if ( rightRead == null ) + throw new StingException("Right read is not recorded. Maybe we have not seen it yet? Pair: "+getName()); + return rightRead; + } + + public SAMRecord getEnd1() { return end1; } + public SAMRecord getEnd2() { return end2; } + + public PairType getPairType() { return pType ; } + + public void addRead(SAMRecord r) { + if ( ! r.getReadPairedFlag() ) throw new StingException("Read "+r.getReadName() +" is unpaired"); + if ( r.getFirstOfPairFlag() ) { + if ( end1 != null ) throw new StingException("Read "+r.getReadName()+" is first of pair and the pair already has first read recorded"); + end1 = r; + if ( end2 != null && ! end1.getReadName().equals(end2.getReadName()) ) + throw new StingException("The pair already has read "+end2.getReadName() +"; the read being added does not match by name ("+r.getReadName()+")" ); + } else { + if ( r.getSecondOfPairFlag() ) { + if ( end2 != null ) throw new StingException("Read "+r.getReadName()+" is second of pair and the pair already has second read recorded"); + end2 = r; + if ( end1 != null && ! end1.getReadName().equals(end2.getReadName()) ) + throw new StingException("The pair already has read "+end1.getReadName() +"; the read being added does not match by name ("+r.getReadName()+")" ); + } else { + throw new StingException("The read "+r.getReadName()+" is marked as paired, but the first/second of pair flag is not set"); + } + } + setPairInfo(r); + } + + /** If pair type has not been set yet, then sets it to t. Otherwise (pair type already set), + * just checks if the pair type is t. If it is, the method returns quietly; if it is not (inconsistency detected), + * throws an exception. + * + */ + private void setCheckPairType(PairType t) { + if ( pType != PairType.UNKNOWN ) { + if ( pType != t ) + throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information"); + } else pType = t; + } + + private void setCheckLeftStart(int pos) { + if ( leftStart >= 0 ) { + if ( leftStart != pos ) + throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information"); + } else leftStart = pos; + } + + private void setCheckRightStart(int pos) { + if ( rightStart >= 0 ) { + if ( rightStart != pos ) + throw new StingException("In pair "+getName()+" two ends provide conflicting alignment information"); + } else rightStart = pos; + } + + private void setPairInfo(SAMRecord read) { + + setCheckPairType(getPairType(read)); + + // there is nothing left to do unless both ends are mapped onto the same contig: + if ( pType == PairType.INTER ) return; + + if ( pType == PairType.ONE_UNMAPPED ) { + // set putative left or right read depending on the orientation of the only mapped mate + if ( ! AlignmentUtils.isReadUnmapped(read ) ) { + // we can set left/right read only if it is the current read that is mapped; if we have the + // unmapped mate, skip and wait for the mapped read to come! + if ( read.getReadNegativeStrandFlag() ) { + setCheckRightStart(read.getAlignmentStart()); + if ( rightRead != null ) throw new StingException("Right read was already set for the pair"); + rightRead = read; + } else { + setCheckLeftStart(read.getAlignmentStart()); + if ( leftRead != null ) throw new StingException("Left read was already set for the pair"); + leftRead = read; + } + } + return; + } + + // we are here if both ends are mapped and they map onto the same contig + if ( read.getAlignmentStart() < read.getMateAlignmentStart() ) { //left/right = read/mate + + setCheckLeftStart(read.getAlignmentStart()); + setCheckRightStart(read.getMateAlignmentStart()); + + if ( leftRead != null ) throw new StingException("Left read was already set for the pair"); + leftRead = read; + } else { + // left/right = mate/read + + setCheckLeftStart(read.getMateAlignmentStart()); + setCheckRightStart(read.getAlignmentStart()); + + if ( rightRead != null ) throw new StingException("Right read was already set for the pair"); + rightRead = read; + } + } + + /** Returns pair type that describes this read and its mate. The alignment information for both the read itself + * and its mate is taken from the read's sam record passed as the argument, so the mate information is expected to be + * correctly set! + * @param read + * @return + */ + public static PairType getPairType(SAMRecord read) { + + if ( AlignmentUtils.isReadUnmapped(read) ) { + if ( AlignmentUtils.isMateUnmapped(read) ) return PairType.BOTH_UNMAPPED; + else return PairType.ONE_UNMAPPED; + } + + return getWouldBePairType(read,read.getReferenceIndex(),read.getAlignmentStart(),read.getReadNegativeStrandFlag()); + } + + /** Returns pair type that would describe this read and its mate, if this read mapped onto refId:start in orientation + * given by rc (forward is rc=false, reverse is rc=true). The read's alignment information (if any, + * unmapped reads are allowed) present in the SAM record is completely ignored by this method, + * only mate's information is used. + * @param read + * @param refId + * @param start + * @param rc + * @return + */ + public static PairType getWouldBePairType(SAMRecord read, int refId, int start, boolean rc) { + + + if ( AlignmentUtils.isMateUnmapped(read) ) return PairType.ONE_UNMAPPED ; + + // both read and mate are mapped: + + if ( refId != read.getMateReferenceIndex() ) return PairType.INTER; + + // both read and its mate map onto the same chromosome + + if ( start < read.getMateAlignmentStart() ) { //left/right = read/mate + + if ( rc ) { + if ( read.getMateNegativeStrandFlag() ) return PairType.LEFT; + else return PairType.OUTER; + } else { + if ( read.getMateNegativeStrandFlag() ) return PairType.PROPER; + else return PairType.RIGHT; + } + } else { + // left/right = mate/read + + if ( rc ) { + if ( read.getMateNegativeStrandFlag() ) return PairType.LEFT; + else return PairType.PROPER; + } else { + if ( read.getMateNegativeStrandFlag() ) return PairType.OUTER; + else return PairType.RIGHT; + } + } + } + + public int getLeftStart() { + if ( ! hasAnyData() ) throw new StingException("ReadPair object was not initialized yet, method can not be applied"); + return leftStart; + } + + public int getRightStart() { + if ( ! hasAnyData() ) throw new StingException("ReadPair object was not initialized yet, method can not be applied"); + return rightStart; + } + + public int getFragmentSize() { + if ( ! hasBothEnds() ) throw new StingException("Can not determine fragment size: pair object does not have both ends yet"); + if ( ! bothEndsMapped() ) throw new StingException("Can not determine fragment size: both ends must be mapped"); + if ( pType != PairType.PROPER ) throw new StingException("The pais is not in proper orientation, can not determine fragment size"); + + return getFragmentSize(leftRead,rightRead); + } + + /** Given a read (that must belong to this pair), returns the other end in the pair if it is already + * recorded, or null otherwise. + * @param read + * @return + */ + public SAMRecord getOtherEnd(SAMRecord read) { + if ( read.getFirstOfPairFlag() ) return end2; + else { + if ( read.getSecondOfPairFlag() ) return end1; + } + return null; + } + + public static int getFragmentSize(SAMRecord left, SAMRecord right) { + + if ( left == null || right == null || + AlignmentUtils.isReadUnmapped(left) || AlignmentUtils.isReadUnmapped(right) ) { + throw new StingException("No read (null) or unmapped read provided: fragment size is not defined"); + } + if ( left.getReferenceIndex() != right.getReferenceIndex() ) { + throw new StingException("Left/right reads map onto different contigs: fragment size is not defined"); + } + + int fragment_length = left.getReadLength(); // fragment is at least as long as the left read, duh! + int leftEnd = left.getAlignmentEnd(); + int rightStart = right.getAlignmentStart(); + + if ( rightStart > leftEnd ) { + // if reads are not overlapping, fragment length is lengths of both reads plus the distance (gap) between + // the reads. Note that if the sequence between the reads happens to have insirtions or deletions, + // our estimation of the actual distance between the reads (on the fragment) is incorrect, but we + // can not do better given just those reads. This estimation is, in particular, incorrect + // for left reads ending with 'I' and/or right reads starting with 'I' + // + // left right + // -------->...gap...<-------- fragment = left+gap+right + + return left.getReadLength() + right.getReadLength() + (rightStart - leftEnd-1); + } + + // if we are here, the reads do overlap; fragment length is lengths of the two reads less the overlap. + // in this case we can compute the actual overlap between the reads (on the fragment) taking into + // account indels, if any + // + // left **** right + // ------------> ****=overlap; fragment = left+right - overlap + // <-------------- + // + // with deletion: + // + // left ** ** right + // -----------ddd-> ****=overlap; fragment = left+right - overlap + // <-ddd------------- note that overlap != leftEnd - rightStart+1 + // instead, overlap = leftEnd-rightStart+1- length(D) + // with insertion: + // + // left ******* right ******* = overlap; fragment = left+right - overlap + // -------------iii-> note that overlap != leftEnd - rightStart +1 + // <-iii-------------- instead, overlap = leftEnd - rightStart +1 + length(I) + // (since 'i' bases are NOT on the ref) + + int posOnRef = rightStart; +// int posOnRightRead = 0; + + int overlap = leftEnd - rightStart + 1 ; + + for(CigarElement ce : left.getCigar().getCigarElements() ) { + switch(ce.getOperator()) { + case S: + case H: +// posOnRightRead+=ce.getLength(); + break; + case I: + overlap += ce.getLength(); + break; + case D: + case N: + overlap -= ce.getLength(); + case M: + posOnRef += ce.getLength(); + break; + default: + } + if ( posOnRef > leftEnd ) break; // we need to examine only overlapping part of the reads + } + return left.getReadLength() + right.getReadLength() - overlap; + } +} diff --git a/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DetectWGAWalker.java b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DetectWGAWalker.java new file mode 100644 index 000000000..995fd764c --- /dev/null +++ b/java/src/org/broadinstitute/sting/oneoffprojects/walkers/DetectWGAWalker.java @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.oneoffprojects.walkers; + +import org.broadinstitute.sting.gatk.walkers.WalkerName; +import org.broadinstitute.sting.gatk.walkers.Requires; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceDataSource; +import org.broadinstitute.sting.oneoffprojects.utils.ReadPair; +import org.broadinstitute.sting.oneoffprojects.utils.AlignmentInfo; +import org.broadinstitute.sting.oneoffprojects.utils.Assembly; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.sam.AlignmentUtils; +import net.sf.samtools.SAMRecord; + +import java.util.Map; +import java.util.HashMap; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Sep 13, 2010 + * Time: 12:45:42 PM + * To change this template use File | Settings | File Templates. + */ + +@WalkerName("DetectWGA") +@Requires(value={DataSource.REFERENCE_BASES}) + +public class DetectWGAWalker extends ReadWalker { + + private int TIP_LENGTH = 10; + private int TIP_MM_THRESHOLD = 1; + private double TIP_AV_QUAL_THRESHOLD = 15.0; + + private boolean DEBUG = true; + + Map pairCache = null; + Map fragmentSizeMap = null; // by library + private ReferenceDataSource refData; + private byte[] refBases; + + + + + @Override + public void initialize() { + refData = new ReferenceDataSource(getToolkit().getArguments().referenceFile); + pairCache = new HashMap(); + fragmentSizeMap = new HashMap(); + } + + + public Integer map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) { + + if ( ! read.getReadPairedFlag() ) return 0; // for now!! + + // read is paired + + cacheReadAsPair(read); + + /* + // if the read is already mapped (uniquely), we check if it may have the "colored tip" artifact on either side: + if ( AlignmentUtils.isReadUniquelyMapped(read) ) { + + TipInfo tips = countTipMismatches(read,TIP_LENGTH); + + if ( tips.leftMM() >= TIP_MM_THRESHOLD || tips.rightMM() >= TIP_MM_THRESHOLD ) { + if ( DEBUG ) { + out.println(" Read "+read.getReadName()+ " has "+tips.leftMM()+"/"+tips.rightMM()+" mismatches in the tips"); + out.println(" Pair orientation: "+pair.getPairType()); + } + // try adding read to existing assemblies: + AlignmentInfo al = alignToAllAddToBest(read,Math.min(3,tips.leftMM()+tips.rightMM())-1); + if ( al == null ) { + if ( tips.leftMM() >= TIP_MM_THRESHOLD && tips.leftQ() >= TIP_AV_QUAL_THRESHOLD || + tips.rightMM() >= TIP_MM_THRESHOLD && tips.rightQ() >= TIP_AV_QUAL_THRESHOLD ) { + if ( DEBUG ) out.println(" Initialized new assembly.") ; + Assembly a = new Assembly(read.getReadBases(),read.getReadName(),read.getAlignmentStart()); + tryAndAddUnmapped(a); // see if we got unmapped reads that would align nicely + assemblies.add(a); + } + } + } + return 1; + } +*/ + + + + return null; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Provide an initial value for reduce computations. + * + * @return Initial value of reduce. + */ + public Integer reduceInit() { + return null; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Reduces a single map with the accumulator provided as the ReduceType. + * + * @param value result of the map. + * @param sum accumulator for the reduce. + * @return accumulator with result of the map taken into account. + */ + public Integer reduce(Integer value, Integer sum) { + return null; //To change body of implemented methods use File | Settings | File Templates. + } + + /** little helper: if we already cached the pair object for this read, just add the read to that object; if we did not - instantiate + * new pair objetc first and register it in the map, then add the read; this method also updates other cache(s)/trackers as needed, e.g. + * fragment size map + */ + private void cacheReadAsPair(SAMRecord read) { + ReadPair pair = pairCache.get( read.getReadName() ); + if ( pair == null ) { + pair = new ReadPair(read); + pairCache.put(read.getReadName(),pair); + } + + pair.addRead(read); + + // if it's a good pair, add its fragment size to the stats: + if ( pair.hasBothEnds() && pair.bothEndsMapped() && pair.isProper() ) { + String lib = read.getReadGroup().getLibrary(); + MathUtils.RunningAverage fSize = fragmentSizeMap.get(lib); + if ( fSize == null ) { + fSize = new MathUtils.RunningAverage(); + fragmentSizeMap.put(lib,fSize); + } + fSize.add(pair.getFragmentSize()); + } + + } + + private TipInfo countTipMismatches(SAMRecord read, int tip_length) { + + AlignmentUtils.MismatchCount left_mm = AlignmentUtils.getMismatchCount(read,refBases,read.getAlignmentStart()-1,0,tip_length); + + int right_start = read.getReadLength()-tip_length; + AlignmentUtils.MismatchCount right_mm = AlignmentUtils.getMismatchCount(read,refBases,read.getAlignmentStart()-1,right_start,read.getReadLength()-right_start); + + return new TipInfo(left_mm,right_mm); + } + + class TipInfo { + AlignmentUtils.MismatchCount left_mm; + AlignmentUtils.MismatchCount right_mm; + double left_avQ; + double right_avQ; + + public TipInfo(AlignmentUtils.MismatchCount l,AlignmentUtils.MismatchCount r) { + left_mm = l; + right_mm = r; + left_avQ = (l.numMismatches ==0 ? 0 : ((double)l.mismatchQualities)/l.numMismatches ); + right_avQ = (r.numMismatches ==0 ? 0 : ((double)r.mismatchQualities)/r.numMismatches ); + } + + public int leftMM() { return left_mm.numMismatches; } + public int rightMM() { return right_mm.numMismatches; } + public double leftQ() { return left_avQ; } + public double rightQ() { return right_avQ; } + } + +} diff --git a/java/src/org/broadinstitute/sting/playground/tools/PairMaker.java b/java/src/org/broadinstitute/sting/playground/tools/PairMaker.java index b37530b8a..0292864a1 100644 --- a/java/src/org/broadinstitute/sting/playground/tools/PairMaker.java +++ b/java/src/org/broadinstitute/sting/playground/tools/PairMaker.java @@ -363,7 +363,13 @@ public class PairMaker extends CommandLineProgram { } } if ( best.size() == 0 ) return null; // no unique alignment - if ( best.size() > 1 ) throw new RuntimeException("Multiple alignments for read "+l.get(0).getReadName()+", all with Q>="+minq); + if ( best.size() > 1 ) { + for ( SAMRecord r : best ) { + System.out.println("READ "+r.getReadName()+" mapQ="+r.getMappingQuality()+" at="+r.getReferenceName()+ + ":"+r.getAlignmentStart()+"("+(r.getReadNegativeStrandFlag()?"-":"+")+") cig="+r.getCigarString()); + } + throw new RuntimeException("Multiple alignments for read "+l.get(0).getReadName()+", all with Q>="+minq); + } return best.get(0); } diff --git a/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java b/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java new file mode 100644 index 000000000..a08bdc6da --- /dev/null +++ b/java/src/org/broadinstitute/sting/utils/interval/OverlappingIntervalIterator.java @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.interval; + +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.gatk.iterators.PushbackIterator; + +import java.util.Iterator; + +/** + * Created by IntelliJ IDEA. + * User: asivache + * Date: Oct 7, 2010 + * Time: 2:40:02 PM + * To change this template use File | Settings | File Templates. + */ + +/** This class provides an adapter to Iterator that returns only (parts of) underlying iterator's + * intervals overlapping with specified "master set" of bounding intervals. The underlying iterator must return + * NON-overlapping intervals in coordinate-sorted order, otherwise the behavior is unspecified. If the master set is represented by + * another interval iterator, it should return sorted and NON-overlapping intervals. + * + */ +public class OverlappingIntervalIterator implements Iterator { + PushbackIterator iter = null; + PushbackIterator boundBy = null; + + GenomeLoc prefetchedOverlap = null; + GenomeLoc currentBound = null; + GenomeLoc currentInterval = null; + + /** Creates new overlapping iterator that will internally traverse intervals and return only + * overlaps of those with set of intervals returned by boundBy. + * @param intervals + * @param boundBy + */ + public OverlappingIntervalIterator(Iterator intervals, Iterator boundBy) { + this.iter = new PushbackIterator(intervals); + this.boundBy = new PushbackIterator(boundBy); + + if ( iter.hasNext() && boundBy.hasNext() ) { + GenomeLoc currentInterval = iter.next(); // load first interval + GenomeLoc currentBound = boundBy.next(); // load first bounding interval + fetchNextOverlap(); + } + } + + /** Traverses both iterators in sync, until the first overlap between the two is reached. If no overlap is found + * until the end of the either of the two streams, leaves prefetchedOverlap set to null + */ + private void fetchNextOverlap() { + + prefetchedOverlap = null; + + while ( prefetchedOverlap == null ) { + + if ( currentInterval.isBefore(currentBound) ) { + if ( ! iter.hasNext() ) break; // no more intervals left; we are done + currentInterval = iter.next(); + continue; + } + + if ( currentInterval.isPast(currentBound) ) { + if ( ! boundBy.hasNext() ) break; // we are past the last available bounding interval, we are done! + currentBound = boundBy.next(); + continue; + } + + // we are at this point only if currentInterval overlaps with currentBound + + prefetchedOverlap = currentInterval.intersect(currentBound); + + // we still do not know if we are done with either current interval or current bound, because + // two special situations are possible: + // + // 1) next interval overlaps with 2) current interval also overlaps with + // the same bounding interval; next bounding interval; note that + // note that in this case next in this case next bound necessarily + // interval necessarily starts before starts before the next interval + // the next bound + // + // curr. int next int. curr. int + // ----- ------ -------------------------- + // ------------------- --------- ------------- + // curr. bound curr. bound next bound + + // To solve this issue we update either only currentInterval or only currentBound to their next value, + // whichever comes first; the rest of the traversal to the next overlap will be performed on the next invocation of + // fetchNextOverlap(). + + if ( ! iter.hasNext() ) { + + } + GenomeLoc nextInterval = iter.next(); + GenomeLoc nextBound = boundBy.next(); + + if ( nextInterval.startsBefore(nextBound)) { + currentInterval = nextInterval; + boundBy.pushback(nextBound); // in case next interval overlaps with the current bound + } else { + currentBound = nextBound; + iter.pushback(nextInterval); // in case current interval also overlaps with the next bound + } + } + + } + + /** + * Returns true if the iteration has more elements. (In other + * words, returns true if next would return an element + * rather than throwing an exception.) + * + * @return true if the iterator has more elements. + */ + public boolean hasNext() { + return false; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Returns the next element in the iteration. + * + * @return the next element in the iteration. + * @throws java.util.NoSuchElementException + * iteration has no more elements. + */ + public GenomeLoc next() { + return null; //To change body of implemented methods use File | Settings | File Templates. + } + + /** + * Removes from the underlying collection the last element returned by the + * iterator (optional operation). This method can be called only once per + * call to next. The behavior of an iterator is unspecified if + * the underlying collection is modified while the iteration is in + * progress in any way other than by calling this method. + * + * @throws UnsupportedOperationException if the remove + * operation is not supported by this Iterator. + * @throws IllegalStateException if the next method has not + * yet been called, or the remove method has already + * been called after the last call to the next + * method. + */ + public void remove() { + throw new UnsupportedOperationException("remove() method is not supported by OverlappingIntervalIterator"); + //To change body of implemented methods use File | Settings | File Templates. + } +} diff --git a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index c4f3338f0..dffb8060e 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -41,7 +41,7 @@ import java.util.Arrays; public class AlignmentUtils { - private static class MismatchCount { + public static class MismatchCount { public int numMismatches = 0; public long mismatchQualities = 0; } @@ -98,13 +98,13 @@ public class AlignmentUtils { return numMismatches(r, StringUtil.stringToBytes(refSeq), refIndex); } - private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) { + public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex) { return getMismatchCount(r,refSeq,refIndex,0,r.getReadLength()); } // todo -- this code and mismatchesInRefWindow should be combined and optimized into a single // todo -- high performance implementation. We can do a lot better than this right now - private static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) { + public static MismatchCount getMismatchCount(SAMRecord r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) { MismatchCount mc = new MismatchCount(); int readIdx = 0; @@ -387,6 +387,15 @@ public class AlignmentUtils { return true; } + /** Returns true is read is mapped and mapped uniquely (Q>0). + * + * @param read + * @return + */ + public static boolean isReadUniquelyMapped(SAMRecord read) { + return ( ! AlignmentUtils.isReadUnmapped(read) ) && read.getMappingQuality() > 0; + } + /** Returns the array of base qualitites in the order the bases were read on the machine (i.e. always starting from * cycle 1). In other words, if the read is unmapped or aligned in the forward direction, the read's own base * qualities are returned as stored in the SAM record; if the read is aligned in the reverse direction, the array