diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/graphalign/GraphReferenceAssessor.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/graphalign/GraphReferenceAssessor.java new file mode 100755 index 000000000..844c4778f --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/graphalign/GraphReferenceAssessor.java @@ -0,0 +1,252 @@ +package org.broadinstitute.sting.playground.gatk.walkers.graphalign; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.fasta.FastaReferenceWalker; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.genotype.Variation; +import org.jgrapht.Graph; +import org.jgrapht.graph.DefaultEdge; +import org.jgrapht.graph.SimpleGraph; +import org.jgrapht.graph.SimpleDirectedGraph; +import org.apache.log4j.Logger; + +import java.util.*; +import java.io.*; + +import net.sf.picard.reference.ReferenceSequence; +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.ReferenceSequenceFileFactory; +import net.sf.samtools.util.StringUtil; +import net.sf.samtools.SAMFileWriter; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; + +public class GraphReferenceAssessor extends ReadWalker { + @Argument(fullName="graphFile", shortName="GF", doc="", required=true) + String graphFile = null; + ObjectInputStream graphSerialStream = null; + + @Argument(fullName="MAX", shortName="MAX", doc="", required=false) + int MAXREADS = -1; + + @Argument(fullName="ignore0MM", shortName="I0", doc="", required=false) + boolean IGNORE_0_MM = false; + + @Argument(fullName="DEBUG", shortName="DB", doc="", required=false) + int DEBUG_LEVEL = 0; + + static boolean DEBUG = false; + static boolean DEBUG2 = false; // crazy level + + @Argument(fullName="read", doc="", required=false) + String onlyDoRead = null; + + ReferenceGraph graphRef = null; + + public void initialize() { + super.initialize(); + + DEBUG = DEBUG_LEVEL > 0; + DEBUG2 = DEBUG_LEVEL > 1; // crazy level + + try { + logger.info("Reading graph reference " + graphFile ); + graphSerialStream = new ObjectInputStream( new FileInputStream( graphFile ) ); + graphRef = (ReferenceGraph)graphSerialStream.readObject(); + graphRef.setDebugPrinting(DEBUG); + graphRef.validateGraph(); + logger.info(graphRef.toBriefString()); + } catch ( FileNotFoundException e ) { + throw new StingException("Couldn't open file " + graphFile, e); + } catch ( IOException e ) { + throw new StingException("Couldn't write to file " + graphFile, e); + } catch ( ClassNotFoundException e ) { + throw new StingException("Couldn't read ReferenceGraph from file " + graphFile, e); + } + } + + private static MismatchCounter countMismatches(byte[] ref, int refOffset, byte[] bases, byte[] quals, int basesOffset, int length) { + MismatchCounter mm = new MismatchCounter(); + + for ( int i = 0; i < length; i++ ) { + byte rawRefBase = ref[i + refOffset]; + byte rawReadBase = bases[i + basesOffset]; + int fragBase = BaseUtils.simpleBaseToBaseIndex((char)rawRefBase); + int readBase = BaseUtils.simpleBaseToBaseIndex((char)rawReadBase); + + boolean mmP = fragBase != -1 && readBase != -1 && fragBase != readBase; + if ( mmP ) { + mm.nMM++; + mm.qSum += quals != null ? quals[i + basesOffset] : 0; + } + + if ( GraphReferenceAssessor.DEBUG2 ) + System.out.printf("%s%d %c %c %s %b%n", Utils.dupString(' ', basesOffset + 2), basesOffset, (char)rawRefBase, (char)rawReadBase, mm, mmP); + } + + return mm; + } + + private static MismatchCounter countMismatches(byte[] ref, byte[] bases, byte[] quals) { + return countMismatches(ref, 0, bases, quals, 0, bases.length); + } + + private static MismatchCounter countMismatchesOnGraph( ReferenceGraph graph, Collection frags, int fragOffset, byte[] bases, byte[] quals, int readOffset ) { + if ( frags.size() == 0 ) + throw new RuntimeException("Fragment list is empty!"); + + MismatchCounter minNMM = MismatchCounter.MAX_VALUE; + + for ( Fragment next : frags ) { + MismatchCounter recNMM = countMismatchesOnGraph( graph, next, 0, bases, quals, readOffset ); + minNMM = minNMM.min( recNMM ); + } + + return minNMM; + } + + private static MismatchCounter countMismatchesOnGraph( ReferenceGraph graph, Fragment frag, int fragOffset, byte[] bases, byte[] quals, int readOffset ) { + if ( GraphReferenceAssessor.DEBUG )System.out.printf("%sfrag %s -> %d%n", Utils.dupString(' ', readOffset + 2), frag, readOffset); + + MismatchCounter mm = new MismatchCounter(); + + if ( readOffset < bases.length ) { + int nRemainingBases = bases.length - readOffset; + int cmpLength = frag.getBaseLengthFrom(fragOffset, nRemainingBases); // how many bases over in the fragment are we from the offset + MismatchCounter fragMM = countMismatches(frag.getUnderlyingBases(), frag.getUnderlyingOffset() + fragOffset, bases, quals, readOffset, cmpLength); + mm.add(fragMM); + +// // still have some counting to do +// for ( int i = 0; i < baseLength; i++ ) { +// int fragBaseOffset = fragOffset + i; +// int readBaseOffset = readOffset + i; +// +// byte rawFragBase = frag.getBase(fragBaseOffset); +// byte rawReadBase = bases[readBaseOffset]; +// int fragBase = BaseUtils.simpleBaseToBaseIndex((char)rawFragBase); +// int readBase = BaseUtils.simpleBaseToBaseIndex((char)rawReadBase); +// +// boolean mmP = fragBase != -1 && readBase != -1 && fragBase != readBase; +// if ( mmP ) nMM++; + + if ( nRemainingBases > cmpLength ) { + MismatchCounter recMM = countMismatchesOnGraph( graph, graph.outgoingFragments(frag), 0, bases, quals, readOffset + cmpLength ); + mm.add(recMM); + } + } + + if ( GraphReferenceAssessor.DEBUG ) System.out.printf("%s=> %s%n", Utils.dupString(' ', readOffset + 2), mm); + return mm; + } + + private static MismatchCounter countMismatchesOnGraph(ReferenceGraph graph, SAMRecord read) { + if ( GraphReferenceAssessor.DEBUG ) System.out.printf("countMismatchesOnGraph( read=%s%n", read.getReadName()); + GenomeLoc loc = GenomeLocParser.createGenomeLoc(read); + MismatchCounter minNMM = MismatchCounter.MAX_VALUE; + + for ( Fragment frag : graph.getStartingFragment(loc) ) { + int fragOffset = frag.getFragOffsetFrom(loc); // how many bases over in the fragment are we from the offset + + if ( GraphReferenceAssessor.DEBUG ) + System.out.printf(" countMismatchesOnGraph frag=%s loc=%s bases=%s offset=%d%n", frag, loc, read.getReadString(), fragOffset); + + MismatchCounter recNMM = countMismatchesOnGraph(graph, frag, fragOffset, read.getReadBases(), read.getBaseQualities(), 0); + minNMM = minNMM.min( recNMM ); + } + + return minNMM; + } + + public Integer map(char[] refArg, SAMRecord read) { + + if ( MAXREADS-- == 0 ) { + System.exit(0); + } else if ( onlyDoRead != null && ! read.getReadName().equals(onlyDoRead) ) { + ; + } else if ( ! read.getReadUnmappedFlag() && read.getCigar().numCigarElements() == 1 ) { + try { + byte[] ref = BaseUtils.charSeq2byteSeq(refArg); + // we're all XM + int nMMFromRead = (Short)read.getAttribute("NM"); + MismatchCounter nAlignedMM = countMismatches(ref, read.getReadBases(), read.getBaseQualities()); + if ( ! IGNORE_0_MM || nAlignedMM.nMM > 0 ) { + MismatchCounter nGraphMM = countMismatchesOnGraph(graphRef, read); + MismatchCounter deltaMM = nAlignedMM.minus(nGraphMM); + + out.printf("%50s with %5s at %10s: mismatches: %3d (delta %3d) -- %3d %3d -- %3d %3d -- delta %3d %3d%n", + read.getReadName(), read.getCigarString(), GenomeLocParser.createGenomeLoc(read), + nMMFromRead, nMMFromRead - nAlignedMM.nMM, + nAlignedMM.nMM, nAlignedMM.qSum, + nGraphMM.nMM, nGraphMM.qSum, + deltaMM.nMM, deltaMM.qSum); + + if ( deltaMM.nMM < 0 || deltaMM.qSum < 0 ) + throw new StingException(read.getReadName() + " is miscalculated"); + } + } catch ( Exception e ) { + System.out.printf("Exception at %s at %s%n", read.getReadName(), GenomeLocParser.createGenomeLoc(read)); + throw new RuntimeException(e); + } + } else { + ; // don't do anything + } + + return 0; + } + + /** + * reduceInit is called once before any calls to the map function. We use it here to setup the output + * bam file, if it was specified on the command line + * + * @return + */ + public Integer reduceInit() { + return 0; + } + + public Integer reduce(Integer readScore, Integer data) { + return data + readScore; + } + + public void onTraversalDone(Integer data) { + out.printf(data.toString()); + } +} + +class MismatchCounter { + int nMM = 0; + int qSum = 0; + + public static MismatchCounter MAX_VALUE = new MismatchCounter(Integer.MAX_VALUE, Integer.MAX_VALUE ); + + public MismatchCounter() {} + + public MismatchCounter(int nMM, int qSum) { + this.nMM = nMM; + this.qSum = qSum; + } + + public void add(MismatchCounter that) { + this.nMM += that.nMM; + this.qSum += that.qSum; + } + + public MismatchCounter min(MismatchCounter that) { + int cmpQSum = Integer.valueOf(this.qSum).compareTo(that.qSum); + if ( cmpQSum < 0 ) { return this; } + else if ( cmpQSum > 0 ) { return that; } + else if ( this.nMM < that.nMM ) { return this; } + else if ( this.nMM > that.nMM ) { return that; } + else { return this; } + } + + public MismatchCounter minus(MismatchCounter that) { + return new MismatchCounter(this.nMM - that.nMM, this.qSum - that.qSum); + } + + public String toString() { return String.format("[MM %d %d]", nMM, qSum); } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/graphalign/GraphReferenceBuilder.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/graphalign/GraphReferenceBuilder.java new file mode 100755 index 000000000..abcb24aa0 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/graphalign/GraphReferenceBuilder.java @@ -0,0 +1,130 @@ +package org.broadinstitute.sting.playground.gatk.walkers.graphalign; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.*; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.gatk.walkers.fasta.FastaReferenceWalker; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.cmdLine.Argument; +import org.broadinstitute.sting.utils.genotype.Variation; +import org.jgrapht.Graph; +import org.jgrapht.graph.DefaultEdge; +import org.jgrapht.graph.SimpleGraph; +import org.jgrapht.graph.SimpleDirectedGraph; +import org.apache.log4j.Logger; + +import java.util.*; +import java.io.ObjectOutputStream; +import java.io.FileOutputStream; +import java.io.FileNotFoundException; +import java.io.IOException; + +import net.sf.picard.reference.ReferenceSequence; +import net.sf.picard.reference.ReferenceSequenceFile; +import net.sf.picard.reference.ReferenceSequenceFileFactory; +import net.sf.samtools.util.StringUtil; + +@WalkerName("GraphReferenceBuilder") +@Requires(value={DataSource.REFERENCE}) +public class GraphReferenceBuilder extends RefWalker { + @Argument(fullName="graphFile", shortName="GF", doc="", required=true) + String graphFile = null; + + @Argument(fullName="DEBUG", shortName="DB", doc="", required=false) + boolean DEBUG = false; + + @Argument(fullName="VALIDATE", shortName="VD", doc="", required=false) + boolean VALIDATE_GRAPH = false; + + @Argument(fullName="printFrequency", shortName="F", doc="", required=false) + int printFrequency = 10000; + + ObjectOutputStream graphSerialStream = null; + + ReferenceGraph graphRef = null; + ReferenceSequenceFile flatReferenceFile = null; + + public void initialize() { + super.initialize(); + + graphRef = new ReferenceGraph(DEBUG); + + try { + graphSerialStream = new ObjectOutputStream( new FileOutputStream( graphFile ) ); + } catch ( FileNotFoundException e ) { + throw new StingException("Couldn't open file " + graphFile, e); + } catch ( IOException e ) { + throw new StingException("Couldn't write to file " + graphFile, e); + } + + flatReferenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.getToolkit().getArguments().referenceFile); + + ReferenceSequence refSeq = flatReferenceFile.nextSequence(); + do { + //logger.info("Read " + refSeq); + graphRef.bindRefenceSequence(refSeq); + logger.info(String.format("contig %s has length %d", refSeq.getName(), refSeq.length())); + refSeq = flatReferenceFile.nextSequence(); + } while ( refSeq != null ); + + System.out.println(graphRef.toBriefString()); + } + + int counter = printFrequency; + public Integer map(RefMetaDataTracker rodData, ReferenceContext ref, AlignmentContext context) { +// if ( context.getLocation().getStart() == 59384 ) { +// try { +// Thread.currentThread().sleep(5000); +// } catch (InterruptedException e) { +// ; +// } +// } + + boolean alreadyAddedAtThisLoc = false; + for ( ReferenceOrderedDatum rod : rodData.getAllRods() ) { + if ( rod instanceof Variation && ! alreadyAddedAtThisLoc ) { + // if we have multiple variants at a locus, just take the first damn one we see for now + Variation variant = (Variation) rod; + // todo -- getAlternativeBases should be getAlleles() + GenomeLoc loc = variant.getLocation(); + String[] allAllelesList = null; // variant.getAlternateBases().split(""); // todo fixme + if ( allAllelesList.length >= 3 ) { // bad dbSNP format :-( + List alleles = Arrays.asList(allAllelesList).subList(1,3); + //logger.info(String.format("Adding %s %s", loc, alleles)); + graphRef.addVariation(variant, loc, alleles); + //logger.info(String.format(" Added %s %s", loc, alleles)); + alreadyAddedAtThisLoc = true; + if ( counter-- == 0 ) { + logger.info(String.format("Added %s %s %s", loc, alleles, graphRef.toBriefString())); + counter = printFrequency; + if ( VALIDATE_GRAPH ) + graphRef.validateGraph(); + } + } + } + } + + return null; + } + + // todo -- graph should be the reduce result + public Integer reduceInit() { + return null; + } + + public Integer reduce(Integer value, Integer sum) { + return sum; + } + + public void onTraversalDone(Integer sum) { + super.onTraversalDone(sum); + try { + graphSerialStream.writeObject(graphRef); + graphSerialStream.close(); + } catch ( IOException e ) { + throw new StingException("Couldn't write to file " + graphFile, e); + } + } +} + diff --git a/java/src/org/broadinstitute/sting/playground/gatk/walkers/graphalign/ReferenceGraph.java b/java/src/org/broadinstitute/sting/playground/gatk/walkers/graphalign/ReferenceGraph.java new file mode 100755 index 000000000..62036ea57 --- /dev/null +++ b/java/src/org/broadinstitute/sting/playground/gatk/walkers/graphalign/ReferenceGraph.java @@ -0,0 +1,420 @@ +package org.broadinstitute.sting.playground.gatk.walkers.graphalign; + +import org.jgrapht.graph.DefaultEdge; +import org.jgrapht.graph.SimpleDirectedGraph; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.genotype.Variation; +import net.sf.picard.reference.ReferenceSequence; +import net.sf.samtools.util.StringUtil; + +import java.util.*; +import java.io.Serializable; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.io.ObjectInputStream; + +import edu.mit.broad.picard.util.IntervalTree; + +/** + * Created by IntelliJ IDEA. + * User: depristo + * Date: Oct 22, 2009 + * Time: 2:15:54 PM + * To change this template use File | Settings | File Templates. + */ +class ReferenceGraph extends SimpleDirectedGraph implements Serializable { + final private static boolean USE_IT = true; + final private static boolean THROW_ERRORS_ON_BAD_INPUTS = false; + + private boolean DEBUG = false; + int nSkippedIndels = 0; + int nSkippedBecauseOfContinguousVariation = 0; + int nBadPolymorphisms = 0; + int nMultiStateAlleles = 0; + + GenomeLoc initialLoc = null; + + private transient IntervalTree loc2Fragment = new IntervalTree(); + + public ReferenceGraph(boolean printDebuggingInfo) { + super(DefaultEdge.class); + DEBUG = printDebuggingInfo; + } + + public ReferenceGraph() { + this(false); + } + + public void setDebugPrinting(boolean enable) { + this.DEBUG = enable; + } + + public void bindRefenceSequence(ReferenceSequence seq) { + GenomeLoc refSeqLoc = GenomeLocParser.createGenomeLoc(seq.getContigIndex(), 1, seq.length()); + String refString = StringUtil.bytesToString(seq.getBases()).toUpperCase(); + Fragment frag = new Fragment(refSeqLoc, 1, StringUtil.stringToBytes(refString)); + addFragment(frag); + initialLoc = refSeqLoc; + } + + private void addFragmentToIntervalTree(Fragment frag) { + loc2Fragment.put((int)frag.getLocation().getStart(), (int)frag.getLocation().getStop(), frag); + } + + private void addFragment(Fragment frag) { + addFragmentToIntervalTree(frag); + addVertex(frag); + } + + private void removeFragment(Fragment frag) { + loc2Fragment.remove((int)frag.getLocation().getStart(), (int)frag.getLocation().getStop()); + removeVertex(frag); + } + + public void validateGraph() { + for ( Fragment v : this.vertexSet() ) { + if ( this.inDegreeOf(v) == 0 && v.getLocation().getStart() != initialLoc.getStart() ) { + throw new StingException(String.format("Fragment %s has no incoming edges but isn't at the start of the contig %s", v, initialLoc)); + } + if ( this.outDegreeOf(v) == 0 && v.getLocation().getStop() != initialLoc.getStop() ) { + throw new StingException(String.format("Fragment %s has no outgoing edges but isn't at the end of the contig %s", v, initialLoc)); + } + } + + //System.out.printf("Passed validation: %s%n", this.toBriefString()); + } + + private void rebuildIntervalTree() { + if ( DEBUG ) System.out.printf("rebuilding IntervalTree()%n"); + for ( Fragment v : this.vertexSet() ) { + if ( DEBUG ) System.out.printf(" adding interval tree: %s%n", v); + addFragmentToIntervalTree(v); + } + } + + private boolean allelesAreInExcisedFragment(Fragment cut, List alleles) { + boolean foundRef = false; + for ( String allele : alleles ) { + if ( allele.equals(cut.getBases()) ) + foundRef = true; + } + + if ( ! foundRef && THROW_ERRORS_ON_BAD_INPUTS ) + throw new StingException(String.format("Polymorphic alleles %s do not contain the reference sequence %s", alleles, cut.getBases())); + + return foundRef; + } + + public void addVariation(Variation variant, GenomeLoc loc, List alleles) { + if ( DEBUG ) System.out.printf("addVariation(%s, %s)%n", loc, alleles); + //validateGraph(); + + if ( variant.isSNP() ) { + Fragment frag = getContainingFragment(loc); + + if ( frag == null ) { + nMultiStateAlleles++; + return; + } + + if ( ! allelesAreInExcisedFragment(subFragment(frag, loc, 1), alleles)) { + nBadPolymorphisms++; + return; + } + + List split = exciseFragment(frag, loc); + if ( split != null ) { + Fragment left = split.get(0); + Fragment cut = split.get(1); + Fragment right = split.get(2); + + if ( DEBUG ) System.out.printf(" cutFrag(%s, %s)%n", loc, cut); + + for ( String allele : alleles ) { + byte[] bases = StringUtil.stringToBytes(allele); + double freq = 1.0 / alleles.size(); + Fragment alleleFrag = new Fragment(loc, freq, 0, bases.length, bases); + if ( DEBUG ) System.out.printf(" Creating allele fragment %s%n", alleleFrag); + addFragment(alleleFrag); + if ( left != null ) addEdge(left, alleleFrag); + if ( right != null ) addEdge(alleleFrag, right); + } + } else { + nSkippedBecauseOfContinguousVariation++; + } + } else { + nSkippedIndels++; + } + } + + + private Fragment subFragment(Fragment frag, GenomeLoc loc, double freq ) { + return new Fragment(loc, 1, frag.getUnderlyingBases()); + } + + public List exciseFragment(Fragment frag, GenomeLoc loc) { + if ( DEBUG ) System.out.printf(" exciseFragment(%s, %s)%n", frag, loc); + GenomeLoc fragLoc = frag.getLocation(); + + Fragment cut = subFragment(frag, loc, 1); + + Set inToFrag = incomingEdgesOf(frag); + Set outOfFrag = outgoingEdgesOf(frag); + + Fragment left = null; + if ( fragLoc.getStart() == loc.getStart() ) { + if ( ! inToFrag.isEmpty() ) { + if ( THROW_ERRORS_ON_BAD_INPUTS ) + throw new StingException(String.format("Attempting to create a variation at the start of a fragment %s %s", frag, loc)); + return null; + } + } else { + GenomeLoc leftLoc = GenomeLocParser.createGenomeLoc(fragLoc.getContigIndex(), fragLoc.getStart(), loc.getStart()-1); + left = new Fragment(leftLoc, 1, frag.getUnderlyingBases()); + addFragment(left); + + for ( DefaultEdge e : inToFrag ) { + addEdge(getEdgeSource(e), left); + } + + removeAllEdges(inToFrag); + } + + Fragment right = null; + if ( fragLoc.getStop() == loc.getStop() ) { + if ( ! outOfFrag.isEmpty() ) { + throw new StingException(String.format("Attempting to create a variation at the end of a fragment %s %s", frag, loc)); + } + } else { + GenomeLoc rightLoc = GenomeLocParser.createGenomeLoc(fragLoc.getContigIndex(), loc.getStop()+1, fragLoc.getStop()); + right = new Fragment(rightLoc, 1, frag.getUnderlyingBases()); + addFragment(right); + + for ( DefaultEdge e : outOfFrag ) { + addEdge(right, getEdgeTarget(e)); + } + + removeAllEdges(outOfFrag); + } + + if ( DEBUG ) System.out.printf(" removing %s%n", frag); + removeFragment(frag); + if ( DEBUG ) System.out.printf(" returning left=%s right=%s%n", left, right); + return Arrays.asList(left, cut, right); + } + + public Fragment getContainingFragment(GenomeLoc loc) { + Fragment frag = USE_IT ? getContainingFragmentIT(loc) : getContainingFragmentG(loc); + + + if ( frag == null ) { + if ( THROW_ERRORS_ON_BAD_INPUTS ) + throw new StingException("No spanning fragment was found for " + loc); + else + return null; + } + else if ( frag.getLocation().getStart() > loc.getStart() || frag.getLocation().getStop() < loc.getStop() ) + throw new StingException("BUG: bad spanning fragment found for " + loc + " was " + frag.getLocation() ); + else + return frag; + } + + public Fragment getContainingFragmentG(GenomeLoc loc) { + for ( Fragment v : this.vertexSet() ) { + if ( v.getLocation().containsP(loc) ) { + return v; + } + } + + return null; + } + + public Fragment getContainingFragmentIT(GenomeLoc loc) { + IntervalTree.Node node = loc2Fragment.minOverlapper((int)loc.getStart(), (int)loc.getStop()); + if ( node == null ) + return null; + else + return node.getValue(); + } + + public Collection getStartingFragment(GenomeLoc loc) { + Collection frags = USE_IT ? getStartingFragmentIT(loc) : getStartingFragmentG(loc); + //Collection frags = getStartingFragmentTest(loc); + + if ( frags == null || frags.size() == 0 ) + throw new StingException("No fragment contains location start of " + loc); + if ( frags.size() == 1 && MathUtils.compareDoubles(frags.iterator().next().getFrequency(), 1.0) != 0 ) { + Fragment bad = frags.iterator().next(); + throw new StingException(String.format("Only one fragment was found but it's frequency < 1 %s with %e", bad, bad.getFrequency())); + } + else + return frags; + } + + public Collection getStartingFragmentTest(GenomeLoc loc) { + Collection fragsFromIT = getStartingFragmentIT(loc); + Collection fragsFromG = getStartingFragmentG(loc); + + if ( fragsFromIT.size() != fragsFromG.size() ) { + throw new StingException(String.format("Fragment sizes differ %d from IntervalTree, %d from graph", fragsFromIT.size(), fragsFromG.size())); + } + + return USE_IT && false ? fragsFromIT : fragsFromG; + } + + public Collection getStartingFragmentIT(GenomeLoc loc) { + Collection frags = new HashSet(); + + Iterator> it = loc2Fragment.overlappers((int)loc.getStart(), (int)loc.getStart()); + IntervalTree.Node node = null; + while ( it.hasNext() ) { + node = it.next(); + frags.add(node.getValue()); + } + + // todo -- painful bug work around -- should be removed + if ( frags.size() == 1 && MathUtils.compareDoubles(node.getValue().getFrequency(), 1.0) != 0 ) { + System.out.printf(">>> Using IT workaround at %s <<<%n", loc); + return getStartingFragmentG(loc); + } + + return frags; +// IntervalTree.Node node = loc2Fragment.minOverlapper((int)loc.getStart(), (int)loc.getStart()); +// if ( node == null ) +// return null; +// else +// return node.getValue(); + } + + public Collection getStartingFragmentG(GenomeLoc loc) { + Collection frags = new HashSet(); + for ( Fragment v : this.vertexSet() ) { + //if ( v.getLocation().getStart() < loc.getStop() ) + // System.out.printf("Checking %s vs. %s%n", loc, v.getLocation()); + if ( v.getLocation().containsStartPosition(loc.getStart()) ) { + // System.out.printf(" Adding %s%n", v.getLocation()); + frags.add(v); + } + } + + return frags; + } + + public Set outgoingFragments( Fragment frag ) { + Set outgoingFrags = new HashSet(); + + for ( DefaultEdge e : outgoingEdgesOf(frag) ) { + outgoingFrags.add(getEdgeTarget(e)); + } + + if ( outgoingFrags.size() == 0 && frag.getLocation().getStop() != initialLoc.getStop() ) { + + } + + return outgoingFrags; + } + + public String toString() { + StringBuilder s = new StringBuilder(); + + for ( Fragment v : this.vertexSet() ) { + s.append(String.format("Fragment: %s%n", v.toString())); + for ( DefaultEdge e : this.incomingEdgesOf(v) ) { + s.append(String.format(" [IN FROM] %s%n", this.getEdgeSource(e))); + } + for ( DefaultEdge e : this.outgoingEdgesOf(v) ) { + s.append(String.format(" [OUT TO ] %s%n", this.getEdgeTarget(e))); + } + } + + return s.toString(); + } + + public String toBriefString() { + return String.format("GraphRef: %d fragments, %d edges, skipped %d contingous variants, %d indels, %d polymorphisms w/o ref allele, %d multi-state", + this.vertexSet().size(), this.edgeSet().size(), nSkippedBecauseOfContinguousVariation, nSkippedIndels, nBadPolymorphisms, nMultiStateAlleles); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // serialization + // + // -------------------------------------------------------------------------------------------------------------- + private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException { + //always perform the default de-serialization first + stream.defaultReadObject(); + loc2Fragment = new IntervalTree(); + rebuildIntervalTree(); + } +} + +class Fragment implements Serializable { + GenomeLoc loc = null; // Index position of this fragment into the reference + int offset, stop; + double freq = -1; + byte[] bases = null; + + public Fragment( GenomeLoc loc, double freq, int offset, int stop, byte[] bases ) { + this.loc = loc; + this.freq = freq; + this.bases = bases; + this.offset = offset; + this.stop = stop; + } + + public Fragment( GenomeLoc loc, double freq, byte[] bases ) { + this(loc, freq, (int)loc.getStart()-1, (int)loc.getStop(), bases); + } + + public String toString() { + //return String.format("%s:%.2f:%s", loc.toString(), getFrequency(), getBases()); + return String.format("%s:%.2f", loc.toString(), getFrequency()); + } + + public GenomeLoc getLocation() { + return loc; + } + + public double getFrequency() { + return freq; + } + + public int getUnderlyingOffset() { return offset; } + public int getStop() { return stop; } + public int getLength() { return getStop() - getUnderlyingOffset(); } + + public byte[] getUnderlyingBases() { + return bases; + } + + /** + * how many bases over in the fragment are we over in this fragment? + * + * @param loc + * @return + */ + public int getFragOffsetFrom(GenomeLoc loc) { + // todo -- ignores contigs -- can we fix this? + if ( getLocation().getStart() > loc.getStart() ) + throw new StingException("BUG: Request for offset from " + loc + " in frag at " + getLocation() + " but this is beyond the location of the fragment"); + return (int)(loc.getStart() - getLocation().getStart()); + } + + public int getBaseLengthFrom( int fragOffset, int maxLength ) { + int fragRemaining = getLength() - fragOffset; + + if ( fragRemaining < 0 ) + throw new StingException("BUG: Request for length from offset " + fragOffset + " but this is longer than the fragment itself"); + + return Math.min(fragRemaining, maxLength); + } + + public byte getBase(int fragOffset) { + return bases[getUnderlyingOffset() + fragOffset]; + } + + public String getBases() { + return StringUtil.bytesToString(getUnderlyingBases(), getUnderlyingOffset(), getLength()); + } +}