made a fake fasta generator, so we can now generate a complete bam / fasta combo of made up data.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1150 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
c2e5a68aaf
commit
d4d3af20f2
|
|
@ -0,0 +1,129 @@
|
|||
package org.broadinstitute.sting.utils.fasta;
|
||||
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class ArtificialFastaUtils
|
||||
* <p/>
|
||||
* artificial fasta utility class, for generating fake fastas.
|
||||
*/
|
||||
public class ArtificialFastaUtils {
|
||||
public enum BASE_PATTERN {
|
||||
RANDOM, ALL_A, ALL_T, ALL_C, ALL_G;
|
||||
}
|
||||
|
||||
// what bases we support
|
||||
public enum BASES {
|
||||
A, T, C, G;
|
||||
}
|
||||
|
||||
// create an artificial fasta file
|
||||
public static void createArtificialFasta(String fileName,
|
||||
List<String> contigNames,
|
||||
List<Integer> contigSizes,
|
||||
BASE_PATTERN pattern) {
|
||||
PrintStream s;
|
||||
try {
|
||||
s = new PrintStream(new FileOutputStream(fileName));
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new StingException("Filename " + fileName + " passed to the ArtificialFastaUtils generated a FileNotFound exception", e);
|
||||
}
|
||||
generateFakeFasta(contigNames, contigSizes, pattern, s);
|
||||
}
|
||||
|
||||
// create an artificial fasta file
|
||||
public static void createArtificialFasta(PrintStream stream,
|
||||
List<String> contigNames,
|
||||
List<Integer> contigSizes,
|
||||
BASE_PATTERN pattern) {
|
||||
|
||||
generateFakeFasta(contigNames, contigSizes, pattern, stream);
|
||||
}
|
||||
|
||||
/**
|
||||
* create a fake fasta file
|
||||
*
|
||||
* @param contigNames the pile of contig names
|
||||
* @param contigSizes the pile of contig sizes
|
||||
* @param pattern the pattern to use for the base distrobution
|
||||
* @param s the print stream to write to
|
||||
*/
|
||||
private static void generateFakeFasta(List<String> contigNames, List<Integer> contigSizes, BASE_PATTERN pattern, PrintStream s) {
|
||||
if (contigNames.size() != contigSizes.size()) {
|
||||
throw new StingException("ArtificialContig name and size arrays are not equal sizes");
|
||||
}
|
||||
for (int x = 0; x < contigNames.size(); x++) {
|
||||
ArtificialContig tig = new ArtificialContig(contigNames.get(x), contigSizes.get(x), pattern);
|
||||
tig.write(s);
|
||||
}
|
||||
s.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/** the fake contig class, a fasta is made up of these */
|
||||
class ArtificialContig {
|
||||
public static final int COLUMN_WIDTH = 80;
|
||||
|
||||
final protected String mName;
|
||||
final protected int mSize;
|
||||
final protected ArtificialFastaUtils.BASE_PATTERN mPattern;
|
||||
|
||||
public ArtificialContig(String name, int size, ArtificialFastaUtils.BASE_PATTERN pat) {
|
||||
this.mName = name;
|
||||
this.mSize = size;
|
||||
this.mPattern = pat;
|
||||
}
|
||||
|
||||
/**
|
||||
* write out the contig to a stream
|
||||
*
|
||||
* @param stream
|
||||
*/
|
||||
public void write(PrintStream stream) {
|
||||
stream.println(">" + mName);
|
||||
int count = 0;
|
||||
while (count < mSize) {
|
||||
for (int x = 0; x < COLUMN_WIDTH; x++) {
|
||||
stream.print(generateAppropriateBase());
|
||||
count++;
|
||||
if (count >= mSize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
stream.println();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* generate the appropriate base, given the BASE_PATTERN
|
||||
*
|
||||
* @return a base, as a string
|
||||
*/
|
||||
public String generateAppropriateBase() {
|
||||
switch (mPattern) {
|
||||
case RANDOM:
|
||||
return (ArtificialFastaUtils.BASES.values()[(int) Math.round(Math.random() * 4)]).toString();
|
||||
case ALL_A:
|
||||
return "A";
|
||||
case ALL_T:
|
||||
return "T";
|
||||
case ALL_C:
|
||||
return "C";
|
||||
case ALL_G:
|
||||
return "G";
|
||||
default:
|
||||
throw new StingException("Unknown base pattern");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @author aaron
|
||||
*
|
||||
* Class ArtificialSAMGenerator
|
||||
*
|
||||
* This provides for an external utility, that creates sam files and associates fasta files
|
||||
*/
|
||||
public class ArtificialSAMGenerator {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class ArtificialFASTAUtils {
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -161,7 +161,7 @@ public class ArtificialSAMQueryIterator extends ArtificialSAMIterator implements
|
|||
contigIndex = rec.getSequenceIndex();
|
||||
}
|
||||
}
|
||||
if (contigIndex < 0) { throw new IllegalArgumentException("Contig" + contig + " doesn't exist"); }
|
||||
if (contigIndex < 0) { throw new IllegalArgumentException("ArtificialContig" + contig + " doesn't exist"); }
|
||||
while (super.hasNext() && this.peek().getReferenceIndex() < contigIndex) {
|
||||
super.next();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,30 +1,12 @@
|
|||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
|
||||
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.utils.StingException;
|
||||
|
||||
/**
|
||||
*
|
||||
* User: aaron
|
||||
* Date: May 21, 2009
|
||||
* Time: 2:57:48 PM
|
||||
*
|
||||
* The Broad Institute
|
||||
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
|
||||
* This software and its documentation are copyright 2009 by the
|
||||
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
|
||||
*
|
||||
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
|
||||
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
|
||||
*
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
|
|
@ -46,11 +28,11 @@ public class ArtificialSAMUtils {
|
|||
SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
|
||||
File outFile = new File(filename);
|
||||
|
||||
SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, false, outFile);
|
||||
|
||||
SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, true, outFile);
|
||||
|
||||
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
|
||||
for (int readNumber = 0; readNumber < readsPerChomosome; readNumber++) {
|
||||
out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, 100));
|
||||
for (int readNumber = 1; readNumber < readsPerChomosome; readNumber++) {
|
||||
out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, DEFAULT_READ_LENGTH));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -73,7 +55,7 @@ public class ArtificialSAMUtils {
|
|||
SAMFileWriter out = new SAMFileWriterFactory().makeSAMWriter(header, false, outFile);
|
||||
|
||||
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
|
||||
for (int readNumber = 0; readNumber < readsPerChomosome; readNumber++) {
|
||||
for (int readNumber = 1; readNumber <= readsPerChomosome; readNumber++) {
|
||||
out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, 100));
|
||||
}
|
||||
}
|
||||
|
|
@ -92,6 +74,7 @@ public class ArtificialSAMUtils {
|
|||
*/
|
||||
public static SAMFileHeader createArtificialSamHeader( int numberOfChromosomes, int startingChromosome, int chromosomeSize ) {
|
||||
SAMFileHeader header = new SAMFileHeader();
|
||||
header.setSortOrder(net.sf.samtools.SAMFileHeader.SortOrder.coordinate);
|
||||
SAMSequenceDictionary dict = new SAMSequenceDictionary();
|
||||
// make up some sequence records
|
||||
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
|
||||
|
|
@ -161,7 +144,7 @@ public class ArtificialSAMUtils {
|
|||
*/
|
||||
public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
|
||||
if (alignmentStart == 0)
|
||||
throw new StingException("Invalid alignment start for artificial read");
|
||||
throw new StingException("Invalid alignment start for artificial read, start = " + alignmentStart);
|
||||
SAMRecord record = new SAMRecord(header);
|
||||
record.setReadName(name);
|
||||
record.setReferenceIndex(refIndex);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,43 @@
|
|||
package org.broadinstitute.sting.utils.fasta;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
* <p/>
|
||||
* Class ArtificialFastaUtilsTest
|
||||
* <p/>
|
||||
* test out the ArtificialFastaUtils functionality
|
||||
*/
|
||||
public class ArtificialFastaUtilsTest extends BaseTest {
|
||||
|
||||
/** generate a fake fasta */
|
||||
@Test
|
||||
public void testFastaGeneration() {
|
||||
List<String> names = new ArrayList<String>();
|
||||
List<Integer> sizes = new ArrayList<Integer>();
|
||||
|
||||
for (int x = 0; x < 5; x++) {
|
||||
sizes.add(1000);
|
||||
names.add("chr" + (x+1));
|
||||
}
|
||||
File temp = new File("tempFileFasta.fasta");
|
||||
ArtificialFastaUtils.createArtificialFasta(temp.getName(),names,sizes,ArtificialFastaUtils.BASE_PATTERN.ALL_A);
|
||||
|
||||
// using the fasta sequence file to test, in reality we should use the indexed version
|
||||
FastaSequenceFile2 fasta = new FastaSequenceFile2(temp);
|
||||
|
||||
Assert.assertEquals(5,fasta.getSequenceDictionary().getSequences().size());
|
||||
|
||||
ArtificialSAMUtils.createArtificialBamFile("tempFileBAM.bam",5,1,1000,600);
|
||||
//temp.delete();
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue