made a fake fasta generator, so we can now generate a complete bam / fasta combo of made up data.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@1150 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
aaron 2009-07-01 21:35:34 +00:00
parent c2e5a68aaf
commit d4d3af20f2
5 changed files with 211 additions and 28 deletions

View File

@ -0,0 +1,129 @@
package org.broadinstitute.sting.utils.fasta;
import org.broadinstitute.sting.utils.StingException;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.PrintStream;
import java.util.List;
/**
* @author aaron
* <p/>
* Class ArtificialFastaUtils
* <p/>
* artificial fasta utility class, for generating fake fastas.
*/
public class ArtificialFastaUtils {
public enum BASE_PATTERN {
RANDOM, ALL_A, ALL_T, ALL_C, ALL_G;
}
// what bases we support
public enum BASES {
A, T, C, G;
}
// create an artificial fasta file
public static void createArtificialFasta(String fileName,
List<String> contigNames,
List<Integer> contigSizes,
BASE_PATTERN pattern) {
PrintStream s;
try {
s = new PrintStream(new FileOutputStream(fileName));
} catch (FileNotFoundException e) {
throw new StingException("Filename " + fileName + " passed to the ArtificialFastaUtils generated a FileNotFound exception", e);
}
generateFakeFasta(contigNames, contigSizes, pattern, s);
}
// create an artificial fasta file
public static void createArtificialFasta(PrintStream stream,
List<String> contigNames,
List<Integer> contigSizes,
BASE_PATTERN pattern) {
generateFakeFasta(contigNames, contigSizes, pattern, stream);
}
/**
* create a fake fasta file
*
* @param contigNames the pile of contig names
* @param contigSizes the pile of contig sizes
* @param pattern the pattern to use for the base distrobution
* @param s the print stream to write to
*/
private static void generateFakeFasta(List<String> contigNames, List<Integer> contigSizes, BASE_PATTERN pattern, PrintStream s) {
if (contigNames.size() != contigSizes.size()) {
throw new StingException("ArtificialContig name and size arrays are not equal sizes");
}
for (int x = 0; x < contigNames.size(); x++) {
ArtificialContig tig = new ArtificialContig(contigNames.get(x), contigSizes.get(x), pattern);
tig.write(s);
}
s.close();
}
}
/** the fake contig class, a fasta is made up of these */
class ArtificialContig {
public static final int COLUMN_WIDTH = 80;
final protected String mName;
final protected int mSize;
final protected ArtificialFastaUtils.BASE_PATTERN mPattern;
public ArtificialContig(String name, int size, ArtificialFastaUtils.BASE_PATTERN pat) {
this.mName = name;
this.mSize = size;
this.mPattern = pat;
}
/**
* write out the contig to a stream
*
* @param stream
*/
public void write(PrintStream stream) {
stream.println(">" + mName);
int count = 0;
while (count < mSize) {
for (int x = 0; x < COLUMN_WIDTH; x++) {
stream.print(generateAppropriateBase());
count++;
if (count >= mSize) {
break;
}
}
stream.println();
}
}
/**
* generate the appropriate base, given the BASE_PATTERN
*
* @return a base, as a string
*/
public String generateAppropriateBase() {
switch (mPattern) {
case RANDOM:
return (ArtificialFastaUtils.BASES.values()[(int) Math.round(Math.random() * 4)]).toString();
case ALL_A:
return "A";
case ALL_T:
return "T";
case ALL_C:
return "C";
case ALL_G:
return "G";
default:
throw new StingException("Unknown base pattern");
}
}
}

View File

@ -0,0 +1,28 @@
package org.broadinstitute.sting.utils.sam;
/**
*
* @author aaron
*
* Class ArtificialSAMGenerator
*
* This provides for an external utility, that creates sam files and associates fasta files
*/
public class ArtificialSAMGenerator {
}
class ArtificialFASTAUtils {
}

View File

@ -161,7 +161,7 @@ public class ArtificialSAMQueryIterator extends ArtificialSAMIterator implements
contigIndex = rec.getSequenceIndex();
}
}
if (contigIndex < 0) { throw new IllegalArgumentException("Contig" + contig + " doesn't exist"); }
if (contigIndex < 0) { throw new IllegalArgumentException("ArtificialContig" + contig + " doesn't exist"); }
while (super.hasNext() && this.peek().getReferenceIndex() < contigIndex) {
super.next();
}

View File

@ -1,30 +1,12 @@
package org.broadinstitute.sting.utils.sam;
import net.sf.samtools.*;
import java.io.File;
import java.util.*;
import org.broadinstitute.sting.gatk.iterators.QueryIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.utils.StingException;
/**
*
* User: aaron
* Date: May 21, 2009
* Time: 2:57:48 PM
*
* The Broad Institute
* SOFTWARE COPYRIGHT NOTICE AGREEMENT
* This software and its documentation are copyright 2009 by the
* Broad Institute/Massachusetts Institute of Technology. All rights are reserved.
*
* This software is supplied without any warranty or guaranteed support whatsoever. Neither
* the Broad Institute nor MIT can be responsible for its use, misuse, or functionality.
*
*/
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* @author aaron
@ -46,11 +28,11 @@ public class ArtificialSAMUtils {
SAMFileHeader header = createArtificialSamHeader(numberOfChromosomes, startingChromosome, chromosomeSize);
File outFile = new File(filename);
SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, false, outFile);
SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(header, true, outFile);
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
for (int readNumber = 0; readNumber < readsPerChomosome; readNumber++) {
out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, 100));
for (int readNumber = 1; readNumber < readsPerChomosome; readNumber++) {
out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, DEFAULT_READ_LENGTH));
}
}
@ -73,7 +55,7 @@ public class ArtificialSAMUtils {
SAMFileWriter out = new SAMFileWriterFactory().makeSAMWriter(header, false, outFile);
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
for (int readNumber = 0; readNumber < readsPerChomosome; readNumber++) {
for (int readNumber = 1; readNumber <= readsPerChomosome; readNumber++) {
out.addAlignment(createArtificialRead(header, "Read_" + readNumber, x - startingChromosome, readNumber, 100));
}
}
@ -92,6 +74,7 @@ public class ArtificialSAMUtils {
*/
public static SAMFileHeader createArtificialSamHeader( int numberOfChromosomes, int startingChromosome, int chromosomeSize ) {
SAMFileHeader header = new SAMFileHeader();
header.setSortOrder(net.sf.samtools.SAMFileHeader.SortOrder.coordinate);
SAMSequenceDictionary dict = new SAMSequenceDictionary();
// make up some sequence records
for (int x = startingChromosome; x < startingChromosome + numberOfChromosomes; x++) {
@ -161,7 +144,7 @@ public class ArtificialSAMUtils {
*/
public static SAMRecord createArtificialRead( SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) {
if (alignmentStart == 0)
throw new StingException("Invalid alignment start for artificial read");
throw new StingException("Invalid alignment start for artificial read, start = " + alignmentStart);
SAMRecord record = new SAMRecord(header);
record.setReadName(name);
record.setReferenceIndex(refIndex);

View File

@ -0,0 +1,43 @@
package org.broadinstitute.sting.utils.fasta;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.junit.Assert;
import org.junit.Test;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
/**
* @author aaron
* <p/>
* Class ArtificialFastaUtilsTest
* <p/>
* test out the ArtificialFastaUtils functionality
*/
public class ArtificialFastaUtilsTest extends BaseTest {
/** generate a fake fasta */
@Test
public void testFastaGeneration() {
List<String> names = new ArrayList<String>();
List<Integer> sizes = new ArrayList<Integer>();
for (int x = 0; x < 5; x++) {
sizes.add(1000);
names.add("chr" + (x+1));
}
File temp = new File("tempFileFasta.fasta");
ArtificialFastaUtils.createArtificialFasta(temp.getName(),names,sizes,ArtificialFastaUtils.BASE_PATTERN.ALL_A);
// using the fasta sequence file to test, in reality we should use the indexed version
FastaSequenceFile2 fasta = new FastaSequenceFile2(temp);
Assert.assertEquals(5,fasta.getSequenceDictionary().getSequences().size());
ArtificialSAMUtils.createArtificialBamFile("tempFileBAM.bam",5,1,1000,600);
//temp.delete();
}
}