Minor renamign changes.

PlinkRodWithGenomeLoc now supports .bed file parsing (and doesn't require |c#_p# conventions for SNPs -- still requires _g[I/D] for indels)



git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2663 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-01-22 23:06:32 +00:00
parent fd223e955c
commit fbf82526cb
3 changed files with 653 additions and 308 deletions

View File

@ -0,0 +1,651 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import java.io.*;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Jan 19, 2010
* Time: 10:24:18 AM
* To change this template use File | Settings | File Templates.
*/
public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements ReferenceOrderedDatum {
private final Set<String> headerEntries = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex",
"Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE"));
private final byte SNP_MAJOR_MODE = 0x00000001;
private ArrayList<PlinkVariantInfo> variants;
private PlinkVariantInfo currentVariant;
private ListIterator<PlinkVariantInfo> variantIterator;
private PlinkFileType plinkFileType;
public enum PlinkFileType {
STANDARD_PED,RAW_PED,BINARY_PED
}
// // // CONSTRUCTOR // // //
public PlinkRodWithGenomeLoc(String name) {
super(name);
}
@Override
public Object initialize(final File plinkFile) throws FileNotFoundException {
if ( ! plinkFile.exists() ) {
throw new FileNotFoundException("File "+plinkFile.getAbsolutePath()+" does not exist.");
}
variants = parsePlinkFile(plinkFile);
if ( variants != null ) {
variantIterator = variants.listIterator();
currentVariant = variantIterator.next();
}
assertNotNull();
return null;
}
private void assertNotNull() {
if ( currentVariant == null ) {
throw new UnsupportedOperationException ( "Current sequenom variant information was set to null" );
}
}
@Override
public boolean parseLine(Object obj, String[] args) {
if ( variantIterator.hasNext() ) {
currentVariant = variantIterator.next();
return true;
} else {
return false;
}
}
@Override
public GenomeLoc getLocation() {
return currentVariant.getLocation();
}
@Override
public String toString() {
return currentVariant == null ? "" : currentVariant.toString();
}
public String getVariantName() {
return currentVariant.getName();
}
public ArrayList<String> getVariantSampleNames() {
return currentVariant.getSampleNames();
}
public ArrayList<Genotype> getGenotypes() {
return currentVariant.getGenotypes();
}
// AM I PARSING A TEXT OR A BINARY FILE ??
private ArrayList<PlinkVariantInfo> parsePlinkFile(File file) {
String[] splitFileName = file.getName().split(".");
String extension = splitFileName[splitFileName.length-1];
if ( extension.equals("ped") || extension.equals("raw") ) {
return parseTextFormattedPlinkFile(file);
} else if ( extension.equals("bed") || extension.equals("bim") || extension.equals("fam") ) {
plinkFileType = PlinkFileType.BINARY_PED;
return parseBinaryFormattedPlinkFile(file);
} else {
System.out.println("Warning -- Plink file does not have a standard extension (ped/raw for text, bed/bim/fam for binary) -- assuming ped format");
return parseTextFormattedPlinkFile(file);
}
}
/* *** *** *** *** *** ** *** ** *** ** *** ** *** ** ***
* * PARSING STANDARD TEXT PED FILES * **
* *** *** *** *** *** ** *** ** *** ** *** ** *** ** ***/
private ArrayList<PlinkVariantInfo> parseTextFormattedPlinkFile( File file ) {
try {
BufferedReader reader = new BufferedReader( new FileReader ( file ) );
String header = reader.readLine();
ArrayList<PlinkVariantInfo> seqVars = instantiateVariantListFromHeader(header);
ArrayList<Integer> snpOffsets = getSNPOffsetsFromHeader(header);
String line = null;
do {
line = reader.readLine();
incorporateInfo(seqVars,snpOffsets,line);
} while ( line != null );
java.util.Collections.sort(seqVars); // because the comparable uses the GenomeLoc comparable; these
// are sorted in standard reference order
return seqVars;
} catch ( FileNotFoundException e ) {
throw new StingException("File "+file.getAbsolutePath()+" could not be found. This was checked earlier. Should never happen.",e);
} catch ( IOException e ) {
throw new StingException("Error reading file "+file.getAbsolutePath()+".",e);
}
}
private void incorporateInfo(List<PlinkVariantInfo> vars, List<Integer> offsets, String plinkLine) {
String[] plinkInfo;
if ( plinkFileType == PlinkFileType.STANDARD_PED) {
plinkInfo = plinkLine.split("\t");
} else {
throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file.");
}
String individualName = plinkInfo[1];
int snpNumber = 0;
if ( plinkFileType == PlinkFileType.STANDARD_PED ) {
for ( int i : offsets ) {
vars.get(snpNumber).addGenotypeEntry(plinkInfo[i], individualName);
snpNumber++;
}
}
}
private ArrayList<PlinkVariantInfo> instantiateVariantListFromHeader(String header) {
if ( header.startsWith("#") ) {// first line is a comment; what we're used to seeing
plinkFileType = PlinkFileType.STANDARD_PED;
} else {// first line is the raw header; comes from de-binary-ing a .bed file
plinkFileType = PlinkFileType.RAW_PED;
throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file.");
}
ArrayList<PlinkVariantInfo> seqVars = new ArrayList<PlinkVariantInfo>();
String[] headerFields;
if ( plinkFileType == PlinkFileType.STANDARD_PED ) {
headerFields = header.split("\t");
} else {
throw new StingException("Plink file is likely of .raw or recoded format. Please use an uncoded .ped file.");
}
for ( String field : headerFields ) {
if ( ! headerEntries.contains(field) ) {
// not a standard header, so a variant
seqVars.add(new PlinkVariantInfo(field));
}
}
return seqVars;
}
private ArrayList<Integer> getSNPOffsetsFromHeader(String header) {
ArrayList<Integer> offsets = new ArrayList<Integer>();
String[] headerFields;
if ( plinkFileType == PlinkFileType.STANDARD_PED ) {
headerFields = header.split("\t");
} else {
headerFields = header.split("\\s+");
}
int offset = 0;
for ( String field : headerFields ) {
if ( ! headerEntries.contains(field) ) {
offsets.add(offset);
}
offset++;
}
return offsets;
}
/* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
* * PARSING BINARY PLINK BED/BIM/FAM FILES * *
* *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***/
private ArrayList<PlinkVariantInfo> parseBinaryFormattedPlinkFile(File file) {
PlinkBinaryTrifecta binaryFiles = getPlinkBinaryTriplet(file);
ArrayList<PlinkVariantInfo> parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile);
ArrayList<String> sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile);
ArrayList<PlinkVariantInfo> updatedVariants = getGenotypesFromBedFile(parsedVariants,sampleNames,binaryFiles.bedFile);
return updatedVariants;
}
private PlinkBinaryTrifecta getPlinkBinaryTriplet(File file) {
// just gonna parse the name
PlinkBinaryTrifecta trifecta = new PlinkBinaryTrifecta();
String absolute_path = file.getAbsolutePath();
String[] directory_tree = absolute_path.split("/");
String file_name = directory_tree[directory_tree.length-1].split(".")[0];
StringBuilder pathBuilder = new StringBuilder();
for ( String folder : directory_tree ) {
pathBuilder.append(String.format("%s/",folder));
}
String path = pathBuilder.toString();
trifecta.bedFile = new File(path+file_name+".bed");
trifecta.bimFile = new File(path+file_name+".bim");
trifecta.famFile = new File(path+file_name+".fam");
return trifecta;
}
private ArrayList<PlinkVariantInfo> instantiateVariantsFromBimFile(File bimFile) {
BufferedReader reader;
try {
reader = new BufferedReader( new FileReader( bimFile ));
} catch ( FileNotFoundException e) {
throw new StingException("The SNP information file accompanying the binary ped file was not found (the .bim file). "+
"Please ensure that it is in the same directory as the .bed and .fam files. The file we "+
"Were looking for was "+bimFile.getAbsolutePath(),e);
}
ArrayList<PlinkVariantInfo> variants = new ArrayList<PlinkVariantInfo>();
try {
String line = null;
do {
line = reader.readLine();
if ( line != null ) {
String[] snpInfo = line.split("\\s+");
PlinkVariantInfo variant = new PlinkVariantInfo(snpInfo[1],true);
variant.setGenomeLoc(GenomeLocParser.parseGenomeLoc(snpInfo[0],Long.valueOf(snpInfo[2]), Long.valueOf(snpInfo[2])+1));
variant.setAlleles(snpInfo[4],snpInfo[5]);
variants.add(variant);
}
} while ( line != null );
} catch ( IOException e ) {
throw new StingException("There was an error reading the .bim file "+bimFile.getAbsolutePath(), e);
}
return variants;
}
private ArrayList<String> getSampleNameOrderingFromFamFile(File famFile) {
BufferedReader reader;
try {
reader = new BufferedReader( new FileReader( famFile ));
} catch ( FileNotFoundException e) {
throw new StingException("The Family information file accompanying the binary ped file was not found (the .fam file). "+
"Please ensure that it is in the same directory as the .bed and .bim files. The file we "+
"Were looking for was "+famFile.getAbsolutePath(),e);
}
ArrayList<String> sampleNames = new ArrayList<String>();
try {
String line = null;
do {
line = reader.readLine();
if ( line != null ) {
sampleNames.add(line.split("\\s+")[1]);
}
} while ( line != null );
} catch (IOException e) {
throw new StingException("There was an error reading the .fam file "+famFile.getAbsolutePath(),e);
}
return sampleNames;
}
private ArrayList<PlinkVariantInfo> getGenotypesFromBedFile(ArrayList<PlinkVariantInfo> variants, ArrayList<String> samples, File bedFile) {
FileInputStream inStream;
try {
inStream = new FileInputStream(bedFile);
} catch (FileNotFoundException e) {
throw new StingException("The Binary pedigree file file accompanying the family file was not found (the .bed file). "+
"Please ensure that it is in the same directory as the .bim and .fam files. The file we "+
"Were looking for was "+bedFile.getAbsolutePath(),e);
}
try {
byte genotype = -1;
long bytesRead = 0;
int snpOffset = 0;
int sampleOffset = 0;
boolean snpMajorMode = true;
do {
genotype = (byte) inStream.read();
bytesRead++;
if ( genotype != -1 ) {
if ( bytesRead > 3 ) {
addGenotypeByte(genotype,variants,samples,snpOffset,sampleOffset, snpMajorMode);
if ( snpMajorMode ) {
sampleOffset = sampleOffset + 4;
while ( sampleOffset > samples.size() ) {
snpOffset ++;
sampleOffset = sampleOffset % samples.size();
}
} else {
snpOffset = snpOffset + 4;
while ( snpOffset > variants.size() ) {
sampleOffset ++;
snpOffset = snpOffset % samples.size();
}
}
} else {
if ( bytesRead == 2) {
snpMajorMode = genotype == SNP_MAJOR_MODE;
}
}
}
} while ( genotype != -1 );
} catch ( IOException e) {
throw new StingException("Error reading binary ped file "+bedFile.getAbsolutePath(), e);
}
return variants;
}
private void addGenotypeByte(byte genotype, ArrayList<PlinkVariantInfo> variants, ArrayList<String> sampleNames, int snpOffset, int sampleOffset, boolean major) {
// four genotypes encoded in this byte
int[] genotypes = parseGenotypes(genotype);
for ( int g : genotypes ) {
variants.get(snpOffset).addBinaryGenotypeEntry(g,sampleNames.get(sampleOffset));
if ( major ) {
sampleOffset++;
while ( sampleOffset > sampleNames.size() ) {
snpOffset++;
sampleOffset = sampleOffset % sampleNames.size();
}
if ( snpOffset >= variants.size() ) {
// done with file; early return
return;
}
} else {
snpOffset++;
while( snpOffset > variants.size() ) {
sampleOffset++;
snpOffset = snpOffset % variants.size();
}
if ( sampleOffset >= sampleNames.size() ) {
// done with file; early return
return;
}
}
}
}
private int[] parseGenotypes(byte genotype) {
int[] genotypes = new int[4];
genotypes[0] = ( genotype & 0x00000011 );
genotypes[1] = ( ( genotype & 0x00001100 ) >>> 2 );
genotypes[2] = ( ( genotype & 0x00110000 ) >>> 4 );
genotypes[3] = ( ( genotype & 0x11000000 ) >>> 6 );
return genotypes;
}
}
class PlinkVariantInfo implements Comparable {
private String variantName;
private GenomeLoc loc;
private ArrayList<Genotype> genotypes = new ArrayList<Genotype>();
private ArrayList<String> sampleNames = new ArrayList<String>();
private ArrayList<Allele> indelHolder;
private ArrayList<String> sampleHolder;
private int siteIndelLength;
private Allele.AlleleType indelType;
// for binary parsing
private String locAllele1;
private String locAllele2;
public GenomeLoc getLocation() {
return loc;
}
public String getName() {
return variantName;
}
public ArrayList<String> getSampleNames() {
return sampleNames;
}
public ArrayList<Genotype> getGenotypes() {
return genotypes;
}
private boolean isSNP() {
return this.indelType == null;
}
public void setGenomeLoc(GenomeLoc loc) {
this.loc = loc;
}
public void setAlleles(String al1, String al2) {
locAllele1 = al1;
locAllele2 = al2;
if ( ! isSNP() ) {
siteIndelLength = Math.max(locAllele1.length(),locAllele2.length());
}
}
// CONSTRUCTOR
public PlinkVariantInfo(String variantName) {
this.variantName = variantName;
}
public PlinkVariantInfo(String variantName, boolean onlyLookForIndelInfo ) {
this.variantName = variantName;
if ( onlyLookForIndelInfo ) {
parseNameForIndels();
} else {
parseName();
}
}
private void parseName() {
String chrom = this.variantName.split("_c")[1].split("_")[0];
String pos = this.variantName.split("_p")[1].split("_")[0];
this.loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos);
this.parseNameForIndels();
}
private void parseNameForIndels() {
if ( this.variantName.contains("_gI") || this.variantName.contains("_gD") ) {
this.instantiateIndel();
}
}
private void instantiateIndel() {
this.indelHolder = new ArrayList<Allele>();
this.sampleHolder = new ArrayList<String>();
this.siteIndelLength = -1;
this.indelType = this.variantName.contains("_gI") ? Allele.AlleleType.INSERTION : Allele.AlleleType.DELETION;
}
public void addGenotypeEntry(String genotypeString, String sampleName) {
// identify if we're dealing with a deletion
if ( this.isSNP() ) {
this.addSNP(genotypeString.split("\\s+"),sampleName);
} else {
this.addIndel(genotypeString.split("\\s+"),sampleName);
}
}
public void addBinaryGenotypeEntry( int genoTYPE, String sampleName ) {
String[] alleleStr = new String[2];
if ( genoTYPE == 0 ) {
alleleStr[0] = locAllele1;
alleleStr[1] = locAllele1;
} else if (genoTYPE == 1) {
alleleStr[0] = locAllele1;
alleleStr[1] = locAllele2;
} else {
alleleStr[0] = locAllele2;
alleleStr[1] = locAllele2;
}
if ( this.isSNP() ) {
this.addSNP(alleleStr,sampleName);
} else {
this.addIndel(alleleStr,sampleName);
}
}
private void addSNP(String[] alleleStrings, String sampleName) {
ArrayList<Allele> alleles = new ArrayList<Allele>(2);
for ( String alStr : alleleStrings ) {
alleles.add(new Allele(Allele.AlleleType.UNKNOWN_POINT_ALLELE,alStr));
}
}
private void addIndel(String[] alleleStrings, String sampleName) {
String alleleStr1 = alleleStrings[0];
String alleleStr2 = alleleStrings[1];
if ( alleleStr1.contains("-") ^ alleleStr2.contains("-") ) {
// heterozygous indel
if ( alleleStr1.contains("-") ) {
this.addHetIndel(alleleStr2,sampleName) ;
} else {
this.addHetIndel(alleleStr1,sampleName);
}
} else {
this.addHomIndel(alleleStr1, alleleStr2, sampleName);
}
}
private void addHetIndel(String baseStr, String sampleName) {
Allele ref;
Allele alt;
if ( indelType == Allele.AlleleType.INSERTION ) {
ref = new Allele(Allele.AlleleType.REFERENCE,"");
alt = new Allele(indelType,baseStr);
} else {
alt = new Allele(indelType,"");
ref = new Allele(Allele.AlleleType.REFERENCE,baseStr);
}
this.setIndelLength(alt,baseStr.length());
if ( ! indelHolder.isEmpty() ) {
siteIndelLength = baseStr.length();
this.addHeldIndels();
}
Genotype indel = new Genotype(Arrays.asList(ref,alt), sampleName, 20.0);
this.setIndelGenotypeLength(indel,siteIndelLength);
this.genotypes.add(indel);
this.sampleNames.add(sampleName);
}
private void addHomIndel(String strand1, String strand2, String sampleName) {
Allele allele1;
Allele allele2;
boolean reference;
if ( indelType == Allele.AlleleType.DELETION ) {
if ( strand1.contains("-") ) {
// homozygous deletion
allele1 = new Allele(indelType,"");
allele2 = new Allele(indelType,"");
reference = false;
} else {
allele1 = new Allele(Allele.AlleleType.REFERENCE,strand1);
allele2 = new Allele(Allele.AlleleType.REFERENCE,strand2);
reference = true;
}
} else {
if ( strand1.contains("-") ) {
// homozygous reference
allele1 = new Allele(Allele.AlleleType.REFERENCE,"");
allele2 = new Allele(Allele.AlleleType.REFERENCE,"");
reference = true;
} else {
allele1 = new Allele(indelType,strand1);
allele2 = new Allele(indelType,strand2);
reference = false;
}
}
if ( reference ) {
if ( ! indelHolder.isEmpty() ) {
siteIndelLength = strand1.length();
this.addHeldIndels();
}
}
if ( reference || siteIndelLength != -1 ) { // if we're ref or know the insertion/deletion length of the site
Genotype gen = new Genotype(Arrays.asList(allele1,allele2), sampleName, 20.0);
this.genotypes.add(gen);
this.sampleNames.add(sampleName);
} else { // hold on the variants until we *do* know the in/del length at this site
this.indelHolder.add(allele1);
this.indelHolder.add(allele2);
this.sampleHolder.add(sampleName);
}
}
private void setIndelGenotypeLength(Genotype g, int length) {
g.setAttribute(Genotype.StandardAttributes.DELETION_LENGTH,length);
}
private void addHeldIndels() {
Allele del1;
Allele del2;
int startingSize = indelHolder.size();
for ( int i = 0; i < startingSize ; i+=2 ) {
del1 = indelHolder.get(i);
del2 = indelHolder.get(i+1);
this.addIndelFromCache(del1,del2,sampleHolder.get(i/2));
if ( indelHolder.size() != startingSize ) {
throw new StingException("Halting algorithm -- possible infinite loop");
}
}
indelHolder.clear();
sampleHolder.clear();
}
private void addIndelFromCache ( Allele indel1, Allele indel2, String sampleName ) {
this.setIndelLength(indel1,siteIndelLength);
this.setIndelLength(indel2,siteIndelLength);
Genotype indel = new Genotype(Arrays.asList(indel1,indel2),sampleName, 20.0);
this.setIndelGenotypeLength(indel,siteIndelLength);
this.genotypes.add(indel);
this.sampleNames.add(sampleName);
}
public int compareTo(Object obj) {
if ( ! ( obj instanceof PlinkVariantInfo) ) {
return 1;
}
return loc.compareTo(((PlinkVariantInfo) obj).getLocation());
}
private void setIndelLength(Allele al, int length) {
// Todo -- once alleles support deletion lengths add that information
// Todo -- into the object; for now this can just return
return;
}
}
class PlinkBinaryTrifecta {
public PlinkBinaryTrifecta() {
}
public File bedFile;
public File bimFile;
public File famFile;
}

View File

@ -1,307 +0,0 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.StingException;
import java.io.*;
import java.util.*;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: Jan 19, 2010
* Time: 10:24:18 AM
* To change this template use File | Settings | File Templates.
*/
public class SequenomRodWithGenomeLoc extends BasicReferenceOrderedDatum implements ReferenceOrderedDatum {
private final String[] SEQUENOM_HEADER_FIELDS = { "#Family ID", "Individual ID", "Paternal ID", "Maternal ID", "Sex", "Phenotype" } ;
private ArrayList<SequenomVariantInfo> variants;
private SequenomVariantInfo currentVariant;
private ListIterator<SequenomVariantInfo> variantIterator;
private HashSet headerEntries;
// // // CONSTRUCTOR // // //
public SequenomRodWithGenomeLoc(String name) {
super(name);
}
@Override
public Object initialize(final File seqFile) throws FileNotFoundException {
if ( ! seqFile.exists() ) {
throw new FileNotFoundException("File "+seqFile.getAbsolutePath()+" does not exist.");
}
headerEntries = new HashSet<String>(Arrays.asList(SEQUENOM_HEADER_FIELDS));
variants = parseSequenomFile(seqFile);
if ( variants != null ) {
variantIterator = variants.listIterator();
currentVariant = variantIterator.next();
}
assertNotNull();
return null;
}
private void assertNotNull() {
if ( currentVariant == null ) {
throw new UnsupportedOperationException ( "Current sequenom variant information was set to null" );
}
}
@Override
public boolean parseLine(Object obj, String[] args) {
if ( variantIterator.hasNext() ) {
currentVariant = variantIterator.next();
return true;
} else {
return false;
}
}
@Override
public GenomeLoc getLocation() {
return currentVariant.getLocation();
}
@Override
public String toString() {
return currentVariant == null ? "" : currentVariant.toString();
}
public String getVariantName() {
return currentVariant.getName();
}
public ArrayList<String> getVariantSampleNames() {
return currentVariant.getSampleNames();
}
public ArrayList<Genotype> getGenotypes() {
return currentVariant.getGenotypes();
}
private ArrayList<SequenomVariantInfo> parseSequenomFile(File file) {
try {
BufferedReader reader = new BufferedReader( new FileReader ( file ) );
String header = reader.readLine();
ArrayList<SequenomVariantInfo> seqVars = instantiateVariantListFromHeader(header);
ArrayList<Integer> snpOffsets = getSNPOffsetsFromHeader(header);
String line = null;
do {
line = reader.readLine();
incorporateInfo(seqVars,snpOffsets,line);
} while ( line != null );
java.util.Collections.sort(seqVars); // because the comparable uses the GenomeLoc comparable; these
// are sorted in standard reference order
return seqVars;
} catch ( FileNotFoundException e ) {
throw new StingException("File "+file.getAbsolutePath()+" could not be found. This was checked earlier. Should never happen.",e);
} catch ( IOException e ) {
throw new StingException("Error reading file "+file.getAbsolutePath()+".",e);
}
}
private void incorporateInfo(List<SequenomVariantInfo> vars, List<Integer> offsets, String seqLine) {
String[] genotypes = seqLine.split("\t");
String individualName = genotypes[1];
int snpNumber = 0;
for ( int i : offsets ) {
vars.get(snpNumber).addGenotypeEntry(genotypes[i], individualName);
snpNumber++;
}
}
private ArrayList<SequenomVariantInfo> instantiateVariantListFromHeader(String header) {
ArrayList<SequenomVariantInfo> seqVars = new ArrayList<SequenomVariantInfo>();
String[] headerFields = header.split("\t");
for ( String field : headerFields ) {
if ( ! headerEntries.contains(field) ) {
// not a standard header, so a variant
seqVars.add(new SequenomVariantInfo(field));
}
}
return seqVars;
}
private ArrayList<Integer> getSNPOffsetsFromHeader(String header) {
ArrayList<Integer> offsets = new ArrayList<Integer>();
String[] headerFields = header.split("\t");
int offset = 0;
for ( String field : headerFields ) {
if ( ! headerEntries.contains(field) ) {
offsets.add(offset);
}
offset++;
}
return offsets;
}
}
class SequenomVariantInfo implements Comparable {
private String variantName;
private GenomeLoc loc;
private ArrayList<Genotype> genotypes;
private ArrayList<String> sampleNames;
private ArrayList<Allele> deletionHolder = new ArrayList<Allele>();
private ArrayList<String> sampleHolder = new ArrayList<String>();
private int siteDeletionLength = -1;
public GenomeLoc getLocation() {
return loc;
}
public String getName() {
return variantName;
}
public ArrayList<String> getSampleNames() {
return sampleNames;
}
public ArrayList<Genotype> getGenotypes() {
return genotypes;
}
// CONSTRUCTOR
public SequenomVariantInfo(String variantName) {
this.variantName = variantName;
this.parseNameToLoc();
}
private void parseNameToLoc() {
String chrom = this.variantName.split("_c")[1].split("_")[0];
String pos = this.variantName.split("_p")[1].split("_")[0];
this.loc = GenomeLocParser.parseGenomeLoc(chrom+":"+pos);
}
public void addGenotypeEntry(String genotypeString, String sampleName) {
String[] alleleStrs = genotypeString.split(" ");
// identify if we're dealing with a deletion
if ( genotypeString.contains("-") ) {
this.addDeletion(alleleStrs, sampleName);
} else {
// simple SNP or indel (easier to handle)
this.addIndelOrSNP(alleleStrs,sampleName);
}
}
private void addIndelOrSNP(String[] alleleStrings, String sampleName) {
ArrayList<Allele> alleles = new ArrayList<Allele>(2);
if ( alleleStrings[0].length() > 1 || alleleStrings[1].length() > 1 ) {
// insertion
for ( String alStr : alleleStrings ) {
if ( alStr.length() > 1 ) {
alleles.add(new Allele(Allele.AlleleType.INSERTION,alStr));
} else {
alleles.add(new Allele(Allele.AlleleType.REFERENCE, alStr));
}
}
} else {
// SNP
for ( String alStr : alleleStrings ) {
alleles.add(new Allele(Allele.AlleleType.UNKNOWN_POINT_ALLELE,alStr));
}
}
}
private void addDeletion(String[] alleleStrings, String sampleName) {
String alleleStr1 = alleleStrings[0];
String alleleStr2 = alleleStrings[1];
Allele allele1 = null;
Allele allele2 = null;
if ( alleleStr1.contains("-") && alleleStr2.contains("-") ) {
// homozygous deletion
this.addHomDeletion(allele1,allele2, sampleName);
} else {
// heterozygous deletion
if ( alleleStr1.contains("-") ) {
this.addHetDeletion(allele1,allele2, alleleStr1, alleleStr2, sampleName);
} else {
this.addHetDeletion(allele2,allele1, alleleStr2, alleleStr1, sampleName); // note the order change
}
}
}
private void addHetDeletion(Allele del, Allele ref, String delStr, String refStr, String sampleName) {
del = new Allele(Allele.AlleleType.DELETION,"");
ref = new Allele(Allele.AlleleType.REFERENCE,refStr.substring(0,1));
this.setDeletionLength(del,refStr.length());
if ( ! deletionHolder.isEmpty() ) {
siteDeletionLength = refStr.length();
this.addHeldDeletions();
}
Genotype indel = new Genotype(Arrays.asList(ref,del), sampleName, 20.0);
this.setIndelGenotypeLength(indel,siteDeletionLength);
this.genotypes.add(indel);
this.sampleNames.add(sampleName);
}
private void addHomDeletion(Allele allele1, Allele allele2, String sampleName) {
allele1 = new Allele(Allele.AlleleType.DELETION,"");
allele2 = new Allele(Allele.AlleleType.DELETION,"");
if ( siteDeletionLength != -1 ) {
this.setDeletionLength(allele1,siteDeletionLength);
this.setDeletionLength(allele2,siteDeletionLength);
Genotype indel = new Genotype(Arrays.asList(allele1,allele2), sampleName, 20.0);
this.setIndelGenotypeLength(indel, siteDeletionLength);
this.genotypes.add(indel);
this.sampleNames.add(sampleName);
} else {
deletionHolder.add(allele1);
deletionHolder.add(allele2);
sampleHolder.add(sampleName);
}
}
private void setIndelGenotypeLength(Genotype g, int length) {
g.setAttribute(Genotype.StandardAttributes.DELETION_LENGTH,length);
}
private void addHeldDeletions() {
Allele del1;
Allele del2;
int startingSize = deletionHolder.size();
for ( int i = 0; i < startingSize ; i+=2 ) {
del1 = deletionHolder.get(i);
del2 = deletionHolder.get(i+1);
this.addHomDeletion(del1,del2,sampleHolder.get(i/2));
if ( deletionHolder.size() != startingSize ) {
throw new StingException("Halting algorithm -- possible infinite loop");
}
}
deletionHolder.clear();
sampleHolder.clear();
}
public int compareTo(Object obj) {
if ( ! ( obj instanceof SequenomVariantInfo ) ) {
return 1;
}
return loc.compareTo(((SequenomVariantInfo) obj).getLocation());
}
private void setDeletionLength(Allele al, int length) {
// Todo -- once alleles support deletion lengths add that information
// Todo -- into the object; for now this can just return
return;
}
}

View File

@ -40,7 +40,8 @@ public class PlinkToVCF extends RefWalker<VCFRecord,Integer> {
@Argument(fullName="maxHomNonref", doc="Maximum homozygous-nonreference rate (as a proportion) to consider an assay valid", required = false)
public double maxHomNonref = 1.1;
private final Set<String> HEADER_FIELDS = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex","Paternal ID","Maternal ID","Phenotype"));
private final Set<String> HEADER_FIELDS = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex","Paternal ID","Maternal ID","Phenotype",
"FID","IID","PAT","MAT","SEX","PHENOTYPE"));
private final int INIT_NUMBER_OF_POPULATIONS = 10;
private final int DEFAULT_QUALITY = 20;
private HashMap<String, SequenomVariantInfo> sequenomResults = new HashMap<String,SequenomVariantInfo>();