PlinkRod now correctly parses binary files without indels; unit test added for this behavior.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2669 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
chartl 2010-01-23 17:34:06 +00:00
parent 94dc09c865
commit ae22d35212
2 changed files with 56 additions and 19 deletions

View File

@ -273,7 +273,7 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
if ( line != null ) {
String[] snpInfo = line.split("\\s+");
PlinkVariantInfo variant = new PlinkVariantInfo(snpInfo[1],true);
variant.setGenomeLoc(GenomeLocParser.parseGenomeLoc(snpInfo[0],Long.valueOf(snpInfo[2]), Long.valueOf(snpInfo[2])+1));
variant.setGenomeLoc(GenomeLocParser.parseGenomeLoc(snpInfo[0],Long.valueOf(snpInfo[3]), Long.valueOf(snpInfo[3])));
variant.setAlleles(snpInfo[4],snpInfo[5]);
variants.add(variant);
}
@ -337,15 +337,15 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
if ( snpMajorMode ) {
sampleOffset = sampleOffset + 4;
while ( sampleOffset > samples.size() -1 ) {
if ( sampleOffset > samples.size() -1 ) {
snpOffset ++;
sampleOffset = sampleOffset % samples.size();
sampleOffset = 0;
}
} else {
snpOffset = snpOffset + 4;
while ( snpOffset > variants.size() -1 ) {
if ( snpOffset > variants.size() -1 ) {
sampleOffset ++;
snpOffset = snpOffset % samples.size();
snpOffset = 0;
}
}
@ -371,22 +371,12 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
if ( major ) {
sampleOffset++;
while ( sampleOffset > sampleNames.size()-1 ) { //using offsets for comparison; size 5 == offset 4
snpOffset++;
sampleOffset = sampleOffset % sampleNames.size();
}
if ( snpOffset > variants.size()-1) {
// done with file; early return
if ( sampleOffset > sampleNames.size()-1 ) { //using offsets for comparison; size 5 == offset 4
return;
}
} else {
snpOffset++;
while( snpOffset > variants.size()-1 ) {
sampleOffset++;
snpOffset = snpOffset % variants.size();
}
if ( sampleOffset > sampleNames.size()-1 ) {
// done with file; early return
if( snpOffset > variants.size()-1 ) {
return;
}
}
@ -444,11 +434,17 @@ class PlinkVariantInfo implements Comparable {
}
public void setAlleles(String al1, String al2) {
locAllele1 = al1;
if ( al1.equals("0") ) {
// encoding for a site at which no variants were detected
locAllele1 = al2;
} else {
locAllele1 = al1;
}
locAllele2 = al2;
if ( ! isSNP() ) {
siteIndelLength = Math.max(locAllele1.length(),locAllele2.length());
}
}
// CONSTRUCTOR
@ -501,7 +497,7 @@ class PlinkVariantInfo implements Comparable {
if ( genoTYPE == 0 ) {
alleleStr[0] = locAllele1;
alleleStr[1] = locAllele1;
} else if (genoTYPE == 1) {
} else if (genoTYPE == 2) {
alleleStr[0] = locAllele1;
alleleStr[1] = locAllele2;
} else if (genoTYPE == 3 ) {

View File

@ -161,5 +161,46 @@ public class PlinkRodTest extends BaseTest {
} catch (FileNotFoundException e) {
throw new StingException("Test file for testBinaryPedFileNoIndels() could not be found",e);
}
// iterate through the ROD and get stuff
ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
ArrayList<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
ArrayList<ArrayList<String>> samplesInRod = new ArrayList<ArrayList<String>>();
do {
lociInRod.add(rod.getLocation());
genotypesInRod.add(rod.getGenotypes());
samplesInRod.add(rod.getVariantSampleNames());
} while ( rod.parseLine(null,null) );
List<String> expecLoc = Arrays.asList("1:123456","1:14327877","2:22074511","3:134787","3:178678","4:829645","4:5234132","12:1268713");
for ( int i = 0; i < expecLoc.size(); i ++ ) {
Assert.assertEquals("That locus "+(i+1)+" in the rod is correct", expecLoc.get(i), lociInRod.get(i).toString());
}
List<String> expecAlleles = Arrays.asList("AA","AA","AA","GG","GG","GG","AA","TA","TT","CC","CC","GC","TC","CC","TT",
"GG","GG","AG","TT","CC","CT","TG","GG","GG");
List<Boolean> expecHet = Arrays.asList(false,false,false,false,false,false,false,true,false,false,false,true,true,false,
false,false,false,true,false,false,true,true,false,false);
List<String> expecName = Arrays.asList("NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000",
"NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000","NA12878","NA12890","NA07000",
"NA12878","NA12890","NA07000");
int snpNo = 1;
int indiv = 1;
int alleleOffset = 0;
for ( ArrayList<Genotype> snp : genotypesInRod ) {
for ( Genotype gen : snp ) {
String alStr = gen.getAlleles().get(0).getBases()+gen.getAlleles().get(1).getBases();
Assert.assertEquals("That the allele of person "+indiv+" for snp "+snpNo+" is correct "+
"(allele offset "+alleleOffset+")", expecAlleles.get(alleleOffset),alStr);
Assert.assertEquals("That the genotype of person "+indiv+" for snp "+snpNo+" is properly set", expecHet.get(alleleOffset),gen.isHet());
Assert.assertEquals("That the name of person "+indiv+" for snp "+snpNo+" is correct", expecName.get(alleleOffset),samplesInRod.get(snpNo-1).get(indiv-1));
indiv++;
alleleOffset++;
}
indiv = 1;
snpNo++;
}
}
}