PlinkRodWithGenomeLoc now properly handels indels.
There is now a DELETION_REFERENCE allele type to allow for the storage of multi-base references rather than point-mutation references. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@2667 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
42fb85e7f3
commit
01db93299c
|
|
@ -17,7 +17,7 @@ public class Allele {
|
|||
|
||||
// the types of variants we currently allow
|
||||
public enum AlleleType {
|
||||
REFERENCE, SNP, INSERTION, DELETION, INVERSION, UNKNOWN_POINT_ALLELE
|
||||
REFERENCE, SNP, INSERTION, DELETION, INVERSION, UNKNOWN_POINT_ALLELE, DELETION_REFERENCE
|
||||
}
|
||||
|
||||
public Allele(AlleleType type, String bases) {
|
||||
|
|
|
|||
|
|
@ -21,7 +21,6 @@ import net.sf.samtools.SAMFileHeader;
|
|||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements ReferenceOrderedDatum {
|
||||
private boolean allowTest = false;
|
||||
private final Set<String> headerEntries = new HashSet<String>(Arrays.asList("#Family ID","Individual ID","Sex",
|
||||
"Paternal ID","Maternal ID","Phenotype", "FID","IID","PAT","MAT","SEX","PHENOTYPE"));
|
||||
private final byte SNP_MAJOR_MODE = 0x00000001;
|
||||
|
|
@ -97,6 +96,10 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
|
|||
return currentVariant.getGenotypes();
|
||||
}
|
||||
|
||||
public boolean variantIsSNP() {
|
||||
return currentVariant.isSNP();
|
||||
}
|
||||
|
||||
|
||||
// AM I PARSING A TEXT OR A BINARY FILE ??
|
||||
|
||||
|
|
@ -226,6 +229,9 @@ public class PlinkRodWithGenomeLoc extends BasicReferenceOrderedDatum implements
|
|||
ArrayList<PlinkVariantInfo> parsedVariants = instantiateVariantsFromBimFile(binaryFiles.bimFile);
|
||||
ArrayList<String> sampleNames = getSampleNameOrderingFromFamFile(binaryFiles.famFile);
|
||||
ArrayList<PlinkVariantInfo> updatedVariants = getGenotypesFromBedFile(parsedVariants,sampleNames,binaryFiles.bedFile);
|
||||
|
||||
java.util.Collections.sort(updatedVariants);
|
||||
|
||||
return updatedVariants;
|
||||
}
|
||||
|
||||
|
|
@ -429,7 +435,7 @@ class PlinkVariantInfo implements Comparable {
|
|||
return genotypes;
|
||||
}
|
||||
|
||||
private boolean isSNP() {
|
||||
public boolean isSNP() {
|
||||
return this.indelType == null;
|
||||
}
|
||||
|
||||
|
|
@ -516,6 +522,7 @@ class PlinkVariantInfo implements Comparable {
|
|||
for ( String alStr : alleleStrings ) {
|
||||
alleles.add(new Allele(Allele.AlleleType.UNKNOWN_POINT_ALLELE,alStr));
|
||||
}
|
||||
|
||||
genotypes.add(new Genotype(alleles,sampleName,20.0) );
|
||||
sampleNames.add(sampleName);
|
||||
}
|
||||
|
|
@ -544,7 +551,7 @@ class PlinkVariantInfo implements Comparable {
|
|||
alt = new Allele(indelType,baseStr);
|
||||
} else {
|
||||
alt = new Allele(indelType,"");
|
||||
ref = new Allele(Allele.AlleleType.REFERENCE,baseStr);
|
||||
ref = new Allele(Allele.AlleleType.DELETION_REFERENCE,baseStr);
|
||||
}
|
||||
|
||||
this.setIndelLength(alt,baseStr.length());
|
||||
|
|
@ -571,8 +578,8 @@ class PlinkVariantInfo implements Comparable {
|
|||
allele2 = new Allele(indelType,"");
|
||||
reference = false;
|
||||
} else {
|
||||
allele1 = new Allele(Allele.AlleleType.REFERENCE,strand1);
|
||||
allele2 = new Allele(Allele.AlleleType.REFERENCE,strand2);
|
||||
allele1 = new Allele(Allele.AlleleType.DELETION_REFERENCE,strand1);
|
||||
allele2 = new Allele(Allele.AlleleType.DELETION_REFERENCE,strand2);
|
||||
reference = true;
|
||||
}
|
||||
} else {
|
||||
|
|
@ -597,6 +604,7 @@ class PlinkVariantInfo implements Comparable {
|
|||
|
||||
if ( reference || siteIndelLength != -1 ) { // if we're ref or know the insertion/deletion length of the site
|
||||
Genotype gen = new Genotype(Arrays.asList(allele1,allele2), sampleName, 20.0);
|
||||
setIndelGenotypeLength(gen,siteIndelLength);
|
||||
this.genotypes.add(gen);
|
||||
this.sampleNames.add(sampleName);
|
||||
} else { // hold on the variants until we *do* know the in/del length at this site
|
||||
|
|
|
|||
|
|
@ -85,7 +85,7 @@ public class PlinkRodTest extends BaseTest {
|
|||
ArrayList<Genotype> snp3 = genotypesInRod.get(2);
|
||||
|
||||
Assert.assertEquals("That there are three Genotypes in SNP 1",3,snp1.size());
|
||||
Assert.assertEquals("That there are three samples in SNP 1", 3, sampleNamesInRod.get(0).size());
|
||||
Assert.assertEquals("That there are three samples in SNP 2", 3, sampleNamesInRod.get(1).size());
|
||||
Assert.assertEquals("That there are three Genotypes in SNP 3",3,snp3.size());
|
||||
|
||||
|
||||
|
|
@ -114,4 +114,42 @@ public class PlinkRodTest extends BaseTest {
|
|||
lociCorrect = lociCorrect && lociInRod.get(i).toString().equals(expectedLoci.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testStandardPedFileWithIndels() {
|
||||
PlinkRodWithGenomeLoc rod = new PlinkRodWithGenomeLoc("test");
|
||||
try {
|
||||
rod.initialize(new File("/humgen/gsa-hpprojects/GATK/data/Validation_Data/test/plink_rod_test/standard_plink_with_indels.ped") );
|
||||
} catch ( FileNotFoundException e) {
|
||||
throw new StingException("Test file for testStandardPedFileWithIndels() could not be found", e);
|
||||
}
|
||||
|
||||
// Iterate through the rod
|
||||
|
||||
List<ArrayList<Genotype>> genotypesInRod = new ArrayList<ArrayList<Genotype>>();
|
||||
ArrayList<ArrayList<String>> sampleNamesInRod = new ArrayList<ArrayList<String>>();
|
||||
ArrayList<GenomeLoc> lociInRod = new ArrayList<GenomeLoc>();
|
||||
ArrayList<Boolean> snpSites = new ArrayList<Boolean>();
|
||||
do {
|
||||
genotypesInRod.add(rod.getGenotypes());
|
||||
sampleNamesInRod.add(rod.getVariantSampleNames());
|
||||
lociInRod.add(rod.getLocation());
|
||||
snpSites.add(rod.variantIsSNP());
|
||||
} while ( rod.parseLine(null,null) );
|
||||
|
||||
boolean snpOrder = true;
|
||||
List<Boolean> expectedOrder = Arrays.asList(true,false,true,false);
|
||||
for ( int i = 0; i < 4; i ++ ) {
|
||||
snpOrder = snpOrder && ( expectedOrder.get(i) == snpSites.get(i) );
|
||||
}
|
||||
|
||||
Assert.assertTrue("That the variant type order is as expected", snpOrder);
|
||||
Assert.assertTrue("That the second genotype of second variant is not a point mutation", ! genotypesInRod.get(1).get(1).isPointGenotype() );
|
||||
Assert.assertTrue("That the second genotype of fourth variant is not a point mutation", ! genotypesInRod.get(3).get(1).isPointGenotype() );
|
||||
Assert.assertTrue("That the second genotype of fourth variant is homozygous", genotypesInRod.get(3).get(1).isHom());
|
||||
Assert.assertTrue("That the fourth genotype of fourth variant is heterozygous",genotypesInRod.get(3).get(3).isHet());
|
||||
Assert.assertEquals("That the reference deletion genotype has the correct string", "ATTTAT",genotypesInRod.get(3).get(2).getAlleles().get(0).getBases());
|
||||
Assert.assertEquals("That the insertion bases are correct","CTC",genotypesInRod.get(1).get(2).getAlleles().get(0).getBases());
|
||||
Assert.assertEquals("That the snp bases are correct","GC",genotypesInRod.get(2).get(2).getAlleles().get(0).getBases()+genotypesInRod.get(2).get(2).getAlleles().get(1).getBases());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue