## Variability due to sampling (sampling variance and the standard error of the estimate)
# let's say we have a population of 20000 individuals.
population <- c(rep("A", 16000), rep("a", 4000))
# We can check the allele frequency
table(population)/length(population)
# Let's say we sample 20 individuals and estimate allele frequency.
our_sample <- sample(population, size = 20, replace = FALSE)
our_sample
# What is the allele frequency?
(table(our_sample)/length(our_sample))["A"]
# Instead of repeating this over and over, we can write a function to encapsulate this (like we did for the geometric mean last week)
allele_sampler <- function(x = population, N = 20) {
our_sample <- sample(x, size = N, replace = FALSE)
as.numeric((table(our_sample)/length(our_sample))["A"])
}
allele_sampler()
allele_sampler()
allele_sampler()
allele_sampler()
allele_sampler()
# This gets tedious.... Let's use the replicate function.
# This allows us to repeatedly call this function n times, with n currently set to 50.
sampled_many_times <- replicate( n = 50, allele_sampler())
sampled_many_times
# plot the distribution of these allele frequencies (use hist() )
# Calculate the mean for the estimated allele frequencies calculated over the many samples from this population. How close is this to the true value of the allele frequency for the population?
# Calculate the standard deviation over the many samples from this population. What does this quantity tell us?
# Compare the estimated mean and standard deviation to the two people on either side of you. Are your estimates identical? Explain why.
# Now go back and alter the allele frequencies in the population so it is 0.97 and 0.03, and repeat this process. What has changed? What explains the difference?
# Repeat this one last time (back to allele frequency of 0.2 and 0.8), but change the number of individuals sampled from 20 to 100.
# - How has the mean changed? Explain why we observe any differences we do?
# - How has this changed the standard deviation among samples? Explain why we observe any differences we do?
# What are the take aways from this part of the tutorial?
## Hardy Weinberg demonstration
# Let's start with the following number of sampled individuals
AA <- 350
Aa <- 172
aa <- 306
N <- AA+Aa+aa
# Create a variable called N, with the number of individuals in this sample?
# Observed frequencies
fAA_observed <- AA/N
fAa_observed <- Aa/N
faa_observed <- aa/N
print(paste("the observed frequency of AA is:", signif(fAA_observed, 3)))
print(paste("the observed frequency of aa is:", signif(fAa_observed, 3)))
print(paste("the observed frequency of aa is:", signif(faa_observed, 3)))
# How do we check that our genotypic frequencies make sense?
# Allele frequencies
fA_observed <- (2*AA + Aa)/(2*N)
fa_observed <- (2*aa + Aa)/(2*N)
print(paste("the observed frequency of A is:", signif(fA_observed, 3)))
print(paste("the observed frequency of a is:", signif(fa_observed, 3)))
# How do we check if our allele frequencies make sense
# Let's calculate our expected genotypic frequencies assuming HWE
fAA_expected <- fA_observed^2
faa_expected <- fa_observed^2
fAa_expected <- 2*fA_observed*fa_observed
# How can we check our calculations make sense?
sum(fAA_expected, faa_expected, fAa_expected)
print(paste("the expected frequency of AA is:", signif(fAA_expected, 3)))
print(paste("the expected frequency of aa is:", signif(fAa_expected, 3)))
print(paste("the expected frequency of aa is:", signif(faa_expected, 3)))
# How many individuals do we expect to see for each genotype?
AA_expected <- fAA_expected*N
Aa_expected <- fAa_expected*N
aa_expected <- faa_expected*N
# Check that sum of the expected number of individuals equals the observed (think about what operator we need to use!!!).
# Examine our expected and our observed
AA_expected; AA
Aa_expected; Aa
aa_expected; aa
observed <- c(AA, Aa, aa)
expected <- c(AA_expected, Aa_expected, aa_expected)
# chi squared function to determine whether this locus is currently in Hardy Weinberg Equilibrium
chisq <- function(observed, expected) {
chisqval <- sum((observed - expected)^2/expected)
return(chisqval)
}
chi_value <- chisq(observed, expected)
chi_value
## Degrees of freedom.
### How many (free observations do we have)
# Compare our observed value for the chi squared VS the distribution (to get a sense if our estimate is particularly extreme relative to what we would expect from random sampling).
# We also need to figure out the degrees of freedom. This is based on two things how many observed data classes we have (in our case 3) and how many parameters we need to estimate (in this case two). Assuming we were using a molecular marker so we can tell AA, Aa and aa individuals apart we have three "observations" in terms of classes. However we really only have one "free" classes as once we know two of the the three genotypes (i.e. AA, aa), then the third can be determined by 1 - fAA - faa. So we effectively have one free observations of data.
### How many parameters are we estimating?
# Then we need to estimate something from the data. In this case we are estimating the allele frequencies (fA and fa) from the data. However, again we only need to estimate one of these two values (say fA), and the other can be then computed directly as fa = 1 - fA. So we only need to estimate one thing.
### So how many degrees of freedom do we have?
# the number of phenotypic classes -1 (3 - 1 = 2) then subtract how many parameters we need to estimate (2-1 =1 ). This means we have 2 - 1 degrees of freedom. In general for n alleles we have (n*(n-1))/2 degrees of freedom.
numAlleles <- 2
degfree <- (2*(2-1))/2
degfree
# We can now compare our chi square value we obtained above (from the comparison of expected and observed) to a chi-square distribution to see if the value we obtained is much higher than we would expect from random sampling
pchisq(chi_value, degfree, lower.tail = FALSE)
## Writing a compact function to do this. It would also be a pain to have to go through and change all of the parameters. So we can write a compact function to do this.
## THis is all a complex way to write it out (and way too long), so we can write this more compactly
HWE_2alleles <- function(observedGenotypes = c( 20, 50, 30), num_alleles = 2) {
NumberGenotypes = length(observedGenotypes)
N = sum(observedGenotypes)
observed_genotype_frequencies <- observedGenotypes/N
fA_observed = (2*observedGenotypes[1] + observedGenotypes[2])/(2*N)
fa_observed = 1 - fA_observed
expected_genotype_frequences = c(fA_observed^2, 2*fA_observed*fa_observed, fa_observed^2)
expected_genotypes = expected_genotype_frequences*N
chi_value <- chisq(observed = observedGenotypes, expected = expected_genotypes)
degfree <- (num_alleles*(num_alleles -1))/2
pval <- pchisq(chi_value, degfree, lower.tail = FALSE)
print(paste("the observed genotypic frequencies are:", signif(observed_genotype_frequencies, 3)))
print(paste("the observed allele frequencies are:", signif(c(fA_observed, fa_observed), 3)))
print(paste("the expected genotypic frequencies are:", signif(expected_genotype_frequences, 3)))
print(paste("the chi sq value is:", signif(chi_value, 3)))
print(paste("the p value associated with chi sq value is:", signif(pval, 3)))
}
HWE_2alleles(c(20, 50, 30))
HWE_2alleles(c(20, 40, 30))
HWE_2alleles(c(0, 0, 30))
# ooopps failed.
# Lets fix that and simplify the output.
HWE_1to2alleles <- function(observedGenotypes = c( 20, 50, 30), num_alleles = 2) {
NumberGenotypes = length(observedGenotypes)
N = sum(observedGenotypes)
observed_genotype_frequencies <- observedGenotypes/N
fA_observed = (2*observedGenotypes[1] + observedGenotypes[2])/(2*N)
fa_observed = 1 - fA_observed
if(fA_observed == 0) return(1)
if(fa_observed == 0) return(1)
expected_genotype_frequences = c(fA_observed^2, 2*fA_observed*fa_observed, fa_observed^2)
expected_genotypes = expected_genotype_frequences*N
chi_value <- chisq(observed = observedGenotypes, expected = expected_genotypes)
degfree <- (num_alleles*(num_alleles -1))/2
pval <- pchisq(chi_value, degfree, lower.tail = FALSE)
}
answer<-HWE_1to2alleles(c(20, 50, 30)); answer
answer<-HWE_1to2alleles(c(20, 40, 30)); answer
answer<-HWE_1to2alleles(c(0, 0, 30)); answer