############################################################################## # A Note on using R scripts: Start R, select "File > Open script..." from # # the pull-down menus, and browse to this file. You can now place the cursor # # in a line and press Ctrl-R to run the line. (Or select "Edit > Run line # # or selection" from the pull-down menus, or right-click and select "Run # # line or selection" from the context menu.) # ############################################################################## # ESTIMATING THE POPULATION MEAN FROM A SAMPLE # This script accompanies the wcsmalaysia.org web page on population # parameters and sample statistics. It uses the same simulated squirrel # data. Run the following line to open the web page in your browser: shell.exec("http://www.wcsmalaysia.org/stats/Squirrels.htm") # Run these lines to create the object 'squirrels' containing # the whole population: squirrels <- c(809, 900, 1177, 922, 1013, 1208, 760, 878, 1176, 850, 858, 1055, 825, 1167, 954, 819, 999, 1069, 875, 866, 1067, 1073, 631, 964, 1027, 860, 826, 946, 1238, 1043, 1198, 916, 1132, 886, 1036, 993, 806, 975, 1004, 1000, 1125, 794, 801, 886, 1007, 951, 998, 940, 1143, 1047, 1321, 746, 932, 905, 1155, 988, 1016, 1020, 1261, 594, 867, 911, 984, 943, 992, 914, 948, 882, 1134, 1385, 1203, 1016, 1238, 1244, 1025, 1147, 1270, 957, 1185, 1137, 1055, 839, 1100, 979, 830, 1147, 1007, 1107, 897, 1020, 1015, 1192, 1004, 1026, 1018, 1212, 973, 862, 1122, 930) length(squirrels) range(squirrels) # ESTIMATING THE MEAN # The true mean is: (mu <- mean(squirrels)) # We use the function 'sample' to draw a random sample from 'squirrels': (x <- sample(squirrels, 4)) # Run this line several times and you will get a different sample each time # The mean of the sample is: (x.bar <- mean(x)) # Now we'll use a loop to draw 500 different samples and get the mean of each, # which we'll store in the vector 'x.bar': x.bar <- NULL for(i in 1:500) { x <- sample(squirrels, 4) x.bar[i] <- mean(x) } ############################################################################# # An explanation of loops: the command "for(i in 1:500) {...}" tells R to # # begin with i = 1 and then do everything between the {curly braces}; so a # # sample is drawn and 'x.bar[1]' gets the mean. Then R set i = 2 and # # does {...} again; a new sample is drawn and 'x.bar[2]' gets the mean of # # this one. This continues for i = 3, 4, ... up 500. # ############################################################################# # Look at the first half-dozen items in 'x.bar': head(x.bar) # A better way to look at 'x.bar' is with a histogram: hist(x.bar) # Add the true population mean and the mean of 'x.bar' to the histogram: abline(v=mu, col='red') # the true value for the population abline(v=mean(x.bar), col='blue', lty=2) # the mean of sample means # (Look carefully as the lines may be on top of each other, in which case the # resulting red/blue line will appear to be purple.) # Individual sample means can be a long way from the population mean, # but for a large number of samples, the mean of means is close. # Note the range along the x axis, then try it again with n = 8 instead of 4. # Do the means of the samples with n = 8 appear to be more or less scattered # than for n = 4? # ESTIMATING SCATTER # First we'll make a little function of our own to calculate SD; this is # equivalent to the columns in the spreadsheet described in the web page: sdev <- function(x, df) { deviation <- x-mean(x) sum.sq <- sum(deviation^2) mean.sq <- sum.sq/df sqrt(mean.sq) } # Use it to get the SD for the population, with all 100 squirrels: (sigma <- sdev(squirrels, 100)) # Should be 145.7379 # Get the root mean square deviation for a sample with n = 4, # dividing the sum of squared deviations by 4: (x <- sample(squirrels, 4)) sdev(x, 4) # Run it a few times to make sure it works # Now try it for 500 samples of 4: rmsd <- NULL for(i in 1:500) { x <- sample(squirrels, 4) rmsd[i] <- sdev(x, 4) } hist(rmsd) abline(v=sigma, col='red') # The true value for the population abline(v=mean(rmsd), col='blue', lty=2) # The value from the samples # The estimate from the samples is too small: it's biased low. # How big is the difference: sigma - mean(rmsd) # Now use 'mu' instead of 'x.bar' to calculate a "true" deviation; # this time we can't use our 'sdev' function but must write out the steps: td <- NULL for(i in 1:500) { x <- sample(squirrels, 4) tdev <- x - mu sum.sq <- sum(tdev^2) mean.sq <- sum.sq/4 td[i] <- sqrt(mean.sq) } hist(td) abline(v=sigma, col='red') abline(v=mean(td), col='blue', lty=2) # How big is the difference: sigma - mean(td) # This estimate is very close to the true value, sigma. # Unfortunately, we usually don't know 'mu', so we can't use this method. # Instead, we adjust the degrees of freedom. # Now try it again, using x.bar but dividing by n - 1 = 3 instead of 4 sd <- NULL for(i in 1:500) { x <- sample(squirrels, 4) sd[i] <- sdev(x, 3) } hist(sd) abline(v=sigma, col='red') # The true value for the population abline(v=mean(sd), col='blue', lty=2) sigma - mean(sd) # How does the mean of this compare with the true value, 'sigma'? # The sample standard deviation of the mean is calculated using n -1 degrees # of freedom, and this is also used by R's built-in 'sd' function: x <- sample(squirrels, 4) sdev(x, 3) sd(x) # Mike Meredith, updated 18 Sept 2007