\documentclass[11pt]{article}
\usepackage{amsfonts}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{multicol}
\usepackage{latexsym}
\usepackage[pdftex]{graphicx}
\usepackage{enumitem}
\usepackage[table]{xcolor}
\usepackage{url}
\usepackage{mathdots}
\usepackage{mathrsfs}
\usepackage[all,arc]{xy}
\usepackage{hyperref}
\hypersetup{colorlinks=true}
\setlength{\oddsidemargin}{.25in}
\setlength{\evensidemargin}{.25in}
\setlength{\textwidth}{6in}
\setlength{\topmargin}{-0.4in}
\setlength{\textheight}{8.5in}
\input{preamble.tex}
\begin{document}
\lecture{10: Everything Else}{Week 10}
This is the tenth week of the Mathematics Subject Test GRE prep course; here, we quickly review a handful of useful concepts from the fields of \textbf{probability}, \textbf{combinatorics}, and \textbf{set theory}!
As always, each of these fields are things you could spend years studying; we present here a few very small slices of each topic that are particularly key to each.
\section{A Crash Course in Set Theory}
Sets are, in a sense, what mathematics are founded on. For the GRE, much of the intricate parts of set theory (the axioms of set theory, the axiom of choice, etc.) aren't particularly relevant; instead, most of what we need to review is simply the \textbf{language} and \textbf{notation} for set theory.
\begin{defn}
A set $S$ is simply any collection of elements. We denote a set $S$ by its elements, which we enclose in a set of parentheses. For example, the set of female characters in \textbf{Measure for Measure} is
\begin{align*}
\{ \textrm{Isabella, Mariana, Juliet, Francisca}\}.
\end{align*}
Another way to describe a set is not by listing its elements, but by listing its properties. For example, the even integers greater than $\pi$ can be described as follows:
\begin{align*}
\{ x ~|~ \pi < x, x \in \mathbb{Z}, 2 \textrm{ divides } x\}.
\end{align*}
\end{defn}
\begin{defn}
If we have two sets $A, B$, we write $A \subseteq B$ if every element of $A$ is an element of $B$. For example, $\mathbb{Z} \subseteq \mathbb{R}$. \end{defn}
\begin{defn}
A specific set that we often care about is the \textbf{empty set}, $\emptyset$ i.e.\ the set containing no elements. One particular quirk of the empty set is that any statement of the form $\forall x \in \emptyset \ldots $ will always be vacuously true, as it is impossible to disprove (as we disprove $\forall$ claims by using $\exists$ quantifiers!) For example, the statement ``every element of the empty set is delicious'' is true. Dumb, but true!
Some other frequently-occurring sets are the \textbf{open} intervals $(a,b) = \{x ~|~ x \in \mathbb{R}, a < x < b\}$ and \textbf{closed} intervals $[a,b] = \{x ~|~ x \in \mathbb{R}, a \leq x \leq b\}$.
\end{defn}
\begin{defn}
Given two sets $A, B$, we can form several other particularly useful sets:
\begin{itemize}
\item The \textbf{difference} of $A$ and $B$, denoted $A - B$ or $A \setminus B$. This is the set $\{x ~|~ x \in A, x \notin B\}$.
\item The \textbf{intersection} of $A$ and $B$, denoted $A \cap B$, is the set $\{x ~|~ x \in A \textrm{ and } x \in B\}$.
\item The \textbf{union} of $A$ and $B$, denoted $A \cup B$, is the set $\{x ~|~ x \in A \textrm{ or } x \in B, \textrm{ or possibly both.}\}$.
\item The \textbf{cartesian product} of $A$ and $B$, denoted $A \times B$, is the set of all ordered pairs of the form $(a,b)$; that is, $\{(a,b) ~|~ a \in A, b \in B\}$.
\item Sometimes, we will have some larger set $B$ (like $\mathbb{R}$) out of which we will be picking some subset $A$ (like $[0,1]$.) In this case, we can form the \textbf{complement} of $A$ with respect to $B$, namely $A^c = \{b \in B ~|~ b \notin A\}$.
\end{itemize}
\end{defn}
With the concept of a set defined, we can define functions as well:
\begin{defn}
A \textbf{function} $f$ with domain $A$ and codomain $B$, formally speaking, is a collection of pairs $(a,b)$, with $a \in A$ and $ b \in B,$ such that there is exactly one pair $(a,b)$ for every $a \in A$. More informally, a function $f: A \to B$ is just a map which takes each element in $A$ to some element of $B$.
\end{defn}
\begin{exmps}~~\\
\begin{itemize}
\item $f: \mathbb{Z} \to \mathbb{N}$ given by $f(n) = 2|n| + 1$ is a function.
\item $g: \mathbb{N} \to \mathbb{N}$ given by $f(n) = 2|n| + 1$ is also a function. It is in fact a different function than $f$, because it has a different domain!
\item The function $h$ depicted below by the three arrows is a function, with domain $\{1, \lambda, \varphi\}$ and codomain $\{ 24, \gamma, \textrm{ Batman}\}:$
\begin{center} \leavevmode
\xymatrix{
1 \ar[rd] & 24 \\
\lambda \ar[ru] & \gamma \\
\varphi \ar@/^/[ruu] & \textrm{Batman}\\
}
\end{center}
\end{itemize}
This may seem like a silly example, but it's illustrative of one key concept: functions are just \textbf{maps between sets!} Often, people fall into the trap of assuming that functions have to have some nice ``closed form'' like $x^3 - \sin(x)$ or something, but that's not true! Often, functions are either defined piecewise, or have special cases, or are generally fairly ugly/awful things; in these cases, the best way to think of them is just as a collection of arrows from one set to another, like we just did above.
\end{exmps}
Functions have several convenient properties:
\begin{defn}
We call a function $f$ \textbf{injective} if it never hits the same point twice -- i.e. for every $b \in B$, there is
\textbf{at most one} $a \in A$ such that $f(a) = b$.
\end{defn}
\begin{exmp}
The function $h$ from before is not injective, as it sends both $\lambda$ and $\varphi$ to 24:
\begin{center} \leavevmode
\xymatrix{
1 \ar[rd] & 24 \\
\lambda \ar[ru] & \gamma \\
\varphi \ar@/^/[ruu] & \textrm{Batman}\\
}
\end{center}
However, if we add a new element $\pi$ to our codomain, and make $\varphi$ map to $\pi$, our function is now injective, as no two elements in the domain are sent to the same place:
\begin{center} \leavevmode
\xymatrix{
1 \ar[rd] & 24 \\
\lambda \ar[ru] & \gamma \\
\varphi \ar@/_/[rd] & \textrm{Batman}\\
& \pi \\
}
\end{center}
\end{exmp}
\begin{defn}
We call a function $f$ \textbf{surjective} if it hits every single point in its codomain -- i.e. if for every $b \in B$, there is \textbf{at least one} $a \in A$ such that $f(a) = b$.
Alternately: define the \textbf{image} of a function as the collection of all points that it maps to. That is, for a function $f: A \to B$, define the image of $f$, denoted $f(A)$, as the set $\{b \in B ~|~\exists a \in A \textrm{ such that }f(a)=b\}$.
Then a surjective function is any map whose image is equal to its codomain: i.e. $f:A \to B$ is surjective if and only if $f(A) = B$.
\end{defn}
\begin{exmp}
The function $h$ from before is not injective, as it doesn't send anything to Batman:
\begin{center} \leavevmode
\xymatrix{
1 \ar[rd] & 24 \\
\lambda \ar[ru] & \gamma \\
\varphi \ar@/^/[ruu] & \textrm{Batman}\\
}
\end{center}
However, if we add a new element $\rho$ to our domain, and make $\rho$ map to Batman, our function is now surjective, as it hits all of the elements in its codomain:
\begin{center} \leavevmode
\xymatrix{
1 \ar[rd] & 24 \\
\lambda \ar[ru] & \gamma \\
\varphi \ar@/^/[ruu] & \textrm{Batman}\\
\rho \ar@/_/[ru] & \\
}
\end{center}
\end{exmp}
\begin{defn}
A function is called \textbf{bijective} if it is injective and surjective.
\end{defn}
\begin{defn}
We say that two sets $A, B$ are the same size (formally, we say that they are of the same \textbf{cardinality},) and write $|A| = |B|$, if and only if there is a bijection $f: A \to B$.
\end{defn}
Not all sets are the same size:
\begin{obs}
If $A, B$ are a pair of finite sets that contain different numbers of elements, then $|A| \neq |B|$.
If $A$ is a finite set and $B$ is infinite, then $|A| \neq |B|$.
If $A$ is an infinite set such that there is a bijection $A \to \mathbb{N}$, call $A$ countable. If $A$ is countable and $B$ is a set that has a bijection to $\mathbb{R}$, then $|A| \neq |B|$.
\end{obs}
We can use this notion of size to make some more definitions:
\begin{defn}
We say that $|A| \leq |B|$ if and only if there is an injection $f: A \to B$. Similarly, we say that $|A| \geq |B|$ if and only if there is a surjection $f: A \to B$.
\end{defn}
This motivates the following theorem:
\begin{thm}
(Cantor-Schr\"oder-Bernstein): Suppose that $A, B$ are two sets such that there are injective functions $f: A \to B, g: B\to A$. Then $|A| = |B|$; i.e. there is some bijection $h: A \to B$.
\end{thm}
\section{A Crash Course in Combinatorics}
Combinatorics, very loosely speaking, is the art of how to \textbf{count things}. For the GRE, a handful of fairly simple techniques will come in handy:
\begin{itemize}
\item
(\textbf{Multiplication principle.}) Suppose that you have a set $A$, each element $\vec{a}$ of which can be broken up into $n$ ordered pieces $(a_1, \ldots a_n)$. Suppose furthermore that the $i$-th piece has $k_i$ total possible states for each $i$, and that our choices for the $i$-th stage do not interact with our choices for any other stage. Then there are
\begin{align*}
k_1 \cdot k_2 \cdot\ldots \cdot k_n = \prod_{i=1}^n k_i
\end{align*}
total elements in $A$.
To give an example, consider the following problem:
\begin{problem}
Suppose that we have $n$ friends and $k$ different kinds of postcards (with arbitrarily many postcards of each kind.) In how many ways can we mail out all of our postcards to our friends?
\end{problem}
A valid ``way'' to mail postcards to friends is some way to assign each friend to a postcard, so that each friend is assigned to at least at least one postcard (because we're mailing each of our friends a postcard) and no friend is assigned to two different postcards at the same time. In other words, a ``way'' to mail postcards is just a function from the set\footnote{Some useful notation: $[n]$ denotes the collection of all integers from 1 to $n$, i.e.\ $\{1,2,\ldots n\}$.} $[n] = \{1,2,3,\ldots n\}$ of postcards to our set $[k] = \{1,2,3,\ldots k\}$ of friends!
In other words, we want to find the size of the following set:
\begin{align*}
A = \Big\{\textrm{all of the functions that map }[n] \textrm{ to }[k]\Big\}.
\end{align*}
We can do this! Think about how any function $f: [n] \to [k]$ is constructed. For each value in $[n] = \{1,2,\ldots n\}$, we have to pick exactly one value from $[k]$. Doing this for each value in $[n]$ completely determines our function; furthermore, any two functions $f,g$ are different if and only if there is some value $m \in [n]$ at which we made a different choice (i.e. where $f(m) \neq g(m)$.)
\begin{align*}
\underbrace{\fbox{k \textrm{choices}} \cdot \fbox{k \textrm{choices}} \cdot \ldots \cdot \fbox{k \textrm{choices}} }_{n \textrm{ total slots}}
\end{align*}
Consequently, we have
\begin{align*}
\underbrace{k \cdot k \cdot \ldots \cdot k}_{n} = k^n
\end{align*}
total ways in which we can construct distinct functions. This gives us this answer $k^n$ to our problem!
\item (\textbf{Summation principle}.) Suppose that you have a set $A$ that you can write as the \textbf{union}\footnote{Given two sets $A, B$, we denote their \textbf{union}, $A \cup B$, as the set containing all of the elements in either $A$ or $B$, or both. For example, $\{2\} \cup \{$lemur$\} = \{2, $ lemur$\}$, while $\{1,\alpha\} \cup \{ \alpha, $lemur$\} = \{1, \alpha, $ lemur$\}$.}of several smaller disjoint\footnote{Sets are called \textbf{disjoint}if they haven no elements in common. For example, $\{2\}$ and $\{$lemur$\}$ are disjoint, while $\{1,\alpha\}$ and $\{ \alpha, $lemur$\}$ are not disjoint.} sets $A_1, \ldots A_n$.
Then the number of elements in $A$ is just the summed number of elements in the $A_i$ sets. If we let $|S|$ denote the number of elements in a set $S$, then we can express this in a formula:
\begin{align*}
|A| = |A_1| + |A_2| + \ldots + |A_n|.
\end{align*}
We work one simple example:
\begin{question}
Pizzas! Specifically, suppose Pizza My Heart (a local chain/ great pizza place) has the following deal on pizzas: for 7\$, you can get a pizza with any two different vegetable toppings, or any one meat topping. There are $m$ meat choices and $v$ vegetable choices. As well, with any pizza you can pick one of $c$ cheese choices.
How many different kinds of pizza are covered by this sale?
\end{question}
Using the summation principle, we can break our pizzas into two types: pizzas with one meat topping, or pizzas with two vegetable toppings.
For the meat pizzas, we have $m \cdot c$ possible pizzas, by the multiplication principle (we pick one of $m$ meats and one of $c$ cheeses.)
For the vegetable pizzas, we have $\binom{v}{2} \cdot c$ possible pizzas (we pick two different vegetables out of $v$ vegetable choices, and the order doesn't matter in which we choose them; we also choose one of $c$ cheeses.)
Therefore, in total, we have $c \cdot \left(m + \binom{v}{2}\right)$ possible pizzas!
\item (\textbf{Double-counting principle}.) Suppose that you have a set $A$, and two different expressions that count the number of elements in $A$. Then those two expressions are equal.
Again, we work a simple example:
\begin{question}
Without using induction, prove the following equality:
\begin{align*}
\sum_{i=1}^n i = \frac{n(n+1)}{2}
\end{align*}
\end{question}
First, make a $(n+1) \times (n+1)$ grid of dots:
\begin{center}
\includegraphics[width=2in]{gridofdots.pdf}
\end{center}
How many dots are in this grid? On one hand, the answer is easy to calculate: it's $(n+1)\cdot(n+1) = n^2 + 2n + 1$.
On the other hand, suppose that we group dots by the following diagonal lines:
\begin{center}
\includegraphics[width=2in]{gridofdots2.pdf}
\end{center}
The number of dots in the top-left line is just one; the number in the line directly beneath that line is two, the number directly beneath that line is three, and so on/so forth until we get to the line containing the bottom-left and top-right corners, which contains $n+1$ dots. From there, as we keep moving right, our lines go down by one in size each time until we get to the line containing only the bottom-right corner, which again has just one point.
So, if we use the summation principle, we have that there are
\begin{align*}
1+2+3+\ldots + (n-1) + n + (n+1) + n + (n-1) + \ldots + 3 + 2 + 1
\end{align*}
points in total.
Therefore, by our double-counting principle, we have just shown that
\begin{align*}
n^2 + 2n + 1 = 1+2+3+\ldots + (n-1) + n + (n+1) + n + (n-1) + \ldots + 3 + 2 + 1.
\end{align*}
Rearranging the right-hand side using summation notation lets us express this as
\begin{align*}
n^2 + 2n + 1 = (n+1) + 2\sum_{i=1}^n i;
\end{align*}
subtracting $n+1$ from both sides and dividing by 2 gives us finally
\begin{align*}
\frac{n^2 + n}{2} = \sum_{i=1}^n i,
\end{align*}
which is our claim!
\item (Pigeonhole principle, simple version): Suppose that $kn+1$ pigeons are placed into $n$ pigeonholes. Then some hole has at least $k+1$ pigeons in it. (In general, replace ``pigeons'' and ``pigeonholes'' with any collection of objects that you're placing in various buckets.)
We look at an example:
\begin{question}
Suppose that ``friendship'' is\footnote{Magic!} a symmetric relation: i.e. that whenever a person $A$ is friends with a person $B$, $B$ is also friends with $A$. Also, suppose that you are never friends with yourself\footnote{Just for this problem. Be friends with yourself in real life.} (i.e. that friendship is antireflexive.)
Then, in any set $S$ of greater than two people, there are at least two people with the same number of friends in $S$.
\end{question}
Let $|S| = n$. Then every person in $S$ has between $0$ and $n-1$ friends in $S$. Also notice that we can never simultaneously have one person with 0 friends and one person with $n-1$ friends at the same time, because if someone has $n-1$ friends in $S$, they must be friends with everyone besides themselves.
Therefore, each person has at most $n-1$ possible numbers of friends, and there are $n$ people total: by the pigeonhole principle, if we think of people as the ``pigeons'' and group them by their numbers of friends (i.e.\ the ``pigeonholes'' are this grouping by numbers of friends,) there must be some pair of people whose friendship numbers are equal.
\end{itemize}
Some sorts of sets are very frequently counted:
\begin{itemize}
\item If we have a set of $n$ objects, there are $n! = n \cdot (n-1) \cdot \ldots \cdot 1$ many ways to \textbf{order} this set. For example, the set $\{a,b,c\}$ has $3! = 6$ orderings:
\begin{align*}
abc, acb, bac, bca, cab, cba.
\end{align*}
\item Suppose that we have a set of $n$ objects, and we want to pick $k$ of them without repetition in order. Then there are $n \cdot (n-1) \cdot \ldots \cdot (n-(k+1)$ many ways to choose them: we have $n$ choices for the first, $n-1$ for the second, and so on/so forth until our $k$-th choice (for which we have $n- (k+1)$ choices.) We can alternately express this as $\frac{n!}{(n-k)!}$; you can see this algebraically by dividing $n!$ by $k!$, or conceptually by thinking our our choice process as actually ordering all $n$ elements (the $n!$ in our fraction) and then forgetting about the ordering on all of the elements after the first $k$, as we didn't pick them (this divides by $(n-k)!$.)
\item Suppose that we have a set of $n$ objects, and we want to pick $k$ of them without repetition and without caring about the order in which we pick these $k$ elements. Then there are $\frac{n!}{k!(n-k)!}$ many ways for this to happen. We denote this number as the \textbf{binomial coefficient} $\binom{n}{k}$.
\item Finally, suppose that we have a set of $n$ objects, and we want to pick $k$ of them, where we can pick an element multiple times (i.e.\ with repetition.) Then there are $n^k$ many ways to do this, by our multiplication principle from before.
\end{itemize}
\section{A Crash Course in Probability}
We give the basics of probability here.
\begin{defn}
A \textbf{finite probability space} consists of two things:
\begin{itemize}
\item A finite set $\Omega$.
\item A \textbf{measure} $Pr$ on $\Omega$, such that $Pr(\Omega) = 1$. In case you haven't seen this before, saying that $Pr$ is a measure is a way of saying that $Pr$ is a function $\mathcal{P}(\Omega) \to \mathbb{R}^+$, such that the following properties are satisfied:
\begin{itemize}
\item $Pr(\emptyset) = 0$.
\item For any collection $\{X_i\}_{i=1}^\infty$ of subsets of $\Omega$, $\displaystyle Pr\left(\bigcup_{i=1}^\infty X_i\right) \leq \sum_{i=1}^n Pr(X_i)$.
\item For any collection $\{X_i\}_{i=1}^\infty$ of \textbf{disjoint} subsets of $\Omega$, $\displaystyle Pr\left(\bigcup_{i=1}^\infty X_i\right) = \sum_{i=1}^n Pr(X_i)$.
\end{itemize}
\end{itemize}
For a \textbf{general} probability space, i.e.\ one that may not be finite, the definition is almost completely the same: the only difference is that $\Omega$ is not restricted to be finite, while $Pr$ becomes a function defined only on the ``measurable'' subsets of $\Omega$. (For the GRE, you can probably assume that any set you run into is ``measurable.'' There are some pathological constructions in set theory that can be nonmeasurable; talk to me to learn more about these!)
\end{defn}
For example, one probability distribution on $\Omega = \{1,2,3,4,5,6\}$ could be the distribution that believes that $Pr(\{i\}) = 1/6$ for each individual $i$, and more generally that $Pr(S) = |S|/6$ for any subset $S$ of $\Omega$. In this sense, this probability distribution is capturing the idea of rolling a fair six-sided die, and seeing what comes up.
This sort of ``fair'' distribution has a name: namely, the \textbf{uniform} distribution!
\begin{defn}
The \textbf{uniform} distribution on a finite space $\Omega$ is the probability space that assigns the measure $|S|/|\Omega|$ to every subset $S$ of $\Omega$. In a sense, this measure thinks that any two elements in $\Omega$ are ``equally likely;'' think about why this is true!
\end{defn}
We have some useful notation and language for working with probability spaces:
\begin{defn}
An \textbf{event} $S$ is just any subset of a probability space. For example, in the six-sided die probability distribution discussed earlier, the set $\{2,4,6\}$ is an event; you can think of this as the event where our die comes up as an even number. The probability of an event $S$ occurring is just $Pr(S)$; i.e.\ the probability that our die when rolled is even is just $Pr(\{2,4,6\}) = 3/6 = 1/2$, as expected.
Notice that by definition, as $Pr$ is a measure, for any two events $A, B$, we always have $Pr(A \cup B) \leq Pr(A) + Pr(B)$. In other words, given two events $A, B$, the probability of either A or B happening (or both!) is at most the probability that $A$ happens, plus the probability that $B$ happens.
\end{defn}
\begin{defn}
A real-valued \textbf{random variable} $X$ on a probability space $\Omega$ is simply any function $\Omega \to \mathbb{R}$.
Given any random variable $X$< we can talk about the \textbf{expected value} of $X$; that is, the ``average value'' of $X$ on $\Omega$, where we use $Pr$ to give ourselves a good notion of what ``average'' should mean. Formally, we define this as the following sum:
\begin{align*}
\sum_{\omega in \Omega} Pr(\omega) \cdot X(\omega).
\end{align*}
For example, consider our six-sided die probability space again, and the random variable $X$ defined by $X(i) = i$ (in other words, $X$ is the random variable that outputs the top face of the die when we roll it.)
The expected value of $X$ would be
\begin{align*}
\sum_{\omega in \Omega} Pr(\omega) \cdot X(\omega) = \frac{1}{6} \cdot 1 + \frac{1}{6} \cdot 2 + \frac{1}{6} \cdot 3 + \frac{1}{6} \cdot 4 + \frac{1}{6} \cdot 5 + \frac{1}{6} \cdot 6 = \frac{21}{6} = \frac72.
\end{align*}
In other words, rolling a fair six-sided die once yields an average face value of 3.5.
\end{defn}
\begin{defn}
Given a random variable $X$, if $\mu = \mathbb{E}(X)$, the \textbf{variance} $\sigma^2(X)$ of $X$ is just $\mathbb{E}((X - \mu)^2)$. This can also be expressed as $\mathbb{E}(X^2) - (\mathbb{E}(X))^2$, via some simple algebraic manipulations.
The \textbf{standard deviation} of $X$, $\sigma(X)$, is just the square root of the variance.
\end{defn}
\begin{defn}
Given a random variable $X: \Omega \to \mathbb{R}$ on a probability space $\Omega$, we can define the \textbf{density function} for $X$, denoted $F_X(t)$, as
\begin{align*}
F_X(t) = Pr( X(\omega) \leq t).
\end{align*}
Given this function, we can define the \textbf{probability density function} $f_X(t)$ as $\frac{d}{dt}F_x(t)$. Notice that for any $a,b$, we have
\begin{align*}
Pr(a < X(\omega) \leq b) = \int_a^b f_X(t)dt.
\end{align*}
A random variable has a \textbf{uniform distribution} if its probability density function is a constant; this expresses the idea that uniformly-distributed things ``don't care'' about the differences between elements of our probability space.
A random variable $X$ with standard deviation $\sigma$, expectation $\mu$ has a \textbf{normal distribution} if its probability density function has the form
\begin{align*}
f_X(t) = \frac{1}{\sigma\sqrt{2\pi}} \cdot e^{-(t-\mu)^2/2\sigma^2}.
\end{align*}
This generates the standard ``bell-curve'' picture that you've seen in tons of different settings. One useful observation about normally-distributed events is that about $68\%$ of the events occur within one standard deviation of the mean (i.e. $Pr(\mu - \sigma < X \leq \mu + \sigma) \approx .68$), about $95\%$ of events occur within two standard deviations of the mean, and about $99.7\%$ of events occur within three standard deviations of the mean.
\end{defn}
\begin{defn}
For any two events $A, B$ that occur with nonzero probability, define $Pr(A$ given $B)$, denoted $Pr(A|B)$, as the likelihood that $A$ happens given that $B$ happens as well. Mathematically, we define this as follows:
\begin{align*}
Pr(A|B) = \frac{Pr(A \cap B)}{Pr(B)}.
\end{align*}
In other words, we are taking as our probability space all of the events for which $B$ happens, and measuring how many of them also have $A$ happen.
\end{defn}
\begin{defn}
Take any two events $A, B$ that occur with nonzero probability. We say that $A$ and $B$ are \textbf{independent} if knowledge about $A$ is useless in determining knowledge about $B$. Mathematically, we can express this as follows:
\begin{align*}
Pr(A) = Pr(A |B).
\end{align*}
Notice that this is equivalent to asking that
\begin{align*}
Pr(A) \cdot Pr(B) = Pr(A \cap B).
\end{align*}
\end{defn}
\begin{defn}
Take any $n$ events $A_1, A_2, \ldots A_n$ that each occur with nonzero probability. We say that these $n$ events are are \textbf{mutually independent} if knowledge about any of these $A_i$ events is useless in determining knowledge about any other $A_j$. Mathematically, we can express this as follows: for any $i_1, \ldots i_k$ and $j \neq i_1, \ldots i_k$, we have
\begin{align*}
Pr(A_j) = Pr(A_j |A_{i_1} \cap \ldots \cap A_{i_k}).
\end{align*}
\end{defn}
It is not hard to prove the following results:
\begin{thm}
A collection of $n$ events $A_1, A_2, \ldots A_n$ are mutually independent if and only if for any distinct $i_1, \ldots i_k \subset \{1, \ldots n\}$, we have
\begin{align*}
Pr(A_{i_1} \cap \ldots \cap A_{i_k}) = \prod_{j=1}^k A_{i_j}.
\end{align*}
\end{thm}
\begin{thm}
Given any event $A$ in a probability space $\Omega$, let $A^c = \{\omega \in \Omega ~|~ \omega \notin A\}$ denote the \textbf{complement} of $A$.
A collection of $n$ events $A_1, A_2, \ldots A_n$ are mutually independent if and only if their complements $\{A_1^c, \ldots A_n^c\}$ are mutually independent.
\end{thm}
It is also useful to note the following non-result:
\begin{notthm}
Pairwise independence does not imply independence! In other words, it is possible for a collection of events $A_1, \ldots A_n$ to all be \textbf{pairwise independent} (i.e.\ $Pr(A_i \cap A_j) = Pr(A_i)Pr(A_j)$ for any $i, j$) but not mutually independent!
\end{notthm}
\begin{exmp}
There are many, many examples. One of the simplest is the following: consider the probability space generated by rolling two fair six-sided dice, where any pair $(i,j)$ of faces comes up with probability $1/6$.
Consider the following three events:
\begin{itemize}
\item $A$, the event that the first die comes up even.
\item $B$, the event that the second die comes up even.
\item $C$, the event that the sum of the two dice is odd.
\end{itemize}
Each of these events clearly has probability $1/2$. Moreover, the probability of $A \cap B, A \cap C$ and $B \cap C$ are all clearly $1/4$; in the first case we are asking that both dice come up even, in the second we are asking for (even, odd) and in the third asking for (odd, even), all of which happen $1/4$ of the time. So these events are pairwise independent, as the probability that any two happen is just the products of their individual probabilities.
However, $A \cap B \cap C$ is impossible, as $A \cap B$ holds iff the sum of our two dice is even! So $Pr(A \cap B \cap C) = 0 \neq Pr(A)Pr(B)Pr(C) = 1/8$, and therefore we are not mutually independent.
\end{exmp}
\end{document}
\section{Example GRE problems}
We work a few GRE problems to give a feel for how these concepts can come up:
\begin{problem}
Which of the following five sets has the greatest cardinality?
\begin{enumerate}[label=(\alph*)]
\item $\mathbb{R}$.
\item The set of all functions $\mathbb{Z} \to \mathbb{Z}$.
\item The set of all functions $\mathbb{R} \to \{ 0, 1\}$.
\item The set of all finite subsets of $\mathbb{R}$.
\item The set of all polynomials with coefficients in $\mathbb{R}$.
\end{enumerate}
\end{problem}
\begin{answer}
\end{answer}
\begin{problem}
For how many values of $k$ does the number $k!$, when written in decimal notation, end with exactly 99 zeroes?
\begin{enumerate}[label=(\alph*)]
\item None.
\item One.
\item Four.
\item Five.
\item Twenty-four.
\end{enumerate}
\end{problem}
\begin{answer}
\end{answer}
\begin{problem}
A fair coin is tossed 100 times, with each toss resulting in either a head or a tail. Let $H$ denote the number of heads, and $T$ the number of tails. Which of the following five events has the greatest probability?
\begin{enumerate}[label=(\alph*)]
\item $H = 50$.
\item $T \geq 60$.
\item $51 \leq H \leq 55$.
\item $H \geq 48$ and $T \geq 48$.
\item $H \leq 5$ or $H \geq 95$.
\end{enumerate}
\end{problem}
\begin{answer}
\end{answer}