\documentclass[11pt]{article}
\usepackage{latexsym}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\newcommand{\handout}[5]{
\noindent
\begin{center}
\framebox{
\vbox{
\hbox to 5.78in { {\bf Complexity Theory II } \hfill Course Instructor: #2 }
\vspace{4mm}
\hbox to 5.78in { {\Large \hfill #5 \hfill} }
\vspace{2mm}
\hbox to 5.78in { {\em #3 \hfill #4} }
}
}
\end{center}
\vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{Lecturer: #3}{Scribe:
#4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newcommand{\N}{\mathbb{N}}
\begin{document}
\lecture{6 : Aug 23, 2006 }{V. Arvind}{V. Arvind}{Vipul Naik}
\section{Overview}
In the last lecture we ...
\section{Today's topic: Error correcting codes}
\subsection{Goal for the day}
In this lecture, we shall look at:
\begin{itemize}
\item The notion of communication across a {\em binary symmetric channel} in the presence
of {\em noise}
\item The use of {\em error-correcting codes} with a view to optimizing communication with
a good {\em rate} and good {\em capacity for correcting errors}
\item Upper bounds on how good an error-correcting code can be: the {\em Shannon capacity}
and the {\em Singleton bound}.
\item Lower bounds that guarantee the existence of ``good'' codes, in fact, that guarantee
that the {\em average} code is good: the {\em Gilbert bound} and the {\em Varshmov bound}.
\item Codes that can be encoded and decoded efficiently: the {\em Reed Solomon code}
and the {\em Berlekamp Welch decoder}
\end{itemize}
\subsection{Binary symmetric channel}
%draw picture of sender and receiver
A binary channel is a communications channel from a {\em sender}
to a {\em receiver} where the sender sends {\em binary data} (a stream of $0$s and $1$s)
to the receiver, and there is a likelihood that some of the bits may get flipped.
A binary {\em symmetric} channel is a binary channel where the bit flip probabilities
are:
\begin{itemize}
\item Same for flip from $0$ to $1$ and $1$ to $0$ (hence the term {\em symmetric})
\item Same for all bits (probability $p < 1/2$)
\item {\em Independent} for all bits. That is, the flipping of one bit in no way
affects the flipping of another bit.
\end{itemize}
In the BSC setup, the {\em expected number of bit flips} in the received message
is $p$ times the length of the message.
\subsection{Use of error-correcting codes}
Although the BSC specifically talks of a binary channel, we shlal develop the theory
overa general finite alphabet.
The idea of error-correcting codes is to use the fact that if the total number of bit flips
(in general, changes in letters)
is less than a certain threshold, the original message should be recovered perfectly.
This is achieved by introducing {\em redundancy} into the original message.
Here's how it's done.
\begin{itemize}
\item The original message, called the {\em message},
is what the sender is trying to convey. The collection of all possible messages
is the {\em message space} and the length of the message is termed
the {\em message length}.
\item The sender uses an {\em encoding algorithm} to convert the
message into a (somewhat larger) message called the {\em code} or
{\em encoded message}. This is what is sent across the channel. The
collection of all possible codes is termed the {\em code space}.
\item The channel flips some bits of the encoded message.
\item The receiver receives a {\em corrupted encoded message}. The receiver applies
a {\em decoding algorithm} to this corrupted message. If the message is completely
uncorrupted, the decoding algorithm must retrieve the original message. Hopefully,
if only a few of the letters of the encoded message have been changed, the decoding
algorithm still recovers the message accurately.
\end{itemize}
Three parameters used to measure the effectiveness of the encoding-decoding algorithms:
\begin{itemize}
\item Comparison between the size of the message space and the code
space. Ideally, the code space should not be much larger than the
message space. A reasonable thing to hope for is that the {\em code
length} is linear in the {\em message length}.
\item The maximum number of bit flips (letter changes) that can be tolerated
still giving the correct decoding.
\item The efficiency of the encoding and decoding algorithms, as a
function of the length of the original message.
\end{itemize}
We shall use the following symbols throughout the discussion of codes:
\begin{tabular}{ll}
$\Sigma$ & the alphabet\\
$k$ & message length\\
$\Sigma^k$ & message space\\
$n$ & code length \\
$\Sigma^n$ & code space\\
$C:\Sigma^k \to \Sigma^n$ & encoding algorithm\\
$D:\Sigma^n \to \Sigma^k$ & decoding algorithm
\end{tabular}
\subsection{Hamming distance}
The Hamming distance between two words in $\Sigma^l$ is the number of
coordinates where they differ. The Hamming distance between $x$ and
$y$ in $\Sigma^l$ is denoted as $d_H(x,y)$. Some properties of the
Hamming distance:
\begin{itemize}
\item The Hamming distance is a metric. That is, $d_H$ is symmetric, nonnegative, and satisfies
the triangle inequality.
\item When $\Sigma = F_q$ (a finite field)
the Hamming distance is translation-invariant on the vector space $F_q^n$.
That is, $d_H(x,y) = d_H(x+z,y+z)$ for all $z \in F_q^n$.
\end{itemize}
For binary codes, the Hamming distance between two words $x$ and $y$ measures the {\em minimum}
number of bit flips to go from $x$ to $y$. Thus, Hamming distance is a realistic
measure of distance via bit flips.
For a code, the minimum distance $d$ is defined as the minimum of distances between
pairs of codewords:
\begin{equation}
d = \min_{x \ne y \in \Sigma^k} d_H(C(x),C(y))
\end{equation}
\subsection{Error correcting/decoding radius}
The idea in coding theory as we shall study it is to replace the probabilistic BSC
model with a deterministic model:
\begin{itemize}
\item The sender takes the message $m \in \Sigma^k$ and uses $C$ to convert $m$ to a codeword
$c \in \Sigma^n$.
\item $c$ is transmitted over the channel. The channel has the property that it will alter
at most $e$ coordinates of $c'$.
\item The receiver obtains $c'$, with the guarantee that $d_H(c,c') \leq e$,
and is expected to recover $m$ using the decoding algorithm $D$.
\end{itemize}
Two easy observations:
\begin{observation}[error-correcting radius and minimum distance]\rm
Let $c=C(m)$. The encoding-decoding algorithm
is said to be capable of correcting upto $e$ errors
if $D(c') = m$ whenever $d_H(c',c) \leq e$. The maximum $e$
for which this holds is termed the {\em error-correcting radius}
or {\em decoding radius}
of the algorithm. Then, if $e$ is the error-correcting radius and $d$
is the minimum distance:
$$e \leq \frac{d-1}{2}$$
\end{observation}
\begin{proof}
The intuitive idea is that if $e$ is the error-correcting radius, then the closed
Hamming spheres of radius $e$ about the codewords must be pairwise disjoint. Let $x$ and $y$
be codewords such that $d_H(x,y) = d$. Then, $x$ and $y$ differ in exactly $d$ coordinates.
Suppose $2e \geq d$. Then, consider $z$ obtaiend by changing $e$ of the differing
coordinates from $x$ to $y$. Clearly, $d(x,z) = e$ and $d(y,z) = d - e \leq e$. Hence
$z$ lies in the Hamming balls of radius $e$ centered at $x$ and $y$, causing a contradiction.
Hence, the assumption $2e \geq d$ is flawed, giving the result.
\end{proof}
The converse is also true, albeit only at a theoretical level:
\begin{observation}
If $e \leq \frac{d-1}{2}$, then there exists a decoding algorithm that corrects
upto $e$ errors. However, we cannot say {\em a priori} whether this algorithm will
be polynomial time, or even efficient.
\end{observation}
\begin{proof}
If $e \leq (d-1)/2$, triangle inequality forces the Hamming balls about codewords
to be pairwise disjoint. Thus, whatever word the receiver gets lies in {\em at most one}
Hamming ball. The receiver can now apply the brute-force algorithm: compute distance
from every codeword, and find the codeword from which
the distance is not greater than the error-correcting radius.
\end{proof}
\subsection{Formal measures of goodness of codes}
We can now formally define two measures of the efficiency of the code:
\begin{itemize}
\item {\em Rate} $R$ of the code: This is the ratio of message length to code length. Using smybols as above,
$R = k/n$.
\item {\em Minimum distance} $d$ of the code: This is $\min_{x \ne y}
d_H(x,y)$ where $x$ and $y$ range over codewords.
\item {\em Relative minimum distance} $\delta$ of the code: This is
$d/n$ where $d$ is the minimum distance and $n$ is the code length.
\end{itemize}
The four challenges in front of us:
\begin{itemize}
\item Make $R$ as large as possible
\item Make $\delta$ as large as possible
\item Make $C$ as efficient as possible (in terms of time)
\item Make $D$ as efficient as possible (in terms of time),
and try to make sure that $D$ decodes upto the full
error-correcting radius.
\end{itemize}
A code over an alphabet of size $q$
with message length $k$, code length $n$ and minimum distance $\geq d$ is termed
a $(n,k,d)_q$ code.
\subsection{Families of codes}
A {\em family of codes} is a collection of codes parametrized by $i \in \N$,
wherein the $i^{th}$ code is a $(n_i,k_i,d_i)$ code. We can have two kinds of families of codes:
those with constant alphabet size, and those where the alphabet size also depends on $i$.
Our ultimate aim is to construct families of codes over a constant sized alphabet (in fact,
a binary alphabet) that are {\em good}. Further, we assume, for simplicity, that the $k_i$
are increasing.
A first consideration for families of codes is the relative spacing of valid message lengths.
Ideally, we would like to be able to encode and decode words of any length.
Thus, we would like to be able to {\em pad} any message to a message length $k_i$.
Because complexities of encoding and decoding algorithms are given as functions
of $k_i$, we want that the expansion in the message to reach a valid message length is by
a constant factor. Thus, we would like that the ratios $k_{i+1}/k_i$ are bounded by a constant.
The four measures of goodness of a code can be extended to give
measures of goodness for a family of codes:
\begin{itemize}
\item The rate $R$: We look at the limit (inferior) of the rates of all codes in the family.
\item The relative minimum distance $\delta$: We look at the limit
(inferior) of the relative minimum distance of all codes in the
family.
\item The encoding algorithm $C$: We measure the bound on the running
time of $C_i$ (the encoding algorithm for the $i^{th}$ code) as a
function of $k_i$ (the $i^{th}$ message length). Note that because
the ratios $k_{i+1}/k_i$ are constant, the running time of the
algorithm as a function of actual message length differs only by a
constant factor (if the running time is polynomial).
\item The decoding algorithm $D$: We measure the bound of the running time of $D_i$
(the decoding algorithm for the $i^{th}$ code) as a function of $n_i$ (the $i^{th}$ code length).
Note that if the rate $R$ is bounded below by a constant, this growth rate is roughly
the same as the growth rate with respect to $k_i$ (The uncoded message).
\end{itemize}
\subsection{Probabilistic model}
\begin{fact}\rm
A code with error-decoding radius $e$ for code length $n$ is ``good'' for binary symmetric channels
with the bit flip probability $\leq ce/n$ where $c \leq 1$ is a suitable constant(?).
\end{fact}
The heuristic justification for the above: the {\em expected number of errors} in the received code word
is $pn$ where $p$ is the bit flip probability. Hence, if $pn \leq ce$, the expected number of errors is $ce$,
which is quite small with respect to $e$. By using Chernoff or Chebyshev bounds, and the nature of the probability
distribution, we can actually obtained that the probability that the number of errors is bounded above
by $e$ is quite high, dependign on the choice of $c$. Moreover, the suitable choice of $c$ depends only on $p$
and not on $n$.
The {\em Shannon capacity} of a channel is
the maximum posible value
a probabilistic analogue of what we called the {\em rate}.
\begin{fact}[Shannon]\rm
The Shannon capacity of a binary symmetric channel with bit flip probability $p$
is $1 - H_2(p)$ where $H_2(p)$ is the Shannon entropy of the probability distribution of the
distribution $(p,1-p)$.
\end{fact}
\section{Bounds both ways}
\subsection{Singleton bound: an upper bound}
The Singleton bound (named after a person called Singleton)
establishes an {\em additive tradeoff} between the rate $R$ and
relative minimum distance of a code. It works for codes over all finite alphabets.
\begin{theorem}[Singleton bound]\rm
For a code with mesage length $k$, code length $n$ and relative minimum distance $d$:
$$d + k \leq n + 1$$
In particular, for a family of codes, as $n \to \infty$, we have:
$$\lim_{n \to \infty} R + \delta = 1$$
\end{theorem}
\begin{proof}
Consider the map $\Sigma^n \to \Sigma^{k-1}$ obtained by projecting
onto the first $k-1$ coordinates. Composing this with the encoding
map, we obtain a map $\Sigma^k \to \Sigma^{k-1}$. The map cannot be
injective, hence there are two distinct messages whose corresponding
codewords are equal in the first $k-1$ positions. The Hamming
distance between these two codewords is bounded by $n - k + 1$,
giving that $d \leq n - k + 1$.
\end{proof}
{\em Prima facie}, the Singleton bound seems a very weak upper bound. Surprisingly, however,
it is possible to obtain a {\em lower bound} such that the gap between the two is not much.
Even more surprisingly, a code satisfying the lwoer bound can be found {\em randomly}.
\subsection{Volume of the Hamming ball}
We are interested in estimating the number of points in the Hamming ball of radius $d-1$
in $\Sigma^n$ where $|\Sigma| = q$.
For any $r$, the number of points of weight exactly equal to $r$ is:
$$ (q - 1)^r \binom{n}{r}$$
To find the number of points in the Hamming ball of radius $d - 1$, we need to sum up
for $0 \leq r \leq d-1$. If we denote this by $V_q(n,d)$, we have
$$V_q(n,d) = \sum_{r=0}^{d-1} (q-1)^r \binom{n}{r}$$
We want to find a good upper bound on $V_2(n,d)$. Here goes:
\begin{proposition}[Volume bound]\label{volumebound}\rm
$$V_2(n,d) \leq n2^{nH_2(\delta)}$$
\end{proposition}
\begin{proof}
\begin{eqnarray*}
V_2(n,d) & \leq & sum_{r=0}^{d-1} \binom{n}{r}\\
\implies V_2(n,d) & \leq & n \binom{n}{d}\\
\implies V_2(n,d) & \leq & n \binom{n}{\delta n}\\
\implies V_2(n,d) & \leq & n \exp{n \log n - (\delta n) \log (\delta n) - ((1 - \delta) n) \log((1 - \delta) n)}\\
\implies V_2(n,d) & \leq & n \exp{-n (\delta \log \delta + (1 - \delta)\log(1 - \delta))}\\
\implies V_2(n,d) & \leq & n2^{nH_2{\delta}}
\end{eqnarray*}
\end{proof}
\subsection{Gilbert bound}
\begin{theorem}[Gilbert bound]\rm
For $d < n/2$, there is a $[n,k,d]_2$ code such that $k \leq n (1 - H_2(\delta)) - \log n - c$ where $c > 0$ si a suitable positive constant.
\end{theorem}
\begin{proof}
A greedy algorithm produces the code successfully with high
probability. Let $C: \{ 0,1 \}^k \to \{ 0,1 \}^n$ be defined as
follows. Enumerate the elements of the message space as $x_1,x_2
\ldots$.
Traverse over the $x_i$ and to each $x_i$, assign a value $y_i = C(x_i)$ that satisfies
the constraints imposed by $y_j$ for $j < i$. In other words, at each step, pick
$y_i$ such that for all $j < i$:
$$d_H(y_i,C(x_j)) \geq d$$
To show that the greedy algorithm works, we need to show that {\em at each stage},
there exists at least one candidate for $y_i$. Equivalently, it suffices
to show that the union of Hamming balls of radius $(d-1)$ about the previous $y_i$s
does not cover the whole region. The volume bound (proposition \ref{volumebound})
tells us that:
\begin{eqnarray*}
\text{Volume of first $i-1$ Hamming balls} & \leq & \sum_{j=1}^{i-1} \text{Volume of $j^{th}$ Hamming ball}\\
\implies \text{Volume of first $i-1$ Hamming balls} & \leq & (i - 1) V_2(n,d)\\
\implies \text{Volume of first $i-1$ Hamming balls} & \leq & 2^k V_2(n,d)\\
\implies \text{Volume of first $i-1$ Hamming balls} & \leq & 2^k n2^{nH_2{\delta}}\\
\end{eqnarray*}
Putting $k \leq n(1 - H_2(\delta)) - \log_2(n)$ gives:
\begin{eqnarray*}
\text{Volume of first $i-1$ Hamming balls} & \leq & n2^{n - \log_2(n) - c}\\
\implies \text{Volume of first $i-1$ Hamming balls} & \leq & \frac{2^n}{c}\ \end{eqnarray*}
The first $i-1$ balls cover at most $1/c$ fraction of the points.
Thus, there is always a candidate for $y_i$ outside the Hamming
balls. In fact, the probability that a randomly picked point lies
outside the Hamming balls is nonnegligible.
\end{proof}
\begin{observation}[Randomized algorithm also works]
Note that the above gives an algorithm in two senses:
\begin{itemize}
\item It gives a randomized polynomial-time
algorithm, because at each stage, the number of candidates
for $y_i$ is a nonnegligible fraction of the total nubmer of words,
and the {\em checking} for eahc $y_i$ based on the past $y_j$s is a polynomial-time
subroutine.
\item It gives a deterministic polynomial-time algorithm, if at each stage,
we do a brute-force (lexicographically) on the $y_j$s, and always pick
the {\em least} $y_j$. The brute-force algorithm, however, may not (in fact,
will not) run in polynomial time.
\end{itemize}
\end{observation}
\subsection{Linear codes}
Let $\Sigma$ be identified with $F_q$, the finite field of $q$
elements. Then the alphabet $\Sigma^n$ can be treated as a vector
space $F_q^n$ with addition coordinatewise. The code is termed a {\em
linear code} if the encoding map $C: \Sigma^k \to \Sigma^n$ is a
linear map. Note that for linear codes, the code space is a
$k$-dimensional vector subspace of $\Sigma^n$. Since the Hamming
distance is translation-invariant, we have $d_H(x,y) = d_H(x-y,0)$
and hence the minimum distance is equal to the distance between $0$ and the closest
codeword.
The {\em weight} of a codeword is defined as the number of nonzero coordinates in that
codeword. The weight of a codeword equals its distance from zero. The above observations
tell us that for a linear code, the minimum distance equals the minimum of weights over all
possible codewords.
\subsection{Varshamov bound}
The Varshamov bound establishes the existence of, and in fact, gives a
method to probabilistically obtain, a {\em good} linear code.
\begin{theorem}[Gilbert bound]\rm
For $d < n/2$, there is a $[n,k,d]_2$ linear code such that $k \leq n (1 - H_2(\delta)) - 3 \log n$.
\end{theorem}
\begin{proof}
Pick a random linear transformation $C: F_2^k \to F_2^n$. We argue that $C$ has a significant
probability of defining a linear code of minimum distance $\geq d$. In other words,
the probability that the image of $C$ has a vector of weight strictly smaller than $d$ is less than $1$.
For each vector of weight less than $d$, the probability that it occurs in the code space is $2^{k-n}$.
by the {\em union bound}, the probability that the Hamming ball of radius $d-1$ intersects the code space
is bounded above by $2^{k-n} V_2(n,d)$. But from the same computation used for the Gilbert bound,
we have that $2^{k-n} V_2(n,d) < 1$ when %fillin for $k$ in terms of $n$
\end{proof}
\section{Reed-Solomon code}
\subsection{Setting of the code}
The Reed-Solomon code is constructed over a suitable finite field of $q$ elements.
\subsection{Description of the code}
Idea of the Reed-Solomon code:
\begin{itemize}
\item The message word goes into constructing the {\em coefficients}
of the degree $k$ (over a suitable finite field)
\item The code word stores the {\em value} taken by the polynomial at
a fixed collection of $n$ points. This collection of points of
evaluation is {\em fixed beforehand} and is independent of the
message (hence, in particular, it is known to the receiver with certainty).
\end{itemize}
The $n$ points of evaluation are denoted $x_1, x_2 \ldots x_n$. The message (whose letters form the coefficients)
is written $c_0c_1\ldots c_{k-1}$.
\subsection{Decoding: Berlekamp-Welch}
\begin{proposition}[Berlekamp-Welch decoding]\rm
The Reed-Solomon code can be decoded upto the maximum decoding radius, in polynomial time.
\end{proposition}
\begin{proof}
We first outline an algorithm, and then prove its correctness:
\begin{enumerate}
\item Find polynomials $N$ and $E$ such that:
\begin{enumerate}
\item $deg(E) \leq e \leq (n-k-1)/2$
\item $deg(N) \leq (n+k-1)/2$
\item For each $i$, $N(x_i) = E(x_i)y_i$
\end{enumerate}
\item If $E|N$ output the polynomial $N(x)/E(x)$, otherwise output ``error'' (indicating that the code word
went outside a Hamming ball).
\end{enumerate}
Let us examine step $(1)$ carefully. We begin by proving that there do exist candidates for $E$ and $N$.
let $I$ be the set of coordinates at which the polynomial value got corrupted. Then the following are possible
candidates for $N$ and $E$:
\begin{eqnarray*}
E(x) & \equiv & \prod_{i \in I} (x - x_i)\\
N(x) & \equiv & E(x)P(x)
\end{eqnarray*}
Let's see why this works.
\begin{enumerate}
\item since $|I| \leq e$, $deg(E) \leq e$.
\item Since $deg(P) = k$, we have $deg (N) \leq k + e$. But $e =
(d-1)/2$ and $d = n-k$, so simplifying, we get $deg(N) \leq
(n+k-1)/2$.
\item the condition $N(x_i) = E(x_i)y_i$ is clearly satisfied for $i
\in I$, because $E(x_i) = 0$ and $N(x_i) = 0$ as well. For $i
\notin I$, $N(x_i) = E(x_i) p(x_i)$. But $p(x_i)= y_i$ (Because
this is an uncorrupted letter) and hence $N(x_i) = E(x_i) y_i$.
\end{enumerate}
Suppose $(N,E)$ and $(N',E')$ are two pairs saitsfying the definition.
Then we claim that $NE' = N'E$. To show this, we observe
that $NE'$ and $N'E$ agree on $n$ values: namely all the $x_i$s.
Because the total
degree on both sides is at most $(n-1)$, the two polynomials are identically
equal.
Thus we are guaranteed two things: there exists a pair $(N,E)$
satisfying the three conditions
with the property that $p(x) = N(x)/E(x)$, and that any other pair $(N',E')$
satisfying the three conditions also gives the same value of $p(x)$.
\end{proof}
\end{document}