\documentclass[11pt]{article}
\usepackage{amsmath,amssymb,amsthm}
\DeclareMathOperator*{\E}{\mathbb{E}}
\let\Pr\relax
\DeclareMathOperator*{\Pr}{\mathbb{P}}
\newcommand{\eps}{\epsilon}
\newcommand{\inprod}[1]{\left\langle #1 \right\rangle}
\newcommand{\R}{\mathbb{R}}
\newcommand{\handout}[5]{
 \noindent
 \begin{center}
 \framebox{
 \vbox{
 \hbox to 5.78in { {\bf CS 395T: Sublinear Algorithms } \hfill #2 }
 \vspace{4mm}
 \hbox to 5.78in { {\Large \hfill #5 \hfill} }
 \vspace{2mm}
 \hbox to 5.78in { {\em #3 \hfill #4} }
 }
 }
 \end{center}
 \vspace*{4mm}
}
\newcommand{\lecture}[4]{\handout{#1}{#2}{#3}{Scribe: #4}{Lecture #1}}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{example}[theorem]{Example}
% 1-inch margins, from fullpage.sty by H.Partl, Version 2, Dec. 15, 1988.
\topmargin 0pt
\advance \topmargin by -\headheight
\advance \topmargin by -\headsep
\textheight 8.9in
\oddsidemargin 0pt
\evensidemargin \oddsidemargin
\marginparwidth 0.5in
\textwidth 6.5in
\parindent 0in
\parskip 1.5ex
\begin{document}
\lecture{10\&11 --- Sept 30 \& Oct 2, 2014}{Fall 2014}{Prof.\ Eric Price}{Xue Chen}
\section{Overview}
These notes describe two lectures. The first gives an introduction to
$\epsilon$-covers, $\epsilon$-packings, and RIP matrices. The second
describes compressed sensing and iterative hard thresholding.
\section{$\epsilon$-cover and $\epsilon$-packing}
First, we define metric spaces, $\epsilon$-covers and covering number.
\begin{definition}[Metric Space]
A metric space is an ordered pair $(X,d)$ where $X$ is a space and $d$ is a metric on $X$ such that $\forall x,y \in X:$
\begin{enumerate}
\item $d(x,y)\ge 0$.
\item $d(x,y)=0 \Leftrightarrow x=y$.
\item $d(x,y)=d(y,x)$.
\item $d(x,y)\le d(x,z)+d(z,y), \forall z \in X$. 
\end{enumerate}
\end{definition}
\begin{definition}
An $\epsilon$-cover of $X$ with respect to $d$ is a collection of points $\{x_1,\cdots,x_n\} \subseteq X$ such that $\forall y\in X, min_{1\le i \le n} d(y,x_i)\le \epsilon.$
\end{definition}
\begin{definition}
The covering number $N(\epsilon,X,d)$ is the minimal number of points of all $\epsilon$-cover of $X$ w.r.t. $d$. 
\end{definition}
We abuse the notation $N$ to denote $N(\epsilon,X,d)$ if $X,d$ is clear. We use $\log N(\epsilon,X,d)$ to denote the metric entropy of $(X,d)$ which indicates the information by knowing the positions of a point to $\epsilon$ distance in $d$.
\begin{example}
$X=[-1,1],d(x,y)=&#124;x-y&#124;.$ Then $\{0, \pm 2\epsilon, \pm 4\epsilon, \cdots \}$ is an $\epsilon$-cover of $X,ドル so $N(\epsilon,X,d)\le \frac{2}{2\epsilon}+1=1+1/\epsilon$.
\end{example}
\begin{example}
$X=[-1,1]^m, d(x,y)=&#124;x-y&#124;_{\infty}$. From the above example, $\{0, \pm 2\epsilon, \pm 4\epsilon,\cdots \}^m$ is an $\epsilon$-cover of $X$. So $N(\epsilon, X, d)\le (1+1/\epsilon)^m$ and the metric entropy is $\log N=\Theta(m\log(\frac{1}{\epsilon}))$.
\end{example}
A closely related concept of covering number is packing number.
\begin{definition}
 An $\epsilon$-packing of $X$ w.r.t. $d$ is a collection of points
 $\{x_1,\cdots, x_n\}\subseteq X$ such that $\forall i,j(i\neq j),
 d(x_i,x_j)\ge \epsilon$.
 The packing number $M(\epsilon,X,d)$ is the maximal number of points
 of all $\epsilon$-packings of $(X,d)$.
\end{definition}
\begin{lemma}{\label{sandwich}}
$$M(2\epsilon,X,d)\le N(\epsilon,X,d) \le M(\epsilon,X,d).$$
\end{lemma}
In general, the difference between $\epsilon$ and 2ドル\epsilon$ will
only affect constants, which we will not care about in this course.
So we will freely switch between the packing number and the covering
number of $(X,d)$.
\section{$N(\epsilon,B^d_q,\&#124;\cdot\&#124;_p)$}
Now let us consider the covering of $L_q$ balls in $L_p$ norm.
\begin{definition}
$\&#124;x\&#124;_p=(\sum_{i} &#124;x_i&#124;^p)^{1/p}$ for any $p>0$. The unit ball of dimension $d$ in $L_p$ is $B^d_p=\{x\in R^d&#124;\&#124;x\&#124;_p\le 1\}$.
\end{definition}
One can verify $\&#124;\cdot\&#124;_p$ is a metric by Holder's inequality for
any $p>0$. We abuse the notation $\&#124;\cdot\&#124;$ if $p$ is clear. One of
the basic property of $L_p$ we will use in this lecture is $\&#124; x
\&#124;_{p_0} \le \&#124; x \&#124; _{p_1}$ for $p_0 \ge p_1$. We also use $Vol(S)$
to denote the volume of $S$. Another property of ball volume often
used in this lecture is that $\frac{\alpha B^d_p}{\beta
 B^d_p}=(\frac{\alpha}{\beta})^d$ -- scaling a ball by $\alpha$ in
each of $d$ dimensions increases its volume by a factor of $\alpha^d$.
\begin{fact}
$\frac{1}{\epsilon^d} \frac{Vol(B^d_q)}{Vol(B^d_p)} \le N(\epsilon,B^d_q,\&#124;\cdot\&#124;_p) \le (\frac{2}{\epsilon})^d \frac{ Vol(B^d_q + \frac{\epsilon}{2}B^d_p) }{Vol(B^d_p)}$.
\end{fact}
\begin{proof}
Lower bound: Let $\{x_1,x_2,\cdots, x_N\}$ be an $\epsilon$-cover of $B^d_q$. Because $B^d_q \subseteq \cup_i (x_i+\epsilon B^d_p),ドル $Vol(B^d_q) \le N \cdot Vol(\epsilon B^d_p)$.
Upper Bound: Let $\{x_1,x_2,\cdots, x_N\}$ be an $\epsilon$-packing of $B^d_q$. Because all the balls of $x_i + \frac{\epsilon}{2} B^d_p$ are disjoint, $\cup_i (x_i + \frac{\epsilon}{2} B^d_p) \subseteq B^d_q + \frac{\epsilon}{2} B^d_p$(some $x_i$ may be on the surface). Therefore $N \cdot Vol(\frac{\epsilon}{2} B^d_p)\le Vol(B^d_q + \frac{\epsilon}{2} B^d_p)$ and a upper bound of packing number is also a upper bound of covering number by Lemma \ref{sandwich}.
\end{proof}
To make this simpler, let's look at a couple cases.
\paragraph{Same norm.} If $p=q,ドル the upper bound $ \frac{ Vol(B^d_q +
 \frac{\epsilon}{2}B^d_p) }{Vol(B^d_p)}=\frac{Vol( (1 +
 \frac{\epsilon}{2})B^d_p )}{Vol(B^d_p)}=(1+\frac{\epsilon}{2})^d.$
Therefore $\frac{1}{\epsilon^d}\le N \le (1+\frac{2}{\epsilon})^d$.
\paragraph{When $q=1$ and $p=2$.} Because $B^d_1\le B^d_2$
from the property of $L_p,ドル one upper bound is $ \frac{ Vol(B^d_q +
 \frac{\epsilon}{2}B^d_p) }{Vol(B^d_p)}\le \frac{ Vol(( 1 +
 \frac{\epsilon}{2})B^d_p) }{Vol(B^d_p)} \le (1+
\frac{\epsilon}{2})^d$. For the lower bound,
$Vol(B^d_1)=\frac{2^d}{d!}$ because there are two signs for each
dimension and the volume of each $d$-simplex is
$\frac{1}{d!}$. $Vol(B^d_2)=\frac{\pi^{d/2}}{(d/2)!}$ for even
$d$\cite{Sphere}. Therefore $\frac{1}{\epsilon^d}\cdot
\frac{2^d/d!}{\pi^{d/2}/(d/2)!}\le N$ and $d\log(1/\epsilon) -
\frac{d}{2}\log d \le \log N \le d \log(2/\epsilon)$.
This gives a tight bound of $N = \Theta(d \log (1/\eps))$ when $\eps < 1/d$. For $\eps> 1/\sqrt{d},ドル however, the lower bound is trivial.
In fact, the volume argument is loose in the ``large $\eps$'' setting.
We can also show that $\log N \le O(\frac{\log d}{\epsilon ^2})$ by
Maurey's empirical method (see, for example, \cite{NPW2012}):
\begin{enumerate}
\item For any $\vec{x}=(x_1,\cdots,x_d) \le B^d_1,ドル we consider the following experiment($e_1,e_2,\cdots, e_d$ is the standard basis of $\mathbb{R}^d$): 
\item Randomly sampling $z_i$ from $\{e_1,e_2,\cdots,e_d\}$ according to $(&#124;x_1&#124;,&#124;x_2&#124;,\cdots, &#124;x_n&#124;)$ (or 0ドル$ for the remainder) for $i=1,\cdots, t$ independently. 
\item Let $z=\frac{1}{t} \sum_i z_i$. Then $E[z]=x$ and $$E[\&#124;x-z\&#124;^2_2]=\sum_{j=1}^d (x_j-\frac{1}{t}\sum_i 1_{z_i=e_j})^2=\frac{1}{t}\sum_{j=1}^d x_j(1-x_j) \le \frac{1}{t} \sum_j x_j\le 1/t.$$ 
\item So there exists a $z$ such that $E[\&#124;z-x\&#124;_2]\le \epsilon$ after choosing $t=1/\epsilon^2$. Therefore $N\le (2d+1)^t$ and $\log N \le O(\frac{\log d}{\epsilon^2})$. 
\end{enumerate}
\section{Sparse Vectors and RIP matrix}
Let us start with some definitions.
\begin{definition}
 We use $supp(x)=\{i&#124;x_i\neq 0,1\le i \le d\}$ to denote the support
 of vector $x$. And we define $\&#124;x\&#124;_0=&#124;supp(x)&#124;$ and $k$-sparse
 space $$T_k=\{x : \&#124;x\&#124;_2\le 1, \&#124;x\&#124;_0=k\}.$$
\end{definition}
It is not difficult to see $N(\epsilon,T_k,\&#124;\cdot \&#124;_2)\le {d \choose
 k}(1+\frac{2}{\epsilon})^k$ by a union bound over all
$k$-dimensional subspaces. Therefore $\log N(\epsilon, T_k, \&#124;\cdot
\&#124;_2)\le O(k \log \frac{d}{k\epsilon}).$
Now we are interested in finding a matrix $A$ with ``few'' rows that
preserves the norm of \emph{every vector $x \in T_k$}, i.e. has
bounded $\max_{x:x \in T_k} \frac{\&#124;Ax\&#124;_2}{\&#124;x\&#124;_2}$ over nonzero
vector $x$. Recall what we proved in the construction of a $JL$
matrix: if we sample a matrix $A\in \mathbb{R}^{m\times n}$ by
independently sampling each entry $a_{i,j}\sim N(0,1/m),ドル then
$\forall x \in \mathbb{R}^n, \&#124;Ax\&#124;^2_2 = (1 \pm \epsilon) \&#124;x\&#124;^2_2$
with prob. at least 1ドル - 2e^{-\epsilon^2 m/C}$ for some constant
$C$(see lecture note 2). If $T_k$ is finite, we could take a union
bound to argue the same way to generate $A$ also works here for every
vector $x \in T_k$. However, $T_k$ is infinite and we need another
argument to bound the error.
Instead of union bound, let $S=\{x^{(1)},\cdots, x^{(N)}\}$ be an
$\epsilon$-cover(a.k.a.``net'') of $T_k$ with size $N\le {d \choose
 k}(1+\frac{2}{\epsilon})^d$. We can decompose any $x \in T_k$ in
this way:
\begin{enumerate}
\item Find $x_1 \in S$ such that $x=x_1+\epsilon x'$ for $\&#124;x'\&#124;_2\le
 1$ and $&#124;supp(x')&#124;\le k$. Since $S$ is an $\epsilon$-cover, there
 always exists $x_1 \in S$ such that $\&#124;x-x_1\&#124;_2<\epsilon$ by definition. $&#124;supp(x')&#124;\le k$ follows from a special property of our net: because our net is a union bound over all $k$-dimensional subspaces, we can choose $x_1$ from the same $k$-dimensional subspace as $x$. \item If $\&#124;x'\&#124; \neq 0,ドル then applying the above procedure on $x'$ again to get $x'=x_2+\epsilon x''$ such that $\&#124;x''\&#124;_2\le 1$ and $&#124;supp(x'')&#124;\le k,ドル and so on. \item Eventually, we have $x=x_1+\epsilon x_2 + \epsilon^2 x_3 + \cdots + \epsilon^{i-1}x_i + \dotsb $ so all the $x_i \in S$. \end{enumerate} Now we choose $m=C_0 \log N/ \epsilon^2=O(\frac{d}{\epsilon^2}\log \frac{d}{\epsilon k})$ for a large constant $C_0$ and $\epsilon<\frac{1}{2},ドル then $\&#124;Ax\&#124;_2=(1\pm \epsilon)\&#124;x\&#124;_2$ for all $x\in S$ with high probability by union bound. This concludes \begin{align*} \forall x\in T, \&#124;Ax\&#124;_2&=\&#124;A(x_1+\epsilon x_2 + \epsilon^2 x_3 + \cdots + \epsilon^{i-1}x_i \cdots )\&#124;_2\\ &\le \&#124;Ax_1\&#124;_2 + \epsilon \&#124;Ax_2\&#124;_2 + \epsilon^2 \&#124;Ax_3\&#124;_2 + \cdots + \epsilon^{i-1} \&#124;Ax_i\&#124;_2 + \cdots \\ &\le (1+\epsilon)(1+\epsilon+\epsilon^2+\cdots + \epsilon^{i-1}+\cdots)\\ &\le (1+\epsilon)\frac{1}{1-\epsilon}\\ &\le 1+O(\epsilon) \end{align*} \begin{definition}[Restricted Isometry Property] An $m\times n$ matrix $A$ has restricted isometry property(RIP) of $(k,\epsilon)$ if $\forall x$ with $\&#124;x\&#124;_0\le k, (1-\epsilon)\&#124;x\&#124;_2 \le \&#124;Ax\&#124;_2 \le (1+\epsilon)\&#124;x\&#124;_2$. \end{definition} An equivalent way to define RIP is for any subset $S$ of size $k, \&#124;(A_S)^T A_S - I\&#124;_2 \le \epsilon$ where $A_S$ is the matrix by picking column in $S$ and $\&#124;A\&#124;_p=\sup_{x\neq \vec{0}}\frac{ \&#124;Ax\&#124;_p}{\&#124;x\&#124;_p}$. From now on, we assume $\epsilon$ is a small constant such as 1ドル/10$ and 1ドル/100$ without further specification. In general, we are interested in the construction of an $m \times n$ RIP matrix $A$ with the following properties: \begin{enumerate} \item It is easy to check $A$ satisfies RIP or not. \item $A$ can be stored in $o(mn)$ space. \item The multiplication $Ax$ can be computed in $o(mk)$ time for $x \in T_k$. \item $m$ is as small as possible. \end{enumerate} \begin{example}[Random (sub)Gaussian Matrix] We generate $A$ by independently sampling a (sub)Gaussian variable in every entry. With overwhelming probability ($\ge 1-e^{\Omega(m)}$), $A$ is an RIP matrix for sufficiently large $m=\Omega(\frac{1}{\eps^2}k log (n/k))$. However, we do not know how to verify $A$ satisfies RIP even though it happens with very high prob., and it is also bad in storage and multiplication. \end{example} \begin{example}[Coherent Matrix] A matrix $A$ with $n$ columns $\{a_1,a_2,\cdots, a_n\}$ is defined to be $\alpha$-coherent iff $\frac{<a_i,a_j>}{\sqrt{\&#124;a_i\&#124;_2\cdot \&#124;a_j\&#124;_2}}\le \alpha$. Let $A'$ be the normalized matrix of $A$(normalize every column to a unit vector). We can show $A'$ is a $(k,\alpha \cdot k)$-RIP matrix: for any $k$-sparse vector $x,ドル $$\&#124;Ax\&#124;^2_2=\sum_{i \in supp(x)} \sum_{j \in supp(x)}<a'_i,a'_j>x_{i}x_j=\sum_{i \in supp(x)}x^2_i + \sum_{i\neq j}\alpha \cdot &#124;x_ix_j&#124; \le \&#124;x\&#124;^2_2+\alpha k \&#124;x\&#124;^2_2.$$
Coherent matrix is easy to verify but it need a large $m$ if we want
to use it as a RIP matrix. For example, suppose we generate every
$a_i$ by independently choosing $\{\pm 1\}$ in every entry. It is not
to difficult to see $m\ge 1/\alpha^2 \ge \Omega(k^2)$ if we want $a_i$
is $\alpha$-coherent with high prob.(compute the variance of
$<a_i,a_j>$). Finding an explicit RIP matrix with $m$ much smaller
than $k^2$ is a challenging open problem. Some progress was made by
Bourgain et. al.\cite{BDFKK11}, who obtained $m=k^{2-\delta}$ for a
universal constant $\delta,ドル but $\delta$ is tiny.
\end{example}
\begin{example}[Fourier Matrix]\cite{FM}
 An $n\times n$ square Fourier matrix $F$ is defined as
 $F_{i,j}=\omega^{ij}$ where $\omega=e^{2\pi i/n}(i$ is the imaginary
 unit here). Another way to construct a $(k,\epsilon)$-RIP matrix is
 sampling a row subset $S$ with size $m=O(k\log n \log^3 \log k)$ to
 get a $&#124;S&#124;\times n$ matrix $A$\cite{CGV2013}, and $A$ is a RIP
 matrix w.h.p.
 There exists an algorithm multiplies $A$ to $x$ in time
 $\tilde{O}(m)$ and stores $A$ in space $O(k \log^2 n \log^3 \log
 k)$. However, we do not know how to verify a matrix $A$ that is
 generated this way is a RIP matrix. The same construction also works
 for Hadamard matrix and a similar one for circulant matrices.
\end{example}
One interesting question for RIP matrices is how sparse can a RIP
matrix be. The negative result is that there are at least $\Omega(k)$
nonzero entries in every column (see HW3).
\section{Compressed Sensing}
Let $A \in \mathbb{R}^{m\times n}$. We normalize every column of $A$
to be roughly 1. Given $y=Ax+e$ for some $x\in \mathbb{R}^n$ with
$\&#124;x\&#124;_0 \le k$ and $e$ is a noise vector. The goal of compressed
sensing is to efficiently recover $\hat{x}$ given $y$ such that
$\&#124;x-\hat{x}\&#124;_2 \le C \&#124;e\&#124;_2$ for some universal constant $C$.
Compressed sensing has widely applications in industry. For example,
it has been used in imaging processing, magnetic resonance
imaging(MRI), oil expolation, spectrum sensing and feature testing. It
takes advantage of the data's sparseness in some basis. We first
discuss the differences between compressed sensing and sparse
recovery, then introduce Iterative Hard Thresholding to recover
$\hat{x}$.
Compressed sensing is very similar to ``sparse recovery'' or ``heavy
hitters'', problems we saw earlier in class with Count-Min and
Count-Sketch. ``Compressed sensing'' and ``sparse recovery'' are
terms for essentially the same problem that grew out of different
communities: ``compressed sensing'' from math/statistics/signal
processing, and ``sparse recovery'' from streaming algorithms in
computer science. That said, there are noticeable differences in
problem formulation and approaches between the two communities, so it
makes sense to preserve the distinction.
Note that this list isn't formal or definitive; it's a sense of
differences between the two communities working in the same area, but
there's work that blurs the lines.
\begin{enumerate}
\item In sparse recovery, it is allowed to choose matrix $A$ after
 giving $x$ and the algorithm only needs to be able to find the
 correct answer w.h.p. over $A$ such as Count-Sketch. However, we
 have to choose the matrix $A$ before reading $y$ in compressed
 sensing, and the algorithm should be able to recover $\hat{x}$ for
 every $y=Ax+e$ where $x,e$ satisfy the requirements.
\item In compressed sensing, there is a noise vector $e$ and one often
 assumes that $x$ is exactly $k$-sparse, while in sparse recovery one
 generally assumes that $x$ is not $k$-sparse but you observe $Ax$
 exactly. This distinction is generally not too important, and
 algorithms that work in either noise model typically also work in
 the other one.
\item Sparse recovery cares more about the running time. Sparse
 recovery algorithms strive for $n \log^c n$ or ideally $k \log^c n$
 time, while compressed sensing algorithms are often happy with $n^c$
 time.
\item In sparse recovery, algorithms is closed tied to the matrices we
 used in it. In compressed sensing, algorithms works well as long as
 the matrix $A$ has some property $P$. For example, if $A$ satisfies
 RIP, solving $\textit{argmin}_{\hat{x}:\&#124;A\hat{x}-y\&#124; \le \epsilon}
 \&#124;\hat{x}\&#124;_{1}$ will give a good recovery $\hat{x}$ of $x$ by
 convex optimization, $L_1$ minimization and iterative methods.
\end{enumerate}
\section{Iterative Hard Thresholding}
We are focusing on Iterative Hard Thresholding in the rest of this
notes. We first describe its algorithm and go to the analysis
later. Let $A$ be a $(C\cdot k,\epsilon)$-RIP matrix and $H_k(x)$
denote the projection of $x$ on its top $k$ elements. Given $y=Ax+e$
for $\&#124;x\&#124;_0 \le k$ and $\&#124;e\&#124;_2$ is small, the algorithm works as
following with an appropriate choice $t$:
\begin{enumerate}
\item $x^{(1)}=\vec{0}$.
\item For $i=1,2,\cdots,t$
\item $\quad x^{(i+1)}=H_k\large( x^{(i)} +A^{T} (y-Ax^{(i)}) \large )$.
\end{enumerate}
We are going to prove $\&#124;x^{(t+1)}-x\&#124;_2 \le O(\&#124;e\&#124;)$ for $t=O(\log \frac{\&#124;x\&#124;}{\&#124;e\&#124;})$. The intuition of the algorithm is $A^{T} (y-Ax^{(i)}) = A^{T} A (x-x^{(i)}) + A^T e$. Because $\&#124;A^{T}_S A_S - I_k\&#124;_2 \le \epsilon$ for any column subset $S$ with size $C\cdot k,ドル we can think $A^T A \approx I_n$ and $A^{T} A (x-x^{(i)}) + A^T e \approx x-x^{(i)}$.
\begin{proof}
Let $x_0=x - x^{(i)}, z=A^T Ax_0 + A^T e$. We use $H$ be the support set of $x_0$. Because $x^{(i+1)}=H_k(x^{(i)}+z),ドル we try to bound $\&#124;z-x_0\&#124;$ at first. For any column subset $S$ with size at most $(C-2)k,ドル we bound $\&#124;(z-x_0)_S\&#124;$ as this way:
\begin{align*}
\&#124;(z-x_0)_S\&#124;_2 &= \&#124; \big((A^T A - I)x_0 + A^T e\big)_S \&#124;_2\\
&\le \&#124;\big( (A^T A -I) x_0 \big)_{S\cup H} \&#124;_2 + \&#124;(A^T e)_S\&#124;_2\\
&\le \&#124;(A^T A -I )_{(S \cup H) \times (S \cup H)}\&#124;_2 \cdot \&#124;x_0\&#124;_2+\&#124;A_S\&#124;_2 \cdot \&#124;e\&#124;_2\\
&\le \epsilon \&#124;x_0\&#124;_2 + (1+\epsilon) \&#124;e\&#124;_2
\end{align*}
However, our goal is to prove $\&#124;x - x^{(i+1)}\&#124;$ is small after enough steps which is equivalent to prove $\&#124; z_S - x_0\&#124;_2/\&#124;x_0\&#124;_2<1$ for the top $k$ elements subset $S$ in $x^{i+1}$. We need the following Lemma to prove the bound of $\&#124;(z-x_0)_S\&#124;_2$ can be used to bound $\&#124;z_S - x_0\&#124;_2$. \begin{lemma} Let $x,z \in \mathbb{R}^n,ドル $x$ is $k$-sparse with support set $H$ and $S$ is the top $k$ elements subset of $z$. Then $$\&#124;x-z_S\&#124;^2_2 \le 5 \&#124;(x-z)_{H \cup S}\&#124;^2_2.$$ \end{lemma} \begin{proof} Pairing up $i\in H\setminus S$ and $j \in S \setminus H$(Recall $&#124;S&#124;=&#124;H&#124;=k$ and $z_j \ge z_i$ by definition), it is enough to prove $$z_j^2+x_i^2 \le 5\big ( (z_i-x_i)^2+z_j^2\big ).$$ We discuss it by two cases: \begin{enumerate} \item $&#124;z_i&#124;>&#124;x_i&#124;/2: x_i^2 \le 4 z_i^2 \le 4 z_j^2$.
\item $&#124;z_i&#124;<&#124;x_i&#124;/2: x_i^2 \le 4 (x_i-z_i)^2$.
\end{enumerate}
\end{proof}
Continue to the proof of the convergence, Taking $(x,x^{(i+1)})$ into the lemma, if $\epsilon<0.1$ and $\&#124;x^{(i)}-x\&#124; \ge 12 \&#124;e\&#124;$ we have 
\begin{align*}
\&#124;x - x^{(i+1)}\&#124; &\le \sqrt{5} \&#124;(x - x^{(i+1)})_{S \cup H}\&#124; \\
&\le \sqrt{5} \&#124;(x_0 - z)_{S \cup H}\&#124; \\
&\le \sqrt{5} \epsilon \&#124;x_0\&#124; + \sqrt{5}(1+\epsilon)\&#124;e\&#124; \\
&\le \&#124;x^{(i)} - x\&#124;/4 + 3 \&#124;e\&#124; \\
& \le \&#124;x^{(i)} - x\&#124;/2.
\end{align*}
For $t=O(\log \frac{\&#124;x\&#124;}{\&#124;e\&#124;}),ドル we have $x^{(t+1)}=O(\&#124;e\&#124;)$.
\end{proof}
\bibliographystyle{alpha}
\begin{thebibliography}{42}
\bibitem[Sphere]{Sphere}
\newblock https://en.wikipedia.org/wiki/Sphere.
\bibitem[NPW12]{NPW2012}
Jelani~Nelson, Eric~Price, Mary~Wooters.
\newblock New constructions of RIP matrices with fast multiplication and fewer rows.
\newblock {\em SODA 2014}: 1515-1528.
\bibitem[CGV2013]{CGV2013}
Mahdi~Cheraghchi	,Venkatesan~Guruswami,Ameya~Velingker.
\newblock Restricted isometry of fourier matrices and list decodability of random linear codes
\newblock {em SODA 2013}:Pages 432-442 
\bibitem[BDFKK11]{BDFKK11}
Jean~Bourgain, Stephen~Dilworth, Kevin~Ford, Sergei~Konyagin, Denka~Kutzarova.
\newblock Breaking the $k^2$ Barrier for Explicit RIP Matrices.
\newblock {\em STOC}:pages 637–644, 2011.
\bibitem[FM]{FM}
\newblock http://mathworld.wolfram.com/FourierMatrix.html.
\end{thebibliography}
\end{document}</div><div class="naked_ctrl">
<form action="/index.cgi/larger-text" method="get" name="gate">
<p><a href="http://altstyle.alfasado.net">AltStyle</a> によって変換されたページ <a href="https://www.cs.utexas.edu/~ecprice/courses/sublinear/fa14/notes/l10.tex">(-&gt;オリジナル)</a>
/ <label>アドレス: <input type="text" name="naked_post_url" value="https://www.cs.utexas.edu/~ecprice/courses/sublinear/fa14/notes/l10.tex" size="22" /></label> <label>モード: <select name="naked_post_mode">
<option value="default">デフォルト</option>
<option value="speech">音声ブラウザ</option>
<option value="ruby">ルビ付き</option>
<option value="contrast">配色反転</option>
<option value="larger-text" selected="selected">文字拡大</option>
<option value="mobile">モバイル</option>
</select>
<input type="submit" value="表示" />
</p>
</form>
</div>