diff --git a/doc/fundamentals.tex b/doc/fundamentals.tex
index c829fd13..578f644f 100644
--- a/doc/fundamentals.tex
+++ b/doc/fundamentals.tex
@@ -450,7 +450,7 @@ \subsection{Backpropagation in DAGs}\label{s:dag}
 d\bx_{t} \leftarrow d\bx_{t}
 + \frac{d\langle \bp_L, f_{\pi_L}(\bx_0,\dots,\bx_{t-1})\rangle}{d\bx_t}.
 \]
-Here, for uniformity with the other iterations, we use the fact that $d\bx_l$ are initialized to zero an\emph{accumulate} the values instead of storing them. In practice, the update operation needs to be carried out only for the variables $\bx_l$ that are actual inputs to $f_{\pi_L}$, which is often a tiny fraction of all the variables in the DAG.
+Here, for uniformity with the other iterations, we use the fact that $d\bx_l$ are initialized to zero and \emph{accumulate} the values instead of storing them. In practice, the update operation needs to be carried out only for the variables $\bx_l$ that are actual inputs to $f_{\pi_L}$, which is often a tiny fraction of all the variables in the DAG.
 
 After the update, each $d\bx_t$ contains the projected derivative of function $h_L$ with respect to the corresponding variable:
 \[
diff --git a/doc/site/docs/wrappers.md b/doc/site/docs/wrappers.md
index d5debe75..08e9cfbf 100644
--- a/doc/site/docs/wrappers.md
+++ b/doc/site/docs/wrappers.md
@@ -27,7 +27,7 @@ cellarray `net.layers` with a list of layers. For example:
 net.layers{1} = struct(...
     'name', 'conv1', ...
     'type', 'conv', ...
-    'weights', {{randn(10,10,3,2,'single'), randn(2,1,'single')}}, ...
+    'weights', {randn(10,10,3,2,'single'), randn(2,1,'single')}, ...
     'pad', 0, ...
     'stride', 1) ;
 net.layers{2} = struct(...
diff --git a/matlab/simplenn/vl_simplenn_display.m b/matlab/simplenn/vl_simplenn_display.m
index 3bdd96de..58e51ae2 100644
--- a/matlab/simplenn/vl_simplenn_display.m
+++ b/matlab/simplenn/vl_simplenn_display.m
@@ -13,7 +13,7 @@
 %   `inputSize`:: auto
 %      Specifies the size of the input tensor X that will be passed to
 %      the network as input. This information is used in order to
-%      estiamte the memory required to process the network. When this
+%      estimate the memory required to process the network. When this
 %      option is not used, VL_SIMPLENN_DISPLAY() tires to use values
 %      in the NET structure to guess the input size:
 %      NET.META.INPUTSIZE and NET.META.NORMALIZATION.IMAGESIZE
diff --git a/matlab/vl_nnbilinearsampler.m b/matlab/vl_nnbilinearsampler.m
index ef182760..d4a5dd59 100644
--- a/matlab/vl_nnbilinearsampler.m
+++ b/matlab/vl_nnbilinearsampler.m
@@ -16,18 +16,18 @@
 %   For output image n, GRID(1,:,:,n) specifies the vertical location
 %   v of a sample in the input image X and GRID(2,:,:,n) the
 %   horizontal location u. The convention follows standard
-%   impelemntations of this operator in the literature. Namely:
+%   impelementations of this operator in the literature. Namely:
 %
 %   1. The grid coordinates are normalized in the range [-1,1]. This
 %      means that (-1,-1) is the center of the upper-left pixel in the
 %      input image and (+1,+1) the center of the bottom-right pixel.
 %
-%   2. The V,U coordiante planes are stacked in the fisrt dimension of
+%   2. The V,U coordinate planes are stacked in the first dimension of
 %      GRID instead of in the third, as it would be more natural in
 %      MatConvNet (as these could be interpreted as 'channels' in
 %      GRID).
 %
-%   Further, No can be a multiple of N; in this case, it is assumed
+%   Further, No shall be a multiple of N; in this case, it is assumed
 %   that there are No/N transforms per input image, hence, the
 %   transforms [1 ... No/N] are applied to the first image, [No/N+1
 %   ... 2*No/N] are applied to the second image, etc.
diff --git a/matlab/vl_nnloss.m b/matlab/vl_nnloss.m
index 3343d06f..bb87a529 100644
--- a/matlab/vl_nnloss.m
+++ b/matlab/vl_nnloss.m
@@ -25,7 +25,7 @@
 %
 %   In the third form, C has dimension H x W x D x N and specifies
 %   attributes rather than categories. Here elements in C are either
-%   +1 or -1 and C, where +1 denotes that an attribute is present and
+%   +1 or -1, where +1 denotes that an attribute is present and
 %   -1 that it is not. The key difference is that multiple attributes
 %   can be active at the same time, while categories are mutually
 %   exclusive. By default, the loss is *summed* across attributes
diff --git a/matlab/vl_tmove.m b/matlab/vl_tmove.m
index 79737210..1ccecc9b 100644
--- a/matlab/vl_tmove.m
+++ b/matlab/vl_tmove.m
@@ -42,7 +42,7 @@
 %       format = {'single', [1  1], 'x0' ;
 %                 'double', [10 5], 'x1' }
 %
-%   As ane extension, it is possible to declare all or some of the
+%   As an extension, it is possible to declare all or some of the
 %   tensors as GPU ones, by adding a fourth column to FORMAT:
 %
 %       format = {'single', [1  1], 'x0', 'cpu' ;